TGUI  1.5
Loading...
Searching...
No Matches
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2024 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25#ifndef TGUI_UTF_HPP
26#define TGUI_UTF_HPP
27
28#include <TGUI/Config.hpp>
29
30#if !TGUI_EXPERIMENTAL_USE_STD_MODULE
31 #include <cstdint>
32 #include <string>
33 #include <array>
34#endif
35
37
38TGUI_MODULE_EXPORT namespace tgui
39{
40 namespace utf
41 {
47 template <typename CharT> // CharT is either char or char8_t
48 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
49 {
50 if (input < 128)
51 {
52 outStrUtf8.push_back(static_cast<CharT>(input));
53 return;
54 }
55
56 // Encode the character (if it is valid)
57 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
58 return;
59
60 // Get the number of bytes to write
61 std::size_t bytestoWrite;
62 std::uint8_t firstByteMask;
63 if (input < 0x800)
64 {
65 bytestoWrite = 2;
66 firstByteMask = 0xC0;
67 }
68 else if (input < 0x10000)
69 {
70 bytestoWrite = 3;
71 firstByteMask = 0xE0;
72 }
73 else
74 {
75 bytestoWrite = 4;
76 firstByteMask = 0xF0;
77 }
78
79 // Extract the bytes to write
80 std::array<CharT, 4> bytes;
81 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
82 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
83 bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6;
84 bytes[0] = static_cast<CharT>(input | firstByteMask);
85
86 // Add them to the output
87 outStrUtf8.append(bytes.begin(), bytes.begin() + static_cast<std::ptrdiff_t>(bytestoWrite));
88 }
89
97 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
98 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
99 {
100 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
101 {
102 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
103 return ++inputCharIt;
104 }
105
106 // Some useful precomputed data
107 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
108 static const std::uint8_t trailingMap[128] =
109 {
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
114 };
115
116 // decode the character
117 const std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
118 const std::uint32_t offset = offsetsMap[trailingBytes];
119 const auto remainingBytes = std::distance(inputCharIt, inputEndIt) - 1;
120 if (remainingBytes >= static_cast<decltype(remainingBytes)>(trailingBytes))
121 {
122 char32_t outputChar = 0;
123 for (std::uint8_t i = 0; i < trailingBytes; ++i)
124 {
125 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
126 outputChar <<= 6;
127 }
128
129 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
130 outputChar -= offset;
131 outStrUtf32.push_back(outputChar);
132 }
133 else // Incomplete character
134 inputCharIt = inputEndIt;
135
136 return inputCharIt;
137 }
138
139#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
145 TGUI_NODISCARD inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
146 {
147 std::u8string outStrUtf8;
148 outStrUtf8.reserve(strUtf32.length() + 1);
149 for (const char32_t& codepoint : strUtf32)
150 encodeCharUtf8(codepoint, outStrUtf8);
151
152 return outStrUtf8;
153 }
154#endif
155
162 template <typename CharIt>
163 TGUI_NODISCARD std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
164 {
165 std::u32string outStrUtf32;
166 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
167
168 auto it = inputBegin;
169 while (it < inputEnd)
170 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
171
172 return outStrUtf32;
173 }
174
181 template <typename U16CharIt>
182 TGUI_NODISCARD std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
183 {
184 std::u32string outStrUtf32;
185 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
186
187 auto it = inputBegin;
188 while (it < inputEnd)
189 {
190 const char16_t first = *it++;
191
192 // Copy the character if it isn't a surrogate pair
193 if ((first < 0xD800) || (first > 0xDBFF))
194 {
195 outStrUtf32.push_back(static_cast<char32_t>(first));
196 continue;
197 }
198
199 // We need to read another character
200 if (it == inputEnd)
201 break;
202
203 const char16_t second = *it++;
204 if ((second >= 0xDC00) && (second <= 0xDFFF))
205 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
206 }
207
208 return outStrUtf32;
209 }
210
216 template <typename WCharIt>
217 TGUI_NODISCARD std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
218 {
219 std::u32string outStrUtf32;
220 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
221
222 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
223 for (auto it = inputBegin; it != inputEnd; ++it)
224 outStrUtf32.push_back(static_cast<char32_t>(*it));
225
226 return outStrUtf32;
227 }
228
234 TGUI_NODISCARD inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
235 {
236 std::string outStrUtf8;
237 outStrUtf8.reserve(strUtf32.length() + 1);
238 for (const char32_t codepoint : strUtf32)
239 encodeCharUtf8(codepoint, outStrUtf8);
240
241 return outStrUtf8;
242 }
243
249 TGUI_NODISCARD inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
250 {
251 std::wstring outStr;
252 outStr.reserve(strUtf32.length() + 1);
253
254 TGUI_IF_CONSTEXPR (sizeof(wchar_t) == 4)
255 {
256 // On Unix, wide characters are UCS-4 and we can just copy the characters
257 for (const char32_t codepoint : strUtf32)
258 outStr.push_back(static_cast<wchar_t>(codepoint));
259 }
260 else
261 {
262 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
263 for (const char32_t codepoint : strUtf32)
264 {
265 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
266 outStr.push_back(static_cast<wchar_t>(codepoint));
267 }
268 }
269
270 return outStr;
271 }
272
278 TGUI_NODISCARD inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
279 {
280 std::u16string outStrUtf16;
281 outStrUtf16.reserve(strUtf32.length() + 1);
282
283 for (const char32_t codepoint : strUtf32)
284 {
285 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
286 if (codepoint <= 0xFFFF)
287 {
288 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
289 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
290
291 continue;
292 }
293 else if (codepoint > 0x0010FFFF)
294 continue; // Invalid character (greater than the maximum Unicode value)
295
296 // The input character needs be converted to two UTF-16 elements
297 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
298 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
299 }
300
301 return outStrUtf16;
302 }
303
305 }
306}
307
309
310#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition AbsoluteOrRelativeValue.hpp:38