TGUI  0.10-dev
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2021 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
26#ifndef TGUI_UTF_HPP
27#define TGUI_UTF_HPP
28
29#include <TGUI/Config.hpp>
30#include <string>
31#include <array>
32
34
35// Disable warning in Visual Studio about being able to use "if constexpr".
36// The code would use "if constexpr" if the compiler would just define __cpp_if_constexpr
37#if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
38 #pragma warning(push)
39 #pragma warning(disable:4127)
40#endif
41
42namespace tgui
43{
44 namespace utf
45 {
51 template <typename CharT> // CharT is either char or char8_t
52 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
53 {
54 if (input < 128)
55 {
56 outStrUtf8.push_back(static_cast<CharT>(input));
57 return;
58 }
59
60 // Encode the character (if it is valid)
61 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
62 return;
63
64 // Get the number of bytes to write
65 std::size_t bytestoWrite;
66 if (input < 0x800)
67 bytestoWrite = 2;
68 else if (input < 0x10000)
69 bytestoWrite = 3;
70 else if (input <= 0x0010FFFF)
71 bytestoWrite = 4;
72 else
73 return;
74
75 static const std::uint8_t firstByteMask[5] = { 0, 0, 0xC0, 0xE0, 0xF0 };
76
77 // Extract the bytes to write
78 std::array<CharT, 4> bytes;
79 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
80 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
81 if (bytestoWrite >= 2) { bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
82 if (bytestoWrite >= 1) { bytes[0] = static_cast<CharT>(input | firstByteMask[bytestoWrite]); }
83
84 // Add them to the output
85 outStrUtf8.append(bytes.begin(), bytes.begin() + bytestoWrite);
86 }
87
88
96 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
97 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
98 {
99 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
100 {
101 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
102 return ++inputCharIt;
103 }
104
105 // Some useful precomputed data
106 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
107 static const std::uint8_t trailingMap[128] =
108 {
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
112 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
113 };
114
115 // decode the character
116 std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
117 const std::uint32_t offset = offsetsMap[trailingBytes];
118 if (inputCharIt + trailingBytes < inputEndIt)
119 {
120 char32_t outputChar = 0;
121 while (trailingBytes > 0)
122 {
123 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
124 outputChar <<= 6;
125 --trailingBytes;
126 }
127
128 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
129 outputChar -= offset;
130 outStrUtf32.push_back(outputChar);
131 }
132 else // Incomplete character
133 inputCharIt = inputEndIt;
134
135 return inputCharIt;
136 }
137
138
139#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
145 inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
146 {
147 std::u8string outStrUtf8;
148 outStrUtf8.reserve(strUtf32.length() + 1);
149 for (const char32_t& codepoint : strUtf32)
150 encodeCharUtf8(codepoint, outStrUtf8);
151
152 return outStrUtf8;
153 }
154#endif
155
162 template <typename CharIt>
163 std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
164 {
165 std::u32string outStrUtf32;
166 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
167
168 auto it = inputBegin;
169 while (it < inputEnd)
170 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
171
172 return outStrUtf32;
173 }
174
175
182 template <typename U16CharIt>
183 std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
184 {
185 std::u32string outStrUtf32;
186 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
187
188 auto it = inputBegin;
189 while (it < inputEnd)
190 {
191 const char16_t first = *it++;
192
193 // Copy the character if it isn't a surrogate pair
194 if ((first < 0xD800) || (first > 0xDBFF))
195 {
196 outStrUtf32.push_back(static_cast<char32_t>(first));
197 continue;
198 }
199
200 // We need to read another character
201 if (it == inputEnd)
202 break;
203
204 const char16_t second = *it++;
205 if ((second >= 0xDC00) && (second <= 0xDFFF))
206 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
207 }
208
209
210 return outStrUtf32;
211 }
212
213
219 template <typename WCharIt>
220 std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
221 {
222 std::u32string outStrUtf32;
223 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
224
225 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
226 for (auto it = inputBegin; it != inputEnd; ++it)
227 outStrUtf32.push_back(static_cast<char32_t>(*it));
228
229
230 return outStrUtf32;
231 }
232
233
239 inline std::string convertUtf32toLatin1(const std::u32string& strUtf32)
240 {
241 std::string outStr;
242 outStr.reserve(strUtf32.length() + 1);
243 for (const char32_t codepoint : strUtf32)
244 {
245 if (codepoint < 256)
246 outStr.push_back(static_cast<char>(codepoint));
247 }
248
249 return outStr;
250 }
251
252
258 inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
259 {
260 std::string outStrUtf8;
261 outStrUtf8.reserve(strUtf32.length() + 1);
262 for (const char32_t codepoint : strUtf32)
263 encodeCharUtf8(codepoint, outStrUtf8);
264
265 return outStrUtf8;
266 }
267
268
274 inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
275 {
276 std::wstring outStr;
277 outStr.reserve(strUtf32.length() + 1);
278
279#if defined(__cpp_if_constexpr) && (__cpp_if_constexpr >= 201606L)
280 if constexpr (sizeof(wchar_t) == 4)
281#else
282 if (sizeof(wchar_t) == 4)
283#endif
284 {
285 // On Unix, wide characters are UCS-4 and we can just copy the characters
286 for (const char32_t codepoint : strUtf32)
287 outStr.push_back(static_cast<wchar_t>(codepoint));
288 }
289 else
290 {
291 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
292 for (const char32_t codepoint : strUtf32)
293 {
294 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
295 outStr.push_back(static_cast<wchar_t>(codepoint));
296 }
297 }
298
299 return outStr;
300 }
301
302
308 inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
309 {
310 std::u16string outStrUtf16;
311 outStrUtf16.reserve(strUtf32.length() + 1);
312
313 for (const char32_t codepoint : strUtf32)
314 {
315 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
316 if (codepoint <= 0xFFFF)
317 {
318 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
319 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
320
321 continue;
322 }
323 else if (codepoint > 0x0010FFFF)
324 continue; // Invalid character (greater than the maximum Unicode value)
325
326 // The input character needs be converted to two UTF-16 elements
327 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
328 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
329 }
330
331 return outStrUtf16;
332 }
333
335 }
336}
337
338#if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
339 #pragma warning(pop)
340#endif
341
343
344#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition: AbsoluteOrRelativeValue.hpp:36