TGUI  0.8.9
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2020 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
26#ifndef TGUI_UTF_HPP
27#define TGUI_UTF_HPP
28
29#include <string>
30#include <array>
31
33
34namespace tgui
35{
36 namespace utf
37 {
43 template <typename CharT> // CharT is either char or char8_t
44 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
45 {
46 if (input < 128)
47 {
48 outStrUtf8.push_back(static_cast<CharT>(input));
49 return;
50 }
51
52 // Encode the character (if it is valid)
53 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
54 return;
55
56 // Get the number of bytes to write
57 std::size_t bytestoWrite;
58 if (input < 0x800)
59 bytestoWrite = 2;
60 else if (input < 0x10000)
61 bytestoWrite = 3;
62 else if (input <= 0x0010FFFF)
63 bytestoWrite = 4;
64 else
65 return;
66
67 static const std::uint8_t firstByteMask[5] = { 0, 0, 0xC0, 0xE0, 0xF0 };
68
69 // Extract the bytes to write
70 std::array<CharT, 4> bytes;
71 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
72 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
73 if (bytestoWrite >= 2) { bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
74 if (bytestoWrite >= 1) { bytes[0] = static_cast<CharT>(input | firstByteMask[bytestoWrite]); }
75
76 // Add them to the output
77 outStrUtf8.append(bytes.begin(), bytes.begin() + bytestoWrite);
78 }
79
80
88 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
89 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
90 {
91 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
92 {
93 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
94 return ++inputCharIt;
95 }
96
97 // Some useful precomputed data
98 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
99 static const std::uint8_t trailingMap[128] =
100 {
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
105 };
106
107 // decode the character
108 std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
109 const std::uint32_t offset = offsetsMap[trailingBytes];
110 if (inputCharIt + trailingBytes < inputEndIt)
111 {
112 char32_t outputChar = 0;
113 while (trailingBytes > 0)
114 {
115 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
116 outputChar <<= 6;
117 --trailingBytes;
118 }
119
120 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
121 outputChar -= offset;
122 outStrUtf32.push_back(outputChar);
123 }
124 else // Incomplete character
125 inputCharIt = inputEndIt;
126
127 return inputCharIt;
128 }
129
130
131#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
137 inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
138 {
139 std::u8string outStrUtf8;
140 outStrUtf8.reserve(strUtf32.length() + 1);
141 for (const char32_t& codepoint : strUtf32)
142 encodeCharUtf8(codepoint, outStrUtf8);
143
144 return outStrUtf8;
145 }
146#endif
147
154 template <typename CharIt>
155 std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
156 {
157 std::u32string outStrUtf32;
158 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
159
160 auto it = inputBegin;
161 while (it < inputEnd)
162 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
163
164 return outStrUtf32;
165 }
166
167
174 template <typename U16CharIt>
175 std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
176 {
177 std::u32string outStrUtf32;
178 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
179
180 auto it = inputBegin;
181 while (it < inputEnd)
182 {
183 const char16_t first = *it++;
184
185 // Copy the character if it isn't a surrogate pair
186 if ((first < 0xD800) || (first > 0xDBFF))
187 {
188 outStrUtf32.push_back(static_cast<char32_t>(first));
189 continue;
190 }
191
192 // We need to read another character
193 if (it == inputEnd)
194 break;
195
196 const char16_t second = *it++;
197 if ((second >= 0xDC00) && (second <= 0xDFFF))
198 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
199 }
200
201
202 return outStrUtf32;
203 }
204
205
211 template <typename WCharIt>
212 std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
213 {
214 std::u32string outStrUtf32;
215 outStrUtf32.reserve((inputEnd - inputBegin) + 1);
216
217 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
218 for (auto it = inputBegin; it != inputEnd; ++it)
219 outStrUtf32.push_back(static_cast<char32_t>(*it));
220
221
222 return outStrUtf32;
223 }
224
225
231 inline std::string convertUtf32toLatin1(const std::u32string& strUtf32)
232 {
233 std::string outStr;
234 outStr.reserve(strUtf32.length() + 1);
235 for (const char32_t codepoint : strUtf32)
236 {
237 if (codepoint < 256)
238 outStr.push_back(static_cast<char>(codepoint));
239 }
240
241 return outStr;
242 }
243
244
250 inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
251 {
252 std::string outStrUtf8;
253 outStrUtf8.reserve(strUtf32.length() + 1);
254 for (const char32_t codepoint : strUtf32)
255 encodeCharUtf8(codepoint, outStrUtf8);
256
257 return outStrUtf8;
258 }
259
260
266 inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
267 {
268 std::wstring outStr;
269 outStr.reserve(strUtf32.length() + 1);
270
271
272#if defined(__cpp_if_constexpr) && (__cpp_if_constexpr >= 201606L)
273 if constexpr (sizeof(wchar_t) == 4)
274#else
275 #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
276 #pragma warning(push)
277 #pragma warning(disable:4127)
278 #endif
279 if (sizeof(wchar_t) == 4)
280 #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
281 #pragma warning(pop)
282 #endif
283#endif
284 {
285 // On Unix, wide characters are UCS-4 and we can just copy the characters
286 for (const char32_t codepoint : strUtf32)
287 outStr.push_back(static_cast<wchar_t>(codepoint));
288 }
289 else
290 {
291 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
292 for (const char32_t codepoint : strUtf32)
293 {
294 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
295 outStr.push_back(static_cast<wchar_t>(codepoint));
296 }
297 }
298
299 return outStr;
300 }
301
302
308 inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
309 {
310 std::u16string outStrUtf16;
311 outStrUtf16.reserve(strUtf32.length() + 1);
312
313 for (const char32_t codepoint : strUtf32)
314 {
315 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
316 if (codepoint <= 0xFFFF)
317 {
318 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
319 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
320
321 continue;
322 }
323 else if (codepoint > 0x0010FFFF)
324 continue; // Invalid character (greater than the maximum Unicode value)
325
326 // The input character needs be converted to two UTF-16 elements
327 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
328 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
329 }
330
331 return outStrUtf16;
332 }
333
335 }
336}
337
339
340#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition: AbsoluteOrRelativeValue.hpp:37