TGUI  1.0-beta
Loading...
Searching...
No Matches
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2022 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
26#ifndef TGUI_UTF_HPP
27#define TGUI_UTF_HPP
28
29#include <TGUI/Config.hpp>
30#include <string>
31#include <array>
32
34
35// Disable warning in Visual Studio about being able to use "if constexpr".
36// The code would use "if constexpr" if the compiler would just define __cpp_if_constexpr
37#if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
38 #pragma warning(push)
39 #pragma warning(disable:4127)
40#endif
41
42namespace tgui
43{
44 namespace utf
45 {
51 template <typename CharT> // CharT is either char or char8_t
52 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
53 {
54 if (input < 128)
55 {
56 outStrUtf8.push_back(static_cast<CharT>(input));
57 return;
58 }
59
60 // Encode the character (if it is valid)
61 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
62 return;
63
64 // Get the number of bytes to write
65 std::size_t bytestoWrite;
66 std::uint8_t firstByteMask;
67 if (input < 0x800)
68 {
69 bytestoWrite = 2;
70 firstByteMask = 0xC0;
71 }
72 else if (input < 0x10000)
73 {
74 bytestoWrite = 3;
75 firstByteMask = 0xE0;
76 }
77 else
78 {
79 bytestoWrite = 4;
80 firstByteMask = 0xF0;
81 }
82
83 // Extract the bytes to write
84 std::array<CharT, 4> bytes;
85 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
86 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
87 bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6;
88 bytes[0] = static_cast<CharT>(input | firstByteMask);
89
90 // Add them to the output
91 outStrUtf8.append(bytes.begin(), bytes.begin() + bytestoWrite);
92 }
93
94
102 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
103 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
104 {
105 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
106 {
107 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
108 return ++inputCharIt;
109 }
110
111 // Some useful precomputed data
112 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
113 static const std::uint8_t trailingMap[128] =
114 {
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
118 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
119 };
120
121 // decode the character
122 const std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
123 const std::uint32_t offset = offsetsMap[trailingBytes];
124 const auto remainingBytes = std::distance(inputCharIt, inputEndIt) - 1;
125 if (remainingBytes >= static_cast<decltype(remainingBytes)>(trailingBytes))
126 {
127 char32_t outputChar = 0;
128 for (std::uint8_t i = 0; i < trailingBytes; ++i)
129 {
130 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
131 outputChar <<= 6;
132 }
133
134 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
135 outputChar -= offset;
136 outStrUtf32.push_back(outputChar);
137 }
138 else // Incomplete character
139 inputCharIt = inputEndIt;
140
141 return inputCharIt;
142 }
143
144
145#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
151 inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
152 {
153 std::u8string outStrUtf8;
154 outStrUtf8.reserve(strUtf32.length() + 1);
155 for (const char32_t& codepoint : strUtf32)
156 encodeCharUtf8(codepoint, outStrUtf8);
157
158 return outStrUtf8;
159 }
160#endif
161
168 template <typename CharIt>
169 std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
170 {
171 std::u32string outStrUtf32;
172 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
173
174 auto it = inputBegin;
175 while (it < inputEnd)
176 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
177
178 return outStrUtf32;
179 }
180
181
188 template <typename U16CharIt>
189 std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
190 {
191 std::u32string outStrUtf32;
192 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
193
194 auto it = inputBegin;
195 while (it < inputEnd)
196 {
197 const char16_t first = *it++;
198
199 // Copy the character if it isn't a surrogate pair
200 if ((first < 0xD800) || (first > 0xDBFF))
201 {
202 outStrUtf32.push_back(static_cast<char32_t>(first));
203 continue;
204 }
205
206 // We need to read another character
207 if (it == inputEnd)
208 break;
209
210 const char16_t second = *it++;
211 if ((second >= 0xDC00) && (second <= 0xDFFF))
212 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
213 }
214
215
216 return outStrUtf32;
217 }
218
219
225 template <typename WCharIt>
226 std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
227 {
228 std::u32string outStrUtf32;
229 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
230
231 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
232 for (auto it = inputBegin; it != inputEnd; ++it)
233 outStrUtf32.push_back(static_cast<char32_t>(*it));
234
235
236 return outStrUtf32;
237 }
238
239
245 inline std::string convertUtf32toLatin1(const std::u32string& strUtf32)
246 {
247 std::string outStr;
248 outStr.reserve(strUtf32.length() + 1);
249 for (const char32_t codepoint : strUtf32)
250 {
251 if (codepoint < 256)
252 outStr.push_back(static_cast<char>(codepoint));
253 }
254
255 return outStr;
256 }
257
258
264 inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
265 {
266 std::string outStrUtf8;
267 outStrUtf8.reserve(strUtf32.length() + 1);
268 for (const char32_t codepoint : strUtf32)
269 encodeCharUtf8(codepoint, outStrUtf8);
270
271 return outStrUtf8;
272 }
273
274
280 inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
281 {
282 std::wstring outStr;
283 outStr.reserve(strUtf32.length() + 1);
284
285#if defined(__cpp_if_constexpr) && (__cpp_if_constexpr >= 201606L)
286 if constexpr (sizeof(wchar_t) == 4)
287#else
288 if (sizeof(wchar_t) == 4)
289#endif
290 {
291 // On Unix, wide characters are UCS-4 and we can just copy the characters
292 for (const char32_t codepoint : strUtf32)
293 outStr.push_back(static_cast<wchar_t>(codepoint));
294 }
295 else
296 {
297 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
298 for (const char32_t codepoint : strUtf32)
299 {
300 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
301 outStr.push_back(static_cast<wchar_t>(codepoint));
302 }
303 }
304
305 return outStr;
306 }
307
308
314 inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
315 {
316 std::u16string outStrUtf16;
317 outStrUtf16.reserve(strUtf32.length() + 1);
318
319 for (const char32_t codepoint : strUtf32)
320 {
321 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
322 if (codepoint <= 0xFFFF)
323 {
324 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
325 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
326
327 continue;
328 }
329 else if (codepoint > 0x0010FFFF)
330 continue; // Invalid character (greater than the maximum Unicode value)
331
332 // The input character needs be converted to two UTF-16 elements
333 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
334 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
335 }
336
337 return outStrUtf16;
338 }
339
341 }
342}
343
344#if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
345 #pragma warning(pop)
346#endif
347
349
350#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition: AbsoluteOrRelativeValue.hpp:36