TGUI  1.3-dev
Loading...
Searching...
No Matches
Utf.hpp
1
2//
3// TGUI - Texus' Graphical User Interface
4// Copyright (C) 2012-2024 Bruno Van de Velde (vdv_b@tgui.eu)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
26#ifndef TGUI_UTF_HPP
27#define TGUI_UTF_HPP
28
29#include <TGUI/Config.hpp>
30
31#if !TGUI_EXPERIMENTAL_USE_STD_MODULE
32 #include <cstdint>
33 #include <string>
34 #include <array>
35#endif
36
38
39TGUI_MODULE_EXPORT namespace tgui
40{
41 namespace utf
42 {
48 template <typename CharT> // CharT is either char or char8_t
49 void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
50 {
51 if (input < 128)
52 {
53 outStrUtf8.push_back(static_cast<CharT>(input));
54 return;
55 }
56
57 // Encode the character (if it is valid)
58 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
59 return;
60
61 // Get the number of bytes to write
62 std::size_t bytestoWrite;
63 std::uint8_t firstByteMask;
64 if (input < 0x800)
65 {
66 bytestoWrite = 2;
67 firstByteMask = 0xC0;
68 }
69 else if (input < 0x10000)
70 {
71 bytestoWrite = 3;
72 firstByteMask = 0xE0;
73 }
74 else
75 {
76 bytestoWrite = 4;
77 firstByteMask = 0xF0;
78 }
79
80 // Extract the bytes to write
81 std::array<CharT, 4> bytes;
82 if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
83 if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
84 bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6;
85 bytes[0] = static_cast<CharT>(input | firstByteMask);
86
87 // Add them to the output
88 outStrUtf8.append(bytes.begin(), bytes.begin() + static_cast<std::ptrdiff_t>(bytestoWrite));
89 }
90
91
99 template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
100 CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
101 {
102 if (static_cast<std::uint8_t>(*inputCharIt) < 128)
103 {
104 outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
105 return ++inputCharIt;
106 }
107
108 // Some useful precomputed data
109 static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
110 static const std::uint8_t trailingMap[128] =
111 {
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
116 };
117
118 // decode the character
119 const std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
120 const std::uint32_t offset = offsetsMap[trailingBytes];
121 const auto remainingBytes = std::distance(inputCharIt, inputEndIt) - 1;
122 if (remainingBytes >= static_cast<decltype(remainingBytes)>(trailingBytes))
123 {
124 char32_t outputChar = 0;
125 for (std::uint8_t i = 0; i < trailingBytes; ++i)
126 {
127 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
128 outputChar <<= 6;
129 }
130
131 outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
132 outputChar -= offset;
133 outStrUtf32.push_back(outputChar);
134 }
135 else // Incomplete character
136 inputCharIt = inputEndIt;
137
138 return inputCharIt;
139 }
140
141
142#if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
148 TGUI_NODISCARD inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
149 {
150 std::u8string outStrUtf8;
151 outStrUtf8.reserve(strUtf32.length() + 1);
152 for (const char32_t& codepoint : strUtf32)
153 encodeCharUtf8(codepoint, outStrUtf8);
154
155 return outStrUtf8;
156 }
157#endif
158
165 template <typename CharIt>
166 TGUI_NODISCARD std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
167 {
168 std::u32string outStrUtf32;
169 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
170
171 auto it = inputBegin;
172 while (it < inputEnd)
173 it = decodeCharUtf8(it, inputEnd, outStrUtf32);
174
175 return outStrUtf32;
176 }
177
178
185 template <typename U16CharIt>
186 TGUI_NODISCARD std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
187 {
188 std::u32string outStrUtf32;
189 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
190
191 auto it = inputBegin;
192 while (it < inputEnd)
193 {
194 const char16_t first = *it++;
195
196 // Copy the character if it isn't a surrogate pair
197 if ((first < 0xD800) || (first > 0xDBFF))
198 {
199 outStrUtf32.push_back(static_cast<char32_t>(first));
200 continue;
201 }
202
203 // We need to read another character
204 if (it == inputEnd)
205 break;
206
207 const char16_t second = *it++;
208 if ((second >= 0xDC00) && (second <= 0xDFFF))
209 outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
210 }
211
212
213 return outStrUtf32;
214 }
215
216
222 template <typename WCharIt>
223 TGUI_NODISCARD std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
224 {
225 std::u32string outStrUtf32;
226 outStrUtf32.reserve(static_cast<std::size_t>((inputEnd - inputBegin) + 1));
227
228 // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
229 for (auto it = inputBegin; it != inputEnd; ++it)
230 outStrUtf32.push_back(static_cast<char32_t>(*it));
231
232
233 return outStrUtf32;
234 }
235
236
242 TGUI_NODISCARD inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
243 {
244 std::string outStrUtf8;
245 outStrUtf8.reserve(strUtf32.length() + 1);
246 for (const char32_t codepoint : strUtf32)
247 encodeCharUtf8(codepoint, outStrUtf8);
248
249 return outStrUtf8;
250 }
251
252
258 TGUI_NODISCARD inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
259 {
260 std::wstring outStr;
261 outStr.reserve(strUtf32.length() + 1);
262
263 TGUI_IF_CONSTEXPR (sizeof(wchar_t) == 4)
264 {
265 // On Unix, wide characters are UCS-4 and we can just copy the characters
266 for (const char32_t codepoint : strUtf32)
267 outStr.push_back(static_cast<wchar_t>(codepoint));
268 }
269 else
270 {
271 // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
272 for (const char32_t codepoint : strUtf32)
273 {
274 if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
275 outStr.push_back(static_cast<wchar_t>(codepoint));
276 }
277 }
278
279 return outStr;
280 }
281
282
288 TGUI_NODISCARD inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
289 {
290 std::u16string outStrUtf16;
291 outStrUtf16.reserve(strUtf32.length() + 1);
292
293 for (const char32_t codepoint : strUtf32)
294 {
295 // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
296 if (codepoint <= 0xFFFF)
297 {
298 if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
299 outStrUtf16.push_back(static_cast<char16_t>(codepoint));
300
301 continue;
302 }
303 else if (codepoint > 0x0010FFFF)
304 continue; // Invalid character (greater than the maximum Unicode value)
305
306 // The input character needs be converted to two UTF-16 elements
307 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
308 outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
309 }
310
311 return outStrUtf16;
312 }
313
315 }
316}
317
319
320#endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition AbsoluteOrRelativeValue.hpp:39