TGUI  0.9-dev
Utf.hpp
1 //
3 // TGUI - Texus' Graphical User Interface
4 // Copyright (C) 2012-2020 Bruno Van de Velde (vdv_b@tgui.eu)
5 //
6 // This software is provided 'as-is', without any express or implied warranty.
7 // In no event will the authors be held liable for any damages arising from the use of this software.
8 //
9 // Permission is granted to anyone to use this software for any purpose,
10 // including commercial applications, and to alter it and redistribute it freely,
11 // subject to the following restrictions:
12 //
13 // 1. The origin of this software must not be misrepresented;
14 // you must not claim that you wrote the original software.
15 // If you use this software in a product, an acknowledgment
16 // in the product documentation would be appreciated but is not required.
17 //
18 // 2. Altered source versions must be plainly marked as such,
19 // and must not be misrepresented as being the original software.
20 //
21 // 3. This notice may not be removed or altered from any source distribution.
22 //
24 
25 
26 #ifndef TGUI_UTF_HPP
27 #define TGUI_UTF_HPP
28 
29 #include <TGUI/Config.hpp>
30 #include <string>
31 #include <array>
32 
34 
35 // Disable warning in Visual Studio about being able to use "if constexpr".
36 // The code would use "if constexpr" if the compiler would just define __cpp_if_constexpr
37 #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
38  #pragma warning(push)
39  #pragma warning(disable:4127)
40 #endif
41 
42 namespace tgui
43 {
44  namespace utf
45  {
51  template <typename CharT> // CharT is either char or char8_t
52  void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
53  {
54  if (input < 128)
55  {
56  outStrUtf8.push_back(static_cast<CharT>(input));
57  return;
58  }
59 
60  // Encode the character (if it is valid)
61  if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
62  return;
63 
64  // Get the number of bytes to write
65  std::size_t bytestoWrite;
66  if (input < 0x800)
67  bytestoWrite = 2;
68  else if (input < 0x10000)
69  bytestoWrite = 3;
70  else if (input <= 0x0010FFFF)
71  bytestoWrite = 4;
72  else
73  return;
74 
75  static const std::uint8_t firstByteMask[5] = { 0, 0, 0xC0, 0xE0, 0xF0 };
76 
77  // Extract the bytes to write
78  std::array<CharT, 4> bytes;
79  if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
80  if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
81  if (bytestoWrite >= 2) { bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
82  if (bytestoWrite >= 1) { bytes[0] = static_cast<CharT>(input | firstByteMask[bytestoWrite]); }
83 
84  // Add them to the output
85  outStrUtf8.append(bytes.begin(), bytes.begin() + bytestoWrite);
86  }
87 
88 
96  template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
97  CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
98  {
99  if (static_cast<std::uint8_t>(*inputCharIt) < 128)
100  {
101  outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
102  return ++inputCharIt;
103  }
104 
105  // Some useful precomputed data
106  static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
107  static const std::uint8_t trailingMap[128] =
108  {
109  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
112  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
113  };
114 
115  // decode the character
116  std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
117  const std::uint32_t offset = offsetsMap[trailingBytes];
118  if (inputCharIt + trailingBytes < inputEndIt)
119  {
120  char32_t outputChar = 0;
121  while (trailingBytes > 0)
122  {
123  outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
124  outputChar <<= 6;
125  --trailingBytes;
126  }
127 
128  outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
129  outputChar -= offset;
130  outStrUtf32.push_back(outputChar);
131  }
132  else // Incomplete character
133  inputCharIt = inputEndIt;
134 
135  return inputCharIt;
136  }
137 
138 
139 #if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
140  inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
146  {
147  std::u8string outStrUtf8;
148  outStrUtf8.reserve(strUtf32.length() + 1);
149  for (const char32_t& codepoint : strUtf32)
150  encodeCharUtf8(codepoint, outStrUtf8);
151 
152  return outStrUtf8;
153  }
154 #endif
155 
162  template <typename CharIt>
163  std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
164  {
165  std::u32string outStrUtf32;
166  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
167 
168  auto it = inputBegin;
169  while (it < inputEnd)
170  it = decodeCharUtf8(it, inputEnd, outStrUtf32);
171 
172  return outStrUtf32;
173  }
174 
175 
182  template <typename U16CharIt>
183  std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
184  {
185  std::u32string outStrUtf32;
186  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
187 
188  auto it = inputBegin;
189  while (it < inputEnd)
190  {
191  const char16_t first = *it++;
192 
193  // Copy the character if it isn't a surrogate pair
194  if ((first < 0xD800) || (first > 0xDBFF))
195  {
196  outStrUtf32.push_back(static_cast<char32_t>(first));
197  continue;
198  }
199 
200  // We need to read another character
201  if (it == inputEnd)
202  break;
203 
204  const char16_t second = *it++;
205  if ((second >= 0xDC00) && (second <= 0xDFFF))
206  outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
207  }
208 
209 
210  return outStrUtf32;
211  }
212 
213 
219  template <typename WCharIt>
220  std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
221  {
222  std::u32string outStrUtf32;
223  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
224 
225  // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
226  for (auto it = inputBegin; it != inputEnd; ++it)
227  outStrUtf32.push_back(static_cast<char32_t>(*it));
228 
229 
230  return outStrUtf32;
231  }
232 
233 
239  inline std::string convertUtf32toLatin1(const std::u32string& strUtf32)
240  {
241  std::string outStr;
242  outStr.reserve(strUtf32.length() + 1);
243  for (const char32_t codepoint : strUtf32)
244  {
245  if (codepoint < 256)
246  outStr.push_back(static_cast<char>(codepoint));
247  }
248 
249  return outStr;
250  }
251 
252 
258  inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
259  {
260  std::string outStrUtf8;
261  outStrUtf8.reserve(strUtf32.length() + 1);
262  for (const char32_t codepoint : strUtf32)
263  encodeCharUtf8(codepoint, outStrUtf8);
264 
265  return outStrUtf8;
266  }
267 
268 
274  inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
275  {
276  std::wstring outStr;
277  outStr.reserve(strUtf32.length() + 1);
278 
279 #if defined(__cpp_if_constexpr) && (__cpp_if_constexpr >= 201606L)
280  if constexpr (sizeof(wchar_t) == 4)
281 #else
282  if (sizeof(wchar_t) == 4)
283 #endif
284  {
285  // On Unix, wide characters are UCS-4 and we can just copy the characters
286  for (const char32_t codepoint : strUtf32)
287  outStr.push_back(static_cast<wchar_t>(codepoint));
288  }
289  else
290  {
291  // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
292  for (const char32_t codepoint : strUtf32)
293  {
294  if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
295  outStr.push_back(static_cast<wchar_t>(codepoint));
296  }
297  }
298 
299  return outStr;
300  }
301 
302 
308  inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
309  {
310  std::u16string outStrUtf16;
311  outStrUtf16.reserve(strUtf32.length() + 1);
312 
313  for (const char32_t codepoint : strUtf32)
314  {
315  // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
316  if (codepoint <= 0xFFFF)
317  {
318  if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
319  outStrUtf16.push_back(static_cast<char16_t>(codepoint));
320 
321  continue;
322  }
323  else if (codepoint > 0x0010FFFF)
324  continue; // Invalid character (greater than the maximum Unicode value)
325 
326  // The input character needs be converted to two UTF-16 elements
327  outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
328  outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
329  }
330 
331  return outStrUtf16;
332  }
333 
335  }
336 }
337 
338 #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
339  #pragma warning(pop)
340 #endif
341 
343 
344 #endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition: AbsoluteOrRelativeValue.hpp:35