TGUI  0.8.9
Utf.hpp
1 //
3 // TGUI - Texus' Graphical User Interface
4 // Copyright (C) 2012-2020 Bruno Van de Velde (vdv_b@tgui.eu)
5 //
6 // This software is provided 'as-is', without any express or implied warranty.
7 // In no event will the authors be held liable for any damages arising from the use of this software.
8 //
9 // Permission is granted to anyone to use this software for any purpose,
10 // including commercial applications, and to alter it and redistribute it freely,
11 // subject to the following restrictions:
12 //
13 // 1. The origin of this software must not be misrepresented;
14 // you must not claim that you wrote the original software.
15 // If you use this software in a product, an acknowledgment
16 // in the product documentation would be appreciated but is not required.
17 //
18 // 2. Altered source versions must be plainly marked as such,
19 // and must not be misrepresented as being the original software.
20 //
21 // 3. This notice may not be removed or altered from any source distribution.
22 //
24 
25 
26 #ifndef TGUI_UTF_HPP
27 #define TGUI_UTF_HPP
28 
29 #include <string>
30 #include <array>
31 
33 
34 namespace tgui
35 {
36  namespace utf
37  {
43  template <typename CharT> // CharT is either char or char8_t
44  void encodeCharUtf8(char32_t input, std::basic_string<CharT>& outStrUtf8)
45  {
46  if (input < 128)
47  {
48  outStrUtf8.push_back(static_cast<CharT>(input));
49  return;
50  }
51 
52  // Encode the character (if it is valid)
53  if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
54  return;
55 
56  // Get the number of bytes to write
57  std::size_t bytestoWrite;
58  if (input < 0x800)
59  bytestoWrite = 2;
60  else if (input < 0x10000)
61  bytestoWrite = 3;
62  else if (input <= 0x0010FFFF)
63  bytestoWrite = 4;
64  else
65  return;
66 
67  static const std::uint8_t firstByteMask[5] = { 0, 0, 0xC0, 0xE0, 0xF0 };
68 
69  // Extract the bytes to write
70  std::array<CharT, 4> bytes;
71  if (bytestoWrite == 4) { bytes[3] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
72  if (bytestoWrite >= 3) { bytes[2] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
73  if (bytestoWrite >= 2) { bytes[1] = static_cast<CharT>((input | 0x80) & 0xBF); input >>= 6; }
74  if (bytestoWrite >= 1) { bytes[0] = static_cast<CharT>(input | firstByteMask[bytestoWrite]); }
75 
76  // Add them to the output
77  outStrUtf8.append(bytes.begin(), bytes.begin() + bytestoWrite);
78  }
79 
80 
88  template <typename CharIt> // CharIt is an iterator for a string containing either char or char8_t
89  CharIt decodeCharUtf8(CharIt inputCharIt, CharIt inputEndIt, std::u32string& outStrUtf32)
90  {
91  if (static_cast<std::uint8_t>(*inputCharIt) < 128)
92  {
93  outStrUtf32.push_back(static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt)));
94  return ++inputCharIt;
95  }
96 
97  // Some useful precomputed data
98  static const std::uint32_t offsetsMap[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
99  static const std::uint8_t trailingMap[128] =
100  {
101  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
105  };
106 
107  // decode the character
108  std::uint8_t trailingBytes = trailingMap[static_cast<std::uint8_t>(*inputCharIt) - 128];
109  const std::uint32_t offset = offsetsMap[trailingBytes];
110  if (inputCharIt + trailingBytes < inputEndIt)
111  {
112  char32_t outputChar = 0;
113  while (trailingBytes > 0)
114  {
115  outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
116  outputChar <<= 6;
117  --trailingBytes;
118  }
119 
120  outputChar += static_cast<char32_t>(static_cast<std::uint8_t>(*inputCharIt++));
121  outputChar -= offset;
122  outStrUtf32.push_back(outputChar);
123  }
124  else // Incomplete character
125  inputCharIt = inputEndIt;
126 
127  return inputCharIt;
128  }
129 
130 
131 #if defined(__cpp_lib_char8_t) && (__cpp_lib_char8_t >= 201811L)
137  inline std::u8string convertUtf32toUtf8(const std::u32string& strUtf32)
138  {
139  std::u8string outStrUtf8;
140  outStrUtf8.reserve(strUtf32.length() + 1);
141  for (const char32_t& codepoint : strUtf32)
142  encodeCharUtf8(codepoint, outStrUtf8);
143 
144  return outStrUtf8;
145  }
146 #endif
147 
154  template <typename CharIt>
155  std::u32string convertUtf8toUtf32(CharIt inputBegin, CharIt inputEnd)
156  {
157  std::u32string outStrUtf32;
158  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
159 
160  auto it = inputBegin;
161  while (it < inputEnd)
162  it = decodeCharUtf8(it, inputEnd, outStrUtf32);
163 
164  return outStrUtf32;
165  }
166 
167 
174  template <typename U16CharIt>
175  std::u32string convertUtf16toUtf32(U16CharIt inputBegin, U16CharIt inputEnd)
176  {
177  std::u32string outStrUtf32;
178  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
179 
180  auto it = inputBegin;
181  while (it < inputEnd)
182  {
183  const char16_t first = *it++;
184 
185  // Copy the character if it isn't a surrogate pair
186  if ((first < 0xD800) || (first > 0xDBFF))
187  {
188  outStrUtf32.push_back(static_cast<char32_t>(first));
189  continue;
190  }
191 
192  // We need to read another character
193  if (it == inputEnd)
194  break;
195 
196  const char16_t second = *it++;
197  if ((second >= 0xDC00) && (second <= 0xDFFF))
198  outStrUtf32.push_back(((static_cast<char32_t>(first) - 0xD800) << 10) + (static_cast<char32_t>(second) - 0xDC00) + 0x0010000);
199  }
200 
201 
202  return outStrUtf32;
203  }
204 
205 
211  template <typename WCharIt>
212  std::u32string convertWidetoUtf32(WCharIt inputBegin, WCharIt inputEnd)
213  {
214  std::u32string outStrUtf32;
215  outStrUtf32.reserve((inputEnd - inputBegin) + 1);
216 
217  // std::wstring uses UCS-2 on Windows and UCS-4 on unix, so we can be cast directly
218  for (auto it = inputBegin; it != inputEnd; ++it)
219  outStrUtf32.push_back(static_cast<char32_t>(*it));
220 
221 
222  return outStrUtf32;
223  }
224 
225 
231  inline std::string convertUtf32toLatin1(const std::u32string& strUtf32)
232  {
233  std::string outStr;
234  outStr.reserve(strUtf32.length() + 1);
235  for (const char32_t codepoint : strUtf32)
236  {
237  if (codepoint < 256)
238  outStr.push_back(static_cast<char>(codepoint));
239  }
240 
241  return outStr;
242  }
243 
244 
250  inline std::string convertUtf32toStdStringUtf8(const std::u32string& strUtf32)
251  {
252  std::string outStrUtf8;
253  outStrUtf8.reserve(strUtf32.length() + 1);
254  for (const char32_t codepoint : strUtf32)
255  encodeCharUtf8(codepoint, outStrUtf8);
256 
257  return outStrUtf8;
258  }
259 
260 
266  inline std::wstring convertUtf32toWide(const std::u32string& strUtf32)
267  {
268  std::wstring outStr;
269  outStr.reserve(strUtf32.length() + 1);
270 
271 
272 #if defined(__cpp_if_constexpr) && (__cpp_if_constexpr >= 201606L)
273  if constexpr (sizeof(wchar_t) == 4)
274 #else
275  #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
276  #pragma warning(push)
277  #pragma warning(disable:4127)
278  #endif
279  if (sizeof(wchar_t) == 4)
280  #if defined TGUI_SYSTEM_WINDOWS && defined _MSC_VER
281  #pragma warning(pop)
282  #endif
283 #endif
284  {
285  // On Unix, wide characters are UCS-4 and we can just copy the characters
286  for (const char32_t codepoint : strUtf32)
287  outStr.push_back(static_cast<wchar_t>(codepoint));
288  }
289  else
290  {
291  // On Windows, wide characters are UCS-2. We just drop the characters that don't fit within a single wide character here.
292  for (const char32_t codepoint : strUtf32)
293  {
294  if ((codepoint < 0xD800) || ((codepoint > 0xDFFF) && (codepoint <= 0xFFFF)))
295  outStr.push_back(static_cast<wchar_t>(codepoint));
296  }
297  }
298 
299  return outStr;
300  }
301 
302 
308  inline std::u16string convertUtf32toUtf16(const std::u32string& strUtf32)
309  {
310  std::u16string outStrUtf16;
311  outStrUtf16.reserve(strUtf32.length() + 1);
312 
313  for (const char32_t codepoint : strUtf32)
314  {
315  // If the codepoint fitst inside 2 bytes and it would represent a valid character then just copy it
316  if (codepoint <= 0xFFFF)
317  {
318  if ((codepoint < 0xD800) || (codepoint > 0xDFFF))
319  outStrUtf16.push_back(static_cast<char16_t>(codepoint));
320 
321  continue;
322  }
323  else if (codepoint > 0x0010FFFF)
324  continue; // Invalid character (greater than the maximum Unicode value)
325 
326  // The input character needs be converted to two UTF-16 elements
327  outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) >> 10) + 0xD800));
328  outStrUtf16.push_back(static_cast<char16_t>(((codepoint - 0x0010000) & 0x3FFUL) + 0xDC00));
329  }
330 
331  return outStrUtf16;
332  }
333 
335  }
336 }
337 
339 
340 #endif // TGUI_UTF_HPP
Namespace that contains all TGUI functions and classes.
Definition: AbsoluteOrRelativeValue.hpp:37