Unicode.inl
1 //
3 // SFML - Simple and Fast Multimedia Library
4 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
5 //
6 // This software is provided 'as-is', without any express or implied warranty.
7 // In no event will the authors be held liable for any damages arising from the use of this software.
8 //
9 // Permission is granted to anyone to use this software for any purpose,
10 // including commercial applications, and to alter it and redistribute it freely,
11 // subject to the following restrictions:
12 //
13 // 1. The origin of this software must not be misrepresented;
14 // you must not claim that you wrote the original software.
15 // If you use this software in a product, an acknowledgment
16 // in the product documentation would be appreciated but is not required.
17 //
18 // 2. Altered source versions must be plainly marked as such,
19 // and must not be misrepresented as being the original software.
20 //
21 // 3. This notice may not be removed or altered from any source distribution.
22 //
24 
25 
30 template <typename In, typename Out>
31 inline Out Unicode::UTF32ToANSI(In Begin, In End, Out Output, char Replacement, const std::locale& Locale)
32 {
33  #ifdef __MINGW32__
34 
35  // MinGW has a almost no support for unicode stuff
36  // As a consequence, the MinGW version of this function can only use the default locale
37  // and ignores the one passed as parameter
38  while (Begin < End)
39  {
40  char Char = 0;
41  if (wctomb(&Char, static_cast<wchar_t>(*Begin++)) >= 0)
42  *Output++ = Char;
43  else if (Replacement)
44  *Output++ = Replacement;
45  }
46 
47  #else
48 
49  // Get the facet of the locale which deals with character conversion
50  const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
51 
52  // Use the facet to convert each character of the input string
53  while (Begin < End)
54  *Output++ = Facet.narrow(static_cast<wchar_t>(*Begin++), Replacement);
55 
56  #endif
57 
58  return Output;
59 }
60 
61 
66 template <typename In, typename Out>
67 inline Out Unicode::ANSIToUTF32(In Begin, In End, Out Output, const std::locale& Locale)
68 {
69  #ifdef __MINGW32__
70 
71  // MinGW has a almost no support for unicode stuff
72  // As a consequence, the MinGW version of this function can only use the default locale
73  // and ignores the one passed as parameter
74  while (Begin < End)
75  {
76  wchar_t Char = 0;
77  mbtowc(&Char, &*Begin, 1);
78  Begin++;
79  *Output++ = static_cast<Uint32>(Char);
80  }
81 
82  #else
83 
84  // Get the facet of the locale which deals with character conversion
85  const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
86 
87  // Use the facet to convert each character of the input string
88  while (Begin < End)
89  *Output++ = static_cast<Uint32>(Facet.widen(*Begin++));
90 
91  #endif
92 
93  return Output;
94 }
95 
96 
101 template <typename In, typename Out>
102 inline Out Unicode::UTF8ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
103 {
104  while (Begin < End)
105  {
106  Uint32 c = 0;
107  int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
108  if (Begin + TrailingBytes < End)
109  {
110  // First decode the UTF-8 character
111  switch (TrailingBytes)
112  {
113  case 5 : c += *Begin++; c <<= 6;
114  case 4 : c += *Begin++; c <<= 6;
115  case 3 : c += *Begin++; c <<= 6;
116  case 2 : c += *Begin++; c <<= 6;
117  case 1 : c += *Begin++; c <<= 6;
118  case 0 : c += *Begin++;
119  }
120  c -= UTF8Offsets[TrailingBytes];
121 
122  // Then encode it in UTF-16
123  if (c < 0xFFFF)
124  {
125  // Character can be converted directly to 16 bits, just need to check it's in the valid range
126  if ((c >= 0xD800) && (c <= 0xDFFF))
127  {
128  // Invalid character (this range is reserved)
129  if (Replacement)
130  *Output++ = Replacement;
131  }
132  else
133  {
134  // Valid character directly convertible to 16 bits
135  *Output++ = static_cast<Uint16>(c);
136  }
137  }
138  else if (c > 0x0010FFFF)
139  {
140  // Invalid character (greater than the maximum unicode value)
141  if (Replacement)
142  *Output++ = Replacement;
143  }
144  else
145  {
146  // Character will be converted to 2 UTF-16 elements
147  c -= 0x0010000;
148  *Output++ = static_cast<Uint16>((c >> 10) + 0xD800);
149  *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
150  }
151  }
152  }
153 
154  return Output;
155 }
156 
157 
162 template <typename In, typename Out>
163 inline Out Unicode::UTF8ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
164 {
165  while (Begin < End)
166  {
167  Uint32 c = 0;
168  int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
169  if (Begin + TrailingBytes < End)
170  {
171  // First decode the UTF-8 character
172  switch (TrailingBytes)
173  {
174  case 5 : c += *Begin++; c <<= 6;
175  case 4 : c += *Begin++; c <<= 6;
176  case 3 : c += *Begin++; c <<= 6;
177  case 2 : c += *Begin++; c <<= 6;
178  case 1 : c += *Begin++; c <<= 6;
179  case 0 : c += *Begin++;
180  }
181  c -= UTF8Offsets[TrailingBytes];
182 
183  // Then write it if valid
184  if ((c < 0xD800) || (c > 0xDFFF))
185  {
186  // Valid UTF-32 character
187  *Output++ = c;
188  }
189  else
190  {
191  // Invalid UTF-32 character
192  if (Replacement)
193  *Output++ = Replacement;
194  }
195  }
196  }
197 
198  return Output;
199 }
200 
201 
206 template <typename In, typename Out>
207 inline Out Unicode::UTF16ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
208 {
209  while (Begin < End)
210  {
211  Uint32 c = *Begin++;
212 
213  // If it's a surrogate pair, first convert to a single UTF-32 character
214  if ((c >= 0xD800) && (c <= 0xDBFF))
215  {
216  if (Begin < End)
217  {
218  // The second element is valid : convert the two elements to a UTF-32 character
219  Uint32 d = *Begin++;
220  if ((d >= 0xDC00) && (d <= 0xDFFF))
221  c = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
222  }
223  else
224  {
225  // Invalid second element
226  if (Replacement)
227  *Output++ = Replacement;
228  }
229  }
230 
231  // Then convert to UTF-8
232  if (c > 0x0010FFFF)
233  {
234  // Invalid character (greater than the maximum unicode value)
235  if (Replacement)
236  *Output++ = Replacement;
237  }
238  else
239  {
240  // Valid character
241 
242  // Get number of bytes to write
243  int BytesToWrite = 1;
244  if (c < 0x80) BytesToWrite = 1;
245  else if (c < 0x800) BytesToWrite = 2;
246  else if (c < 0x10000) BytesToWrite = 3;
247  else if (c <= 0x0010FFFF) BytesToWrite = 4;
248 
249  // Extract bytes to write
250  Uint8 Bytes[4];
251  switch (BytesToWrite)
252  {
253  case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
254  case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
255  case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
256  case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
257  }
258 
259  // Add them to the output
260  const Uint8* CurByte = Bytes;
261  switch (BytesToWrite)
262  {
263  case 4 : *Output++ = *CurByte++;
264  case 3 : *Output++ = *CurByte++;
265  case 2 : *Output++ = *CurByte++;
266  case 1 : *Output++ = *CurByte++;
267  }
268  }
269  }
270 
271  return Output;
272 }
273 
274 
279 template <typename In, typename Out>
280 inline Out Unicode::UTF16ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
281 {
282  while (Begin < End)
283  {
284  Uint16 c = *Begin++;
285  if ((c >= 0xD800) && (c <= 0xDBFF))
286  {
287  // We have a surrogate pair, ie. a character composed of two elements
288  if (Begin < End)
289  {
290  Uint16 d = *Begin++;
291  if ((d >= 0xDC00) && (d <= 0xDFFF))
292  {
293  // The second element is valid : convert the two elements to a UTF-32 character
294  *Output++ = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
295  }
296  else
297  {
298  // Invalid second element
299  if (Replacement)
300  *Output++ = Replacement;
301  }
302  }
303  }
304  else if ((c >= 0xDC00) && (c <= 0xDFFF))
305  {
306  // Invalid character
307  if (Replacement)
308  *Output++ = Replacement;
309  }
310  else
311  {
312  // Valid character directly convertible to UTF-32
313  *Output++ = static_cast<Uint32>(c);
314  }
315  }
316 
317  return Output;
318 }
319 
320 
325 template <typename In, typename Out>
326 inline Out Unicode::UTF32ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
327 {
328  while (Begin < End)
329  {
330  Uint32 c = *Begin++;
331  if (c > 0x0010FFFF)
332  {
333  // Invalid character (greater than the maximum unicode value)
334  if (Replacement)
335  *Output++ = Replacement;
336  }
337  else
338  {
339  // Valid character
340 
341  // Get number of bytes to write
342  int BytesToWrite = 1;
343  if (c < 0x80) BytesToWrite = 1;
344  else if (c < 0x800) BytesToWrite = 2;
345  else if (c < 0x10000) BytesToWrite = 3;
346  else if (c <= 0x0010FFFF) BytesToWrite = 4;
347 
348  // Extract bytes to write
349  Uint8 Bytes[4];
350  switch (BytesToWrite)
351  {
352  case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
353  case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
354  case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
355  case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
356  }
357 
358  // Add them to the output
359  const Uint8* CurByte = Bytes;
360  switch (BytesToWrite)
361  {
362  case 4 : *Output++ = *CurByte++;
363  case 3 : *Output++ = *CurByte++;
364  case 2 : *Output++ = *CurByte++;
365  case 1 : *Output++ = *CurByte++;
366  }
367  }
368  }
369 
370  return Output;
371 }
372 
373 
378 template <typename In, typename Out>
379 inline Out Unicode::UTF32ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
380 {
381  while (Begin < End)
382  {
383  Uint32 c = *Begin++;
384  if (c < 0xFFFF)
385  {
386  // Character can be converted directly to 16 bits, just need to check it's in the valid range
387  if ((c >= 0xD800) && (c <= 0xDFFF))
388  {
389  // Invalid character (this range is reserved)
390  if (Replacement)
391  *Output++ = Replacement;
392  }
393  else
394  {
395  // Valid character directly convertible to 16 bits
396  *Output++ = static_cast<Uint16>(c);
397  }
398  }
399  else if (c > 0x0010FFFF)
400  {
401  // Invalid character (greater than the maximum unicode value)
402  if (Replacement)
403  *Output++ = Replacement;
404  }
405  else
406  {
407  // Character will be converted to 2 UTF-16 elements
408  c -= 0x0010000;
409  *Output++ = static_cast<Uint16>((c >> 10) + 0xD800);
410  *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
411  }
412  }
413 
414  return Output;
415 }
416 
417 
421 template <typename In>
422 inline std::size_t Unicode::GetUTF8Length(In Begin, In End)
423 {
424  std::size_t Length = 0;
425  while (Begin < End)
426  {
427  int NbBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
428  if (Begin + NbBytes < End)
429  ++Length;
430 
431  Begin += NbBytes + 1;
432  }
433 
434  return Length;
435 }
436 
437 
441 template <typename In>
442 inline std::size_t Unicode::GetUTF16Length(In Begin, In End)
443 {
444  std::size_t Length = 0;
445  while (Begin < End)
446  {
447  if ((*Begin >= 0xD800) && (*Begin <= 0xDBFF))
448  {
449  ++Begin;
450  if ((Begin < End) && ((*Begin >= 0xDC00) && (*Begin <= 0xDFFF)))
451  {
452  ++Length;
453  }
454  }
455  else
456  {
457  ++Length;
458  }
459 
460  ++Begin;
461  }
462 
463  return Length;
464 }
465 
466 
470 template <typename In>
471 inline std::size_t Unicode::GetUTF32Length(In Begin, In End)
472 {
473  return End - Begin;
474 }