cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <codecvt>
12 #include <cstdint>
13 #include <cstring>
14 #include <iomanip>
15 #include <locale>
16 #include <sstream>
17 
18 #include "invariant.h"
19 
20 #ifdef _WIN32
21 # include <util/pragma_push.def>
22 # ifdef _MSC_VER
23 # pragma warning(disable : 4668)
24 // using #if/#elif on undefined macro
25 # pragma warning(disable : 5039)
26 // pointer or reference to potentially throwing function passed to extern C
27 # endif
28 # include <util/pragma_pop.def>
29 # include <windows.h>
30 #endif
31 
32 static void utf8_append_code(unsigned int c, std::string &);
33 
34 std::string narrow(const wchar_t *s)
35 {
36 #ifdef _WIN32
37 
38  int slength = static_cast<int>(wcslen(s));
39  int rlength =
40  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
41  std::string r(rlength, 0);
42  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
43  return r;
44 
45 #else
46  return narrow(std::wstring(s));
47 #endif
48 }
49 
50 std::wstring widen(const char *s)
51 {
52 #ifdef _WIN32
53 
54  int slength = static_cast<int>(strlen(s));
55  int rlength = MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
56  std::wstring r(rlength, 0);
57  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
58  return r;
59 
60 #else
61  return widen(std::string(s));
62 #endif
63 }
64 
65 std::string narrow(const std::wstring &s)
66 {
67 #ifdef _WIN32
68 
69  int slength = static_cast<int>(s.size());
70  int rlength =
71  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
72  std::string r(rlength, 0);
73  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
74  return r;
75 
76 #else
77  std::string result;
78 
79  result.reserve(s.size()); // at least that long
80 
81  for(const auto codepoint : s)
82  utf8_append_code(codepoint, result);
83 
84  return result;
85 #endif
86 }
87 
88 std::wstring widen(const std::string &s)
89 {
90 #ifdef _WIN32
91 
92  int slength = static_cast<int>(s.size());
93  int rlength = MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
94  std::wstring r(rlength, 0);
95  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
96  return r;
97 
98 #else
99  auto utf32 = utf8_to_utf32(std::string(s));
100 
101  std::wstring r;
102  r.reserve(utf32.size());
103  for(auto codepoint : utf32)
104  r += codepoint;
105  return r;
106 #endif
107 }
108 
111 static void utf8_append_code(unsigned int c, std::string &result)
112 {
113  if(c <= 0x7f)
114  result += static_cast<char>(c);
115  else if(c <= 0x7ff)
116  {
117  result += static_cast<char>((c >> 6) | 0xc0);
118  result += static_cast<char>((c & 0x3f) | 0x80);
119  }
120  else if(c <= 0xffff)
121  {
122  result += static_cast<char>((c >> 12) | 0xe0);
123  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
124  result += static_cast<char>((c & 0x3f) | 0x80);
125  }
126  else
127  {
128  result += static_cast<char>((c >> 18) | 0xf0);
129  result += static_cast<char>(((c >> 12) & 0x3f) | 0x80);
130  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
131  result += static_cast<char>((c & 0x3f) | 0x80);
132  }
133 }
134 
137 std::string
138 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
139 {
140  std::string result;
141 
142  result.reserve(s.size()); // at least that long
143 
144  for(const auto c : s)
145  utf8_append_code(c, result);
146 
147  return result;
148 }
149 
150 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
151 {
152  if(argv_wide == nullptr)
153  return std::vector<std::string>();
154 
155  std::vector<std::string> argv_narrow;
156  argv_narrow.reserve(argc);
157 
158  for(int i = 0; i != argc; ++i)
159  argv_narrow.push_back(narrow(argv_wide[i]));
160 
161  return argv_narrow;
162 }
163 
164 static void utf16_append_code(unsigned int code, std::wstring &result)
165 {
166  // we do not treat 0xD800 to 0xDFFF, although
167  // they are not valid unicode symbols
168 
169  if(code < 0xFFFF)
170  {
171  // code is encoded as one UTF16 character
172  result += static_cast<wchar_t>(code);
173  }
174  else // code is encoded as two UTF16 characters
175  {
176  // if this is valid unicode, we have
177  // code<0x10FFFF
178  // but let's not check it programmatically
179 
180  // encode the code in UTF16
181  code = code - 0x10000;
182  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
183  result += static_cast<wchar_t>(i1);
184  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
185  result += static_cast<wchar_t>(i2);
186  }
187 }
188 
193 std::wstring utf8_to_utf16_native_endian(const std::string &in)
194 {
195  std::wstring result;
196  result.reserve(in.size());
197 
198  for(auto codepoint : utf8_to_utf32(in))
199  utf16_append_code(codepoint, result);
200 
201  return result;
202 }
203 
207 std::u32string utf8_to_utf32(const std::string &utf8_str)
208 {
209  std::u32string result;
210  result.reserve(utf8_str.size());
212  while(i < utf8_str.size())
213  {
214  unsigned char c = utf8_str[i++];
215  char32_t code = 0;
216  // the ifs that follow find out how many UTF8 characters (1-4) store the
217  // next unicode character. This is determined by the few most
218  // significant bits.
219  if(c <= 0x7F)
220  {
221  // if it's one character, then code is exactly the value
222  code = c;
223  }
224  else if(c <= 0xDF && i < utf8_str.size())
225  { // in other cases, we need to read the right number of chars and decode
226  // note: if we wanted to make sure that we capture incorrect strings,
227  // we should check that whatever follows first character starts with
228  // bits 10.
229  code = (c & 0x1Fu) << 6;
230  c = utf8_str[i++];
231  code += c & 0x3Fu;
232  }
233  else if(c <= 0xEF && i + 1 < utf8_str.size())
234  {
235  code = (c & 0xFu) << 12;
236  c = utf8_str[i++];
237  code += (c & 0x3Fu) << 6;
238  c = utf8_str[i++];
239  code += c & 0x3Fu;
240  }
241  else if(c <= 0xF7 && i + 2 < utf8_str.size())
242  {
243  code = (c & 0x7u) << 18;
244  c = utf8_str[i++];
245  code += (c & 0x3Fu) << 12;
246  c = utf8_str[i++];
247  code += (c & 0x3Fu) << 6;
248  c = utf8_str[i++];
249  code += c & 0x3Fu;
250  }
251  else
252  {
253  // The string is not a valid UTF8 string! Either it has some characters
254  // missing from a multi-character unicode symbol, or it has a char with
255  // too high value.
256  // For now, let's replace the character with a space
257  code = 32;
258  }
259 
260  result.append(1, code);
261  }
262 
263  return result;
264 }
265 
275  const wchar_t ch,
276  std::ostringstream &result,
277  const std::locale &loc)
278 {
279  // \u unicode characters are translated very early by the Java compiler and so
280  // \u000a or \u000d would become a newline character in a char constant, which
281  // is illegal. Instead use \n or \r.
282  if(ch == '\n')
283  result << "\\n";
284  else if(ch == '\r')
285  result << "\\r";
286  // \f, \b and \t do not need to be escaped, but this will improve readability
287  // of generated tests.
288  else if(ch == '\f')
289  result << "\\f";
290  else if(ch == '\b')
291  result << "\\b";
292  else if(ch == '\t')
293  result << "\\t";
294  else if(ch <= 255 && isprint(ch, loc))
295  {
296  const auto uch = static_cast<unsigned char>(ch);
297  // ", and \ need to be escaped, but not ' for java strings
298  // e.g. "\"\\" needs escaping but "'" does not.
299  if(uch == '"' || uch == '\\')
300  result << '\\';
301  result << uch;
302  }
303  else
304  {
305  // Format ch as a hexadecimal unicode character padded to four digits with
306  // zeros.
307  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
308  << static_cast<unsigned int>(ch);
309  }
310 }
311 
319  const wchar_t ch,
320  std::ostringstream &result,
321  const std::locale &loc)
322 {
323  if(ch == (wchar_t)'\'')
324  {
325  const auto uch = static_cast<unsigned char>(ch);
326  // ' needs to be escaped for java characters, e.g. '\''
327  result << '\\' << uch;
328  }
329  else
330  {
331  utf16_native_endian_to_java_string(ch, result, loc);
332  }
333 }
334 
337 std::string utf16_native_endian_to_java(const char16_t ch)
338 {
339  std::ostringstream result;
340  const std::locale loc;
341  utf16_native_endian_to_java(ch, result, loc);
342  return result.str();
343 }
344 
352 std::string utf16_native_endian_to_java_string(const std::wstring &in)
353 {
354  std::ostringstream result;
355  const std::locale loc;
356  for(const auto ch : in)
357  utf16_native_endian_to_java_string(ch, result, loc);
358  return result.str();
359 }
360 
361 std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
362 {
363  return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
364 }
365 
366 std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
367 {
368 #ifdef _MSC_VER
369  // Workaround for Visual Studio bug, see
370  // https://stackoverflow.com/questions/32055357
371  std::wstring wide_string(utf16_str.begin(), utf16_str.end());
372  return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
373  .to_bytes(wide_string);
374 #else
375  return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
376  .to_bytes(utf16_str);
377 #endif
378 }
379 
380 char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
381 {
382  PRECONDITION(hex.length() == 4);
383  return std::strtol(hex.c_str(), nullptr, 16);
384 }
385 
386 std::string codepoint_hex_to_utf8(const std::string &hex)
387 {
389 }
codepoint_hex_to_utf8
std::string codepoint_hex_to_utf8(const std::string &hex)
Definition: unicode.cpp:386
utf8_to_utf32
std::u32string utf8_to_utf32(const std::string &utf8_str)
Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
Definition: unicode.cpp:207
PRECONDITION
#define PRECONDITION(CONDITION)
Definition: invariant.h:464
utf16_native_endian_to_java_string
static void utf16_native_endian_to_java_string(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double quotes and backslashes.
Definition: unicode.cpp:274
narrow
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:34
utf16_native_endian_to_utf8
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
Definition: unicode.cpp:361
widen
std::wstring widen(const char *s)
Definition: unicode.cpp:50
utf8_to_utf16_native_endian
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:193
codepoint_hex_to_utf16_native_endian
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
Definition: unicode.cpp:380
invariant.h
unicode.h
utf8_append_code
static void utf8_append_code(unsigned int c, std::string &)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:111
r
static int8_t r
Definition: irep_hash.h:59
narrow_argv
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:150
utf16_native_endian_to_java
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double- and single-quotes and backsla...
Definition: unicode.cpp:318
size_type
unsignedbv_typet size_type()
Definition: c_types.cpp:58
utf16_append_code
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:164
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:138