cprover
convert_string_literal.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module: C/C++ Language Conversion
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
11 
12 #include "convert_string_literal.h"
13 
14 #include <cassert>
15 
16 #include <util/arith_tools.h>
17 #include <util/c_types.h>
18 #include <util/unicode.h>
19 #include <util/string_constant.h>
20 
21 #include "unescape_string.h"
22 
23 std::basic_string<unsigned int> convert_one_string_literal(
24  const std::string &src)
25 {
26  assert(src.size()>=2);
27 
28  if(src[0]=='u' && src[1]=='8')
29  {
30  assert(src[src.size()-1]=='"');
31  assert(src[2]=='"');
32 
33  std::basic_string<unsigned int> value=
34  unescape_wide_string(std::string(src, 3, src.size()-4));
35 
36  // turn into utf-8
37  const std::string utf8_value = utf32_native_endian_to_utf8(value);
38 
39  // pad into wide string
40  value.resize(utf8_value.size());
41  for(std::size_t i=0; i<utf8_value.size(); i++)
42  value[i]=utf8_value[i];
43 
44  return value;
45  }
46  else if(src[0]=='L' || src[0]=='u' || src[0]=='U')
47  {
48  assert(src[src.size()-1]=='"');
49  assert(src[1]=='"');
50 
51  return unescape_wide_string(std::string(src, 2, src.size()-3));
52  }
53  else
54  {
55  assert(src[0]=='"');
56  assert(src[src.size()-1]=='"');
57 
58  std::string char_value=
59  unescape_string(std::string(src, 1, src.size()-2));
60 
61  // pad into wide string
62  std::basic_string<unsigned int> value;
63  value.resize(char_value.size());
64  for(std::size_t i=0; i<char_value.size(); i++)
65  value[i]=char_value[i];
66 
67  return value;
68  }
69 }
70 
71 exprt convert_string_literal(const std::string &src)
72 {
73  // note that 'src' could be a concatenation of string literals,
74  // e.g., something like "asd" "xyz".
75  // GCC allows "asd" L"xyz"!
76 
77  std::basic_string<unsigned int> value;
78 
79  char wide=0;
80 
81  for(std::size_t i=0; i<src.size(); i++)
82  {
83  char ch=src[i];
84 
85  // skip whitespace/newline
86  if(ch!='L' && ch!='u' && ch!='U' && ch!='"')
87  continue;
88 
89  if(ch=='L')
90  wide=ch;
91  if((ch=='u' || ch=='U') && i+1<src.size() && src[i+1]=='"')
92  wide=ch;
93 
94  // find start of sequence
95  std::size_t j=src.find('"', i);
96  if(j==std::string::npos)
97  throw "invalid string constant '" + src + "'";
98 
99  // find end of sequence, considering escaping
100  for(++j; j<src.size() && src[j]!='"'; ++j)
101  if(src[j]=='\\') // skip next character
102  ++j;
103 
104  assert(j<=src.size());
105  if(j==src.size())
106  throw "non-terminated string constant '" + src + "'";
107 
108  std::string tmp_src=std::string(src, i, j-i+1);
109  std::basic_string<unsigned int> tmp_value=
111  value.append(tmp_value);
112  i=j;
113  }
114 
115  if(wide!=0)
116  {
117  // add implicit trailing zero
118  value.push_back(0);
119 
120  // L is wchar_t, u is char16_t, U is char32_t.
121  typet subtype;
122 
123  switch(wide)
124  {
125  case 'L': subtype=wchar_t_type(); break;
126  case 'u': subtype=char16_t_type(); break;
127  case 'U': subtype=char32_t_type(); break;
128  default: assert(false);
129  }
130 
131  exprt result=exprt(ID_array);
132  result.set(ID_C_string_constant, true);
133  result.type()=typet(ID_array);
134  result.type().subtype()=subtype;
135  result.type().set(ID_size, from_integer(value.size(), index_type()));
136 
137  result.operands().resize(value.size());
138  for(std::size_t i=0; i<value.size(); i++)
139  result.operands()[i]=from_integer(value[i], subtype);
140 
141  return result;
142  }
143  else
144  {
145  std::string char_value;
146 
147  char_value.resize(value.size());
148 
149  for(std::size_t i=0; i<value.size(); i++)
150  {
151  // Loss of data here if value[i]>255.
152  // gcc issues a warning in this case.
153  char_value[i]=value[i];
154  }
155 
156  return string_constantt(char_value);
157  }
158 }
typet::subtype
const typet & subtype() const
Definition: type.h:47
arith_tools.h
typet
The type of an expression, extends irept.
Definition: type.h:29
char32_t_type
unsignedbv_typet char32_t_type()
Definition: c_types.cpp:175
convert_string_literal.h
C/C++ Language Conversion.
irept::find
const irept & find(const irep_namet &name) const
Definition: irep.cpp:103
string_constant.h
exprt
Base class for all expressions.
Definition: expr.h:53
char16_t_type
unsignedbv_typet char16_t_type()
Definition: c_types.cpp:165
string_constantt
Definition: string_constant.h:16
index_type
bitvector_typet index_type()
Definition: c_types.cpp:16
exprt::type
typet & type()
Return the type of the expression.
Definition: expr.h:81
convert_one_string_literal
std::basic_string< unsigned int > convert_one_string_literal(const std::string &src)
Definition: convert_string_literal.cpp:23
wchar_t_type
bitvector_typet wchar_t_type()
Definition: c_types.cpp:149
unescape_wide_string
std::basic_string< unsigned int > unescape_wide_string(const std::string &src)
Definition: unescape_string.cpp:156
unescape_string.h
ANSI-C Language Conversion.
irept::set
void set(const irep_namet &name, const irep_idt &value)
Definition: irep.h:442
from_integer
constant_exprt from_integer(const mp_integer &int_value, const typet &type)
Definition: arith_tools.cpp:99
unicode.h
exprt::operands
operandst & operands()
Definition: expr.h:95
unescape_string
std::string unescape_string(const std::string &src)
Definition: unescape_string.cpp:151
c_types.h
convert_string_literal
exprt convert_string_literal(const std::string &src)
Definition: convert_string_literal.cpp:71
utf32_native_endian_to_utf8
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:138