Kea  2.3.8
strutil.cc
Go to the documentation of this file.
1 // Copyright (C) 2011-2022 Internet Systems Consortium, Inc. ("ISC")
2 //
3 // This Source Code Form is subject to the terms of the Mozilla Public
4 // License, v. 2.0. If a copy of the MPL was not distributed with this
5 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 
7 #include <config.h>
8 
9 #include <util/encode/hex.h>
10 #include <util/strutil.h>
11 
12 #include <boost/algorithm/string/classification.hpp>
13 #include <boost/algorithm/string/constants.hpp>
14 #include <boost/algorithm/string/split.hpp>
15 
16 #include <numeric>
17 #include <iostream>
18 #include <sstream>
19 
20 // Early versions of C++11 regex were buggy, use it if we
21 // can otherwise, we fall back to regcomp/regexec. For more info see:
22 // https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
23 #ifdef USE_REGEX
24 #include <regex>
25 #else
26 #include <sys/types.h>
27 #include <regex.h>
28 #endif
29 
30 #include <string.h>
31 
32 using namespace std;
33 
34 namespace isc {
35 namespace util {
36 namespace str {
37 
38 // Normalize slashes
39 
40 void
41 normalizeSlash(std::string& name) {
42  if (!name.empty()) {
43  size_t pos = 0;
44  while ((pos = name.find('\\', pos)) != std::string::npos) {
45  name[pos] = '/';
46  }
47  }
48 }
49 
50 // Trim String
51 
52 string
53 trim(const string& instring) {
54  string retstring = "";
55  if (!instring.empty()) {
56  static const char* blanks = " \t\n";
57 
58  // Search for first non-blank character in the string
59  size_t first = instring.find_first_not_of(blanks);
60  if (first != string::npos) {
61 
62  // String not all blanks, so look for last character
63  size_t last = instring.find_last_not_of(blanks);
64 
65  // Extract the trimmed substring
66  retstring = instring.substr(first, (last - first + 1));
67  }
68  }
69 
70  return (retstring);
71 }
72 
73 // Tokenize string. As noted in the header, this is locally written to avoid
74 // another dependency on a Boost library.
75 
76 vector<string>
77 tokens(const std::string& text, const std::string& delim, bool escape) {
78  vector<string> result;
79  string token;
80  bool in_token = false;
81  bool escaped = false;
82  for (auto c = text.cbegin(); c != text.cend(); ++c) {
83  if (delim.find(*c) != string::npos) {
84  // Current character is a delimiter
85  if (!in_token) {
86  // Two or more delimiters, eat them
87  } else if (escaped) {
88  // Escaped delimiter in a token: reset escaped and keep it
89  escaped = false;
90  token.push_back(*c);
91  } else {
92  // End of the current token: save it if not empty
93  if (!token.empty()) {
94  result.push_back(token);
95  }
96  // Reset state
97  in_token = false;
98  token.clear();
99  }
100  } else if (escape && (*c == '\\')) {
101  // Current character is the escape character
102  if (!in_token) {
103  // The escape character is the first character of a new token
104  in_token = true;
105  }
106  if (escaped) {
107  // Escaped escape: reset escaped and keep one character
108  escaped = false;
109  token.push_back(*c);
110  } else {
111  // Remember to keep the next character
112  escaped = true;
113  }
114  } else {
115  // Not a delimiter nor an escape
116  if (!in_token) {
117  // First character of a new token
118  in_token = true;
119  }
120  if (escaped) {
121  // Escaped common character: as escape was false
122  escaped = false;
123  token.push_back('\\');
124  token.push_back(*c);
125  } else {
126  // The common case: keep it
127  token.push_back(*c);
128  }
129  }
130  }
131  // End of input: close and save the current token if not empty
132  if (escaped) {
133  // Pending escape
134  token.push_back('\\');
135  }
136  if (!token.empty()) {
137  result.push_back(token);
138  }
139 
140  return (result);
141 }
142 
143 // Local function to pass to accumulate() for summing up string lengths.
144 
145 namespace {
146 
147 size_t
148 lengthSum(string::size_type curlen, const string& cur_string) {
149  return (curlen + cur_string.size());
150 }
151 
152 }
153 
154 // Provide printf-style formatting.
155 
156 std::string
157 format(const std::string& format, const std::vector<std::string>& args) {
158 
159  static const string flag = "%s";
160 
161  // Initialize return string. To speed things up, we'll reserve an
162  // appropriate amount of space - current string size, plus length of all
163  // the argument strings, less two characters for each argument (the %s in
164  // the format string is being replaced).
165  string result;
166  size_t length = accumulate(args.begin(), args.end(), format.size(),
167  lengthSum) - (args.size() * flag.size());
168  result.reserve(length);
169 
170  // Iterate through replacing all tokens
171  result = format;
172  size_t tokenpos = 0; // Position of last token replaced
173  std::vector<std::string>::size_type i = 0; // Index into argument array
174 
175  while ((i < args.size()) && (tokenpos != string::npos)) {
176  tokenpos = result.find(flag, tokenpos);
177  if (tokenpos != string::npos) {
178  result.replace(tokenpos, flag.size(), args[i++]);
179  }
180  }
181 
182  return (result);
183 }
184 
185 std::string
186 getToken(std::istringstream& iss) {
187  string token;
188  iss >> token;
189  if (iss.bad() || iss.fail()) {
190  isc_throw(StringTokenError, "could not read token from string");
191  }
192  return (token);
193 }
194 
195 std::vector<uint8_t>
196 quotedStringToBinary(const std::string& quoted_string) {
197  std::vector<uint8_t> binary;
198  // Remove whitespace before and after the quotes.
199  std::string trimmed_string = trim(quoted_string);
200 
201  // We require two quote characters, so the length of the string must be
202  // equal to 2 at minimum, and it must start and end with quotes.
203  if ((trimmed_string.length() > 1) && ((trimmed_string[0] == '\'') &&
204  (trimmed_string[trimmed_string.length()-1] == '\''))) {
205  // Remove quotes and trim the text inside the quotes.
206  trimmed_string = trim(trimmed_string.substr(1, trimmed_string.length() - 2));
207  // Copy string contents into the vector.
208  binary.assign(trimmed_string.begin(), trimmed_string.end());
209  }
210  // Return resulting vector or empty vector.
211  return (binary);
212 }
213 
214 void
215 decodeColonSeparatedHexString(const std::string& hex_string,
216  std::vector<uint8_t>& binary) {
217  decodeSeparatedHexString(hex_string, ":", binary);
218 }
219 
220 void
221 decodeSeparatedHexString(const std::string& hex_string, const std::string& sep,
222  std::vector<uint8_t>& binary) {
223  std::vector<std::string> split_text;
224  boost::split(split_text, hex_string, boost::is_any_of(sep),
225  boost::algorithm::token_compress_off);
226 
227  std::vector<uint8_t> binary_vec;
228  for (size_t i = 0; i < split_text.size(); ++i) {
229 
230  // If there are multiple tokens and the current one is empty, it
231  // means that two consecutive colons were specified. This is not
232  // allowed.
233  if ((split_text.size() > 1) && split_text[i].empty()) {
234  isc_throw(isc::BadValue, "two consecutive separators ('" << sep << "') specified in"
235  " a decoded string '" << hex_string << "'");
236 
237  // Between a colon we expect at most two characters.
238  } else if (split_text[i].size() > 2) {
239  isc_throw(isc::BadValue, "invalid format of the decoded string"
240  << " '" << hex_string << "'");
241 
242  } else if (!split_text[i].empty()) {
243  std::stringstream s;
244  s << "0x";
245 
246  for (unsigned int j = 0; j < split_text[i].length(); ++j) {
247  // Check if we're dealing with hexadecimal digit.
248  if (!isxdigit(split_text[i][j])) {
249  isc_throw(isc::BadValue, "'" << split_text[i][j]
250  << "' is not a valid hexadecimal digit in"
251  << " decoded string '" << hex_string << "'");
252  }
253  s << split_text[i][j];
254  }
255 
256  // The stream should now have one or two hexadecimal digits.
257  // Let's convert it to a number and store in a temporary
258  // vector.
259  unsigned int binary_value;
260  s >> std::hex >> binary_value;
261 
262  binary_vec.push_back(static_cast<uint8_t>(binary_value));
263  }
264 
265  }
266 
267  // All ok, replace the data in the output vector with a result.
268  binary.swap(binary_vec);
269 }
270 
271 
272 void
273 decodeFormattedHexString(const std::string& hex_string,
274  std::vector<uint8_t>& binary) {
275  // If there is at least one colon we assume that the string
276  // comprises octets separated by colons (e.g. MAC address notation).
277  if (hex_string.find(':') != std::string::npos) {
278  decodeSeparatedHexString(hex_string, ":", binary);
279  } else if (hex_string.find(' ') != std::string::npos) {
280  decodeSeparatedHexString(hex_string, " ", binary);
281  } else {
282  std::ostringstream s;
283 
284  // If we have odd number of digits we'll have to prepend '0'.
285  if (hex_string.length() % 2 != 0) {
286  s << "0";
287  }
288 
289  // It is ok to use '0x' prefix in a string.
290  if ((hex_string.length() > 2) && (hex_string.substr(0, 2) == "0x")) {
291  // Exclude '0x' from the decoded string.
292  s << hex_string.substr(2);
293 
294  } else {
295  // No '0x', so decode the whole string.
296  s << hex_string;
297  }
298 
299  try {
300  // Decode the hex string.
301  encode::decodeHex(s.str(), binary);
302 
303  } catch (...) {
304  isc_throw(isc::BadValue, "'" << hex_string << "' is not a valid"
305  " string of hexadecimal digits");
306  }
307  }
308 }
309 
311 public:
313  StringSanitizerImpl(const std::string& char_set, const std::string& char_replacement)
314  : char_set_(char_set), char_replacement_(char_replacement) {
315  if (char_set.size() > StringSanitizer::MAX_DATA_SIZE) {
316  isc_throw(isc::BadValue, "char set size: '" << char_set.size()
317  << "' exceeds max size: '"
318  << StringSanitizer::MAX_DATA_SIZE << "'");
319  }
320 
321  if (char_replacement.size() > StringSanitizer::MAX_DATA_SIZE) {
322  isc_throw(isc::BadValue, "char replacement size: '"
323  << char_replacement.size() << "' exceeds max size: '"
324  << StringSanitizer::MAX_DATA_SIZE << "'");
325  }
326 #ifdef USE_REGEX
327  try {
328  scrub_exp_ = std::regex(char_set, std::regex::extended);
329  } catch (const std::exception& ex) {
330  isc_throw(isc::BadValue, "invalid regex: '"
331  << char_set_ << "', " << ex.what());
332  }
333 #else
334  int ec = regcomp(&scrub_exp_, char_set_.c_str(), REG_EXTENDED);
335  if (ec) {
336  char errbuf[512] = "";
337  static_cast<void>(regerror(ec, &scrub_exp_, errbuf, sizeof(errbuf)));
338  regfree(&scrub_exp_);
339  isc_throw(isc::BadValue, "invalid regex: '" << char_set_ << "', " << errbuf);
340  }
341 #endif
342  }
343 
346 #ifndef USE_REGEX
347  regfree(&scrub_exp_);
348 #endif
349  }
350 
351  std::string scrub(const std::string& original) {
352 #ifdef USE_REGEX
353  std::stringstream result;
354  try {
355  std::regex_replace(std::ostream_iterator<char>(result),
356  original.begin(), original.end(),
357  scrub_exp_, char_replacement_);
358  } catch (const std::exception& ex) {
359  isc_throw(isc::BadValue, "replacing '" << char_set_ << "' with '"
360  << char_replacement_ << "' in '" << original << "' failed: ,"
361  << ex.what());
362  }
363 
364  return (result.str());
365 #else
366  // In order to handle embedded nuls, we have to process in nul-terminated
367  // chunks. We iterate over the original data, doing pattern replacement
368  // on each chunk.
369  const char* orig_data = original.data();
370  const char* dead_end = orig_data + original.size();
371  const char* start_from = orig_data;
372  stringstream result;
373 
374  while (start_from < dead_end) {
375  // Iterate over original string, match by match.
376  regmatch_t matches[2]; // n matches + 1
377  const char* end_at = start_from + strlen(start_from);
378 
379  while (start_from < end_at) {
380  // Look for the next match
381  if (regexec(&scrub_exp_, start_from, 1, matches, 0) == REG_NOMATCH) {
382  // No matches, so add in the remainder
383  result << start_from;
384  start_from = end_at + 1;
385  break;
386  }
387 
388  // Shouldn't happen, but one never knows eh?
389  if (matches[0].rm_so == -1) {
390  isc_throw(isc::Unexpected, "matched but so is -1?");
391  }
392 
393  // Add everything from starting point up to the current match
394  const char* match_at = start_from + matches[0].rm_so;
395  while (start_from < match_at) {
396  result << *start_from;
397  ++start_from;
398  }
399 
400  // Add in the replacement
401  result << char_replacement_;
402 
403  // Move past the match.
404  ++start_from;
405  }
406 
407  // if we have an embedded nul, replace it and continue
408  if (start_from < dead_end) {
409  // Add in the replacement
410  result << char_replacement_;
411  start_from = end_at + 1;
412  }
413  }
414 
415  return (result.str());
416 #endif
417  }
418 
419 private:
421  std::string char_set_;
422 
424  std::string char_replacement_;
425 
426 #ifdef USE_REGEX
427  regex scrub_exp_;
428 #else
429  regex_t scrub_exp_;
430 #endif
431 };
432 
433 // @note The regex engine is implemented using recursion and can cause
434 // stack overflow if the input data is too large. An arbitrary size of
435 // 4096 should be enough for all cases.
436 const uint32_t StringSanitizer::MAX_DATA_SIZE = 4096;
437 
438 StringSanitizer::StringSanitizer(const std::string& char_set,
439  const std::string& char_replacement)
440  : impl_(new StringSanitizerImpl(char_set, char_replacement)) {
441 }
442 
444 }
445 
446 std::string
447 StringSanitizer::scrub(const std::string& original) {
448  return (impl_->scrub(original));
449 }
450 
451 std::string dumpAsHex(const uint8_t* data, size_t length) {
452  std::stringstream output;
453  for (unsigned int i = 0; i < length; i++) {
454  if (i) {
455  output << ":";
456  }
457 
458  output << std::setfill('0') << std::setw(2) << std::hex
459  << static_cast<unsigned short>(data[i]);
460  }
461 
462  return (output.str());
463 }
464 
465 } // namespace str
466 } // namespace util
467 } // namespace isc
A generic exception that is thrown if a parameter given to a method is considered invalid in that con...
virtual const char * what() const
Returns a C-style character string of the cause of the exception.
A generic exception that is thrown when an unexpected error condition occurs.
StringSanitizerImpl(const std::string &char_set, const std::string &char_replacement)
Constructor.
Definition: strutil.cc:313
std::string scrub(const std::string &original)
Definition: strutil.cc:351
std::string scrub(const std::string &original)
Returns a scrubbed copy of a given string.
Definition: strutil.cc:447
A Set of C++ Utilities for Manipulating Strings.
Definition: strutil.h:31
#define isc_throw(type, stream)
A shortcut macro to insert known values into exception arguments.
void decodeHex(const string &input, vector< uint8_t > &result)
Decode a text encoded in the base16 ('hex') format into the original data.
Definition: base_n.cc:488
void normalizeSlash(std::string &name)
Normalize Backslash.
Definition: strutil.cc:41
std::string format(const std::string &format, const std::vector< std::string > &args)
Apply Formatting.
Definition: strutil.cc:157
std::string dumpAsHex(const uint8_t *data, size_t length)
Dumps a buffer of bytes as a string of hexadecimal digits.
Definition: strutil.cc:451
std::string getToken(std::istringstream &iss)
Returns one token from the given stringstream.
Definition: strutil.cc:186
void decodeSeparatedHexString(const std::string &hex_string, const std::string &sep, std::vector< uint8_t > &binary)
Converts a string of separated hexadecimal digits into a vector.
Definition: strutil.cc:221
void decodeFormattedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a formatted string of hexadecimal digits into a vector.
Definition: strutil.cc:273
std::vector< uint8_t > quotedStringToBinary(const std::string &quoted_string)
Converts a string in quotes into vector.
Definition: strutil.cc:196
void decodeColonSeparatedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a string of hexadecimal digits with colons into a vector.
Definition: strutil.cc:215
string trim(const string &instring)
Trim Leading and Trailing Spaces.
Definition: strutil.cc:53
vector< string > tokens(const std::string &text, const std::string &delim, bool escape)
Split String into Tokens.
Definition: strutil.cc:77
Defines the logger used by the top-level component of kea-lfc.