Kea 2.5.4
strutil.cc
Go to the documentation of this file.
1// Copyright (C) 2011-2022 Internet Systems Consortium, Inc. ("ISC")
2//
3// This Source Code Form is subject to the terms of the Mozilla Public
4// License, v. 2.0. If a copy of the MPL was not distributed with this
5// file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7#include <config.h>
8
9#include <util/encode/hex.h>
10#include <util/strutil.h>
11
12#include <boost/algorithm/string/classification.hpp>
13#include <boost/algorithm/string/constants.hpp>
14#include <boost/algorithm/string/split.hpp>
15
16#include <numeric>
17#include <iostream>
18#include <sstream>
19
20// Early versions of C++11 regex were buggy, use it if we
21// can otherwise, we fall back to regcomp/regexec. For more info see:
22// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
23#ifdef USE_REGEX
24#include <regex>
25#else
26#include <sys/types.h>
27#include <regex.h>
28#endif
29
30#include <string.h>
31
32using namespace std;
33
34namespace isc {
35namespace util {
36namespace str {
37
38// Normalize slashes
39
40void
41normalizeSlash(std::string& name) {
42 if (!name.empty()) {
43 size_t pos = 0;
44 while ((pos = name.find('\\', pos)) != std::string::npos) {
45 name[pos] = '/';
46 }
47 }
48}
49
50// Trim String
51
52string
53trim(const string& instring) {
54 string retstring = "";
55 if (!instring.empty()) {
56 static const char* blanks = " \t\n";
57
58 // Search for first non-blank character in the string
59 size_t first = instring.find_first_not_of(blanks);
60 if (first != string::npos) {
61
62 // String not all blanks, so look for last character
63 size_t last = instring.find_last_not_of(blanks);
64
65 // Extract the trimmed substring
66 retstring = instring.substr(first, (last - first + 1));
67 }
68 }
69
70 return (retstring);
71}
72
73// Tokenize string. As noted in the header, this is locally written to avoid
74// another dependency on a Boost library.
75
76vector<string>
77tokens(const std::string& text, const std::string& delim, bool escape) {
78 vector<string> result;
79 string token;
80 bool in_token = false;
81 bool escaped = false;
82 for (auto c = text.cbegin(); c != text.cend(); ++c) {
83 if (delim.find(*c) != string::npos) {
84 // Current character is a delimiter
85 if (!in_token) {
86 // Two or more delimiters, eat them
87 } else if (escaped) {
88 // Escaped delimiter in a token: reset escaped and keep it
89 escaped = false;
90 token.push_back(*c);
91 } else {
92 // End of the current token: save it if not empty
93 if (!token.empty()) {
94 result.push_back(token);
95 }
96 // Reset state
97 in_token = false;
98 token.clear();
99 }
100 } else if (escape && (*c == '\\')) {
101 // Current character is the escape character
102 if (!in_token) {
103 // The escape character is the first character of a new token
104 in_token = true;
105 }
106 if (escaped) {
107 // Escaped escape: reset escaped and keep one character
108 escaped = false;
109 token.push_back(*c);
110 } else {
111 // Remember to keep the next character
112 escaped = true;
113 }
114 } else {
115 // Not a delimiter nor an escape
116 if (!in_token) {
117 // First character of a new token
118 in_token = true;
119 }
120 if (escaped) {
121 // Escaped common character: as escape was false
122 escaped = false;
123 token.push_back('\\');
124 token.push_back(*c);
125 } else {
126 // The common case: keep it
127 token.push_back(*c);
128 }
129 }
130 }
131 // End of input: close and save the current token if not empty
132 if (escaped) {
133 // Pending escape
134 token.push_back('\\');
135 }
136 if (!token.empty()) {
137 result.push_back(token);
138 }
139
140 return (result);
141}
142
143// Local function to pass to accumulate() for summing up string lengths.
144
145namespace {
146
147size_t
148lengthSum(string::size_type curlen, const string& cur_string) {
149 return (curlen + cur_string.size());
150}
151
152}
153
154// Provide printf-style formatting.
155
156std::string
157format(const std::string& format, const std::vector<std::string>& args) {
158
159 static const string flag = "%s";
160
161 // Initialize return string. To speed things up, we'll reserve an
162 // appropriate amount of space - current string size, plus length of all
163 // the argument strings, less two characters for each argument (the %s in
164 // the format string is being replaced).
165 string result;
166 size_t length = accumulate(args.begin(), args.end(), format.size(),
167 lengthSum) - (args.size() * flag.size());
168 result.reserve(length);
169
170 // Iterate through replacing all tokens
171 result = format;
172 size_t tokenpos = 0; // Position of last token replaced
173 std::vector<std::string>::size_type i = 0; // Index into argument array
174
175 while ((i < args.size()) && (tokenpos != string::npos)) {
176 tokenpos = result.find(flag, tokenpos);
177 if (tokenpos != string::npos) {
178 result.replace(tokenpos, flag.size(), args[i++]);
179 }
180 }
181
182 return (result);
183}
184
185std::string
186getToken(std::istringstream& iss) {
187 string token;
188 iss >> token;
189 if (iss.bad() || iss.fail()) {
190 isc_throw(StringTokenError, "could not read token from string");
191 }
192 return (token);
193}
194
195std::vector<uint8_t>
196quotedStringToBinary(const std::string& quoted_string) {
197 std::vector<uint8_t> binary;
198 // Remove whitespace before and after the quotes.
199 std::string trimmed_string = trim(quoted_string);
200
201 // We require two quote characters, so the length of the string must be
202 // equal to 2 at minimum, and it must start and end with quotes.
203 if ((trimmed_string.length() > 1) && ((trimmed_string[0] == '\'') &&
204 (trimmed_string[trimmed_string.length()-1] == '\''))) {
205 // Remove quotes and trim the text inside the quotes.
206 trimmed_string = trim(trimmed_string.substr(1, trimmed_string.length() - 2));
207 // Copy string contents into the vector.
208 binary.assign(trimmed_string.begin(), trimmed_string.end());
209 }
210 // Return resulting vector or empty vector.
211 return (binary);
212}
213
214void
215decodeColonSeparatedHexString(const std::string& hex_string,
216 std::vector<uint8_t>& binary) {
217 decodeSeparatedHexString(hex_string, ":", binary);
218}
219
220void
221decodeSeparatedHexString(const std::string& hex_string, const std::string& sep,
222 std::vector<uint8_t>& binary) {
223 std::vector<std::string> split_text;
224 boost::split(split_text, hex_string, boost::is_any_of(sep),
225 boost::algorithm::token_compress_off);
226
227 std::vector<uint8_t> binary_vec;
228 for (size_t i = 0; i < split_text.size(); ++i) {
229
230 // If there are multiple tokens and the current one is empty, it
231 // means that two consecutive colons were specified. This is not
232 // allowed.
233 if ((split_text.size() > 1) && split_text[i].empty()) {
234 isc_throw(isc::BadValue, "two consecutive separators ('" << sep << "') specified in"
235 " a decoded string '" << hex_string << "'");
236
237 // Between a colon we expect at most two characters.
238 } else if (split_text[i].size() > 2) {
239 isc_throw(isc::BadValue, "invalid format of the decoded string"
240 << " '" << hex_string << "'");
241
242 } else if (!split_text[i].empty()) {
243 std::stringstream s;
244 s << "0x";
245
246 for (unsigned int j = 0; j < split_text[i].length(); ++j) {
247 // Check if we're dealing with hexadecimal digit.
248 if (!isxdigit(split_text[i][j])) {
249 isc_throw(isc::BadValue, "'" << split_text[i][j]
250 << "' is not a valid hexadecimal digit in"
251 << " decoded string '" << hex_string << "'");
252 }
253 s << split_text[i][j];
254 }
255
256 // The stream should now have one or two hexadecimal digits.
257 // Let's convert it to a number and store in a temporary
258 // vector.
259 unsigned int binary_value;
260 s >> std::hex >> binary_value;
261
262 binary_vec.push_back(static_cast<uint8_t>(binary_value));
263 }
264
265 }
266
267 // All ok, replace the data in the output vector with a result.
268 binary.swap(binary_vec);
269}
270
271
272void
273decodeFormattedHexString(const std::string& hex_string,
274 std::vector<uint8_t>& binary) {
275 // If there is at least one colon we assume that the string
276 // comprises octets separated by colons (e.g. MAC address notation).
277 if (hex_string.find(':') != std::string::npos) {
278 decodeSeparatedHexString(hex_string, ":", binary);
279 } else if (hex_string.find(' ') != std::string::npos) {
280 decodeSeparatedHexString(hex_string, " ", binary);
281 } else {
282 std::ostringstream s;
283
284 // If we have odd number of digits we'll have to prepend '0'.
285 if (hex_string.length() % 2 != 0) {
286 s << "0";
287 }
288
289 // It is ok to use '0x' prefix in a string.
290 if ((hex_string.length() > 2) && (hex_string.substr(0, 2) == "0x")) {
291 // Exclude '0x' from the decoded string.
292 s << hex_string.substr(2);
293
294 } else {
295 // No '0x', so decode the whole string.
296 s << hex_string;
297 }
298
299 try {
300 // Decode the hex string.
301 encode::decodeHex(s.str(), binary);
302
303 } catch (...) {
304 isc_throw(isc::BadValue, "'" << hex_string << "' is not a valid"
305 " string of hexadecimal digits");
306 }
307 }
308}
309
311public:
313 StringSanitizerImpl(const std::string& char_set, const std::string& char_replacement)
314 : char_set_(char_set), char_replacement_(char_replacement) {
315 if (char_set.size() > StringSanitizer::MAX_DATA_SIZE) {
316 isc_throw(isc::BadValue, "char set size: '" << char_set.size()
317 << "' exceeds max size: '"
319 }
320
321 if (char_replacement.size() > StringSanitizer::MAX_DATA_SIZE) {
322 isc_throw(isc::BadValue, "char replacement size: '"
323 << char_replacement.size() << "' exceeds max size: '"
325 }
326#ifdef USE_REGEX
327 try {
328 scrub_exp_ = std::regex(char_set, std::regex::extended);
329 } catch (const std::exception& ex) {
330 isc_throw(isc::BadValue, "invalid regex: '"
331 << char_set_ << "', " << ex.what());
332 }
333#else
334 int ec = regcomp(&scrub_exp_, char_set_.c_str(), REG_EXTENDED);
335 if (ec) {
336 char errbuf[512] = "";
337 static_cast<void>(regerror(ec, &scrub_exp_, errbuf, sizeof(errbuf)));
338 regfree(&scrub_exp_);
339 isc_throw(isc::BadValue, "invalid regex: '" << char_set_ << "', " << errbuf);
340 }
341#endif
342 }
343
346#ifndef USE_REGEX
347 regfree(&scrub_exp_);
348#endif
349 }
350
351 std::string scrub(const std::string& original) {
352#ifdef USE_REGEX
353 std::stringstream result;
354 try {
355 std::regex_replace(std::ostream_iterator<char>(result),
356 original.begin(), original.end(),
357 scrub_exp_, char_replacement_);
358 } catch (const std::exception& ex) {
359 isc_throw(isc::BadValue, "replacing '" << char_set_ << "' with '"
360 << char_replacement_ << "' in '" << original << "' failed: ,"
361 << ex.what());
362 }
363
364 return (result.str());
365#else
366 // In order to handle embedded nuls, we have to process in nul-terminated
367 // chunks. We iterate over the original data, doing pattern replacement
368 // on each chunk.
369 const char* orig_data = original.data();
370 const char* dead_end = orig_data + original.size();
371 const char* start_from = orig_data;
372 stringstream result;
373
374 while (start_from < dead_end) {
375 // Iterate over original string, match by match.
376 regmatch_t matches[2]; // n matches + 1
377 const char* end_at = start_from + strlen(start_from);
378
379 while (start_from < end_at) {
380 // Look for the next match
381 if (regexec(&scrub_exp_, start_from, 1, matches, 0) == REG_NOMATCH) {
382 // No matches, so add in the remainder
383 result << start_from;
384 start_from = end_at + 1;
385 break;
386 }
387
388 // Shouldn't happen, but one never knows eh?
389 if (matches[0].rm_so == -1) {
390 isc_throw(isc::Unexpected, "matched but so is -1?");
391 }
392
393 // Add everything from starting point up to the current match
394 const char* match_at = start_from + matches[0].rm_so;
395 while (start_from < match_at) {
396 result << *start_from;
397 ++start_from;
398 }
399
400 // Add in the replacement
401 result << char_replacement_;
402
403 // Move past the match.
404 ++start_from;
405 }
406
407 // if we have an embedded nul, replace it and continue
408 if (start_from < dead_end) {
409 // Add in the replacement
410 result << char_replacement_;
411 start_from = end_at + 1;
412 }
413 }
414
415 return (result.str());
416#endif
417 }
418
419private:
421 std::string char_set_;
422
424 std::string char_replacement_;
425
426#ifdef USE_REGEX
427 regex scrub_exp_;
428#else
429 regex_t scrub_exp_;
430#endif
431};
432
433// @note The regex engine is implemented using recursion and can cause
434// stack overflow if the input data is too large. An arbitrary size of
435// 4096 should be enough for all cases.
436const uint32_t StringSanitizer::MAX_DATA_SIZE = 4096;
437
438StringSanitizer::StringSanitizer(const std::string& char_set,
439 const std::string& char_replacement)
440 : impl_(new StringSanitizerImpl(char_set, char_replacement)) {
441}
442
444}
445
446std::string
447StringSanitizer::scrub(const std::string& original) {
448 return (impl_->scrub(original));
449}
450
451std::string dumpAsHex(const uint8_t* data, size_t length) {
452 std::stringstream output;
453 for (unsigned int i = 0; i < length; i++) {
454 if (i) {
455 output << ":";
456 }
457
458 output << std::setfill('0') << std::setw(2) << std::hex
459 << static_cast<unsigned short>(data[i]);
460 }
461
462 return (output.str());
463}
464
465} // namespace str
466} // namespace util
467} // namespace isc
A generic exception that is thrown if a parameter given to a method is considered invalid in that con...
A generic exception that is thrown when an unexpected error condition occurs.
StringSanitizerImpl(const std::string &char_set, const std::string &char_replacement)
Constructor.
Definition: strutil.cc:313
std::string scrub(const std::string &original)
Definition: strutil.cc:351
std::string scrub(const std::string &original)
Returns a scrubbed copy of a given string.
Definition: strutil.cc:447
static const uint32_t MAX_DATA_SIZE
The maximum size for regex parameters.
Definition: strutil.h:351
StringSanitizer(const std::string &char_set, const std::string &char_replacement)
Constructor.
Definition: strutil.cc:438
A Set of C++ Utilities for Manipulating Strings.
Definition: strutil.h:31
#define isc_throw(type, stream)
A shortcut macro to insert known values into exception arguments.
void decodeHex(const string &input, vector< uint8_t > &result)
Decode a text encoded in the base16 ('hex') format into the original data.
Definition: base_n.cc:488
void normalizeSlash(std::string &name)
Normalize Backslash.
Definition: strutil.cc:41
std::string format(const std::string &format, const std::vector< std::string > &args)
Apply Formatting.
Definition: strutil.cc:157
std::string dumpAsHex(const uint8_t *data, size_t length)
Dumps a buffer of bytes as a string of hexadecimal digits.
Definition: strutil.cc:451
std::string getToken(std::istringstream &iss)
Returns one token from the given stringstream.
Definition: strutil.cc:186
void decodeSeparatedHexString(const std::string &hex_string, const std::string &sep, std::vector< uint8_t > &binary)
Converts a string of separated hexadecimal digits into a vector.
Definition: strutil.cc:221
void decodeFormattedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a formatted string of hexadecimal digits into a vector.
Definition: strutil.cc:273
std::vector< uint8_t > quotedStringToBinary(const std::string &quoted_string)
Converts a string in quotes into vector.
Definition: strutil.cc:196
void decodeColonSeparatedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a string of hexadecimal digits with colons into a vector.
Definition: strutil.cc:215
string trim(const string &instring)
Trim Leading and Trailing Spaces.
Definition: strutil.cc:53
vector< string > tokens(const std::string &text, const std::string &delim, bool escape)
Split String into Tokens.
Definition: strutil.cc:77
Defines the logger used by the top-level component of kea-lfc.