From 87c3476655c2e7419787076fc4cbccda86e09adc Mon Sep 17 00:00:00 2001 From: Dan Lecocq Date: Thu, 28 Jul 2016 16:05:41 -0700 Subject: [PATCH 1/2] Add a benchmark for punycode -> unpunycode round trip. --- bench.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bench.cpp b/bench.cpp index 211fcb9..42091d8 100644 --- a/bench.cpp +++ b/bench.cpp @@ -44,7 +44,8 @@ int main(int argc, char* argv[]) { size_t runs = 5; Url::Url base_url(base); - std::string full = Url::Url(relative).relative_to(base_url).str(); + Url::Url full_url = Url::Url(relative).relative_to(base_url); + std::string full = full_url.str(); bench("parse", count, runs, [full]() { Url::Url parsed(full); @@ -65,4 +66,8 @@ int main(int argc, char* argv[]) { bench("parse + punycode", count, runs, [full]() { Url::Url(full).punycode(); }); + + bench("punycode + unpunycode", count, runs, [full_url]() mutable { + full_url.punycode().unpunycode(); + }); } From 9f454eb09c6768a4296e7354f015d32ee2848ae9 Mon Sep 17 00:00:00 2001 From: Dan Lecocq Date: Thu, 28 Jul 2016 16:43:18 -0700 Subject: [PATCH 2/2] Avoid additional string allocations when punycoding. --- include/punycode.h | 20 +++++++++- src/punycode.cpp | 73 +++++++++++++++++++++++------------ src/url.cpp | 95 +++++++++++++++++++++++++++------------------- test/test-url.cpp | 12 ++++++ 4 files changed, 134 insertions(+), 66 deletions(-) diff --git a/include/punycode.h b/include/punycode.h index aecd97f..c082e27 100644 --- a/include/punycode.h +++ b/include/punycode.h @@ -54,8 +54,12 @@ namespace Url // The highest codepoint in unicode const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits::max(); - //Utf8::MAX_CODEPOINT; - //std::numeric_limits::max(); + + /** + * Punycode the utf-8-encoded begin->end and append it to str. + */ + std::string& encode(std::string& str, std::string::const_iterator begin, + std::string::const_iterator end); /** * Replace utf-8-encoded str into punycode. @@ -67,6 +71,12 @@ namespace Url */ std::string encode(const std::string& str); + /** + * Append the utf-8-version of the punycoded string between begin and end to str. + */ + std::string& decode(std::string& str, std::string::const_iterator begin, + std::string::const_iterator end); + /** * Replace punycoded str into utf-8-encoded. */ @@ -82,6 +92,12 @@ namespace Url */ bool needsPunycoding(const std::string& str); + /** + * Determine if the characters between these two iterators needs punycoding. + */ + bool needsPunycoding(const std::string::const_iterator& begin, + const std::string::const_iterator& end); + /** * Internal function for calculating bias. */ diff --git a/src/punycode.cpp b/src/punycode.cpp index 7ebf640..f7420a0 100644 --- a/src/punycode.cpp +++ b/src/punycode.cpp @@ -9,6 +9,15 @@ namespace Url { std::string& Punycode::encode(std::string& str) + { + std::string output; + encode(output, str.cbegin(), str.cend()); + return str = output; + } + + std::string& Punycode::encode(std::string& output, + std::string::const_iterator begin, + std::string::const_iterator end) { // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3 // @@ -18,25 +27,26 @@ namespace Url punycode_uint n = INITIAL_N; punycode_uint delta = 0; punycode_uint bias = INITIAL_BIAS; - std::string output; + + // let h = b = the number of basic code points in the input + size_t h = 0; + size_t b = 0; // Accumulate the non-basic codepoints std::vector codepoints; - for (auto it = str.cbegin(); it != str.cend(); ) + while (begin != end) { - Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); + Utf8::codepoint_t value = Utf8::readCodepoint(begin, end); if (value < 0x80) { // copy them to the output in order output.append(1, static_cast(value)); + ++h; + ++b; } codepoints.push_back(value); } - // let h = b = the number of basic code points in the input - size_t h = output.size(); - size_t b = h; - // copy a delimiter if b > 0 if (b > 0) { @@ -125,9 +135,8 @@ namespace Url ++delta; ++n; } - - str.assign(output); - return str; + + return output; } std::string Punycode::encode(const std::string& str) @@ -137,7 +146,8 @@ namespace Url return result; } - std::string& Punycode::decode(std::string& str) + std::string& Punycode::decode(std::string& str, std::string::const_iterator begin, + std::string::const_iterator end) { // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2 // @@ -150,15 +160,18 @@ namespace Url punycode_uint bias = INITIAL_BIAS; std::vector codepoints; - size_t index = str.rfind('-'); - if (index == std::string::npos) + std::string::const_iterator index = end; + for (; index != begin; --index) { - index = 0; + if (*index == '-') + { + break; + } } // consume all code points before the last delimiter (if there is one) // and copy them to output, fail on any non-basic code point - for (auto it = str.begin(); it != (str.begin() + index); ++it) + for (auto it = begin; it != index; ++it) { if (static_cast(*it) > 127U) { @@ -169,13 +182,13 @@ namespace Url // if more than zero code points were consumed then consume one more // (which will be the last delimiter) - if (index > 0) + if (index != begin) { - index += 1; + ++index; } // while the input is not exhausted do begin - for (auto it = (str.begin() + index); it != str.end(); ++it) + for (auto it = index; it != end; ++it) { // let oldi = i // let w = 1 @@ -186,7 +199,7 @@ namespace Url for (punycode_uint k = BASE; ; k += BASE, ++it) { // consume a code point, or fail if there was none to consume - if (it == str.end()) + if (it == end) { throw std::invalid_argument("Premature termination"); } @@ -275,16 +288,22 @@ namespace Url ++i; } - std::string output; for (auto it = codepoints.begin(); it != codepoints.end(); ++it) { - Utf8::writeCodepoint(output, *it); + Utf8::writeCodepoint(str, *it); } - str.assign(output); return str; } + std::string& Punycode::decode(std::string& str) + { + std::string output; + decode(output, str.cbegin(), str.cend()); + str.assign(output); + return str; + } + std::string Punycode::decode(const std::string& str) { std::string result(str); @@ -293,10 +312,16 @@ namespace Url } bool Punycode::needsPunycoding(const std::string& str) + { + return needsPunycoding(str.cbegin(), str.cend()); + } + + bool Punycode::needsPunycoding(const std::string::const_iterator& begin, + const std::string::const_iterator& end) { return std::any_of( - str.begin(), - str.end(), + begin, + end, [](char i){ return static_cast(i) & 0x80; }); } diff --git a/src/url.cpp b/src/url.cpp index 347e3e7..7944e1b 100644 --- a/src/url.cpp +++ b/src/url.cpp @@ -709,33 +709,36 @@ namespace Url std::string encoded; - size_t start = 0; - size_t end = host_.find('.'); - while(true) + auto last = host_.cbegin(); + for (auto it = host_.cbegin(); it != host_.cend(); ++it) { - std::string segment = host_.substr(start, end - start); - if (Punycode::needsPunycoding(segment)) + if (*it == '.') { - encoded.append("xn--"); - encoded.append(Punycode::encode(segment)); - } - else - { - encoded.append(segment); - } + if (Punycode::needsPunycoding(last, it)) + { + encoded.append("xn--"); + Punycode::encode(encoded, last, it); + } + else + { + encoded.append(last, it); + } - if (end == std::string::npos) - { - break; - } - else - { encoded.append(1, '.'); - start = end + 1; - end = host_.find('.', start); + last = it + 1; } } + if (Punycode::needsPunycoding(last, host_.cend())) + { + encoded.append("xn--"); + Punycode::encode(encoded, last, host_.cend()); + } + else + { + encoded.append(last, host_.cend()); + } + host_.assign(encoded); return *this; @@ -744,36 +747,48 @@ namespace Url Url& Url::unpunycode() { std::string unencoded; + std::string prefix; - size_t start = 0; - size_t end = host_.find('.'); - while(true) + auto last = host_.cbegin(); + for (auto it = host_.cbegin(); it != host_.cend(); ++it) { - std::string segment = host_.substr(start, end - start); - if (segment.substr(0, 4).compare("xn--") == 0) + if (*it == '.') { - segment = segment.substr(4); - unencoded.append(Punycode::decode(segment)); - } - else - { - unencoded.append(segment); - } + // Starts with 'xn--' + size_t distance = it - last; + if (distance > 4) + { + prefix.assign(last, last + 4); + if (prefix == "xn--") + { + Punycode::decode(unencoded, last + 4, it); + unencoded.append(1, '.'); + last = it + 1; + continue; + } + } - if (end == std::string::npos) - { - break; + unencoded.append(last, it); + unencoded.append(1, '.'); + last = it + 1; } - else + } + + // Last segment + size_t distance = host_.cend() - last; + if (distance > 4) + { + prefix.assign(last, last + 4); + if (prefix == "xn--") { - unencoded.append(1, '.'); - start = end + 1; - end = host_.find('.', start); + Punycode::decode(unencoded, last + 4, host_.cend()); + host_.assign(unencoded); + return *this; } } + unencoded.append(last, host_.cend()); host_.assign(unencoded); - return *this; } diff --git a/test/test-url.cpp b/test/test-url.cpp index 71a2b8a..49c453e 100644 --- a/test/test-url.cpp +++ b/test/test-url.cpp @@ -1108,6 +1108,18 @@ TEST(DefragTest, Defrag) Url::Url("http://foo.com/path#fragment").defrag().str()); } +TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd) +{ + std::string example("http://www.xn-/"); + EXPECT_EQ(example, Url::Url(example).unpunycode().str()); +} + +TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart) +{ + std::string example("http://xn-.com/"); + EXPECT_EQ(example, Url::Url(example).unpunycode().str()); +} + TEST(PunycodeTest, German) { std::string unencoded("http://www.kündigen.de/");