From 87c3476655c2e7419787076fc4cbccda86e09adc Mon Sep 17 00:00:00 2001
From: Dan Lecocq <dan@moz.com>
Date: Thu, 28 Jul 2016 16:05:41 -0700
Subject: [PATCH 1/2] Add a benchmark for punycode -> unpunycode round trip.

---
 bench.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bench.cpp b/bench.cpp
index 211fcb9..42091d8 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -44,7 +44,8 @@ int main(int argc, char* argv[]) {
     size_t runs = 5;
 
     Url::Url base_url(base);
-    std::string full = Url::Url(relative).relative_to(base_url).str();
+    Url::Url full_url = Url::Url(relative).relative_to(base_url);
+    std::string full = full_url.str();
 
     bench("parse", count, runs, [full]() {
         Url::Url parsed(full);
@@ -65,4 +66,8 @@ int main(int argc, char* argv[]) {
     bench("parse + punycode", count, runs, [full]() {
         Url::Url(full).punycode();
     });
+
+    bench("punycode + unpunycode", count, runs, [full_url]() mutable {
+        full_url.punycode().unpunycode();
+    });
 }

From 9f454eb09c6768a4296e7354f015d32ee2848ae9 Mon Sep 17 00:00:00 2001
From: Dan Lecocq <dan@moz.com>
Date: Thu, 28 Jul 2016 16:43:18 -0700
Subject: [PATCH 2/2] Avoid additional string allocations when punycoding.

---
 include/punycode.h | 20 +++++++++-
 src/punycode.cpp   | 73 +++++++++++++++++++++++------------
 src/url.cpp        | 95 +++++++++++++++++++++++++++-------------------
 test/test-url.cpp  | 12 ++++++
 4 files changed, 134 insertions(+), 66 deletions(-)

diff --git a/include/punycode.h b/include/punycode.h
index aecd97f..c082e27 100644
--- a/include/punycode.h
+++ b/include/punycode.h
@@ -54,8 +54,12 @@ namespace Url
 
         // The highest codepoint in unicode
         const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
-        //Utf8::MAX_CODEPOINT;
-        //std::numeric_limits<punycode_uint>::max();
+
+        /**
+         * Punycode the utf-8-encoded begin->end and append it to str.
+         */
+        std::string& encode(std::string& str, std::string::const_iterator begin,
+            std::string::const_iterator end);
 
         /**
          * Replace utf-8-encoded str into punycode.
@@ -67,6 +71,12 @@ namespace Url
          */
         std::string encode(const std::string& str);
 
+        /**
+         * Append the utf-8-version of the punycoded string between begin and end to str.
+         */
+        std::string& decode(std::string& str, std::string::const_iterator begin,
+            std::string::const_iterator end);
+
         /**
          * Replace punycoded str into utf-8-encoded.
          */
@@ -82,6 +92,12 @@ namespace Url
          */
         bool needsPunycoding(const std::string& str);
 
+        /**
+         * Determine if the characters between these two iterators needs punycoding.
+         */
+        bool needsPunycoding(const std::string::const_iterator& begin,
+                             const std::string::const_iterator& end);
+
         /**
          * Internal function for calculating bias.
          */
diff --git a/src/punycode.cpp b/src/punycode.cpp
index 7ebf640..f7420a0 100644
--- a/src/punycode.cpp
+++ b/src/punycode.cpp
@@ -9,6 +9,15 @@ namespace Url
 {
 
     std::string& Punycode::encode(std::string& str)
+    {
+        std::string output;
+        encode(output, str.cbegin(), str.cend());
+        return str = output;
+    }
+
+    std::string& Punycode::encode(std::string& output,
+        std::string::const_iterator begin,
+        std::string::const_iterator end)
     {
         // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
         //
@@ -18,25 +27,26 @@ namespace Url
         punycode_uint n = INITIAL_N;
         punycode_uint delta = 0;
         punycode_uint bias = INITIAL_BIAS;
-        std::string output;
+
+        // let h = b = the number of basic code points in the input
+        size_t h = 0;
+        size_t b = 0;
 
         // Accumulate the non-basic codepoints
         std::vector<punycode_uint> codepoints;
-        for (auto it = str.cbegin(); it != str.cend(); )
+        while (begin != end)
         {
-            Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
+            Utf8::codepoint_t value = Utf8::readCodepoint(begin, end);
             if (value < 0x80)
             {
                 // copy them to the output in order
                 output.append(1, static_cast<char>(value));
+                ++h;
+                ++b;
             }
             codepoints.push_back(value);
         }
 
-        // let h = b = the number of basic code points in the input
-        size_t h = output.size();
-        size_t b = h;
-
         // copy a delimiter if b > 0
         if (b > 0)
         {
@@ -125,9 +135,8 @@ namespace Url
             ++delta;
             ++n;
         }
- 
-        str.assign(output);
-        return str;
+
+        return output;
     }
 
     std::string Punycode::encode(const std::string& str)
@@ -137,7 +146,8 @@ namespace Url
         return result;
     }
 
-    std::string& Punycode::decode(std::string& str)
+    std::string& Punycode::decode(std::string& str, std::string::const_iterator begin,
+            std::string::const_iterator end)
     {
         // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
         //
@@ -150,15 +160,18 @@ namespace Url
         punycode_uint bias = INITIAL_BIAS;
         std::vector<punycode_uint> codepoints;
 
-        size_t index = str.rfind('-');
-        if (index == std::string::npos)
+        std::string::const_iterator index = end;
+        for (; index != begin; --index)
         {
-            index = 0;
+            if (*index == '-')
+            {
+                break;
+            }
         }
 
         // consume all code points before the last delimiter (if there is one)
         // and copy them to output, fail on any non-basic code point
-        for (auto it = str.begin(); it != (str.begin() + index); ++it)
+        for (auto it = begin; it != index; ++it)
         {
             if (static_cast<unsigned char>(*it) > 127U)
             {
@@ -169,13 +182,13 @@ namespace Url
 
         // if more than zero code points were consumed then consume one more
         //   (which will be the last delimiter)
-        if (index > 0)
+        if (index != begin)
         {
-            index += 1;
+            ++index;
         }
 
         // while the input is not exhausted do begin
-        for (auto it = (str.begin() + index); it != str.end(); ++it)
+        for (auto it = index; it != end; ++it)
         {
             // let oldi = i
             // let w = 1
@@ -186,7 +199,7 @@ namespace Url
             for (punycode_uint k = BASE; ; k += BASE, ++it)
             {
                 // consume a code point, or fail if there was none to consume
-                if (it == str.end())
+                if (it == end)
                 {
                     throw std::invalid_argument("Premature termination");
                 }
@@ -275,16 +288,22 @@ namespace Url
             ++i;
         }
 
-        std::string output;
         for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
         {
-            Utf8::writeCodepoint(output, *it);
+            Utf8::writeCodepoint(str, *it);
         }
-        str.assign(output);
 
         return str;
     }
 
+    std::string& Punycode::decode(std::string& str)
+    {
+        std::string output;
+        decode(output, str.cbegin(), str.cend());
+        str.assign(output);
+        return str;
+    }
+
     std::string Punycode::decode(const std::string& str)
     {
         std::string result(str);
@@ -293,10 +312,16 @@ namespace Url
     }
 
     bool Punycode::needsPunycoding(const std::string& str)
+    {
+        return needsPunycoding(str.cbegin(), str.cend());
+    }
+
+    bool Punycode::needsPunycoding(const std::string::const_iterator& begin,
+        const std::string::const_iterator& end)
     {
         return std::any_of(
-            str.begin(),
-            str.end(),
+            begin,
+            end,
             [](char i){ return static_cast<unsigned char>(i) & 0x80; });
     }
 
diff --git a/src/url.cpp b/src/url.cpp
index 347e3e7..7944e1b 100644
--- a/src/url.cpp
+++ b/src/url.cpp
@@ -709,33 +709,36 @@ namespace Url
 
         std::string encoded;
 
-        size_t start = 0;
-        size_t end = host_.find('.');
-        while(true)
+        auto last = host_.cbegin();
+        for (auto it = host_.cbegin(); it != host_.cend(); ++it)
         {
-            std::string segment = host_.substr(start, end - start);
-            if (Punycode::needsPunycoding(segment))
+            if (*it == '.')
             {
-                encoded.append("xn--");
-                encoded.append(Punycode::encode(segment));
-            }
-            else
-            {
-                encoded.append(segment);
-            }
+                if (Punycode::needsPunycoding(last, it))
+                {
+                    encoded.append("xn--");
+                    Punycode::encode(encoded, last, it);
+                }
+                else
+                {
+                    encoded.append(last, it);
+                }
 
-            if (end == std::string::npos)
-            {
-                break;
-            }
-            else
-            {
                 encoded.append(1, '.');
-                start = end + 1;
-                end = host_.find('.', start);
+                last = it + 1;
             }
         }
 
+        if (Punycode::needsPunycoding(last, host_.cend()))
+        {
+            encoded.append("xn--");
+            Punycode::encode(encoded, last, host_.cend());
+        }
+        else
+        {
+            encoded.append(last, host_.cend());
+        }
+
         host_.assign(encoded);
 
         return *this;
@@ -744,36 +747,48 @@ namespace Url
     Url& Url::unpunycode()
     {
         std::string unencoded;
+        std::string prefix;
 
-        size_t start = 0;
-        size_t end = host_.find('.');
-        while(true)
+        auto last = host_.cbegin();
+        for (auto it = host_.cbegin(); it != host_.cend(); ++it)
         {
-            std::string segment = host_.substr(start, end - start);
-            if (segment.substr(0, 4).compare("xn--") == 0)
+            if (*it == '.')
             {
-                segment = segment.substr(4);
-                unencoded.append(Punycode::decode(segment));
-            }
-            else
-            {
-                unencoded.append(segment);
-            }
+                // Starts with 'xn--'
+                size_t distance = it - last;
+                if (distance > 4)
+                {
+                    prefix.assign(last, last + 4);
+                    if (prefix == "xn--")
+                    {
+                        Punycode::decode(unencoded, last + 4, it);
+                        unencoded.append(1, '.');
+                        last = it + 1;
+                        continue;
+                    }
+                }
 
-            if (end == std::string::npos)
-            {
-                break;
+                unencoded.append(last, it);
+                unencoded.append(1, '.');
+                last = it + 1;
             }
-            else
+        }
+
+        // Last segment
+        size_t distance = host_.cend() - last;
+        if (distance > 4)
+        {
+            prefix.assign(last, last + 4);
+            if (prefix == "xn--")
             {
-                unencoded.append(1, '.');
-                start = end + 1;
-                end = host_.find('.', start);
+                Punycode::decode(unencoded, last + 4, host_.cend());
+                host_.assign(unencoded);
+                return *this;
             }
         }
 
+        unencoded.append(last, host_.cend());
         host_.assign(unencoded);
-
         return *this;
     }
 
diff --git a/test/test-url.cpp b/test/test-url.cpp
index 71a2b8a..49c453e 100644
--- a/test/test-url.cpp
+++ b/test/test-url.cpp
@@ -1108,6 +1108,18 @@ TEST(DefragTest, Defrag)
         Url::Url("http://foo.com/path#fragment").defrag().str());
 }
 
+TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd)
+{
+    std::string example("http://www.xn-/");
+    EXPECT_EQ(example, Url::Url(example).unpunycode().str());
+}
+
+TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart)
+{
+    std::string example("http://xn-.com/");
+    EXPECT_EQ(example, Url::Url(example).unpunycode().str());
+}
+
 TEST(PunycodeTest, German)
 {
     std::string unencoded("http://www.kündigen.de/");