Skip to content

Commit

Permalink
Avoid additional string allocations when punycoding.
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan Lecocq committed Jul 28, 2016
1 parent 0fac174 commit 8f10f02
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 66 deletions.
20 changes: 18 additions & 2 deletions include/punycode.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ namespace Url

// The highest codepoint in unicode
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
//Utf8::MAX_CODEPOINT;
//std::numeric_limits<punycode_uint>::max();

/**
* Punycode the utf-8-encoded begin->end and append it to str.
*/
std::string& encode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end);

/**
* Replace utf-8-encoded str into punycode.
Expand All @@ -67,6 +71,12 @@ namespace Url
*/
std::string encode(const std::string& str);

/**
* Append the utf-8-version of the punycoded string between begin and end to str.
*/
std::string& decode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end);

/**
* Replace punycoded str into utf-8-encoded.
*/
Expand All @@ -82,6 +92,12 @@ namespace Url
*/
bool needsPunycoding(const std::string& str);

/**
* Determine if the characters between these two iterators needs punycoding.
*/
bool needsPunycoding(const std::string::const_iterator& begin,
const std::string::const_iterator& end);

/**
* Internal function for calculating bias.
*/
Expand Down
74 changes: 50 additions & 24 deletions src/punycode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ namespace Url
{

std::string& Punycode::encode(std::string& str)
{
std::string output;
encode(output, str.cbegin(), str.cend());
str.assign(output);
return str;
}

std::string& Punycode::encode(std::string& output,
std::string::const_iterator begin,
std::string::const_iterator end)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
Expand All @@ -18,25 +28,26 @@ namespace Url
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;

// let h = b = the number of basic code points in the input
size_t h = 0;
size_t b = 0;

// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
while (begin != end)
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
Utf8::codepoint_t value = Utf8::readCodepoint(begin, end);
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
++h;
++b;
}
codepoints.push_back(value);
}

// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;

// copy a delimiter if b > 0
if (b > 0)
{
Expand Down Expand Up @@ -125,9 +136,8 @@ namespace Url
++delta;
++n;
}

str.assign(output);
return str;

return output;
}

std::string Punycode::encode(const std::string& str)
Expand All @@ -137,7 +147,8 @@ namespace Url
return result;
}

std::string& Punycode::decode(std::string& str)
std::string& Punycode::decode(std::string& str, std::string::const_iterator begin,
std::string::const_iterator end)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
Expand All @@ -150,15 +161,18 @@ namespace Url
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;

size_t index = str.rfind('-');
if (index == std::string::npos)
std::string::const_iterator index = end;
for (; index != begin; --index)
{
index = 0;
if (*index == '-')
{
break;
}
}

// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
for (auto it = begin; it != index; ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
Expand All @@ -169,13 +183,13 @@ namespace Url

// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
if (index != begin)
{
index += 1;
++index;
}

// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
for (auto it = index; it != end; ++it)
{
// let oldi = i
// let w = 1
Expand All @@ -186,7 +200,7 @@ namespace Url
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
if (it == end)
{
throw std::invalid_argument("Premature termination");
}
Expand Down Expand Up @@ -275,16 +289,22 @@ namespace Url
++i;
}

std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
Utf8::writeCodepoint(str, *it);
}
str.assign(output);

return str;
}

std::string& Punycode::decode(std::string& str)
{
std::string output;
decode(output, str.cbegin(), str.cend());
str.assign(output);
return str;
}

std::string Punycode::decode(const std::string& str)
{
std::string result(str);
Expand All @@ -293,10 +313,16 @@ namespace Url
}

bool Punycode::needsPunycoding(const std::string& str)
{
return needsPunycoding(str.cbegin(), str.cend());
}

bool Punycode::needsPunycoding(const std::string::const_iterator& begin,
const std::string::const_iterator& end)
{
return std::any_of(
str.begin(),
str.end(),
begin,
end,
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}

Expand Down
95 changes: 55 additions & 40 deletions src/url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,33 +663,36 @@ namespace Url

std::string encoded;

size_t start = 0;
size_t end = host_.find('.');
while(true)
auto last = host_.cbegin();
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
{
std::string segment = host_.substr(start, end - start);
if (Punycode::needsPunycoding(segment))
if (*it == '.')
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (Punycode::needsPunycoding(last, it))
{
encoded.append("xn--");
Punycode::encode(encoded, last, it);
}
else
{
encoded.append(last, it);
}

if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = host_.find('.', start);
last = it + 1;
}
}

if (Punycode::needsPunycoding(last, host_.cend()))
{
encoded.append("xn--");
Punycode::encode(encoded, last, host_.cend());
}
else
{
encoded.append(last, host_.cend());
}

host_.assign(encoded);

return *this;
Expand All @@ -698,36 +701,48 @@ namespace Url
Url& Url::unpunycode()
{
std::string unencoded;
std::string prefix;

size_t start = 0;
size_t end = host_.find('.');
while(true)
auto last = host_.cbegin();
for (auto it = host_.cbegin(); it != host_.cend(); ++it)
{
std::string segment = host_.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
if (*it == '.')
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
// Starts with 'xn--'
size_t distance = it - last;
if (distance > 4)
{
prefix.assign(last, last + 4);
if (prefix == "xn--")
{
Punycode::decode(unencoded, last + 4, it);
unencoded.append(1, '.');
last = it + 1;
continue;
}
}

if (end == std::string::npos)
{
break;
unencoded.append(last, it);
unencoded.append(1, '.');
last = it + 1;
}
else
}

// Last segment
size_t distance = host_.cend() - last;
if (distance > 4)
{
prefix.assign(last, last + 4);
if (prefix == "xn--")
{
unencoded.append(1, '.');
start = end + 1;
end = host_.find('.', start);
Punycode::decode(unencoded, last + 4, host_.cend());
host_.assign(unencoded);
return *this;
}
}

unencoded.append(last, host_.cend());
host_.assign(unencoded);

return *this;
}

Expand Down
12 changes: 12 additions & 0 deletions test/test-url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,18 @@ TEST(DefragTest, Defrag)
Url::Url("http://foo.com/path#fragment").defrag().str());
}

TEST(PunycodeTest, UnpunycodeShortIdentifierAtEnd)
{
std::string example("http://www.xn-/");
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
}

TEST(PunycodeTest, UnpunycodeShortIdentifierAtStart)
{
std::string example("http://xn-.com/");
EXPECT_EQ(example, Url::Url(example).unpunycode().str());
}

TEST(PunycodeTest, German)
{
std::string unencoded("http://www.kündigen.de/");
Expand Down

0 comments on commit 8f10f02

Please sign in to comment.