-
Notifications
You must be signed in to change notification settings - Fork 208
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Resolves: #148.
- Loading branch information
Showing
8 changed files
with
323 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
local punycode = require "weserv.helpers.punycode" | ||
|
||
describe("punycode", function() | ||
describe("test encode", function() | ||
it("a single basic code point", function() | ||
assert.equal('Bach-', punycode.encode('Bach')) | ||
end) | ||
it("a single non-ASCII character", function() | ||
assert.equal('tda', punycode.encode('ü')) | ||
end) | ||
it("multiple non-ASCII characters", function() | ||
assert.equal('4can8av2009b', punycode.encode('üëäö♥')) | ||
end) | ||
it("mix of ASCII and non-ASCII characters", function() | ||
assert.equal('bcher-kva', punycode.encode('bücher')) | ||
end) | ||
it("long string with both ASCII and non-ASCII characters", function() | ||
assert.equal('Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal', | ||
punycode.encode('Willst du die Blüthe des frühen, die Früchte des späteren Jahres')) | ||
end) | ||
-- https://tools.ietf.org/html/rfc3492#section-7.1 | ||
it("Arabic (Egyptian)", function() | ||
assert.equal('egbpdaj6bu4bxfgehfvwxn', punycode.encode('ليهمابتكلموشعربي؟')) | ||
end) | ||
it("Chinese (simplified)", function() | ||
assert.equal('ihqwcrb4cv8a8dqg056pqjye', punycode.encode('他们为什么不说中文')) | ||
end) | ||
it("Chinese (traditional)", function() | ||
assert.equal('ihqwctvzc91f659drss3x8bo0yb', punycode.encode('他們爲什麽不說中文')) | ||
end) | ||
it("Czech", function() | ||
assert.equal('Proprostnemluvesky-uyb24dma41a', punycode.encode('Pročprostěnemluvíčesky')) | ||
end) | ||
it("Hebrew", function() | ||
assert.equal('4dbcagdahymbxekheh6e0a7fei0b', | ||
punycode.encode('למההםפשוטלאמדבריםעברית')) | ||
end) | ||
it("Hindi (Devanagari)", function() | ||
assert.equal('i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd', | ||
punycode.encode('यहलोगहिन्दीक्योंनहींबोलसकतेहैं')) -- luacheck: ignore | ||
end) | ||
it("Japanese (kanji and hiragana)", function() | ||
assert.equal('n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa', | ||
punycode.encode('なぜみんな日本語を話してくれないのか')) | ||
end) | ||
it("Korean (Hangul syllables)", function() | ||
assert.equal('989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c', | ||
punycode.encode('세계의모든사람들이한국어를이해한다면얼마나좋을까')) | ||
end) | ||
it("Russian (Cyrillic)", function() | ||
-- It doesn't support mixed-case annotation (which is entirely optional as per the RFC). | ||
-- So, while the RFC sample string encodes to: | ||
-- `b1abfaaepdrnnbgefbaDotcwatmq2g4l` | ||
-- Without mixed-case annotation it has to encode to: | ||
-- `b1abfaaepdrnnbgefbadotcwatmq2g4l` | ||
assert.equal('b1abfaaepdrnnbgefbadotcwatmq2g4l', | ||
punycode.encode('почемужеонинеговорятпорусски')) | ||
end) | ||
it("Spanish", function() | ||
assert.equal('PorqunopuedensimplementehablarenEspaol-fmd56a', | ||
punycode.encode('PorquénopuedensimplementehablarenEspañol')) | ||
end) | ||
it("Vietnamese", function() | ||
assert.equal('TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g', | ||
punycode.encode('TạisaohọkhôngthểchỉnóitiếngViệt')) | ||
end) | ||
it("other", function() | ||
assert.equal('3B-ww4c5e180e575a65lsy2b', punycode.encode('3年B組金八先生')) | ||
assert.equal('-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n', | ||
punycode.encode('安室奈美恵-with-SUPER-MONKEYS')) | ||
assert.equal('Hello-Another-Way--fc4qua05auwb3674vfr0b', | ||
punycode.encode('Hello-Another-Way-それぞれの場所')) | ||
assert.equal('2-u9tlzr9756bt3uc0v', punycode.encode('ひとつ屋根の下2')) | ||
assert.equal('MajiKoi5-783gue6qz075azm5e', punycode.encode('MajiでKoiする5秒前')) | ||
assert.equal('de-jg4avhby1noc0d', punycode.encode('パフィーdeルンバ')) | ||
assert.equal('d9juau41awczczp', punycode.encode('そのスピードで')) | ||
end) | ||
end) | ||
|
||
describe("test domain encode", function() | ||
it("Emoji", function() | ||
assert.equal('xn--ls8h.la', punycode.domain_encode('💩.la')) | ||
end) | ||
it("invalid", function() | ||
local idn, err = punycode.domain_encode('--example--.org') | ||
assert.falsy(idn) | ||
assert.equal('Invalid domain label', err) | ||
end) | ||
it("unchanged", function() | ||
assert.equal('example.org', punycode.domain_encode('example.org')) | ||
end) | ||
it("other", function() | ||
assert.equal("xn--maana-pta.com", punycode.domain_encode("mañana.com")) | ||
assert.equal("xn--bcher-kva.com", punycode.domain_encode("bücher.com")) | ||
assert.equal("xn--caf-dma.com", punycode.domain_encode("café.com")) | ||
assert.equal("xn----dqo34k.com", punycode.domain_encode("☃-⌘.com")) | ||
assert.equal("xn----dqo34kn65z.com", punycode.domain_encode("퐀☃-⌘.com")) | ||
assert.equal("xn--j1ail.xn--p1ai", punycode.domain_encode("кто.рф")) | ||
end) | ||
end) | ||
end) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
local bit = require 'bit' | ||
local math = math | ||
local table = table | ||
local string = string | ||
local ipairs = ipairs | ||
local select = select | ||
|
||
-- Parameter values for Punycode: https://tools.ietf.org/html/rfc3492#section-5 | ||
local base = 36 | ||
local t_min = 1 | ||
local t_max = 26 | ||
local skew = 38 | ||
local damp = 700 | ||
local initial_bias = 72 | ||
local initial_n = 128 -- 0x80 | ||
local delimiter = '-' -- 0x2D | ||
|
||
-- Highest positive signed 32-bit float value | ||
local max_int = 2147483647; -- aka. 0x7FFFFFFF or 2^31-1 | ||
|
||
--- Punycode module | ||
-- @module punycode | ||
local punycode = {} | ||
punycode.__index = punycode | ||
|
||
--- Returns the code of a UTF-8 character. | ||
-- From sarn_utf8_code_func: http://lua-users.org/wiki/ValidateUnicodeString | ||
function punycode.utf8_code(code, ...) | ||
local offset = { 0, 0x3000, 0xE0000, 0x3C00000 } | ||
|
||
local num_bytes = select('#', ...) | ||
for i = 1, num_bytes do | ||
local b = select(i, ...) | ||
code = bit.lshift(code, 6) + bit.band(b, 63) | ||
end | ||
|
||
return code - offset[num_bytes + 1] | ||
end | ||
|
||
--- Convert string to universal character set coded | ||
-- in 4 octets [0,0x7FFFFFFF] | ||
function punycode.to_ucs4(str) | ||
local out = {} | ||
-- https://stackoverflow.com/a/22954220/1480019 | ||
for c in str:gmatch('([%z\1-\127\194-\244][\128-\191]*)') do | ||
out[#out + 1] = punycode.utf8_code(string.byte(c, 1, -1)) | ||
end | ||
|
||
return out | ||
end | ||
|
||
--- This function converts a digit/integer into a basic code point. | ||
-- whose value (when used for representing integers) is `d`, | ||
-- which needs to be in the range `0` to `base - 1`. | ||
-- @param digit The numeric value of a basic code point. | ||
-- @return The basic code point. | ||
function punycode.encode_digit(d) | ||
-- 0..25 map to ASCII a..z or A..Z | ||
-- 26..35 map to ASCII 0..9 | ||
return d + 22 + 75 * (d < 26 and 1 or 0) | ||
end | ||
|
||
--- Bias adaptation function as per section 3.4 of RFC 3492. | ||
-- https://tools.ietf.org/html/rfc3492#section-3.4 | ||
function punycode.adapt(delta, numPoints, firstTime) | ||
delta = firstTime and math.floor(delta / damp) or math.floor(delta / 2) | ||
|
||
delta = delta + math.floor(delta / numPoints) | ||
|
||
local k = 0 | ||
|
||
while delta > math.floor((base - t_min) * t_max / 2) do | ||
delta = math.floor(delta / (base - t_min)) | ||
k = k + 1 | ||
end | ||
|
||
return base * k + math.floor((base - t_min + 1) * delta / (delta + skew)); | ||
end | ||
|
||
--- Encoding procedure as per section 6.3 of RFC 3492. | ||
-- https://tools.ietf.org/html/rfc3492#section-6.3 | ||
-- @param input list-table of Unicode code points. | ||
-- @return The new encoded string. | ||
function punycode.encode(input) | ||
input = punycode.to_ucs4(input) | ||
|
||
local codepoints = {} | ||
|
||
-- Cache the length. | ||
local input_length = #input | ||
|
||
-- Initialize the state. | ||
local n = initial_n | ||
local delta = 0 | ||
local bias = initial_bias | ||
|
||
-- Handle the basic code points. | ||
for j = 1, input_length do | ||
local c = input[j] | ||
if c < 0x80 then | ||
codepoints[#codepoints + 1] = string.char(c) | ||
end | ||
end | ||
|
||
-- The number of basic code points. | ||
local basic_length = #codepoints | ||
|
||
-- The number of code points that have been handled | ||
local h = basic_length | ||
|
||
-- Finish the basic string with a delimiter unless it's empty. | ||
if basic_length > 0 then codepoints[#codepoints + 1] = delimiter end | ||
|
||
-- Main encoding loop | ||
while h < input_length do | ||
-- All non-basic code points < n have been handled already. Find | ||
-- the next larger one: | ||
local m = max_int | ||
for _, v in ipairs(input) do | ||
if v >= n and v < m then | ||
m = v | ||
end | ||
end | ||
|
||
-- Increase `delta` enough to advance the decoder's <n,i> state to <m,0> | ||
delta = delta + (m - n) * (h + 1) | ||
n = m | ||
|
||
for _, curr_v in ipairs(input) do | ||
if curr_v < n then | ||
delta = delta + 1 | ||
end | ||
|
||
if curr_v == n then | ||
-- Represent delta as a generalized variable-length integer. | ||
local q = delta | ||
local k = base | ||
|
||
while true do | ||
local t = (k <= bias) and t_min or | ||
(k >= bias + t_max) and t_max or (k - bias) | ||
if q < t then break end | ||
|
||
local q_minus_t = q - t | ||
local base_minus_t = base - t | ||
codepoints[#codepoints + 1] = string.char(punycode.encode_digit(t + q_minus_t % base_minus_t)) | ||
|
||
q = math.floor(q_minus_t / base_minus_t) | ||
|
||
k = k + base | ||
end | ||
|
||
codepoints[#codepoints + 1] = string.char(punycode.encode_digit(q)) | ||
bias = punycode.adapt(delta, h + 1, h == basic_length) | ||
|
||
delta = 0 | ||
h = h + 1 | ||
end | ||
end | ||
|
||
delta = delta + 1 | ||
n = n + 1 | ||
end | ||
|
||
return table.concat(codepoints, '') | ||
end | ||
|
||
--- Encode a IDN domain. | ||
-- If the domain is already ASCII, it is returned in its original state. | ||
-- If any encoding was required, the "xn--" prefix is added. | ||
function punycode.domain_encode(domain) | ||
local labels = {} | ||
for label in domain:gmatch('([^.]+)%.?') do | ||
-- Domain names can only consist of [a-zA-Z0-9-] and aren't allowed | ||
-- to start or end with a hyphen | ||
local first, last = label:sub(1, 1), label:sub(-1) | ||
if first == '-' or last == '-' then | ||
return nil, 'Invalid domain label' | ||
end | ||
|
||
if label:match('^[a-zA-Z0-9-]+$') then | ||
labels[#labels + 1] = label | ||
elseif label:sub(1, 1) ~= '-' and label:sub(2, 2) ~= '-' then | ||
labels[#labels + 1] = 'xn--' .. punycode.encode(label) | ||
end | ||
end | ||
|
||
return table.concat(labels, '.') | ||
end | ||
|
||
return punycode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.