Skip to content

Commit

Permalink
Compliant URI parser with RFC 3492
Browse files Browse the repository at this point in the history
Resolves: #148.
  • Loading branch information
kleisauke committed Jul 31, 2018
1 parent 91413b3 commit 471c45d
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 28 deletions.
101 changes: 101 additions & 0 deletions spec/helpers/punycode_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
local punycode = require "weserv.helpers.punycode"

describe("punycode", function()
describe("test encode", function()
it("a single basic code point", function()
assert.equal('Bach-', punycode.encode('Bach'))
end)
it("a single non-ASCII character", function()
assert.equal('tda', punycode.encode('ü'))
end)
it("multiple non-ASCII characters", function()
assert.equal('4can8av2009b', punycode.encode('üëäö♥'))
end)
it("mix of ASCII and non-ASCII characters", function()
assert.equal('bcher-kva', punycode.encode('bücher'))
end)
it("long string with both ASCII and non-ASCII characters", function()
assert.equal('Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal',
punycode.encode('Willst du die Blüthe des frühen, die Früchte des späteren Jahres'))
end)
-- https://tools.ietf.org/html/rfc3492#section-7.1
it("Arabic (Egyptian)", function()
assert.equal('egbpdaj6bu4bxfgehfvwxn', punycode.encode('ليهمابتكلموشعربي؟'))
end)
it("Chinese (simplified)", function()
assert.equal('ihqwcrb4cv8a8dqg056pqjye', punycode.encode('他们为什么不说中文'))
end)
it("Chinese (traditional)", function()
assert.equal('ihqwctvzc91f659drss3x8bo0yb', punycode.encode('他們爲什麽不說中文'))
end)
it("Czech", function()
assert.equal('Proprostnemluvesky-uyb24dma41a', punycode.encode('Pročprostěnemluvíčesky'))
end)
it("Hebrew", function()
assert.equal('4dbcagdahymbxekheh6e0a7fei0b',
punycode.encode('למההםפשוטלאמדבריםעברית'))
end)
it("Hindi (Devanagari)", function()
assert.equal('i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd',
punycode.encode('यहलोगहिन्दीक्योंनहींबोलसकतेहैं')) -- luacheck: ignore
end)
it("Japanese (kanji and hiragana)", function()
assert.equal('n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa',
punycode.encode('なぜみんな日本語を話してくれないのか'))
end)
it("Korean (Hangul syllables)", function()
assert.equal('989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c',
punycode.encode('세계의모든사람들이한국어를이해한다면얼마나좋을까'))
end)
it("Russian (Cyrillic)", function()
-- It doesn't support mixed-case annotation (which is entirely optional as per the RFC).
-- So, while the RFC sample string encodes to:
-- `b1abfaaepdrnnbgefbaDotcwatmq2g4l`
-- Without mixed-case annotation it has to encode to:
-- `b1abfaaepdrnnbgefbadotcwatmq2g4l`
assert.equal('b1abfaaepdrnnbgefbadotcwatmq2g4l',
punycode.encode('почемужеонинеговорятпорусски'))
end)
it("Spanish", function()
assert.equal('PorqunopuedensimplementehablarenEspaol-fmd56a',
punycode.encode('PorquénopuedensimplementehablarenEspañol'))
end)
it("Vietnamese", function()
assert.equal('TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g',
punycode.encode('TạisaohọkhôngthểchỉnóitiếngViệt'))
end)
it("other", function()
assert.equal('3B-ww4c5e180e575a65lsy2b', punycode.encode('3年B組金八先生'))
assert.equal('-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n',
punycode.encode('安室奈美恵-with-SUPER-MONKEYS'))
assert.equal('Hello-Another-Way--fc4qua05auwb3674vfr0b',
punycode.encode('Hello-Another-Way-それぞれの場所'))
assert.equal('2-u9tlzr9756bt3uc0v', punycode.encode('ひとつ屋根の下2'))
assert.equal('MajiKoi5-783gue6qz075azm5e', punycode.encode('MajiでKoiする5秒前'))
assert.equal('de-jg4avhby1noc0d', punycode.encode('パフィーdeルンバ'))
assert.equal('d9juau41awczczp', punycode.encode('そのスピードで'))
end)
end)

describe("test domain encode", function()
it("Emoji", function()
assert.equal('xn--ls8h.la', punycode.domain_encode('💩.la'))
end)
it("invalid", function()
local idn, err = punycode.domain_encode('--example--.org')
assert.falsy(idn)
assert.equal('Invalid domain label', err)
end)
it("unchanged", function()
assert.equal('example.org', punycode.domain_encode('example.org'))
end)
it("other", function()
assert.equal("xn--maana-pta.com", punycode.domain_encode("mañana.com"))
assert.equal("xn--bcher-kva.com", punycode.domain_encode("bücher.com"))
assert.equal("xn--caf-dma.com", punycode.domain_encode("café.com"))
assert.equal("xn----dqo34k.com", punycode.domain_encode("☃-⌘.com"))
assert.equal("xn----dqo34kn65z.com", punycode.domain_encode("퐀☃-⌘.com"))
assert.equal("xn--j1ail.xn--p1ai", punycode.domain_encode("кто.рф"))
end)
end)
end)
6 changes: 4 additions & 2 deletions spec/helpers/utils_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,14 @@ describe("utils", function()
{ unpack(utils.parse_uri('http://ory.weserv.nl?a=1&b=2')) })
assert.are.same({ 'http', 'ory.weserv.nl', 443, '/', 'a=1&b=2' },
{ unpack(utils.parse_uri('http://ory.weserv.nl:443/?a=1&b=2')) })
assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub/path/', '' },
{ unpack(utils.parse_uri('http://ory.weserv.nl:443/sub/path/')) })
assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub//path/', '' },
{ unpack(utils.parse_uri('http://ory.weserv.nl:443/sub//path/')) })
assert.equal('https://ory.weserv.nl',
utils.clean_uri('//ory.weserv.nl'))
assert.are.same({ nil, "Unable to parse URL" },
{ utils.parse_uri('http:\\ory.weserv.nl') })
assert.are.same({ nil, "Invalid domain label" },
{ utils.parse_uri('--example--.org') })
assert.are.same({
'https', 'ory.weserv.nl', 443,
"/-._~!$'()*%20,;=:#@",
Expand Down
4 changes: 2 additions & 2 deletions src/weserv/api.lua
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ function api:process(tmpfile, args)

if args.loader == nil then
-- Log image invalid or unsupported errors
ngx.log(ngx.ERR, 'Image invalid or unsupported', vips.verror.get())
ngx.log(ngx.ERR, 'Image invalid or unsupported: ', vips.verror.get())

-- No known loader is found, stop further processing
return nil, {
Expand All @@ -147,7 +147,7 @@ function api:process(tmpfile, args)

if not success then
-- Log image not readable errors
ngx.log(ngx.ERR, 'Image not readable', image_err, args.url)
ngx.log(ngx.ERR, 'Image not readable: ', image_err)

return nil, {
status = ngx.HTTP_NOT_FOUND,
Expand Down
2 changes: 1 addition & 1 deletion src/weserv/client.lua
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ function client:request(uri, addl_headers, redirect_nr)

local ok, keepalive_err = httpc:set_keepalive()
if not ok then
ngx.log(ngx.ERR, 'Failed to set keepalive', keepalive_err)
ngx.log(ngx.ERR, 'Failed to set keepalive: ', keepalive_err)
os.remove(res.tmpfile)

return nil, {
Expand Down
191 changes: 191 additions & 0 deletions src/weserv/helpers/punycode.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
local bit = require 'bit'
local math = math
local table = table
local string = string
local ipairs = ipairs
local select = select

-- Parameter values for Punycode: https://tools.ietf.org/html/rfc3492#section-5
local base = 36
local t_min = 1
local t_max = 26
local skew = 38
local damp = 700
local initial_bias = 72
local initial_n = 128 -- 0x80
local delimiter = '-' -- 0x2D

-- Highest positive signed 32-bit float value
local max_int = 2147483647; -- aka. 0x7FFFFFFF or 2^31-1

--- Punycode module
-- @module punycode
local punycode = {}
punycode.__index = punycode

--- Returns the code of a UTF-8 character.
-- From sarn_utf8_code_func: http://lua-users.org/wiki/ValidateUnicodeString
function punycode.utf8_code(code, ...)
local offset = { 0, 0x3000, 0xE0000, 0x3C00000 }

local num_bytes = select('#', ...)
for i = 1, num_bytes do
local b = select(i, ...)
code = bit.lshift(code, 6) + bit.band(b, 63)
end

return code - offset[num_bytes + 1]
end

--- Convert string to universal character set coded
-- in 4 octets [0,0x7FFFFFFF]
function punycode.to_ucs4(str)
local out = {}
-- https://stackoverflow.com/a/22954220/1480019
for c in str:gmatch('([%z\1-\127\194-\244][\128-\191]*)') do
out[#out + 1] = punycode.utf8_code(string.byte(c, 1, -1))
end

return out
end

--- This function converts a digit/integer into a basic code point.
-- whose value (when used for representing integers) is `d`,
-- which needs to be in the range `0` to `base - 1`.
-- @param digit The numeric value of a basic code point.
-- @return The basic code point.
function punycode.encode_digit(d)
-- 0..25 map to ASCII a..z or A..Z
-- 26..35 map to ASCII 0..9
return d + 22 + 75 * (d < 26 and 1 or 0)
end

--- Bias adaptation function as per section 3.4 of RFC 3492.
-- https://tools.ietf.org/html/rfc3492#section-3.4
function punycode.adapt(delta, numPoints, firstTime)
delta = firstTime and math.floor(delta / damp) or math.floor(delta / 2)

delta = delta + math.floor(delta / numPoints)

local k = 0

while delta > math.floor((base - t_min) * t_max / 2) do
delta = math.floor(delta / (base - t_min))
k = k + 1
end

return base * k + math.floor((base - t_min + 1) * delta / (delta + skew));
end

--- Encoding procedure as per section 6.3 of RFC 3492.
-- https://tools.ietf.org/html/rfc3492#section-6.3
-- @param input list-table of Unicode code points.
-- @return The new encoded string.
function punycode.encode(input)
input = punycode.to_ucs4(input)

local codepoints = {}

-- Cache the length.
local input_length = #input

-- Initialize the state.
local n = initial_n
local delta = 0
local bias = initial_bias

-- Handle the basic code points.
for j = 1, input_length do
local c = input[j]
if c < 0x80 then
codepoints[#codepoints + 1] = string.char(c)
end
end

-- The number of basic code points.
local basic_length = #codepoints

-- The number of code points that have been handled
local h = basic_length

-- Finish the basic string with a delimiter unless it's empty.
if basic_length > 0 then codepoints[#codepoints + 1] = delimiter end

-- Main encoding loop
while h < input_length do
-- All non-basic code points < n have been handled already. Find
-- the next larger one:
local m = max_int
for _, v in ipairs(input) do
if v >= n and v < m then
m = v
end
end

-- Increase `delta` enough to advance the decoder's <n,i> state to <m,0>
delta = delta + (m - n) * (h + 1)
n = m

for _, curr_v in ipairs(input) do
if curr_v < n then
delta = delta + 1
end

if curr_v == n then
-- Represent delta as a generalized variable-length integer.
local q = delta
local k = base

while true do
local t = (k <= bias) and t_min or
(k >= bias + t_max) and t_max or (k - bias)
if q < t then break end

local q_minus_t = q - t
local base_minus_t = base - t
codepoints[#codepoints + 1] = string.char(punycode.encode_digit(t + q_minus_t % base_minus_t))

q = math.floor(q_minus_t / base_minus_t)

k = k + base
end

codepoints[#codepoints + 1] = string.char(punycode.encode_digit(q))
bias = punycode.adapt(delta, h + 1, h == basic_length)

delta = 0
h = h + 1
end
end

delta = delta + 1
n = n + 1
end

return table.concat(codepoints, '')
end

--- Encode a IDN domain.
-- If the domain is already ASCII, it is returned in its original state.
-- If any encoding was required, the "xn--" prefix is added.
function punycode.domain_encode(domain)
local labels = {}
for label in domain:gmatch('([^.]+)%.?') do
-- Domain names can only consist of [a-zA-Z0-9-] and aren't allowed
-- to start or end with a hyphen
local first, last = label:sub(1, 1), label:sub(-1)
if first == '-' or last == '-' then
return nil, 'Invalid domain label'
end

if label:match('^[a-zA-Z0-9-]+$') then
labels[#labels + 1] = label
elseif label:sub(1, 1) ~= '-' and label:sub(2, 2) ~= '-' then
labels[#labels + 1] = 'xn--' .. punycode.encode(label)
end
end

return table.concat(labels, '.')
end

return punycode
21 changes: 11 additions & 10 deletions src/weserv/helpers/utils.lua
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
local punycode = require "weserv.helpers.punycode"
local ffi = require "ffi"
local ngx = ngx
local table = table
Expand Down Expand Up @@ -130,19 +131,12 @@ end
function utils.canonicalise_path(path)
local segments = {}
for segment in path:gmatch("/([^/]*)") do
if segment ~= "" and segment ~= "." then
segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode)
end
segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode)
end
local len = #segments
if len == 0 then
return "/"
end
-- If there was a slash on the end, keep it there.
if path:sub(-1, -1) == "/" then
len = len + 1
segments[len] = ""
end
segments[0] = ""
segments = table.concat(segments, "/", 0, len)
return segments
Expand All @@ -165,11 +159,18 @@ end
-- @param uri The URI.
-- @return Parsed URI.
function utils.parse_uri(uri)
local m = ngx.re.match(utils.clean_uri(uri), [[^(?:(http[s]?):)?//([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo')
local m = ngx.re.match(utils.clean_uri(uri), [[^(http[s]?)://([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo')

if not m or not m[1] then
if not m then
return nil, "Unable to parse URL"
else
local punycode_idn, err = punycode.domain_encode(m[2])
if not punycode_idn then
return nil, err
end

m[2] = punycode_idn

if m[3] then
m[3] = tonumber(m[3])
else
Expand Down
Loading

0 comments on commit 471c45d

Please sign in to comment.