Compliant URI parser with RFC 3492

Resolves: #148.
weserv · Jul 31, 2018 · 471c45d · 471c45d
1 parent 91413b3
commit 471c45d
Show file tree

Hide file tree

Showing 8 changed files with 323 additions and 28 deletions.
diff --git a/spec/helpers/punycode_spec.lua b/spec/helpers/punycode_spec.lua
@@ -0,0 +1,101 @@
+local punycode = require "weserv.helpers.punycode"
+
+describe("punycode", function()
+    describe("test encode", function()
+        it("a single basic code point", function()
+            assert.equal('Bach-', punycode.encode('Bach'))
+        end)
+        it("a single non-ASCII character", function()
+            assert.equal('tda', punycode.encode('ü'))
+        end)
+        it("multiple non-ASCII characters", function()
+            assert.equal('4can8av2009b', punycode.encode('üëäö♥'))
+        end)
+        it("mix of ASCII and non-ASCII characters", function()
+            assert.equal('bcher-kva', punycode.encode('bücher'))
+        end)
+        it("long string with both ASCII and non-ASCII characters", function()
+            assert.equal('Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal',
+                punycode.encode('Willst du die Blüthe des frühen, die Früchte des späteren Jahres'))
+        end)
+        -- https://tools.ietf.org/html/rfc3492#section-7.1
+        it("Arabic (Egyptian)", function()
+            assert.equal('egbpdaj6bu4bxfgehfvwxn', punycode.encode('ليهمابتكلموشعربي؟'))
+        end)
+        it("Chinese (simplified)", function()
+            assert.equal('ihqwcrb4cv8a8dqg056pqjye', punycode.encode('他们为什么不说中文'))
+        end)
+        it("Chinese (traditional)", function()
+            assert.equal('ihqwctvzc91f659drss3x8bo0yb', punycode.encode('他們爲什麽不說中文'))
+        end)
+        it("Czech", function()
+            assert.equal('Proprostnemluvesky-uyb24dma41a', punycode.encode('Pročprostěnemluvíčesky'))
+        end)
+        it("Hebrew", function()
+            assert.equal('4dbcagdahymbxekheh6e0a7fei0b',
+                punycode.encode('למההםפשוטלאמדבריםעברית'))
+        end)
+        it("Hindi (Devanagari)", function()
+            assert.equal('i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd',
+                punycode.encode('यहलोगहिन्दीक्योंनहींबोलसकतेहैं')) -- luacheck: ignore
+        end)
+        it("Japanese (kanji and hiragana)", function()
+            assert.equal('n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa',
+                punycode.encode('なぜみんな日本語を話してくれないのか'))
+        end)
+        it("Korean (Hangul syllables)", function()
+            assert.equal('989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c',
+                punycode.encode('세계의모든사람들이한국어를이해한다면얼마나좋을까'))
+        end)
+        it("Russian (Cyrillic)", function()
+            -- It doesn't support mixed-case annotation (which is entirely optional as per the RFC).
+            -- So, while the RFC sample string encodes to:
+            -- `b1abfaaepdrnnbgefbaDotcwatmq2g4l`
+            -- Without mixed-case annotation it has to encode to:
+            -- `b1abfaaepdrnnbgefbadotcwatmq2g4l`
+            assert.equal('b1abfaaepdrnnbgefbadotcwatmq2g4l',
+                punycode.encode('почемужеонинеговорятпорусски'))
+        end)
+        it("Spanish", function()
+            assert.equal('PorqunopuedensimplementehablarenEspaol-fmd56a',
+                punycode.encode('PorquénopuedensimplementehablarenEspañol'))
+        end)
+        it("Vietnamese", function()
+            assert.equal('TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g',
+                punycode.encode('TạisaohọkhôngthểchỉnóitiếngViệt'))
+        end)
+        it("other", function()
+            assert.equal('3B-ww4c5e180e575a65lsy2b', punycode.encode('3年B組金八先生'))
+            assert.equal('-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n',
+                punycode.encode('安室奈美恵-with-SUPER-MONKEYS'))
+            assert.equal('Hello-Another-Way--fc4qua05auwb3674vfr0b',
+                punycode.encode('Hello-Another-Way-それぞれの場所'))
+            assert.equal('2-u9tlzr9756bt3uc0v', punycode.encode('ひとつ屋根の下2'))
+            assert.equal('MajiKoi5-783gue6qz075azm5e', punycode.encode('MajiでKoiする5秒前'))
+            assert.equal('de-jg4avhby1noc0d', punycode.encode('パフィーdeルンバ'))
+            assert.equal('d9juau41awczczp', punycode.encode('そのスピードで'))
+        end)
+    end)
+
+    describe("test domain encode", function()
+        it("Emoji", function()
+            assert.equal('xn--ls8h.la', punycode.domain_encode('💩.la'))
+        end)
+        it("invalid", function()
+            local idn, err = punycode.domain_encode('--example--.org')
+            assert.falsy(idn)
+            assert.equal('Invalid domain label', err)
+        end)
+        it("unchanged", function()
+            assert.equal('example.org', punycode.domain_encode('example.org'))
+        end)
+        it("other", function()
+            assert.equal("xn--maana-pta.com", punycode.domain_encode("mañana.com"))
+            assert.equal("xn--bcher-kva.com", punycode.domain_encode("bücher.com"))
+            assert.equal("xn--caf-dma.com", punycode.domain_encode("café.com"))
+            assert.equal("xn----dqo34k.com", punycode.domain_encode("☃-⌘.com"))
+            assert.equal("xn----dqo34kn65z.com", punycode.domain_encode("퐀☃-⌘.com"))
+            assert.equal("xn--j1ail.xn--p1ai", punycode.domain_encode("кто.рф"))
+        end)
+    end)
+end)
diff --git a/spec/helpers/utils_spec.lua b/spec/helpers/utils_spec.lua
@@ -82,12 +82,14 @@ describe("utils", function()
             { unpack(utils.parse_uri('http://ory.weserv.nl?a=1&b=2')) })
         assert.are.same({ 'http', 'ory.weserv.nl', 443, '/', 'a=1&b=2' },
             { unpack(utils.parse_uri('http://ory.weserv.nl:443/?a=1&b=2')) })
-        assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub/path/', '' },
-            { unpack(utils.parse_uri('http://ory.weserv.nl:443/sub/path/')) })
+        assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub//path/', '' },
+            { unpack(utils.parse_uri('http://ory.weserv.nl:443/sub//path/')) })
         assert.equal('https://ory.weserv.nl',
             utils.clean_uri('//ory.weserv.nl'))
         assert.are.same({ nil, "Unable to parse URL" },
             { utils.parse_uri('http:\\ory.weserv.nl') })
+        assert.are.same({ nil, "Invalid domain label" },
+            { utils.parse_uri('--example--.org') })
         assert.are.same({
             'https', 'ory.weserv.nl', 443,
             "/-._~!$'()*%20,;=:#@",

diff --git a/src/weserv/api.lua b/src/weserv/api.lua
@@ -126,7 +126,7 @@ function api:process(tmpfile, args)
 
     if args.loader == nil then
         -- Log image invalid or unsupported errors
-        ngx.log(ngx.ERR, 'Image invalid or unsupported', vips.verror.get())
+        ngx.log(ngx.ERR, 'Image invalid or unsupported: ', vips.verror.get())
 
         -- No known loader is found, stop further processing
         return nil, {
@@ -147,7 +147,7 @@ function api:process(tmpfile, args)
 
     if not success then
         -- Log image not readable errors
-        ngx.log(ngx.ERR, 'Image not readable', image_err, args.url)
+        ngx.log(ngx.ERR, 'Image not readable: ', image_err)
 
         return nil, {
             status = ngx.HTTP_NOT_FOUND,

diff --git a/src/weserv/client.lua b/src/weserv/client.lua
@@ -237,7 +237,7 @@ function client:request(uri, addl_headers, redirect_nr)
 
     local ok, keepalive_err = httpc:set_keepalive()
     if not ok then
-        ngx.log(ngx.ERR, 'Failed to set keepalive', keepalive_err)
+        ngx.log(ngx.ERR, 'Failed to set keepalive: ', keepalive_err)
         os.remove(res.tmpfile)
 
         return nil, {

diff --git a/src/weserv/helpers/punycode.lua b/src/weserv/helpers/punycode.lua
@@ -0,0 +1,191 @@
+local bit = require 'bit'
+local math = math
+local table = table
+local string = string
+local ipairs = ipairs
+local select = select
+
+-- Parameter values for Punycode: https://tools.ietf.org/html/rfc3492#section-5
+local base = 36
+local t_min = 1
+local t_max = 26
+local skew = 38
+local damp = 700
+local initial_bias = 72
+local initial_n = 128 -- 0x80
+local delimiter = '-' -- 0x2D
+
+-- Highest positive signed 32-bit float value
+local max_int = 2147483647; -- aka. 0x7FFFFFFF or 2^31-1
+
+--- Punycode module
+-- @module punycode
+local punycode = {}
+punycode.__index = punycode
+
+--- Returns the code of a UTF-8 character.
+-- From sarn_utf8_code_func: http://lua-users.org/wiki/ValidateUnicodeString
+function punycode.utf8_code(code, ...)
+    local offset = { 0, 0x3000, 0xE0000, 0x3C00000 }
+
+    local num_bytes = select('#', ...)
+    for i = 1, num_bytes do
+        local b = select(i, ...)
+        code = bit.lshift(code, 6) + bit.band(b, 63)
+    end
+
+    return code - offset[num_bytes + 1]
+end
+
+--- Convert string to universal character set coded
+-- in 4 octets [0,0x7FFFFFFF]
+function punycode.to_ucs4(str)
+    local out = {}
+    -- https://stackoverflow.com/a/22954220/1480019
+    for c in str:gmatch('([%z\1-\127\194-\244][\128-\191]*)') do
+        out[#out + 1] = punycode.utf8_code(string.byte(c, 1, -1))
+    end
+
+    return out
+end
+
+--- This function converts a digit/integer into a basic code point.
+-- whose value (when used for representing integers) is `d`,
+-- which needs to be in the range `0` to `base - 1`.
+-- @param digit The numeric value of a basic code point.
+-- @return The basic code point.
+function punycode.encode_digit(d)
+    -- 0..25 map to ASCII a..z or A..Z
+    -- 26..35 map to ASCII 0..9
+    return d + 22 + 75 * (d < 26 and 1 or 0)
+end
+
+--- Bias adaptation function as per section 3.4 of RFC 3492.
+-- https://tools.ietf.org/html/rfc3492#section-3.4
+function punycode.adapt(delta, numPoints, firstTime)
+    delta = firstTime and math.floor(delta / damp) or math.floor(delta / 2)
+
+    delta = delta + math.floor(delta / numPoints)
+
+    local k = 0
+
+    while delta > math.floor((base - t_min) * t_max / 2) do
+        delta = math.floor(delta / (base - t_min))
+        k = k + 1
+    end
+
+    return base * k + math.floor((base - t_min + 1) * delta / (delta + skew));
+end
+
+--- Encoding procedure as per section 6.3 of RFC 3492.
+-- https://tools.ietf.org/html/rfc3492#section-6.3
+-- @param input list-table of Unicode code points.
+-- @return The new encoded string.
+function punycode.encode(input)
+    input = punycode.to_ucs4(input)
+
+    local codepoints = {}
+
+    -- Cache the length.
+    local input_length = #input
+
+    -- Initialize the state.
+    local n = initial_n
+    local delta = 0
+    local bias = initial_bias
+
+    -- Handle the basic code points.
+    for j = 1, input_length do
+        local c = input[j]
+        if c < 0x80 then
+            codepoints[#codepoints + 1] = string.char(c)
+        end
+    end
+
+    -- The number of basic code points.
+    local basic_length = #codepoints
+
+    -- The number of code points that have been handled
+    local h = basic_length
+
+    -- Finish the basic string with a delimiter unless it's empty.
+    if basic_length > 0 then codepoints[#codepoints + 1] = delimiter end
+
+    -- Main encoding loop
+    while h < input_length do
+        -- All non-basic code points < n have been handled already. Find
+        -- the next larger one:
+        local m = max_int
+        for _, v in ipairs(input) do
+            if v >= n and v < m then
+                m = v
+            end
+        end
+
+        -- Increase `delta` enough to advance the decoder's <n,i> state to <m,0>
+        delta = delta + (m - n) * (h + 1)
+        n = m
+
+        for _, curr_v in ipairs(input) do
+            if curr_v < n then
+                delta = delta + 1
+            end
+
+            if curr_v == n then
+                -- Represent delta as a generalized variable-length integer.
+                local q = delta
+                local k = base
+
+                while true do
+                    local t = (k <= bias) and t_min or
+                            (k >= bias + t_max) and t_max or (k - bias)
+                    if q < t then break end
+
+                    local q_minus_t = q - t
+                    local base_minus_t = base - t
+                    codepoints[#codepoints + 1] = string.char(punycode.encode_digit(t + q_minus_t % base_minus_t))
+
+                    q = math.floor(q_minus_t / base_minus_t)
+
+                    k = k + base
+                end
+
+                codepoints[#codepoints + 1] = string.char(punycode.encode_digit(q))
+                bias = punycode.adapt(delta, h + 1, h == basic_length)
+
+                delta = 0
+                h = h + 1
+            end
+        end
+
+        delta = delta + 1
+        n = n + 1
+    end
+
+    return table.concat(codepoints, '')
+end
+
+--- Encode a IDN domain.
+-- If the domain is already ASCII, it is returned in its original state.
+-- If any encoding was required, the "xn--" prefix is added.
+function punycode.domain_encode(domain)
+    local labels = {}
+    for label in domain:gmatch('([^.]+)%.?') do
+        -- Domain names can only consist of [a-zA-Z0-9-] and aren't allowed
+        -- to start or end with a hyphen
+        local first, last = label:sub(1, 1), label:sub(-1)
+        if first == '-' or last == '-' then
+            return nil, 'Invalid domain label'
+        end
+
+        if label:match('^[a-zA-Z0-9-]+$') then
+            labels[#labels + 1] = label
+        elseif label:sub(1, 1) ~= '-' and label:sub(2, 2) ~= '-' then
+            labels[#labels + 1] = 'xn--' .. punycode.encode(label)
+        end
+    end
+
+    return table.concat(labels, '.')
+end
+
+return punycode
diff --git a/src/weserv/helpers/utils.lua b/src/weserv/helpers/utils.lua
@@ -1,3 +1,4 @@
+local punycode = require "weserv.helpers.punycode"
 local ffi = require "ffi"
 local ngx = ngx
 local table = table
@@ -130,19 +131,12 @@ end
 function utils.canonicalise_path(path)
     local segments = {}
     for segment in path:gmatch("/([^/]*)") do
-        if segment ~= "" and segment ~= "." then
-            segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode)
-        end
+        segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode)
     end
     local len = #segments
     if len == 0 then
         return "/"
     end
-    -- If there was a slash on the end, keep it there.
-    if path:sub(-1, -1) == "/" then
-        len = len + 1
-        segments[len] = ""
-    end
     segments[0] = ""
     segments = table.concat(segments, "/", 0, len)
     return segments
@@ -165,11 +159,18 @@ end
 -- @param uri The URI.
 -- @return Parsed URI.
 function utils.parse_uri(uri)
-    local m = ngx.re.match(utils.clean_uri(uri), [[^(?:(http[s]?):)?//([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo')
+    local m = ngx.re.match(utils.clean_uri(uri), [[^(http[s]?)://([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo')
 
-    if not m or not m[1] then
+    if not m then
         return nil, "Unable to parse URL"
     else
+        local punycode_idn, err = punycode.domain_encode(m[2])
+        if not punycode_idn then
+            return nil, err
+        end
+
+        m[2] = punycode_idn
+
         if m[3] then
             m[3] = tonumber(m[3])
         else