From 471c45de85acaef20127aecaf98ad214897d55b3 Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen Date: Tue, 31 Jul 2018 18:44:50 +0200 Subject: [PATCH] Compliant URI parser with RFC 3492 Resolves: #148. --- spec/helpers/punycode_spec.lua | 101 +++++++++++++++++ spec/helpers/utils_spec.lua | 6 +- src/weserv/api.lua | 4 +- src/weserv/client.lua | 2 +- src/weserv/helpers/punycode.lua | 191 ++++++++++++++++++++++++++++++++ src/weserv/helpers/utils.lua | 21 ++-- src/weserv/policy.lua | 12 +- src/weserv/throttler.lua | 14 +-- 8 files changed, 323 insertions(+), 28 deletions(-) create mode 100644 spec/helpers/punycode_spec.lua create mode 100644 src/weserv/helpers/punycode.lua diff --git a/spec/helpers/punycode_spec.lua b/spec/helpers/punycode_spec.lua new file mode 100644 index 00000000..e8c383c1 --- /dev/null +++ b/spec/helpers/punycode_spec.lua @@ -0,0 +1,101 @@ +local punycode = require "weserv.helpers.punycode" + +describe("punycode", function() + describe("test encode", function() + it("a single basic code point", function() + assert.equal('Bach-', punycode.encode('Bach')) + end) + it("a single non-ASCII character", function() + assert.equal('tda', punycode.encode('ü')) + end) + it("multiple non-ASCII characters", function() + assert.equal('4can8av2009b', punycode.encode('üëäö♥')) + end) + it("mix of ASCII and non-ASCII characters", function() + assert.equal('bcher-kva', punycode.encode('bücher')) + end) + it("long string with both ASCII and non-ASCII characters", function() + assert.equal('Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal', + punycode.encode('Willst du die Blüthe des frühen, die Früchte des späteren Jahres')) + end) + -- https://tools.ietf.org/html/rfc3492#section-7.1 + it("Arabic (Egyptian)", function() + assert.equal('egbpdaj6bu4bxfgehfvwxn', punycode.encode('ليهمابتكلموشعربي؟')) + end) + it("Chinese (simplified)", function() + assert.equal('ihqwcrb4cv8a8dqg056pqjye', punycode.encode('他们为什么不说中文')) + end) + it("Chinese (traditional)", function() + assert.equal('ihqwctvzc91f659drss3x8bo0yb', punycode.encode('他們爲什麽不說中文')) + end) + it("Czech", function() + assert.equal('Proprostnemluvesky-uyb24dma41a', punycode.encode('Pročprostěnemluvíčesky')) + end) + it("Hebrew", function() + assert.equal('4dbcagdahymbxekheh6e0a7fei0b', + punycode.encode('למההםפשוטלאמדבריםעברית')) + end) + it("Hindi (Devanagari)", function() + assert.equal('i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd', + punycode.encode('यहलोगहिन्दीक्योंनहींबोलसकतेहैं')) -- luacheck: ignore + end) + it("Japanese (kanji and hiragana)", function() + assert.equal('n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa', + punycode.encode('なぜみんな日本語を話してくれないのか')) + end) + it("Korean (Hangul syllables)", function() + assert.equal('989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c', + punycode.encode('세계의모든사람들이한국어를이해한다면얼마나좋을까')) + end) + it("Russian (Cyrillic)", function() + -- It doesn't support mixed-case annotation (which is entirely optional as per the RFC). + -- So, while the RFC sample string encodes to: + -- `b1abfaaepdrnnbgefbaDotcwatmq2g4l` + -- Without mixed-case annotation it has to encode to: + -- `b1abfaaepdrnnbgefbadotcwatmq2g4l` + assert.equal('b1abfaaepdrnnbgefbadotcwatmq2g4l', + punycode.encode('почемужеонинеговорятпорусски')) + end) + it("Spanish", function() + assert.equal('PorqunopuedensimplementehablarenEspaol-fmd56a', + punycode.encode('PorquénopuedensimplementehablarenEspañol')) + end) + it("Vietnamese", function() + assert.equal('TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g', + punycode.encode('TạisaohọkhôngthểchỉnóitiếngViệt')) + end) + it("other", function() + assert.equal('3B-ww4c5e180e575a65lsy2b', punycode.encode('3年B組金八先生')) + assert.equal('-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n', + punycode.encode('安室奈美恵-with-SUPER-MONKEYS')) + assert.equal('Hello-Another-Way--fc4qua05auwb3674vfr0b', + punycode.encode('Hello-Another-Way-それぞれの場所')) + assert.equal('2-u9tlzr9756bt3uc0v', punycode.encode('ひとつ屋根の下2')) + assert.equal('MajiKoi5-783gue6qz075azm5e', punycode.encode('MajiでKoiする5秒前')) + assert.equal('de-jg4avhby1noc0d', punycode.encode('パフィーdeルンバ')) + assert.equal('d9juau41awczczp', punycode.encode('そのスピードで')) + end) + end) + + describe("test domain encode", function() + it("Emoji", function() + assert.equal('xn--ls8h.la', punycode.domain_encode('💩.la')) + end) + it("invalid", function() + local idn, err = punycode.domain_encode('--example--.org') + assert.falsy(idn) + assert.equal('Invalid domain label', err) + end) + it("unchanged", function() + assert.equal('example.org', punycode.domain_encode('example.org')) + end) + it("other", function() + assert.equal("xn--maana-pta.com", punycode.domain_encode("mañana.com")) + assert.equal("xn--bcher-kva.com", punycode.domain_encode("bücher.com")) + assert.equal("xn--caf-dma.com", punycode.domain_encode("café.com")) + assert.equal("xn----dqo34k.com", punycode.domain_encode("☃-⌘.com")) + assert.equal("xn----dqo34kn65z.com", punycode.domain_encode("퐀☃-⌘.com")) + assert.equal("xn--j1ail.xn--p1ai", punycode.domain_encode("кто.рф")) + end) + end) +end) \ No newline at end of file diff --git a/spec/helpers/utils_spec.lua b/spec/helpers/utils_spec.lua index c8f92489..a6198518 100644 --- a/spec/helpers/utils_spec.lua +++ b/spec/helpers/utils_spec.lua @@ -82,12 +82,14 @@ describe("utils", function() { unpack(utils.parse_uri('http://ory.weserv.nl?a=1&b=2')) }) assert.are.same({ 'http', 'ory.weserv.nl', 443, '/', 'a=1&b=2' }, { unpack(utils.parse_uri('http://ory.weserv.nl:443/?a=1&b=2')) }) - assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub/path/', '' }, - { unpack(utils.parse_uri('http://ory.weserv.nl:443/sub/path/')) }) + assert.are.same({ 'http', 'ory.weserv.nl', 443, '/sub//path/', '' }, + { unpack(utils.parse_uri('http://ory.weserv.nl:443/sub//path/')) }) assert.equal('https://ory.weserv.nl', utils.clean_uri('//ory.weserv.nl')) assert.are.same({ nil, "Unable to parse URL" }, { utils.parse_uri('http:\\ory.weserv.nl') }) + assert.are.same({ nil, "Invalid domain label" }, + { utils.parse_uri('--example--.org') }) assert.are.same({ 'https', 'ory.weserv.nl', 443, "/-._~!$'()*%20,;=:#@", diff --git a/src/weserv/api.lua b/src/weserv/api.lua index aacc5a3c..237cdd9a 100644 --- a/src/weserv/api.lua +++ b/src/weserv/api.lua @@ -126,7 +126,7 @@ function api:process(tmpfile, args) if args.loader == nil then -- Log image invalid or unsupported errors - ngx.log(ngx.ERR, 'Image invalid or unsupported', vips.verror.get()) + ngx.log(ngx.ERR, 'Image invalid or unsupported: ', vips.verror.get()) -- No known loader is found, stop further processing return nil, { @@ -147,7 +147,7 @@ function api:process(tmpfile, args) if not success then -- Log image not readable errors - ngx.log(ngx.ERR, 'Image not readable', image_err, args.url) + ngx.log(ngx.ERR, 'Image not readable: ', image_err) return nil, { status = ngx.HTTP_NOT_FOUND, diff --git a/src/weserv/client.lua b/src/weserv/client.lua index edaa371a..742b83b7 100644 --- a/src/weserv/client.lua +++ b/src/weserv/client.lua @@ -237,7 +237,7 @@ function client:request(uri, addl_headers, redirect_nr) local ok, keepalive_err = httpc:set_keepalive() if not ok then - ngx.log(ngx.ERR, 'Failed to set keepalive', keepalive_err) + ngx.log(ngx.ERR, 'Failed to set keepalive: ', keepalive_err) os.remove(res.tmpfile) return nil, { diff --git a/src/weserv/helpers/punycode.lua b/src/weserv/helpers/punycode.lua new file mode 100644 index 00000000..86c119f7 --- /dev/null +++ b/src/weserv/helpers/punycode.lua @@ -0,0 +1,191 @@ +local bit = require 'bit' +local math = math +local table = table +local string = string +local ipairs = ipairs +local select = select + +-- Parameter values for Punycode: https://tools.ietf.org/html/rfc3492#section-5 +local base = 36 +local t_min = 1 +local t_max = 26 +local skew = 38 +local damp = 700 +local initial_bias = 72 +local initial_n = 128 -- 0x80 +local delimiter = '-' -- 0x2D + +-- Highest positive signed 32-bit float value +local max_int = 2147483647; -- aka. 0x7FFFFFFF or 2^31-1 + +--- Punycode module +-- @module punycode +local punycode = {} +punycode.__index = punycode + +--- Returns the code of a UTF-8 character. +-- From sarn_utf8_code_func: http://lua-users.org/wiki/ValidateUnicodeString +function punycode.utf8_code(code, ...) + local offset = { 0, 0x3000, 0xE0000, 0x3C00000 } + + local num_bytes = select('#', ...) + for i = 1, num_bytes do + local b = select(i, ...) + code = bit.lshift(code, 6) + bit.band(b, 63) + end + + return code - offset[num_bytes + 1] +end + +--- Convert string to universal character set coded +-- in 4 octets [0,0x7FFFFFFF] +function punycode.to_ucs4(str) + local out = {} + -- https://stackoverflow.com/a/22954220/1480019 + for c in str:gmatch('([%z\1-\127\194-\244][\128-\191]*)') do + out[#out + 1] = punycode.utf8_code(string.byte(c, 1, -1)) + end + + return out +end + +--- This function converts a digit/integer into a basic code point. +-- whose value (when used for representing integers) is `d`, +-- which needs to be in the range `0` to `base - 1`. +-- @param digit The numeric value of a basic code point. +-- @return The basic code point. +function punycode.encode_digit(d) + -- 0..25 map to ASCII a..z or A..Z + -- 26..35 map to ASCII 0..9 + return d + 22 + 75 * (d < 26 and 1 or 0) +end + +--- Bias adaptation function as per section 3.4 of RFC 3492. +-- https://tools.ietf.org/html/rfc3492#section-3.4 +function punycode.adapt(delta, numPoints, firstTime) + delta = firstTime and math.floor(delta / damp) or math.floor(delta / 2) + + delta = delta + math.floor(delta / numPoints) + + local k = 0 + + while delta > math.floor((base - t_min) * t_max / 2) do + delta = math.floor(delta / (base - t_min)) + k = k + 1 + end + + return base * k + math.floor((base - t_min + 1) * delta / (delta + skew)); +end + +--- Encoding procedure as per section 6.3 of RFC 3492. +-- https://tools.ietf.org/html/rfc3492#section-6.3 +-- @param input list-table of Unicode code points. +-- @return The new encoded string. +function punycode.encode(input) + input = punycode.to_ucs4(input) + + local codepoints = {} + + -- Cache the length. + local input_length = #input + + -- Initialize the state. + local n = initial_n + local delta = 0 + local bias = initial_bias + + -- Handle the basic code points. + for j = 1, input_length do + local c = input[j] + if c < 0x80 then + codepoints[#codepoints + 1] = string.char(c) + end + end + + -- The number of basic code points. + local basic_length = #codepoints + + -- The number of code points that have been handled + local h = basic_length + + -- Finish the basic string with a delimiter unless it's empty. + if basic_length > 0 then codepoints[#codepoints + 1] = delimiter end + + -- Main encoding loop + while h < input_length do + -- All non-basic code points < n have been handled already. Find + -- the next larger one: + local m = max_int + for _, v in ipairs(input) do + if v >= n and v < m then + m = v + end + end + + -- Increase `delta` enough to advance the decoder's state to + delta = delta + (m - n) * (h + 1) + n = m + + for _, curr_v in ipairs(input) do + if curr_v < n then + delta = delta + 1 + end + + if curr_v == n then + -- Represent delta as a generalized variable-length integer. + local q = delta + local k = base + + while true do + local t = (k <= bias) and t_min or + (k >= bias + t_max) and t_max or (k - bias) + if q < t then break end + + local q_minus_t = q - t + local base_minus_t = base - t + codepoints[#codepoints + 1] = string.char(punycode.encode_digit(t + q_minus_t % base_minus_t)) + + q = math.floor(q_minus_t / base_minus_t) + + k = k + base + end + + codepoints[#codepoints + 1] = string.char(punycode.encode_digit(q)) + bias = punycode.adapt(delta, h + 1, h == basic_length) + + delta = 0 + h = h + 1 + end + end + + delta = delta + 1 + n = n + 1 + end + + return table.concat(codepoints, '') +end + +--- Encode a IDN domain. +-- If the domain is already ASCII, it is returned in its original state. +-- If any encoding was required, the "xn--" prefix is added. +function punycode.domain_encode(domain) + local labels = {} + for label in domain:gmatch('([^.]+)%.?') do + -- Domain names can only consist of [a-zA-Z0-9-] and aren't allowed + -- to start or end with a hyphen + local first, last = label:sub(1, 1), label:sub(-1) + if first == '-' or last == '-' then + return nil, 'Invalid domain label' + end + + if label:match('^[a-zA-Z0-9-]+$') then + labels[#labels + 1] = label + elseif label:sub(1, 1) ~= '-' and label:sub(2, 2) ~= '-' then + labels[#labels + 1] = 'xn--' .. punycode.encode(label) + end + end + + return table.concat(labels, '.') +end + +return punycode \ No newline at end of file diff --git a/src/weserv/helpers/utils.lua b/src/weserv/helpers/utils.lua index a19d3aa5..26c7dae1 100644 --- a/src/weserv/helpers/utils.lua +++ b/src/weserv/helpers/utils.lua @@ -1,3 +1,4 @@ +local punycode = require "weserv.helpers.punycode" local ffi = require "ffi" local ngx = ngx local table = table @@ -130,19 +131,12 @@ end function utils.canonicalise_path(path) local segments = {} for segment in path:gmatch("/([^/]*)") do - if segment ~= "" and segment ~= "." then - segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode) - end + segments[#segments + 1] = ngx.unescape_uri(segment):gsub(REGEX_DISALLOWED_CHARS, utils.percent_encode) end local len = #segments if len == 0 then return "/" end - -- If there was a slash on the end, keep it there. - if path:sub(-1, -1) == "/" then - len = len + 1 - segments[len] = "" - end segments[0] = "" segments = table.concat(segments, "/", 0, len) return segments @@ -165,11 +159,18 @@ end -- @param uri The URI. -- @return Parsed URI. function utils.parse_uri(uri) - local m = ngx.re.match(utils.clean_uri(uri), [[^(?:(http[s]?):)?//([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo') + local m = ngx.re.match(utils.clean_uri(uri), [[^(http[s]?)://([^:/\?]+)(?::(\d+))?([^\?]*)\??(.*)]], 'jo') - if not m or not m[1] then + if not m then return nil, "Unable to parse URL" else + local punycode_idn, err = punycode.domain_encode(m[2]) + if not punycode_idn then + return nil, err + end + + m[2] = punycode_idn + if m[3] then m[3] = tonumber(m[3]) else diff --git a/src/weserv/policy.lua b/src/weserv/policy.lua index 309c6a76..b0a42678 100644 --- a/src/weserv/policy.lua +++ b/src/weserv/policy.lua @@ -59,19 +59,19 @@ function policy:ban_at_cloudflare(ip_address) }) if not res then - ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare", ip_address), request_err) + ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare: ", ip_address), request_err) return false end if res.status ~= ngx.HTTP_OK then - ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare", ip_address), res.body) + ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare: ", ip_address), res.body) return false end local response = cjson.decode(res.body) local success = response.result ~= cjson.null and response.result.id ~= cjson.null if not success then - ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare", ip_address), res.body) + ngx.log(ngx.ERR, string.format("Failed to ban (for %s) at CloudFlare: ", ip_address), res.body) return false end @@ -104,19 +104,19 @@ function policy:unban_at_cloudflare(identifier) }) if not res then - ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare", identifier), request_err) + ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare: ", identifier), request_err) return false end if res.status ~= ngx.HTTP_OK then - ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare", identifier), res.body) + ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare: ", identifier), res.body) return false end local response = cjson.decode(res.body) local success = response.result ~= cjson.null and response.result.id ~= cjson.null if not success then - ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare", identifier), res.body) + ngx.log(ngx.ERR, string.format("Failed to unban (for identifier %s) at CloudFlare: ", identifier), res.body) return false end diff --git a/src/weserv/throttler.lua b/src/weserv/throttler.lua index 93411248..ed65fe61 100644 --- a/src/weserv/throttler.lua +++ b/src/weserv/throttler.lua @@ -40,7 +40,7 @@ function throttler:is_exceeded(ip_address) local ok, set_err = self.redis:set(string.format("%s_%s:lockout", self.config.prefix, ip_address), expires, 'ex', ttl) if not ok then - ngx.log(ngx.ERR, string.format("Failed to set lockout key (for %s)", ip_address), set_err) + ngx.log(ngx.ERR, string.format("Failed to set lockout key (for %s): ", ip_address), set_err) end self:reset_attempts(ip_address) @@ -57,7 +57,7 @@ end function throttler:increment(ip_address, decay_minutes) local value, incr_err = self.redis:incr(string.format("%s_%s", self.config.prefix, ip_address)) if not value then - ngx.log(ngx.ERR, string.format("Failed to increment (for %s)", ip_address), incr_err) + ngx.log(ngx.ERR, string.format("Failed to increment (for %s): ", ip_address), incr_err) return -1 end @@ -66,7 +66,7 @@ function throttler:increment(ip_address, decay_minutes) local ttl = decay_minutes * 60 local res, expire_err = self.redis:expire(string.format("%s_%s", self.config.prefix, ip_address), ttl) if not res then - ngx.log(ngx.ERR, string.format("Failed to set expiry (for %s) to %d", ip_address, ttl), expire_err) + ngx.log(ngx.ERR, string.format("Failed to set expiry (for %s) to %d: ", ip_address, ttl), expire_err) end end return value @@ -78,7 +78,7 @@ end function throttler:attempts(ip_address) local attempts, get_err = self.redis:get(string.format("%s_%s", self.config.prefix, ip_address)) if not attempts then - ngx.log(ngx.ERR, string.format("Failed to get number of attempts (for %s)", ip_address), get_err) + ngx.log(ngx.ERR, string.format("Failed to get number of attempts (for %s): ", ip_address), get_err) return -1 end @@ -91,7 +91,7 @@ end function throttler:reset_attempts(ip_address) local res, del_err = self.redis:del(string.format("%s_%s", self.config.prefix, ip_address)) if not res then - ngx.log(ngx.ERR, string.format("Failed to reset number of attempts (for %s)", ip_address), del_err) + ngx.log(ngx.ERR, string.format("Failed to reset number of attempts (for %s): ", ip_address), del_err) return false end @@ -118,7 +118,7 @@ function throttler:clear(ip_address) local success = self:reset_attempts(ip_address) local res, del_err = self.redis:del(string.format("%s_%s:lockout", self.config.prefix, ip_address)) if not res then - ngx.log(ngx.ERR, string.format("Failed to delete lockout key (for %s)", ip_address), del_err) + ngx.log(ngx.ERR, string.format("Failed to delete lockout key (for %s): ", ip_address), del_err) return false end return success @@ -130,7 +130,7 @@ end function throttler:available_in(ip_address) local expires, get_err = self.redis:get(string.format("%s_%s:lockout", self.config.prefix, ip_address)) if not expires then - ngx.log(ngx.ERR, string.format("Failed to get expires time (for %s)", ip_address), get_err) + ngx.log(ngx.ERR, string.format("Failed to get expires time (for %s): ", ip_address), get_err) return -1 end return expires - ngx.time()