Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Robust URL checker #51

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions lib/relaton/render/general/render.rb
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,11 @@ def add_date_accessed(data, template)
(/\{\{\s*date_accessed\s*\}\}/.match?(template) &&
/\{\{\s*uri\s*\}\}/.match?(template) &&
data[:uri_raw] && !data[:date_accessed]) or return
if url_exist?(data[:uri_raw])

if url_is_not_accessible?(data[:uri_raw])
url_warn(data[:uri_raw])
else
data[:date_accessed] = { on: ::Date.today.to_s }
else url_warn(data[:uri_raw])
end
end

Expand Down
127 changes: 112 additions & 15 deletions lib/relaton/render/general/uri.rb
Original file line number Diff line number Diff line change
@@ -1,28 +1,120 @@
require "concurrent"
require "net/http"
require "uri"
require "net_http_timeout_errors"

module Relaton
module Render
class General
def url_exist?(url_string)
return true # temporarily disabling validation of URIs
OK_CODES = [
Net::HTTPOK,
Net::HTTPCreated,
Net::HTTPNonAuthoritativeInformation,
Net::HTTPPartialContent,
Net::HTTPMultipleChoices,
Net::HTTPNotModified,
].freeze

NON_ACCESSIBLE_CODES = [
Net::HTTPNotFound,
Net::HTTPGone,
].freeze

# could happen but does not mean a url is accessible or not
UNKNOWN_STATE_CODES = [
Net::HTTPEarlyHints, # 103
Net::HTTPMovedPermanently, # 301
Net::HTTPFound, # 302
Net::HTTPSeeOther, # 303
Net::HTTPUseProxy, # 305
Net::HTTPTemporaryRedirect, # 307
Net::HTTPPermanentRedirect, # 308
Net::HTTPUnauthorized, # 401
Net::HTTPPaymentRequired, # 402
Net::HTTPForbidden, # 403
Net::HTTPMethodNotAllowed, # 405
Net::HTTPNotAcceptable, # 406
Net::HTTPProxyAuthenticationRequired, # 407
Net::HTTPRequestTimeOut, # 408
Net::HTTPConflict, # 409
Net::HTTPPreconditionFailed, # 412
Net::HTTPRequestEntityTooLarge, # 413
Net::HTTPRequestURITooLong, # 414
Net::HTTPUnsupportedMediaType, # 415
Net::HTTPExpectationFailed, # 417
Net::HTTPMisdirectedRequest, # 421
Net::HTTPUnprocessableEntity, # 422
# 425
Net::HTTPUpgradeRequired, # 426
Net::HTTPTooManyRequests, # 429
Net::HTTPRequestHeaderFieldsTooLarge, # 431
Net::HTTPUnavailableForLegalReasons, # 451
Net::HTTPInternalServerError, # 500
Net::HTTPNotImplemented, # 501
Net::HTTPBadGateway, # 502
Net::HTTPServiceUnavailable, # 503
Net::HTTPGatewayTimeOut, # 504
Net::HTTPVersionNotSupported, # 505
Net::HTTPVariantAlsoNegotiates, # 506
Net::HTTPNetworkAuthenticationRequired, # 511
# 520-527 cloudflare
# 530
].freeze

ACCESSIBLE = :accessible
NON_ACCESSIBLE = :non_accessible
POSSIBLY_ACCESSIBLE = :possibly_accessible
UNEXPECTED_RESPONSE = :unexpected_response

def url_is_not_accessible?(url)
state = url_state(url)
case state
when NON_ACCESSIBLE
true
when ACCESSIBLE, POSSIBLY_ACCESSIBLE, UNEXPECTED_RESPONSE
false
else
raise "Unknown state '#{state}' for URL '#{url}'"
end
end

# Returns 4 types of result:
# (1) ACCESSIBLE - definetely accessible
# (2) NON_ACCESSIBLE - definetely not
# (3) POSSIBLY_ACCESSIBLE - possibly accessible
# (4) UNEXPECTED_RESPONSE - unexpected response, for all other cases
def url_state(url_string)
url = URI.parse(url_string)
url.host or return true # allow file URLs
res = access_url(url) or return false
res.is_a?(Net::HTTPRedirection) and return url_exist?(res["location"])
res.code[0] != "4"
rescue Errno::ENOENT, SocketError
false # false if can't find the server
url.is_a?(URI::File) and return ACCESSIBLE
url.path or return NON_ACCESSIBLE # does not allow broken URLs

# when could not connect, it could be temporary
res = access_url(url) or return POSSIBLY_ACCESSIBLE

case res
when *NON_ACCESSIBLE_CODES then NON_ACCESSIBLE
when *OK_CODES then ACCESSIBLE
when *UNKNOWN_STATE_CODES then POSSIBLY_ACCESSIBLE
else UNEXPECTED_RESPONSE # TODO: track somewhere an unexpected code
end
rescue URI::InvalidURIError
NON_ACCESSIBLE
end

def access_url(url)
path = url.path or return false
path.empty? and path = "/"
url_head(url, path)
rescue StandardError => e
tries ||= 0

path = url.path.empty? ? "/" : url.path

NetHttpTimeoutErrors.conflate do
url_head(url, path)
end
rescue NetHttpTimeoutError => e
tries += 1
retry if tries < 3

warn e
false
nil
end

def url_head(url, path)
Expand All @@ -42,15 +134,20 @@ def urls_exist_concurrent(urls)
responses = Concurrent::Array.new
thread_pool = Concurrent::FixedThreadPool.new(5)
urls.each do |u|
thread_pool.post { responses << url_exist_async?(u) }
thread_pool.post do
responses << url_exist_async?(u)
rescue StandardError => e
warn "Error in a thread pool: #{e.inspect}. " \
"Backtrace:\n#{e.backtrace.join("\n")}"
end
end
thread_pool.shutdown
thread_pool.wait_for_termination
responses.each_with_object({}) { |n, m| m[n[:url]] = n[:status] }
end

def url_exist_async?(url_string)
{ url: url_string, status: url_exist?(url_string) }
{ url: url_string, status: !url_is_not_accessible?(url_string) }
end
end
end
Expand Down
1 change: 1 addition & 0 deletions relaton-render.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
spec.add_dependency "isodoc-i18n", "~> 1.2.1"
spec.add_dependency "liquid", "~> 5"
spec.add_dependency "nokogiri"
spec.add_dependency "net_http_timeout_errors"
spec.add_dependency "relaton-bib", ">= 1.13.0"
spec.add_dependency "twitter_cldr"
spec.add_dependency "tzinfo-data" # we need this for windows only
Expand Down
Loading