Skip to content

Commit

Permalink
Handle and log errors when ingesting bioschemas content
Browse files Browse the repository at this point in the history
  • Loading branch information
fbacall committed Mar 5, 2024
1 parent 55b5933 commit b1ed5fe
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 60 deletions.
2 changes: 1 addition & 1 deletion app/controllers/bioschemas_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def run_test
if body
begin
ingestor = Ingestors::BioschemasIngestor.new
@output = ingestor.read_content(StringIO.new(body), url: params[:url] || 'https://example.com')
@output = ingestor.read_content(StringIO.new(body), url: params[:url] || Ingestors::BioschemasIngestor::DUMMY_URL).merge(messages: ingestor.messages)
rescue RDF::ReaderError
flash[:error] = 'A parsing error occurred. Please check your document contains valid JSON-LD or HTML.'
format.html { render :test, status: :unprocessable_entity }
Expand Down
20 changes: 15 additions & 5 deletions app/views/bioschemas/_test_results.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,22 @@
<% end %>
</div>
<div class="col-md-8 col-md-pull-4">
<% unless @output[:messages].blank? %>
<h5>Log</h5>
<div class="markdown source-log">
<%= render_markdown(@output[:messages].join("\n\n")) %>
</div>
<% end %>
<h4>Bioschemas summary:</h4>
<table class="table" style="max-width: 20em">
<% @output[:totals].each do |type, total| %>
<tr><td><%= type %></td><td><%= total %></td></tr>
<% end %>
</table>
<% if @output[:totals].values.sum.zero? %>
<span class="muted">Nothing found</span>
<% else %>
<table class="table" style="max-width: 20em">
<% @output[:totals].each do |type, total| %>
<tr><td><%= type %></td><td><%= total %></td></tr>
<% end %>
</table>
<% end %>
</div>
</div>

Expand Down
2 changes: 1 addition & 1 deletion app/views/sources/_test_results.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<% unless test_results[:messages].blank? %>
<h5>Log</h5>
<div class="markdown source-log">
<%= render_markdown(test_results[:messages].join("\n")) %>
<%= render_markdown(test_results[:messages].join("\n\n")) %>
</div>
<% end %>

Expand Down
116 changes: 70 additions & 46 deletions lib/ingestors/bioschemas_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

module Ingestors
class BioschemasIngestor < Ingestor
DUMMY_URL = 'https://example.com'

attr_reader :verbose

def self.config
Expand All @@ -17,13 +19,14 @@ def read(source_url)
sitemap_regex = nil
@verbose = false
sources = if source_url.downcase.match?(/sitemap(.*)?.xml\Z/)
@messages << "\nParsing sitemap: #{source_url}\n"
sitemap_message = "Parsing sitemap: #{source_url}\n"
urls = SitemapParser.new(source_url, {
recurse: true,
url_regex: sitemap_regex,
headers: { 'User-Agent' => config[:user_agent] }
}).to_a.uniq.map(&:strip)
@messages << " - #{urls.count} URLs found"
recurse: true,
url_regex: sitemap_regex,
headers: { 'User-Agent' => config[:user_agent] }
}).to_a.uniq.map(&:strip)
sitemap_message << "\n - #{urls.count} URLs found"
@messages << sitemap_message
urls
else
[source_url]
Expand All @@ -35,19 +38,21 @@ def read(source_url)
sources.each do |url|
source = open_url(url)
output = read_content(source, url: url)
provider_events += output[:resources][:events]
provider_materials += output[:resources][:materials]
output[:totals].each do |key, value|
totals[key] += value
if output
provider_events += output[:resources][:events]
provider_materials += output[:resources][:materials]
output[:totals].each do |key, value|
totals[key] += value
end
end
end

if totals.keys.any?
@messages << "\nBioschemas summary:\n"
bioschemas_summary = "Bioschemas summary:\n"
totals.each do |type, count|
@messages << " - #{type}: #{count}"
bioschemas_summary << "\n - #{type}: #{count}"
end

@messages << bioschemas_summary
end

deduplicate(provider_events).each do |event_params|
Expand All @@ -65,46 +70,65 @@ def read_content(content, url: nil)
events: [],
materials: []
},
totals: Hash.new(0)
totals: Hash.new(0)
}

return output unless content

sample = content.read(256)&.strip
return output unless sample
begin
sample = content.read(256)&.strip
return output unless sample

format = sample.start_with?('[') || sample.start_with?('{') ? :jsonld : :rdfa
content.rewind
source = content.read
events = Tess::Rdf::EventExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
courses = Tess::Rdf::CourseExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
course_instances = Tess::Rdf::CourseInstanceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
learning_resources = Tess::Rdf::LearningResourceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
output[:totals]['Events'] += events.count
output[:totals]['Courses'] += courses.count
output[:totals]['CourseInstances'] += course_instances.count
output[:totals]['LearningResources'] += learning_resources.count
if verbose
puts "Events: #{events.count}"
puts "Courses: #{courses.count}"
puts "CourseInstances: #{course_instances.count}"
puts "LearningResources: #{learning_resources.count}"
end
format = sample.start_with?('[') || sample.start_with?('{') ? :jsonld : :rdfa
content.rewind
source = content.read
events = Tess::Rdf::EventExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
courses = Tess::Rdf::CourseExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
course_instances = Tess::Rdf::CourseInstanceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
learning_resources = Tess::Rdf::LearningResourceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
output[:totals]['Events'] += events.count
output[:totals]['Courses'] += courses.count
output[:totals]['CourseInstances'] += course_instances.count
output[:totals]['LearningResources'] += learning_resources.count
if verbose
puts "Events: #{events.count}"
puts "Courses: #{courses.count}"
puts "CourseInstances: #{course_instances.count}"
puts "LearningResources: #{learning_resources.count}"
end

deduplicate(events + courses + course_instances).each do |event|
output[:resources][:events] << event
end
deduplicate(events + courses + course_instances).each do |event|
output[:resources][:events] << event
end

deduplicate(learning_resources).each do |material|
output[:resources][:materials] << material
deduplicate(learning_resources).each do |material|
output[:resources][:materials] << material
end
rescue StandardError => e
Rails.logger.error("#{e.class}: #{e.message}")
Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
error = 'An error'
comment = nil
if e.is_a?(RDF::ReaderError)
error = 'A parsing error'
comment = 'Please check your page contains valid JSON-LD or HTML.'
end
message = "#{error} occurred while reading"
if url.present? && url != 'https://example.com'
message << ": #{url} "
else
message << " the source"
end
message << ". #{comment}" if comment
@messages << message
end

output
Expand Down
10 changes: 6 additions & 4 deletions test/controllers/bioschemas_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ class BioschemasControllerTest < ActionController::TestCase

post :run_test, params: { snippet: "{ 'oh dear }" }

assert_response :unprocessable_entity
assert flash[:error].include?('parsing error')
assert_response :success
assert_select '.source-log', text:
'A parsing error occurred while reading the source. Please check your page contains valid JSON-LD or HTML.'
ensure
JSON::LD::Reader.define_method(old_method.name, old_method)
end
Expand All @@ -111,8 +112,9 @@ class BioschemasControllerTest < ActionController::TestCase

post :run_test, params: { url: 'https://website.com/material.json' }

assert_response :unprocessable_entity
assert flash[:error].include?('parsing error')
assert_response :success
assert_select '.source-log', text:
'A parsing error occurred while reading: https://website.com/material.json . Please check your page contains valid JSON-LD or HTML.'
ensure
JSON::LD::Reader.define_method(old_method.name, old_method)
end
Expand Down
7 changes: 4 additions & 3 deletions test/unit/ingestors/bioschemas_ingestor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ class BioschemasIngestorTest < ActiveSupport::TestCase
@ingestor.read('https://training.galaxyproject.org/sitemap.xml')
assert_equal 0, @ingestor.events.count
assert_equal 3, @ingestor.materials.count
assert_includes @ingestor.messages, " - 6 URLs found"
assert_includes @ingestor.messages, " - Events: 0"
assert_includes @ingestor.messages, " - LearningResources: 3"
messages = @ingestor.messages.join("\n")
assert_includes messages, "\n - 6 URLs found"
assert_includes messages, "\n - Events: 0"
assert_includes messages, "\n - LearningResources: 3"

assert_difference('Material.count', 3) do
@ingestor.write(@user, @content_provider)
Expand Down

0 comments on commit b1ed5fe

Please sign in to comment.