diff --git a/lib/scraper.rb b/lib/scraper.rb index 67c8ce2e8..18bb61570 100644 --- a/lib/scraper.rb +++ b/lib/scraper.rb @@ -168,6 +168,49 @@ def scrape(source, user = get_user, index: 0) end end + def scraper_event_check(data_sources) + return unless TeSS::Config&.scraper_event_check&.enabled + + data_sources.each do |_key, sources| + sources.each do |source| + event_check_rejected(source) + event_check_stale(source) + end + end + end + + def event_check_stale(source) + return unless TeSS::Config&.scraper_event_check&.[]('stale_threshold') + + scraper_events = get_scraper_events(source.content_provider.events) + stale_warning = scraper_events.filter(&:stale?).count / scraper_events.count > TeSS::Config.scraper_event_check['stale_threshold'] + + if stale_warning && TeSS::Config.sentry_enabled? + Sentry.capture_message( + "Warning: #{source.content_provider.title} has too many stale events. Check if the scraper is still working properly.", + level: :warning + ) + end + stale_warning + end + + def event_check_rejected(source) + return unless TeSS::Config&.scraper_event_check&.[]('rejected_threshold') + + rejected_warning = source.resources_rejected.to_f / (source.records_written + source.resources_rejected) > TeSS::Config.scraper_event_check['rejected_threshold'] + if rejected_warning && TeSS::Config.sentry_enabled? + Sentry.capture_message( + "Warning: #{source.content_provider.title} has too many rejected events. Check if the scraper is still working properly.", + level: :warning + ) + end + rejected_warning + end + + def get_scraper_events(event_list) + event_list.filter { |e| e.respond_to?(:last_scraped) && !e.last_scraped.nil? && e.scraper_record } + end + private def validate_source(source) @@ -235,44 +278,4 @@ def get_user end user end - - def scraper_event_check(data_sources) - return unless TeSS::Config&.scraper_event_check&.enabled && TeSS::Config.sentry_enabled? - - data_sources.each do |_key, sources| - sources.each do |source| - event_check_rejected(source) - event_check_stale(source) - end - end - end - - def event_check_stale(source) - return unless TeSS::Config&.scraper_event_check&.stale_threshold - - scraper_events = get_scraper_events(source.content_provider.events) - stale_warning = scraper_events.filter(&:stale?).count / scraper_events.count > TeSS::Config.scraper_event_check.stale_threshold - return unless stale_warning - - Sentry.capture_message( - "Warning: #{source.content_provider.title} has too many stale events. Check if the scraper is still working properly.", - level: :warning - ) - end - - def event_check_rejected(source) - return unless TeSS::Config&.scraper_event_check&.rejected_threshold - - rejected_warning = source.resources_rejected / (source.records_written + source.resources_rejected) > TeSS::Config.scraper_event_check.rejected_threshold - return unless rejected_warning - - Sentry.capture_message( - "Warning: #{source.content_provider.title} has too many rejected events. Check if the scraper is still working properly.", - level: :warning - ) - end - - def get_scraper_events(event_list) - event_list.filter { |e| e.respond_to?(:last_scraped) && !e.last_scraped.nil? && e.scraper_record } - end end diff --git a/test/unit/ingestors/scraper_test.rb b/test/unit/ingestors/scraper_test.rb index 3d66628e2..e3c8600c3 100644 --- a/test/unit/ingestors/scraper_test.rb +++ b/test/unit/ingestors/scraper_test.rb @@ -234,28 +234,29 @@ def run end test 'does not scrape disabled or unapproved sources' do - WebMock.stub_request(:get, /https:\/\/app.com\/\d/).to_return(status: 200, - body: File.open(Rails.root.join('test', 'fixtures', 'files', 'ingestion', 'events.csv'))) + WebMock.stub_request(:get, %r{https://app.com/\d}).to_return(status: 200, + body: File.open(Rails.root.join('test', 'fixtures', 'files', 'ingestion', + 'events.csv'))) scraper = Scraper.new(load_scraper_config('test_ingestion_disabled.yml')) provider = content_providers(:goblet) user = users(:admin) - unapproved_source = provider.sources.create!(url: 'https://app.com/2', method: 'event_csv', user: user, + unapproved_source = provider.sources.create!(url: 'https://app.com/2', method: 'event_csv', user:, enabled: true, approval_status: 'not_approved') - approval_requested_source = provider.sources.create!(url: 'https://app.com/3', method: 'event_csv', user: user, + approval_requested_source = provider.sources.create!(url: 'https://app.com/3', method: 'event_csv', user:, enabled: true, approval_status: 'requested') User.current_user = user # Admin is required to save approved status - enabled_source = provider.sources.create!(url: 'https://app.com/1', method: 'event_csv', user: user, + enabled_source = provider.sources.create!(url: 'https://app.com/1', method: 'event_csv', user:, enabled: true, approval_status: 'approved') - disabled_source = provider.sources.create!(url: 'https://app.com/4', method: 'event_csv', user: user, + disabled_source = provider.sources.create!(url: 'https://app.com/4', method: 'event_csv', user:, enabled: false, approval_status: 'approved') scraper.run logfile = scraper.log_file # From Config - assert logfile_contains(logfile, "Source URL[https://app.com/events/sitemap.xml]") - refute logfile_contains(logfile, "Source URL[https://app.com/events/disabled.xml]") + assert logfile_contains(logfile, 'Source URL[https://app.com/events/sitemap.xml]') + refute logfile_contains(logfile, 'Source URL[https://app.com/events/disabled.xml]') # From Database assert logfile_contains(logfile, "Source URL[#{enabled_source.url}]") refute logfile_contains(logfile, "Source URL[#{disabled_source.url}]") @@ -288,6 +289,48 @@ def run assert_includes source.log, '- CourseInstances: 23' end + def beep # rubocop:disable Metrics/AbcSize + provider = content_providers(:portal_provider) + @source = Source.create!(url: 'https://somewhere.com/stuff', method: 'bioschemas', + enabled: true, approval_status: 'approved', + content_provider: provider, user: users(:admin)) + file = Rails.root.join('test', 'fixtures', 'files', 'ingestion', 'nbis-course-instances.json') + WebMock.stub_request(:get, @source.url).to_return(status: 200, headers: {}, body: file.read) + @scraper = Scraper.new + + refute provider.events.where(url: 'https://uppsala.instructure.com/courses/75565').exists? + assert_difference('provider.events.count', 23) do + @scraper.scrape(@source) + end + assert provider.events.where(url: 'https://uppsala.instructure.com/courses/75565').exists? + @source.reload + end + + test 'event_check_stale' do + beep + with_settings({ scraper_event_check: { enabled: true, stale_threshold: 0.3 } }) do + assert_not @scraper.event_check_stale(@source) + @source.content_provider.events.each do |event| + event.last_scraped = 10.days.freeze.ago + event.timezone = 'Amsterdam' + event.save! + end + assert @scraper.event_check_stale(@source) + end + end + + test 'event_check_rejected' do + beep + with_settings({ scraper_event_check: { enabled: true, rejected_threshold: 0.3 } }) do + assert_not @scraper.event_check_rejected(@source) + @source.records_written = 10 + @source.resources_rejected = 90 + @source.save! + @source.reload + assert @scraper.event_check_rejected(@source) + end + end + private def check_task_finished(logfile)