Skip to content

Commit

Permalink
test for event_check
Browse files Browse the repository at this point in the history
  • Loading branch information
mikesndrs committed Oct 21, 2024
1 parent cd3260d commit 6be1846
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 48 deletions.
83 changes: 43 additions & 40 deletions lib/scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,49 @@ def scrape(source, user = get_user, index: 0)
end
end

def scraper_event_check(data_sources)
return unless TeSS::Config&.scraper_event_check&.enabled

data_sources.each do |_key, sources|
sources.each do |source|
event_check_rejected(source)
event_check_stale(source)
end
end
end

def event_check_stale(source)
return unless TeSS::Config&.scraper_event_check&.[]('stale_threshold')

scraper_events = get_scraper_events(source.content_provider.events)
stale_warning = scraper_events.filter(&:stale?).count / scraper_events.count > TeSS::Config.scraper_event_check['stale_threshold']

if stale_warning && TeSS::Config.sentry_enabled?
Sentry.capture_message(
"Warning: #{source.content_provider.title} has too many stale events. Check if the scraper is still working properly.",
level: :warning
)
end
stale_warning
end

def event_check_rejected(source)
return unless TeSS::Config&.scraper_event_check&.[]('rejected_threshold')

rejected_warning = source.resources_rejected.to_f / (source.records_written + source.resources_rejected) > TeSS::Config.scraper_event_check['rejected_threshold']
if rejected_warning && TeSS::Config.sentry_enabled?
Sentry.capture_message(
"Warning: #{source.content_provider.title} has too many rejected events. Check if the scraper is still working properly.",
level: :warning
)
end
rejected_warning
end

def get_scraper_events(event_list)
event_list.filter { |e| e.respond_to?(:last_scraped) && !e.last_scraped.nil? && e.scraper_record }
end

private

def validate_source(source)
Expand Down Expand Up @@ -235,44 +278,4 @@ def get_user
end
user
end

def scraper_event_check(data_sources)
return unless TeSS::Config&.scraper_event_check&.enabled && TeSS::Config.sentry_enabled?

data_sources.each do |_key, sources|
sources.each do |source|
event_check_rejected(source)
event_check_stale(source)
end
end
end

def event_check_stale(source)
return unless TeSS::Config&.scraper_event_check&.stale_threshold

scraper_events = get_scraper_events(source.content_provider.events)
stale_warning = scraper_events.filter(&:stale?).count / scraper_events.count > TeSS::Config.scraper_event_check.stale_threshold
return unless stale_warning

Sentry.capture_message(
"Warning: #{source.content_provider.title} has too many stale events. Check if the scraper is still working properly.",
level: :warning
)
end

def event_check_rejected(source)
return unless TeSS::Config&.scraper_event_check&.rejected_threshold

rejected_warning = source.resources_rejected / (source.records_written + source.resources_rejected) > TeSS::Config.scraper_event_check.rejected_threshold
return unless rejected_warning

Sentry.capture_message(
"Warning: #{source.content_provider.title} has too many rejected events. Check if the scraper is still working properly.",
level: :warning
)
end

def get_scraper_events(event_list)
event_list.filter { |e| e.respond_to?(:last_scraped) && !e.last_scraped.nil? && e.scraper_record }
end
end
59 changes: 51 additions & 8 deletions test/unit/ingestors/scraper_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,28 +234,29 @@ def run
end

test 'does not scrape disabled or unapproved sources' do
WebMock.stub_request(:get, /https:\/\/app.com\/\d/).to_return(status: 200,
body: File.open(Rails.root.join('test', 'fixtures', 'files', 'ingestion', 'events.csv')))
WebMock.stub_request(:get, %r{https://app.com/\d}).to_return(status: 200,
body: File.open(Rails.root.join('test', 'fixtures', 'files', 'ingestion',
'events.csv')))

scraper = Scraper.new(load_scraper_config('test_ingestion_disabled.yml'))
provider = content_providers(:goblet)
user = users(:admin)
unapproved_source = provider.sources.create!(url: 'https://app.com/2', method: 'event_csv', user: user,
unapproved_source = provider.sources.create!(url: 'https://app.com/2', method: 'event_csv', user:,
enabled: true, approval_status: 'not_approved')
approval_requested_source = provider.sources.create!(url: 'https://app.com/3', method: 'event_csv', user: user,
approval_requested_source = provider.sources.create!(url: 'https://app.com/3', method: 'event_csv', user:,
enabled: true, approval_status: 'requested')
User.current_user = user # Admin is required to save approved status
enabled_source = provider.sources.create!(url: 'https://app.com/1', method: 'event_csv', user: user,
enabled_source = provider.sources.create!(url: 'https://app.com/1', method: 'event_csv', user:,
enabled: true, approval_status: 'approved')
disabled_source = provider.sources.create!(url: 'https://app.com/4', method: 'event_csv', user: user,
disabled_source = provider.sources.create!(url: 'https://app.com/4', method: 'event_csv', user:,
enabled: false, approval_status: 'approved')

scraper.run

logfile = scraper.log_file
# From Config
assert logfile_contains(logfile, "Source URL[https://app.com/events/sitemap.xml]")
refute logfile_contains(logfile, "Source URL[https://app.com/events/disabled.xml]")
assert logfile_contains(logfile, 'Source URL[https://app.com/events/sitemap.xml]')
refute logfile_contains(logfile, 'Source URL[https://app.com/events/disabled.xml]')
# From Database
assert logfile_contains(logfile, "Source URL[#{enabled_source.url}]")
refute logfile_contains(logfile, "Source URL[#{disabled_source.url}]")
Expand Down Expand Up @@ -288,6 +289,48 @@ def run
assert_includes source.log, '- CourseInstances: 23'
end

def beep # rubocop:disable Metrics/AbcSize
provider = content_providers(:portal_provider)
@source = Source.create!(url: 'https://somewhere.com/stuff', method: 'bioschemas',
enabled: true, approval_status: 'approved',
content_provider: provider, user: users(:admin))
file = Rails.root.join('test', 'fixtures', 'files', 'ingestion', 'nbis-course-instances.json')
WebMock.stub_request(:get, @source.url).to_return(status: 200, headers: {}, body: file.read)
@scraper = Scraper.new

refute provider.events.where(url: 'https://uppsala.instructure.com/courses/75565').exists?
assert_difference('provider.events.count', 23) do
@scraper.scrape(@source)
end
assert provider.events.where(url: 'https://uppsala.instructure.com/courses/75565').exists?
@source.reload
end

test 'event_check_stale' do
beep
with_settings({ scraper_event_check: { enabled: true, stale_threshold: 0.3 } }) do
assert_not @scraper.event_check_stale(@source)
@source.content_provider.events.each do |event|
event.last_scraped = 10.days.freeze.ago
event.timezone = 'Amsterdam'
event.save!
end
assert @scraper.event_check_stale(@source)
end
end

test 'event_check_rejected' do
beep
with_settings({ scraper_event_check: { enabled: true, rejected_threshold: 0.3 } }) do
assert_not @scraper.event_check_rejected(@source)
@source.records_written = 10
@source.resources_rejected = 90
@source.save!
@source.reload
assert @scraper.event_check_rejected(@source)
end
end

private

def check_task_finished(logfile)
Expand Down

0 comments on commit 6be1846

Please sign in to comment.