From 06e5cc397ab21cd7e28744e793dac5d78a9e6129 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Mon, 21 Oct 2024 18:08:30 +0200 Subject: [PATCH 1/3] scraper for oscd --- lib/ingestors/oscd_ingestor.rb | 87 +++++++++++++++++++++++ test/unit/ingestors/oscd_ingestor_test.rb | 60 ++++++++++++++++ test/vcr_cassettes/ingestors/oscd.yml | 72 +++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 lib/ingestors/oscd_ingestor.rb create mode 100644 test/unit/ingestors/oscd_ingestor_test.rb create mode 100644 test/vcr_cassettes/ingestors/oscd.yml diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb new file mode 100644 index 000000000..faab3c2a4 --- /dev/null +++ b/lib/ingestors/oscd_ingestor.rb @@ -0,0 +1,87 @@ +require 'open-uri' +require 'csv' +require 'nokogiri' + +module Ingestors + class OscdIngestor < Ingestor + def self.config + { + key: 'oscd_event', + title: 'OSCD Events API', + category: :events + } + end + + def read(url) + begin + process_oscd(url) + rescue Exception => e + @messages << "#{self.class.name} failed with: #{e.message}" + end + + # finished + nil + end + + private + + def process_oscd(_url) + url = 'https://osc-delft.github.io/events' + + event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('.article-post').children + first_event = true + event = nil + event_page.each do |div| + if div.name == 'h1' + first_event = false + event = OpenStruct.new + event.title = div.text + event.url = "#{url}##{event.title.downcase.gsub(' ', '_')}" + event.description = '' + event.source = 'OSCD' + event.timezone = 'Amsterdam' + end + + next if first_event || div.name == 'text' + + if div.name == 'p' + if div.text.strip.start_with?('Date & time:') + date_str = div.text.remove('Date & time:').strip + event.start, event.end = oscd_fix_time(date_str) + elsif div.text.strip.start_with?('Location:') + event.venue = div.text.remove('Location:').strip + else + event.description = [event.description, div.text.strip].join(' ') + end + if div&.next_sibling&.next_sibling.nil? || (div&.next_sibling&.next_sibling&.name == 'h1') + event.set_default_times + add_event(event) + end + end + rescue Exception => e + @messages << "Extract event fields failed with: #{e.message}" + end + end + end +end + +def oscd_fix_time(date_str) + date_str.split(',').each do |str| + str.strip.split(' ').each_cons(2) do |el1, el2| + if is_month?(el1) && el2.to_i.positive? + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) + end + return event_start, event_end + end + end + end +end + +def is_month?(str) + formatted_str = str.strip.capitalize + Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) +end diff --git a/test/unit/ingestors/oscd_ingestor_test.rb b/test/unit/ingestors/oscd_ingestor_test.rb new file mode 100644 index 000000000..8b0aebf4b --- /dev/null +++ b/test/unit/ingestors/oscd_ingestor_test.rb @@ -0,0 +1,60 @@ +require 'test_helper' + +class OscdIngestorTest < ActiveSupport::TestCase + setup do + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + mock_ingestions + mock_timezone # System time zone should not affect test result + end + + teardown do + reset_timezone + end + + test 'can ingest events from oscd' do + source = @content_provider.sources.build( + url: 'https://osc-delft.github.io/events', + method: 'oscd', + enabled: true + ) + + ingestor = Ingestors::OscdIngestor.new + + # check event doesn't + new_title = 'Opening up a Flow battery by Sanli Faez' + new_url = 'https://osc-delft.github.io/events#opening_up_a_flow_battery_by_sanli_faez' + refute Event.where(title: new_title, url: new_url).any? + + # run task + assert_difference 'Event.count', 4 do + freeze_time(2019) do + VCR.use_cassette('ingestors/oscd') do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + end + end + + assert_equal 4, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 4, ingestor.stats[:events][:added] + assert_equal 0, ingestor.stats[:events][:updated] + assert_equal 0, ingestor.stats[:events][:rejected] + + + # check event does exist + event = Event.where(title: new_title, url: new_url).first + assert event + assert_equal new_title, event.title + assert_equal new_url, event.url + + # check other fields + assert_equal 'OSCD', event.source + assert_equal 'Amsterdam', event.timezone + assert_equal Time.zone.parse('Tue, 21 Jan 2019 09:00:00.000000000 UTC +00:00'), event.start + assert_equal Time.zone.parse('Tue, 21 Jan 2019 17:00:00.000000000 UTC +00:00'), event.end + assert_equal 'Online - Register here', event.venue + assert event.online? + end +end diff --git a/test/vcr_cassettes/ingestors/oscd.yml b/test/vcr_cassettes/ingestors/oscd.yml new file mode 100644 index 000000000..15259ee2e --- /dev/null +++ b/test/vcr_cassettes/ingestors/oscd.yml @@ -0,0 +1,72 @@ +--- +http_interactions: +- request: + method: get + uri: https://osc-delft.github.io/events + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Connection: + - keep-alive + Content-Length: + - '12145' + Server: + - GitHub.com + Content-Type: + - text/html; charset=utf-8 + Permissions-Policy: + - interest-cohort=() + Last-Modified: + - Fri, 09 Jun 2023 11:08:05 GMT + Access-Control-Allow-Origin: + - "*" + Strict-Transport-Security: + - max-age=31556952 + Etag: + - W/"64830815-2f71" + Expires: + - Mon, 21 Oct 2024 14:35:24 GMT + Cache-Control: + - max-age=600 + X-Proxy-Cache: + - MISS + X-Github-Request-Id: + - FB2D:0DB5:44C9AE3:46C3AEA:67166454 + Accept-Ranges: + - bytes + Age: + - '0' + Date: + - Mon, 21 Oct 2024 15:06:48 GMT + Via: + - 1.1 varnish + X-Served-By: + - cache-ams2100084-AMS + X-Cache: + - HIT + X-Cache-Hits: + - '0' + X-Timer: + - S1729523208.413896,VS0,VE97 + Vary: + - Accept-Encoding + X-Fastly-Request-Id: + - e784ffe238a0a9d1f10db12b038ed739fc67bfe9 + body: + encoding: ASCII-8BIT + string: !binary |- + PCFET0NUWVBFIGh0bWw+CjxodG1sIGxhbmc9ImVuIj4KICAgIDxoZWFkPgogICAgICAgIDxtZXRhIGNoYXJzZXQ9InV0Zi04Ij4KICAgICAgICA8bWV0YSBodHRwLWVxdWl2PSJ4LXVhLWNvbXBhdGlibGUiIGNvbnRlbnQ9ImllPWVkZ2UiPgogICAgICAgIDxtZXRhIG5hbWU9InZpZXdwb3J0IiBjb250ZW50PSJ3aWR0aD1kZXZpY2Utd2lkdGgsIGluaXRpYWwtc2NhbGU9MSI+CiAgICAgICAgPHRpdGxlPkNvbW11bml0eSBldmVudHMgfCBPcGVuIFNjaWVuY2UgQ29tbXVuaXR5IERlbGZ0PC90aXRsZT4KICAgICAgICAKICAgICAgICA8bGluayBocmVmPSIvY3NzL2N1c3RvbS5jc3MiIHJlbD0ic3R5bGVzaGVldCIgdHlwZT0idGV4dC9jc3MiPgogICAgICAgIDxsaW5rIHJlbD0ic3R5bGVzaGVldCIgaHJlZj0iaHR0cHM6Ly9jZG4ucmF3Z2l0LmNvbS9qcHN3YWxzaC9hY2FkZW1pY29ucy9tYXN0ZXIvY3NzL2FjYWRlbWljb25zLm1pbi5jc3MiPgogICAgPC9oZWFkPgoKICAgIDxib2R5PgogICAgICAgIDxuYXYgY2xhc3M9Im5hdmJhciBpcy1maXhlZC10b3AiIHJvbGU9Im5hdmlnYXRpb24iIGFyaWEtbGFiZWw9Im1haW4gbmF2aWdhdGlvbiI+CiAgICA8ZGl2IGNsYXNzPSJjb250YWluZXIiPgogICAgICA8ZGl2IGNsYXNzPSJuYXZiYXItYnJhbmQiPgoKICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii8iPgogICAgICAgICAgICA8aW1nIHNyYz0iL2ltYWdlcy9PU0NETG9nby5wbmciIGFsdD0iT3BlbiBTY2llbmNlIENvbW11bml0eSBEZWxmdCI+CiAgICAgICAgPC9hPgoKICAgICAgICA8YQogICAgICAgICAgcm9sZT0iYnV0dG9uIgogICAgICAgICAgY2xhc3M9Im5hdmJhci1idXJnZXIgYnVyZ2VyIgogICAgICAgICAgYXJpYS1sYWJlbD0ibWVudSIKICAgICAgICAgIGFyaWEtZXhwYW5kZWQ9ImZhbHNlIgogICAgICAgICAgZGF0YS10YXJnZXQ9InRvcC1tZW51IgogICAgICAgID4KICAgICAgICAgIDxzcGFuIGFyaWEtaGlkZGVuPSJ0cnVlIj48L3NwYW4+IDxzcGFuIGFyaWEtaGlkZGVuPSJ0cnVlIj48L3NwYW4+CiAgICAgICAgICA8c3BhbiBhcmlhLWhpZGRlbj0idHJ1ZSI+PC9zcGFuPgogICAgICAgIDwvYT4KICAgICAgPC9kaXY+CgogICAgICA8ZGl2IGlkPSJ0b3AtbWVudSIgY2xhc3M9Im5hdmJhci1tZW51Ij4KICAgICAgICA8ZGl2IGNsYXNzPSJuYXZiYXItZW5kIj4KCiAgICAgICAgICA8ZGl2IGNsYXNzPSJuYXZiYXItaXRlbSBoYXMtZHJvcGRvd24gaXMtaG92ZXJhYmxlIj4KICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1saW5rIiBocmVmPSIvaW5kZXgiPlRoZSBjb21tdW5pdHk8L2E+CiAgICAgICAgICAgIDxkaXYgY2xhc3M9Im5hdmJhci1kcm9wZG93biI+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvaW5kZXgjd2hvLWNhbi1qb2luLXRoZS1vcGVuLXNjaWVuY2UtY29tbXVuaXR5LWRlbGZ0Ij4gV2hvIGNhbiBqb2luIDwvYT4KICAgICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii9pbmRleCN3aGF0LWFyZS10aGUtYmVuZWZpdHMiPiBCZW5lZml0cyA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvaW5kZXgjZ2V0LWludm9sdmVkIj4gR2V0IGludm9sdmVkIDwvYT4KICAgICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii9pbmRleCNjb2RlLW9mLWNvbmR1Y3QiPiBDb2RlIG9mIENvbmR1Y3QgPC9hPgogICAgICAgICAgICA8L2Rpdj4KICAgICAgICAgIDwvZGl2PgoKICAgICAgICAgIDxkaXYgY2xhc3M9Im5hdmJhci1pdGVtIGhhcy1kcm9wZG93biBpcy1ob3ZlcmFibGUiPgogICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWxpbmsiIGhyZWY9Ii9pbml0aWF0aXZlcyI+SW5pdGlhdGl2ZXM8L2E+CiAgICAgICAgICAgIDxkaXYgY2xhc3M9Im5hdmJhci1kcm9wZG93biI+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvaW5pdGlhdGl2ZXMjZGF0YS1jaGFtcGlvbnMiPiBEYXRhIENoYW1waW9ucyA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvaW5pdGlhdGl2ZXMjZGVsZnQtb3Blbi1oYXJkd2FyZSI+IERlbGZ0IE9wZW4gSGFyZHdhcmUgPC9hPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL2luaXRpYXRpdmVzI2NpdGl6ZW4tc2NpZW5jZSI+IENpdGl6ZW4gU2NpZW5jZSA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvaW5pdGlhdGl2ZXMjdXJiYW5pc20iPiBSYmFuaXNtIDwvYT4KICAgICAgICAgICAgPC9kaXY+CiAgICAgICAgICA8L2Rpdj4KICAgICAgICAgICAgCiAgICAgICAgPGRpdiBjbGFzcz0ibmF2YmFyLWl0ZW0gaGFzLWRyb3Bkb3duIGlzLWhvdmVyYWJsZSI+CiAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItbGluayIgaHJlZj0iL2Z1bmRpbmciPiBGdW5kaW5nIDwvYT4KICAgICAgICAgICAgPGRpdiBjbGFzcz0ibmF2YmFyLWRyb3Bkb3duIj4KICAgICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii9mdW5kaW5nI21haW5zdHJlYW1pbmctb3Blbi1zY2llbmNlLWZ1bmQiPiBNYWluc3RyZWFtaW5nIE9wZW4gU2NpZW5jZSBGdW5kIDwvYT4KICAgICAgICAgICAgPC9kaXY+CiAgICAgICAgICA8L2Rpdj4KCiAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii9ldmVudHMiPiBFdmVudHMgPC9hPgoKICAgICAgICAgIDxkaXYgY2xhc3M9Im5hdmJhci1pdGVtIGhhcy1kcm9wZG93biBpcy1ob3ZlcmFibGUiPgogICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWxpbmsiIGhyZWY9Ii9tZW1iZXJzIj5NZW1iZXJzPC9hPgogICAgICAgICAgICA8ZGl2IGNsYXNzPSJuYXZiYXItZHJvcGRvd24iPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL21lbWJlcnMjQUJFIj4gQUJFIDwvYT4KICAgICAgICAgICAgICA8YSBjbGFzcz0ibmF2YmFyLWl0ZW0iIGhyZWY9Ii9tZW1iZXJzI0FFIj4gQUUgPC9hPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL21lbWJlcnMjQVMiPiBBUyA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvbWVtYmVycyNDRUciPiBDRUcgPC9hPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL21lbWJlcnMjRUVNQ1MiPiBFRU1DUyA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvbWVtYmVycyNJREUiPiBJREUgPC9hPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL21lbWJlcnMjdGhyZWVtRSI+IDNtRSA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvbWVtYmVycyNUUE0iPiBUUE0gPC9hPgogICAgICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL21lbWJlcnMjTkYiPiBOb24tRmFjdWx0eSA8L2E+CiAgICAgICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvbWVtYmVycyNBbHVtbmkiPiBBbHVtbmkgPC9hPgogICAgICAgICAgICA8L2Rpdj4KICAgICAgICAgIDwvZGl2PgoKICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL2dhbGxlcnkiPiBQaG90byBnYWxsZXJ5IDwvYT4KICAgICAgICAgIDxhIGNsYXNzPSJuYXZiYXItaXRlbSIgaHJlZj0iL3Bvc3RzIj4gU3RvcmllcyA8L2E+CgogICAgICAgICAgPGEgY2xhc3M9Im5hdmJhci1pdGVtIiBocmVmPSIvam9pbiI+IEpvaW4gdXMgPC9hPgogICAgICAgIDwvZGl2PgogICAgICA8L2Rpdj4KICAgIDwvZGl2PgogIDwvbmF2PgoKICAgICAgICAKCjxzZWN0aW9uIGNsYXNzPSJoZXJvICIgc3R5bGU9ImJhY2tncm91bmQtaW1hZ2U6IGxpbmVhci1ncmFkaWVudCh0byB0b3AsICMwMDAsIHRyYW5zcGFyZW50KSwgdXJsKGh0dHBzOi8vaW1hZ2VzLnVuc3BsYXNoLmNvbS9waG90by0xNDM1NTI3MTczMTI4LTk4M2I4NzIwMWY0ZD9peGxpYj1yYi0xLjIuMSZpeGlkPWV5SmhjSEJmYVdRaU9qRXlNRGQ5JmF1dG89Zm9ybWF0JmZpdD1jcm9wJnc9MTA0NyZxPTgwKTsiPgogIDxkaXYgY2xhc3M9Imhlcm8tYm9keSI+CiAgICA8aDEgY2xhc3M9InRpdGxlIGlzLTEiPkNvbW11bml0eSBldmVudHM8L2gxPgoKICAgIAoKICAgIAogIDwvZGl2PgoKICAKICA8ZGl2IGNsYXNzPSJoZXJvLWZvb3QiPgogICAgPHAgY2xhc3M9ImNyZWRpdCI+PGEgaHJlZj0iIj5FcmljIFJvdGhlcm1lbDwvYT4gKENDIEJZKTwvcD4KICA8L2Rpdj4KICAKPC9zZWN0aW9uPgoKPHNlY3Rpb24gY2xhc3M9InNlY3Rpb24iPgogIDxkaXYgY2xhc3M9ImNvbnRhaW5lciBjb250ZW50ICAiPgogICAgPHNlY3Rpb24+CiAgICA8ZGl2IGNsYXNzPSJhcnRpY2xlLXBvc3QiPgogICAgICAgIAo8cD5UaGlzIGlzIGEgbGlzdCBvZiBldmVudHMgb3JnYW5pc2VkIGJ5IG1lbWJlcnMgb2YgdGhlIE9wZW4gU2NpZW5jZSBDb21tdW5pdHkgRGVsZnQgdGhhdCBjYW4gYmUgb2YgaW50ZXJlc3QgdG8gb3RoZXIgbWVtYmVycy48L3A+Cgo8cD5JZiB5b3Ugd291bGQgbGlrZSB0byBhZGQgYW4gZXZlbnQgaGVyZSwgcGxlYXNlIDxhIGhyZWY9Imh0dHBzOi8vZ2l0aHViLmNvbS9vc2MtZGVsZnQvb3NjLWRlbGZ0LmdpdGh1Yi5pby9pc3N1ZXMvbmV3P2Fzc2lnbmVlcz0mYW1wO2xhYmVscz0mYW1wO3RlbXBsYXRlPWV2ZW50LWxpc3Rpbmctc3VibWlzc2lvbi10ZW1wbGF0ZS5tZCZhbXA7dGl0bGU9RXZlbnQiPnN1Ym1pdCBhbiBpc3N1ZSBvbiBHaXRIdWI8L2E+LjwvcD4KCjxoMSBpZD0iZmFpcmx5LXRvb2xzZXQtd29ya3Nob3AiPkZhaXJseSBUb29sc2V0IFdvcmtzaG9wPC9oMT4KCjxwPjxpIGNsYXNzPSJmYXMgZmEtY2FsZW5kYXItYWx0Ij48L2k+IDxzdHJvbmc+RGF0ZSAmYW1wOyB0aW1lPC9zdHJvbmc+OiBXZWRuZXNkYXksIE1hcmNoIDIyLCAxMzozMC0xNzozMCBDRVQ8L3A+Cgo8cD48aSBjbGFzcz0iZmFzIGZhLWNhbGVuZGFyLWFsdCI+PC9pPiA8c3Ryb25nPkxvY2F0aW9uPC9zdHJvbmc+OiBYIFRVIERlbGZ0IE1la2Vsd2VnIDggQnVpbGRpbmcgMzcgMjYyOCBDRCBEZWxmdDwvcD4KCjxwPkNvbXB1dGF0aW9uYWwgcmVzZWFyY2ggZW52aXJvbm1lbnRzIGZhY2lsaXRhdGUgcmVzZWFyY2ggZGF0YSBwcm9kdWN0aW9uIGJ5IHByb3ZpZGluZyB0aGUgbmVjZXNzYXJ5IHByb2Nlc3NpbmcgYW5kIGFuYWx5c2lzIHRvb2xzLiBUaGV5IGFyZSB3ZWxsIGNvbm5lY3RlZCB0byBzb21lIHJlc2VhcmNoIGluZnJhc3RydWN0dXJlLCBlLmcuIGNvZGUgcmVwb3NpdG9yaWVzLiBCdXQsIHRoZWlyIGludGVyb3BlcmFiaWxpdHkgd2l0aCByZXNlYXJjaCBkYXRhIHJlcG9zaXRvcmllcyBpcyB3ZWFrLCBhbmQgdGhlIHJlc2VhcmNoZXJzIG5lZWQgdG8gbWFudWFsbHkgdXBsb2FkIHRoZWlyIHJlc2VhcmNoIGRhdGEgdG8gdGhlIHJlcG9zaXRvcmllcywgbW9zdGx5IHRocm91Z2ggd2ViIGZvcm1zIGFuZCBpbnRlcmZhY2VzLgpUaGUgZmFpcmx5IHRvb2xzZXQgc2VhbWxlc3NseSBpbnRlZ3JhdGVzIHJlc2VhcmNoIGVudmlyb25tZW50cyBhbmQgZGF0YSByZXBvc2l0b3JpZXMsIGFuZCBhbGxvd3MgbG9jYWwgZGF0YSBhbmQgbWV0YWRhdGEgbWFuYWdlbWVudCwgcXVpY2sgZGF0YSBwdWJsaWNhdGlvbiwgdW5hdHRlbmRlZCBkYXRhIHVwbG9hZGluZywgc21hcnQgZGF0YXNldCBzeW5jaHJvbml6YXRpb24sIGFuZCBxdWljayBkYXRhc2V0IGNsb25pbmcuIFRoZSB0b29sc2V0IGluY2x1ZGVzIGEgUHl0aG9uIGxpYnJhcnkgcHJvdmlkaW5nIGEgc3RhbmRhcmQgQVBJIHRvIG1hbmFnZSBhbmQgcHVibGlzaCBkYXRhc2V0cyBvbiB2YXJpb3VzIGRhdGEgcmVwb3NpdG9yeSBwbGF0Zm9ybXMgKGUuZy4gWmVub2RvLCBGaWdzaGFyZSwgNFRVLlJlc2VhcmNoRGF0YSksIGEgY29tbWFuZCBsaW5lIHRvb2wgdGhhdCBlbmFibGVzIHJlc2VhcmNoIGRhdGEgbWFuYWdlbWVudCB3aXRob3V0IHByb2dyYW1taW5nIHNraWxscywgYW5kIGEgSnVweXRlckxhYiBleHRlbnNpb24gdG8gbWFuYWdlIGRhdGFzZXRzIHRocm91Z2ggYSBncmFwaGljYWwgdXNlciBpbnRlcmZhY2UuClRoZSB0b29sc2V0IGlzIHJlbGV2YW50IGZvciByZXNlYXJjaGVycyBhdCBhbGwgbGV2ZWxzLCBkYXRhIHN0ZXdhcmRzLCBSU0VzLCBkYXRhIG1hbmFnZXJzLCBhbmQgcHJhY3RpY2FsbHkgYW55b25lIHdobyBkZXZlbG9wcyBvciBtYW5hZ2VzIHJlc2VhcmNoIGRhdGEgYW5kIGRhdGEgcmVwb3NpdG9yaWVzLiBUaGUgbWFpbiB0YXJnZXQgZ3JvdXAgb2YgdGhpcyBldmVudCBpcyB0aGUgVFUgRGVsZnQgY29tbXVuaXR5LiBCdXQgdGhlIGludml0YXRpb24gaXMgb3BlbiB0byByZXNlYXJjaCBhbmQgc3VwcG9ydCBzdGFmZiAoc3BlY2lhbGx5IERDQyBzdGFmZikgZnJvbSBhbGwgdW5pdmVyc2l0aWVzIGluIFRoZSBOZXRoZXJsYW5kcy4gRHVyaW5nIHRoZSB3b3Jrc2hvcCwgd2Ugd2lsbCBwcmVzZW50IHRoZSB0b29sc2V0LCB0cmFpbiBwYXJ0aWNpcGFudHMgb24gaG93IHRvIHVzZSBpdCB0byBtYWtlIHJlc2VhcmNoIG91dHB1dHMgRkFJUiwgYW5kIGNvbGxlY3QgZmVlZGJhY2sgZm9yIGltcHJvdmVtZW50IGFuZCBmdXJ0aGVyIGRldmVsb3BtZW50LgpUaGlzIGV2ZW50IGlzIHNwb25zb3JlZCBieSB0aGUgT3BlbiBTY2llbmNlIFByb2dyYW0gYW5kIHRoZSBPcGVuIFNjaWVuY2UgQ29tbXVuaXR5IG9mIFRVIERlbGZ0IHZpYSB0aGUgTWFpbnN0cmVhbWluZyBPcGVuIFNjaWVuY2UgRnVuZC4gQW5kIGNvLW9yZ2FuaXplZCBieSBUVSBEZWxmdCBEaWdpdGFsIENvbXBldGVuY2UgQ2VudHJlICZhbXA7IENlbnRlciBvZiBFeHBlcnRpc2UgaW4gQmlnIEdlb2RhdGEgU2NpZW5jZSwgVW5pdmVyc2l0eSBvZiBUd2VudGUuPC9wPgoKPHA+PHN0cm9uZz5GaW5kIG91dCBtb3JlPC9zdHJvbmc+IGF0IDxhIGhyZWY9Imh0dHBzOi8vd3d3LmV2ZW50YnJpdGUubmwvZS9mYWlybHktdG9vbHNldC13b3Jrc2hvcC10aWNrZXRzLTU0OTQyNTk3Njg4NyIgdGFyZ2V0PSJfYmxhbmsiPnRoZSBldmVudCB3ZWJzaXRlPC9hPjwvcD4KCjxoMSBpZD0ib3Blbi1zY2llbmNlLWNvbW11bml0eS1kZWxmdC1uZXR3b3JraW5nLWV2ZW50Ij5PcGVuIFNjaWVuY2UgQ29tbXVuaXR5IERlbGZ0IE5ldHdvcmtpbmcgRXZlbnQ8L2gxPgoKPHA+PGkgY2xhc3M9ImZhcyBmYS1jYWxlbmRhci1hbHQiPjwvaT4gPHN0cm9uZz5EYXRlICZhbXA7IHRpbWU8L3N0cm9uZz46IFRodXJzZGF5LCBTZXB0ZW1iZXIgMjIsIDE1OjAwLTE4OjAwIENFVDwvcD4KCjxwPjxpIGNsYXNzPSJmYXMgZmEtY2FsZW5kYXItYWx0Ij48L2k+IDxzdHJvbmc+TG9jYXRpb248L3N0cm9uZz46IEF1bGEgQ29uZ3Jlc3MgQ2VudGVyLCBmb3llciAoMXN0IGZsb29yKSAtIFNlbmQgYW4gZW1haWwgdG8gdC55LnlhbmtlbGV2aWNoQHR1ZGVsZnQubmwgdG8gcmVnaXN0ZXI8L3A+Cgo8cD5PdXIgY29tbXVuaXR5IGhhcyBzbyBtYW55IHBhc3Npb25hdGUgYW5kIGluc3BpcmluZyBPcGVuIFNjaWVuY2UgYWR2b2NhdGVzLCBpdOKAmXMgdGltZSB3ZSBtZWV0IGVhY2ggb3RoZXIsIGxlYXJuIGFib3V0IGVhY2ggb3RoZXLigJlzIHdvcmsgYW5kIGhhdmUgZnVuLiBUaGUgcHJvZ3JhbW1lIHJldm9sdmVzIGFyb3VuZCBjb21tdW5pdHkgbWVtYmVycyB3aXRoIHBpdGNoZXMgb2YgbmV3IGlkZWFzIGFzIHdlbGwgYXMgdGhlbWF0aWMgdGFibGVzIHRvIGRpc2N1c3MgZGlmZmVyZW50IHRvcGljcyBvbiBPcGVuIFNjaWVuY2UuIEFuZCBhbGwgdGhpcyB3aGlsZSBlbmpveWluZyBmb29kIGFuZCBkcmlua3MuClRvIHJlY2VpdmUgdGhlIGludml0ZSwgY29udGFjdCBUYW55YSwgdGhlIENvbW11bml0eSBDb29yZGluYXRvciAodC55LnlhbmtlbGV2aWNoQHR1ZGVsZnQubmwpIHNvIHlvdSBjYW4gY29ubmVjdCB3aXRoIG90aGVyIGNvbW11bml0eSBtZW1iZXJzLiBZb3UgY2FuIGFsc28gbGV0IGhlciBrbm93IGlmIHlvdeKAmWQgbGlrZSB0byBoYXZlIGEgc3BhY2UgdG8gc2hhcmUgeW91ciBpbml0aWF0aXZlIHdpdGggdGhlIGNvbW11bml0eS4KVGhlIGV2ZW50IGlzIGRlc2lnbmVkIGZvciBPU0NEZWxmdCBjb21tdW5pdHkgbWVtYmVycy4gTm90IGEgbWVtYmVyIHlldCwgYnV0IHdvdWxkIGxpa2UgdG8gam9pbj8gRmlsbCBvdXQgdGhlIHNpZ24tdXAgc2hlZXQgaGVyZTogaHR0cHM6Ly9vc2MtZGVsZnQuZ2l0aHViLmlvL2pvaW48L3A+Cgo8aDEgaWQ9Im9wZW5pbmctdXAtYS1mbG93LWJhdHRlcnktYnktc2FubGktZmFleiI+T3BlbmluZyB1cCBhIEZsb3cgYmF0dGVyeSBieSBTYW5saSBGYWV6PC9oMT4KCjxwPjxpIGNsYXNzPSJmYXMgZmEtY2FsZW5kYXItYWx0Ij48L2k+IDxzdHJvbmc+RGF0ZSAmYW1wOyB0aW1lPC9zdHJvbmc+OiBGcmlkYXksIEphbnVhcnkgMjEsIDE0OjMwLTE2OjMwIENFVDwvcD4KCjxwPjxpIGNsYXNzPSJmYXMgZmEtY2FsZW5kYXItYWx0Ij48L2k+IDxzdHJvbmc+TG9jYXRpb248L3N0cm9uZz46IE9ubGluZSAtIDxhIGhyZWY9Imh0dHBzOi8vd3d3LmV2ZW50YnJpdGUuY28udWsvZS90aWNrZXRzLXNlbWluYXItb3BlbmluZy11cC1hLWZsb3ctYmF0dGVyeS1ieS1zYW5saS1mYWV6LTIyODMxMDAyMDQxNyI+UmVnaXN0ZXIgaGVyZTwvYT48L3A+Cgo8cD5JbiB0aGUgZnV0dXJlIHdlIHdpbGwgdXNlIG1vcmUgZW5lcmd5LCBiYXR0ZXJpZXMgd2lsbCBwbGF5IGEgY3JpdGljYWwgcm9sZSBpbiBwcm92aWRpbmcgdXMgd2l0aCB0aGlzIGVuZXJneS4gU2FubGkgRmFleiBpcyB3b3JraW5nIG9uIGFuIE9wZW4gU291cmNlIEZsb3cgQmF0dGVyeSB0byBkZW1vY3JhdGlzZSB0aGUgZW5lcmd5IG1hcmtldC4gU2FubGkgRmVheiBpcyBhbiBhc3Npc3RhbnQgcHJvZmVzc29yIGF0IHRoZSBVbml2ZXJzaXR5IFV0cmVjaHQgYW5kIGEgYmlnIHN1cHBvcnRlciBvZiBvcGVuIHNjaWVuY2UgYW5kIG9wZW4tc291cmNlIHdvcmtpbmcuIEhlIHJlY29yZHMgcmVndWxhciBwb2RjYXN0cyBhbmQgaXMgbm93IG9uZSBvZiB0aGUgcGlvbmVlcnMgb2YgbWFraW5nIG9wZW4gZmxvdyBiYXR0ZXJpZXMhPC9wPgoKPHA+V2Ugd2lsbCBhbHNvIGJlIHNob3djYXNpbmcgc29tZSBvZiB0aGUgcHJvamVjdHMgdGhhdCBoYXZlIGJlZW4gYnVpbHQgYnkgdGhlIG1lbWJlcnMgb2YgdGhlIE9wZW4gSGFyZHdhcmUgY29tbXVuaXR5IGF0IERlbGZ0LCB0aGVzZSB3aWxsIGluY2x1ZGUgdGhlIE9wZW4gQ2VudHJpZnVnZSwgdGhlIEZ1bWUgc2Vuc29yLCBSYXNwYmVycnkgUGkgQ29tcHV0ZXIgQ2x1c3RlciBhbmQgdGhlIGF3YXJkIHdpbm5pbmcgUGxhc3RpYyBTY2FubmVyLiBKb2luIHVzIHRvIGtub3cgbW9yZSBhYm91dCB0aGVzZSBwcm9qZWN0cyBvciBvbiBob3cgdG8gd29yayBvbiB5b3VyIG93biE8L3A+Cgo8cD48c3Ryb25nPkZpbmQgb3V0IG1vcmU8L3N0cm9uZz4gYXQgPGEgaHJlZj0iaHR0cHM6Ly93d3cuZXZlbnRicml0ZS5jby51ay9lL3RpY2tldHMtc2VtaW5hci1vcGVuaW5nLXVwLWEtZmxvdy1iYXR0ZXJ5LWJ5LXNhbmxpLWZhZXotMjI4MzEwMDIwNDE3IiB0YXJnZXQ9Il9ibGFuayI+dGhlIGV2ZW50IHdlYnNpdGU8L2E+PC9wPgoKPGgxIGlkPSJtYWNoaW5lLWxlYXJuaW5nLWluLW1hdGVyaWFsLXNjaWVuY2VzIj5NYWNoaW5lIExlYXJuaW5nIGluIE1hdGVyaWFsIFNjaWVuY2VzPC9oMT4KCjxwPjxpIGNsYXNzPSJmYXMgZmEtY2FsZW5kYXItYWx0Ij48L2k+IDxzdHJvbmc+RGF0ZSAmYW1wOyB0aW1lPC9zdHJvbmc+OiBGZWIgMTAsIEZlYiAyMiBhbmQgTWFyY2ggMTA8L3A+Cgo8cD48aSBjbGFzcz0iZmFzIGZhLWNhbGVuZGFyLWFsdCI+PC9pPiA8c3Ryb25nPkxvY2F0aW9uPC9zdHJvbmc+OiBPbmxpbmUgLSA8YSBocmVmPSJodHRwczovL29zYy1kZWxmdC5naXRodWIuaW8vcG9zdHMvMjAyMi8wMS8xMi93b3Jrc2hvcC1zZXJpZXMtb24tbWFjaGluZS1sZWFybmluZy1hcHBsaWNhdGlvbnMtaW4tbWF0ZXJpYWwtc2NpZW5jZS8iPlJlZ2lzdHJhdGlvbiBsaW5rcyBmcm9tIHRoaXMgcGFnZTwvYT48L3A+Cgo8cD5UaGlzIHdvcmtzaG9wIHNlcmllcyBhaW1zIHRvIGludHJvZHVjZSBlYXJseSBjYXJlZXIgcmVzZWFyY2hlcnMgaW4gbWF0ZXJpYWxzIHNjaWVuY2UgdG8gZnVuZGFtZW50YWwgbWFjaGluZSBsZWFybmluZyBjb25jZXB0cywgYXMgd2VsbCBhcyB0b29scyBhbmQgdGVjaG5pcXVlcyBmb3IgYXBwbHlpbmcgbWFjaGluZSBsZWFybmluZyBhcHByb2FjaGVzIHRvIHRoZWlyIHdvcmssIGluY2x1ZGluZyBoYW5kbGluZyBhbmQgc2hhcmluZyBkYXRhIGFzIHdlbGwgYXMgYnVpbGRpbmcgbWFjaGluZSBsZWFybmluZyBhbmQgZGVlcCBsZWFybmluZyBtb2RlbHMuPC9wPgoKPHA+PHN0cm9uZz5GaW5kIG91dCBtb3JlPC9zdHJvbmc+IGF0IDxhIGhyZWY9Imh0dHBzOi8vb3NjLWRlbGZ0LmdpdGh1Yi5pby9wb3N0cy8yMDIyLzAxLzEyL3dvcmtzaG9wLXNlcmllcy1vbi1tYWNoaW5lLWxlYXJuaW5nLWFwcGxpY2F0aW9ucy1pbi1tYXRlcmlhbC1zY2llbmNlLyIgdGFyZ2V0PSJfYmxhbmsiPnRoZSBldmVudCB3ZWJzaXRlPC9hPjwvcD4KCgogICAgPC9kaXY+ICAgIAo8L3NlY3Rpb24+CiAgPC9kaXY+Cjwvc2VjdGlvbj4KCiAgICAgICAgPGZvb3RlciBjbGFzcz0iZm9vdGVyIj4KICAgIDxkaXYgY2xhc3M9ImNvbnRhaW5lciBjb250ZW50Ij4KICAgICAgPGRpdiBjbGFzcz0iY29sdW1ucyI+CiAgICAgICAgPGRpdiBjbGFzcz0iY29sdW1uIj4KICAgICAgICAgIDxoMz5HZXQgaW4gdG91Y2g8L2gzPgogICAgICAgICAgPHA+CiAgICAgICAgICAgIDxpIGNsYXNzPSJmYWIgZmEtdHdpdHRlciI+PC9pPgogICAgICAgICAgICA8YSBocmVmPSJodHRwczovL3R3aXR0ZXIuY29tL09TQ0RlbGZ0Ij4KICAgICAgICAgICAgICBPU0NEZWxmdAogICAgICAgICAgICA8L2E+CiAgICAgICAgICA8L3A+CiAgICAgICAgICA8cD4KICAgICAgICAgICAgPGkgY2xhc3M9ImZhYiBmYS1naXRodWIiPjwvaT4KICAgICAgICAgICAgPGEgaHJlZj0iaHR0cHM6Ly9naXRodWIuY29tL29zYy1kZWxmdC9vc2MtZGVsZnQuZ2l0aHViLmlvIj4KICAgICAgICAgICAgICBvc2MtZGVsZnQKICAgICAgICAgICAgPC9hPgogICAgICAgICAgPC9wPgogICAgICAgIDwvZGl2PgogICAgICAgIDxkaXYgY2xhc3M9ImNvbHVtbiBpcy01Ij4KICAgICAgICAgIDxwPgogICAgICAgICAgICBJZiB5b3Ugd2FudCB0byByZXBvcnQgYSBwcm9ibGVtIG9yIHN1Z2dlc3QgYW4gZW5oYW5jZW1lbnQgd2UnZCBsb3ZlCiAgICAgICAgICAgIGZvciB5b3UgdG8KICAgICAgICAgICAgPGEgaHJlZj0iaHR0cHM6Ly9naXRodWIuY29tL29zYy1kZWxmdC9vc2MtZGVsZnQuZ2l0aHViLmlvL2lzc3Vlcy9uZXciPm9wZW4gYW4gaXNzdWU8L2E+CiAgICAgICAgICAgIG9uIG91ciBHaXRIdWIgcmVwb3NpdG9yeSBiZWNhdXNlIHRoZW4gd2UgY2FuIGdldCByaWdodCBvbiBpdC4KICAgICAgICAgIDwvcD4KICAgICAgICAgIDxwPiBUaGlzIHdlYnNpdGUgaXMgY3JlYXRlZCBiYXNlZCBvbiB0aGUgPGEgaHJlZj0iaHR0cHM6Ly9vcGVubGlmZXNjaS5vcmciPk9wZW4gbGlmZSBTY2llbmNlIChPTFMpIHByb3JhbW1lIHdlYnNpdGU8L2E+LAogICAgICAgICAgICBjcmVhdGVkIGJ5IHRoZSBhd2Vzb21lIE9MUyB0ZWFtLgogICAgICAgICAgPC9wPgogICAgICAgICAgPHA+CiAgICAgICAgICAgIFRoZSB3ZWJzaXRlIGNvbnRlbnQgaXMgbGljZW5zZWQKICAgICAgICAgICAgPGEgaHJlZj0iaHR0cDovL2NyZWF0aXZlY29tbW9ucy5vcmcvbGljZW5zZXMvYnktc2EvNC4wLyI+Q0MgQlkgU0EgNC4wPC9hPi4KICAgICAgICAgIDwvcD4KICAgICAgICA8L2Rpdj4KICAgICAgPC9kaXY+CiAgICA8L2Rpdj4KICA8L2Zvb3Rlcj4KCgogICAgICAgIDxzY3JpcHQgZGVmZXIgc3JjPSJodHRwczovL3VzZS5mb250YXdlc29tZS5jb20vcmVsZWFzZXMvdjUuMy4xL2pzL2FsbC5qcyI+PC9zY3JpcHQ+CiAgICAgICAgPHNjcmlwdCBzcmM9Ii9qcy9zY3JpcHRzLmpzIj48L3NjcmlwdD4KICAgICAgICA8c2NyaXB0IGFzeW5jIHNyYz0iaHR0cHM6Ly9wbGF0Zm9ybS50d2l0dGVyLmNvbS93aWRnZXRzLmpzIiBjaGFyc2V0PSJ1dGYtOCI+PC9zY3JpcHQ+IAogICAgPC9ib2R5Pgo8L2h0bWw+Cg== + recorded_at: Thu, 03 Jan 2019 08:00:00 GMT +recorded_with: VCR 6.2.0 From b2984bc7bf58862db4a092b58640d3c70bf5b149 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Tue, 22 Oct 2024 12:23:15 +0200 Subject: [PATCH 2/3] use input url to be able to use it for more --- lib/ingestors/oscd_ingestor.rb | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb index faab3c2a4..ba9c1b48a 100644 --- a/lib/ingestors/oscd_ingestor.rb +++ b/lib/ingestors/oscd_ingestor.rb @@ -25,8 +25,9 @@ def read(url) private - def process_oscd(_url) - url = 'https://osc-delft.github.io/events' + def process_oscd(url) + # url = 'https://osc-delft.github.io/events' + # url = 'https://osceindhoven.github.io/events' event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('.article-post').children first_event = true @@ -68,15 +69,15 @@ def process_oscd(_url) def oscd_fix_time(date_str) date_str.split(',').each do |str| str.strip.split(' ').each_cons(2) do |el1, el2| - if is_month?(el1) && el2.to_i.positive? - event_start = Time.zone.parse([el1, el2].join(' ')) - event_end = Time.zone.parse([el1, el2].join(' ')) - if event_start < (Time.zone.now - 2.weeks) - event_start = event_start.change(year: event_start.year + 1) - event_end = event_end.change(year: event_start.year + 1) - end - return event_start, event_end + next unless is_month?(el1) && el2.to_i.positive? + + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) end + return event_start, event_end end end end From 1ffd96f2032457b17f39c01512147f1082cc52a5 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Tue, 22 Oct 2024 13:49:34 +0200 Subject: [PATCH 3/3] move extra funcs to class --- lib/ingestors/oscd_ingestor.rb | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb index ba9c1b48a..21ec22eed 100644 --- a/lib/ingestors/oscd_ingestor.rb +++ b/lib/ingestors/oscd_ingestor.rb @@ -63,26 +63,26 @@ def process_oscd(url) @messages << "Extract event fields failed with: #{e.message}" end end - end -end -def oscd_fix_time(date_str) - date_str.split(',').each do |str| - str.strip.split(' ').each_cons(2) do |el1, el2| - next unless is_month?(el1) && el2.to_i.positive? + def oscd_fix_time(date_str) + date_str.split(',').each do |str| + str.strip.split(' ').each_cons(2) do |el1, el2| + next unless is_month?(el1) && el2.to_i.positive? - event_start = Time.zone.parse([el1, el2].join(' ')) - event_end = Time.zone.parse([el1, el2].join(' ')) - if event_start < (Time.zone.now - 2.weeks) - event_start = event_start.change(year: event_start.year + 1) - event_end = event_end.change(year: event_start.year + 1) + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) + end + return event_start, event_end + end end - return event_start, event_end end - end -end -def is_month?(str) - formatted_str = str.strip.capitalize - Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) + def is_month?(str) + formatted_str = str.strip.capitalize + Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) + end + end end