From 06e5cc397ab21cd7e28744e793dac5d78a9e6129 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Mon, 21 Oct 2024 18:08:30 +0200 Subject: [PATCH 1/3] scraper for oscd --- lib/ingestors/oscd_ingestor.rb | 87 +++++++++++++++++++++++ test/unit/ingestors/oscd_ingestor_test.rb | 60 ++++++++++++++++ test/vcr_cassettes/ingestors/oscd.yml | 72 +++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 lib/ingestors/oscd_ingestor.rb create mode 100644 test/unit/ingestors/oscd_ingestor_test.rb create mode 100644 test/vcr_cassettes/ingestors/oscd.yml diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb new file mode 100644 index 000000000..faab3c2a4 --- /dev/null +++ b/lib/ingestors/oscd_ingestor.rb @@ -0,0 +1,87 @@ +require 'open-uri' +require 'csv' +require 'nokogiri' + +module Ingestors + class OscdIngestor < Ingestor + def self.config + { + key: 'oscd_event', + title: 'OSCD Events API', + category: :events + } + end + + def read(url) + begin + process_oscd(url) + rescue Exception => e + @messages << "#{self.class.name} failed with: #{e.message}" + end + + # finished + nil + end + + private + + def process_oscd(_url) + url = 'https://osc-delft.github.io/events' + + event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('.article-post').children + first_event = true + event = nil + event_page.each do |div| + if div.name == 'h1' + first_event = false + event = OpenStruct.new + event.title = div.text + event.url = "#{url}##{event.title.downcase.gsub(' ', '_')}" + event.description = '' + event.source = 'OSCD' + event.timezone = 'Amsterdam' + end + + next if first_event || div.name == 'text' + + if div.name == 'p' + if div.text.strip.start_with?('Date & time:') + date_str = div.text.remove('Date & time:').strip + event.start, event.end = oscd_fix_time(date_str) + elsif div.text.strip.start_with?('Location:') + event.venue = div.text.remove('Location:').strip + else + event.description = [event.description, div.text.strip].join(' ') + end + if div&.next_sibling&.next_sibling.nil? || (div&.next_sibling&.next_sibling&.name == 'h1') + event.set_default_times + add_event(event) + end + end + rescue Exception => e + @messages << "Extract event fields failed with: #{e.message}" + end + end + end +end + +def oscd_fix_time(date_str) + date_str.split(',').each do |str| + str.strip.split(' ').each_cons(2) do |el1, el2| + if is_month?(el1) && el2.to_i.positive? + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) + end + return event_start, event_end + end + end + end +end + +def is_month?(str) + formatted_str = str.strip.capitalize + Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) +end diff --git a/test/unit/ingestors/oscd_ingestor_test.rb b/test/unit/ingestors/oscd_ingestor_test.rb new file mode 100644 index 000000000..8b0aebf4b --- /dev/null +++ b/test/unit/ingestors/oscd_ingestor_test.rb @@ -0,0 +1,60 @@ +require 'test_helper' + +class OscdIngestorTest < ActiveSupport::TestCase + setup do + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + mock_ingestions + mock_timezone # System time zone should not affect test result + end + + teardown do + reset_timezone + end + + test 'can ingest events from oscd' do + source = @content_provider.sources.build( + url: 'https://osc-delft.github.io/events', + method: 'oscd', + enabled: true + ) + + ingestor = Ingestors::OscdIngestor.new + + # check event doesn't + new_title = 'Opening up a Flow battery by Sanli Faez' + new_url = 'https://osc-delft.github.io/events#opening_up_a_flow_battery_by_sanli_faez' + refute Event.where(title: new_title, url: new_url).any? + + # run task + assert_difference 'Event.count', 4 do + freeze_time(2019) do + VCR.use_cassette('ingestors/oscd') do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + end + end + + assert_equal 4, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 4, ingestor.stats[:events][:added] + assert_equal 0, ingestor.stats[:events][:updated] + assert_equal 0, ingestor.stats[:events][:rejected] + + + # check event does exist + event = Event.where(title: new_title, url: new_url).first + assert event + assert_equal new_title, event.title + assert_equal new_url, event.url + + # check other fields + assert_equal 'OSCD', event.source + assert_equal 'Amsterdam', event.timezone + assert_equal Time.zone.parse('Tue, 21 Jan 2019 09:00:00.000000000 UTC +00:00'), event.start + assert_equal Time.zone.parse('Tue, 21 Jan 2019 17:00:00.000000000 UTC +00:00'), event.end + assert_equal 'Online - Register here', event.venue + assert event.online? + end +end diff --git a/test/vcr_cassettes/ingestors/oscd.yml b/test/vcr_cassettes/ingestors/oscd.yml new file mode 100644 index 000000000..15259ee2e --- /dev/null +++ b/test/vcr_cassettes/ingestors/oscd.yml @@ -0,0 +1,72 @@ +--- +http_interactions: +- request: + method: get + uri: https://osc-delft.github.io/events + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Connection: + - keep-alive + Content-Length: + - '12145' + Server: + - GitHub.com + Content-Type: + - text/html; charset=utf-8 + Permissions-Policy: + - interest-cohort=() + Last-Modified: + - Fri, 09 Jun 2023 11:08:05 GMT + Access-Control-Allow-Origin: + - "*" + Strict-Transport-Security: + - max-age=31556952 + Etag: + - W/"64830815-2f71" + Expires: + - Mon, 21 Oct 2024 14:35:24 GMT + Cache-Control: + - max-age=600 + X-Proxy-Cache: + - MISS + X-Github-Request-Id: + - FB2D:0DB5:44C9AE3:46C3AEA:67166454 + Accept-Ranges: + - bytes + Age: + - '0' + Date: + - Mon, 21 Oct 2024 15:06:48 GMT + Via: + - 1.1 varnish + X-Served-By: + - cache-ams2100084-AMS + X-Cache: + - HIT + X-Cache-Hits: + - '0' + X-Timer: + - S1729523208.413896,VS0,VE97 + Vary: + - Accept-Encoding + X-Fastly-Request-Id: + - e784ffe238a0a9d1f10db12b038ed739fc67bfe9 + body: + encoding: ASCII-8BIT + string: !binary |- + <!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="utf-8">
        <meta http-equiv="x-ua-compatible" content="ie=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <title>Community events | Open Science Community Delft</title>
        
        <link href="/css/custom.css" rel="stylesheet" type="text/css">
        <link rel="stylesheet" href="https://cdn.rawgit.com/jpswalsh/academicons/master/css/academicons.min.css">
    </head>

    <body>
        <nav class="navbar is-fixed-top" role="navigation" aria-label="main navigation">
    <div class="container">
      <div class="navbar-brand">

        <a class="navbar-item" href="/">
            <img src="/images/OSCDLogo.png" alt="Open Science Community Delft">
        </a>

        <a
          role="button"
          class="navbar-burger burger"
          aria-label="menu"
          aria-expanded="false"
          data-target="top-menu"
        >
          <span aria-hidden="true"></span> <span aria-hidden="true"></span>
          <span aria-hidden="true"></span>
        </a>
      </div>

      <div id="top-menu" class="navbar-menu">
        <div class="navbar-end">

          <div class="navbar-item has-dropdown is-hoverable">
            <a class="navbar-link" href="/index">The community</a>
            <div class="navbar-dropdown">
              <a class="navbar-item" href="/index#who-can-join-the-open-science-community-delft"> Who can join </a>
              <a class="navbar-item" href="/index#what-are-the-benefits"> Benefits </a>
              <a class="navbar-item" href="/index#get-involved"> Get involved </a>
              <a class="navbar-item" href="/index#code-of-conduct"> Code of Conduct </a>
            </div>
          </div>

          <div class="navbar-item has-dropdown is-hoverable">
            <a class="navbar-link" href="/initiatives">Initiatives</a>
            <div class="navbar-dropdown">
              <a class="navbar-item" href="/initiatives#data-champions"> Data Champions </a>
              <a class="navbar-item" href="/initiatives#delft-open-hardware"> Delft Open Hardware </a>
              <a class="navbar-item" href="/initiatives#citizen-science"> Citizen Science </a>
              <a class="navbar-item" href="/initiatives#urbanism"> Rbanism </a>
            </div>
          </div>
            
        <div class="navbar-item has-dropdown is-hoverable">
            <a class="navbar-link" href="/funding"> Funding </a>
            <div class="navbar-dropdown">
              <a class="navbar-item" href="/funding#mainstreaming-open-science-fund"> Mainstreaming Open Science Fund </a>
            </div>
          </div>

          <a class="navbar-item" href="/events"> Events </a>

          <div class="navbar-item has-dropdown is-hoverable">
            <a class="navbar-link" href="/members">Members</a>
            <div class="navbar-dropdown">
              <a class="navbar-item" href="/members#ABE"> ABE </a>
              <a class="navbar-item" href="/members#AE"> AE </a>
              <a class="navbar-item" href="/members#AS"> AS </a>
              <a class="navbar-item" href="/members#CEG"> CEG </a>
              <a class="navbar-item" href="/members#EEMCS"> EEMCS </a>
              <a class="navbar-item" href="/members#IDE"> IDE </a>
              <a class="navbar-item" href="/members#threemE"> 3mE </a>
              <a class="navbar-item" href="/members#TPM"> TPM </a>
              <a class="navbar-item" href="/members#NF"> Non-Faculty </a>
              <a class="navbar-item" href="/members#Alumni"> Alumni </a>
            </div>
          </div>

          <a class="navbar-item" href="/gallery"> Photo gallery </a>
          <a class="navbar-item" href="/posts"> Stories </a>

          <a class="navbar-item" href="/join"> Join us </a>
        </div>
      </div>
    </div>
  </nav>

        

<section class="hero " style="background-image: linear-gradient(to top, #000, transparent), url(https://images.unsplash.com/photo-1435527173128-983b87201f4d?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1047&q=80);">
  <div class="hero-body">
    <h1 class="title is-1">Community events</h1>

    

    
  </div>

  
  <div class="hero-foot">
    <p class="credit"><a href="">Eric Rothermel</a> (CC BY)</p>
  </div>
  
</section>

<section class="section">
  <div class="container content  ">
    <section>
    <div class="article-post">
        
<p>This is a list of events organised by members of the Open Science Community Delft that can be of interest to other members.</p>

<p>If you would like to add an event here, please <a href="https://github.com/osc-delft/osc-delft.github.io/issues/new?assignees=&amp;labels=&amp;template=event-listing-submission-template.md&amp;title=Event">submit an issue on GitHub</a>.</p>

<h1 id="fairly-toolset-workshop">Fairly Toolset Workshop</h1>

<p><i class="fas fa-calendar-alt"></i> <strong>Date &amp; time</strong>: Wednesday, March 22, 13:30-17:30 CET</p>

<p><i class="fas fa-calendar-alt"></i> <strong>Location</strong>: X TU Delft Mekelweg 8 Building 37 2628 CD Delft</p>

<p>Computational research environments facilitate research data production by providing the necessary processing and analysis tools. They are well connected to some research infrastructure, e.g. code repositories. But, their interoperability with research data repositories is weak, and the researchers need to manually upload their research data to the repositories, mostly through web forms and interfaces.
The fairly toolset seamlessly integrates research environments and data repositories, and allows local data and metadata management, quick data publication, unattended data uploading, smart dataset synchronization, and quick dataset cloning. The toolset includes a Python library providing a standard API to manage and publish datasets on various data repository platforms (e.g. Zenodo, Figshare, 4TU.ResearchData), a command line tool that enables research data management without programming skills, and a JupyterLab extension to manage datasets through a graphical user interface.
The toolset is relevant for researchers at all levels, data stewards, RSEs, data managers, and practically anyone who develops or manages research data and data repositories. The main target group of this event is the TU Delft community. But the invitation is open to research and support staff (specially DCC staff) from all universities in The Netherlands. During the workshop, we will present the toolset, train participants on how to use it to make research outputs FAIR, and collect feedback for improvement and further development.
This event is sponsored by the Open Science Program and the Open Science Community of TU Delft via the Mainstreaming Open Science Fund. And co-organized by TU Delft Digital Competence Centre &amp; Center of Expertise in Big Geodata Science, University of Twente.</p>

<p><strong>Find out more</strong> at <a href="https://www.eventbrite.nl/e/fairly-toolset-workshop-tickets-549425976887" target="_blank">the event website</a></p>

<h1 id="open-science-community-delft-networking-event">Open Science Community Delft Networking Event</h1>

<p><i class="fas fa-calendar-alt"></i> <strong>Date &amp; time</strong>: Thursday, September 22, 15:00-18:00 CET</p>

<p><i class="fas fa-calendar-alt"></i> <strong>Location</strong>: Aula Congress Center, foyer (1st floor) - Send an email to t.y.yankelevich@tudelft.nl to register</p>

<p>Our community has so many passionate and inspiring Open Science advocates, it’s time we meet each other, learn about each other’s work and have fun. The programme revolves around community members with pitches of new ideas as well as thematic tables to discuss different topics on Open Science. And all this while enjoying food and drinks.
To receive the invite, contact Tanya, the Community Coordinator (t.y.yankelevich@tudelft.nl) so you can connect with other community members. You can also let her know if you’d like to have a space to share your initiative with the community.
The event is designed for OSCDelft community members. Not a member yet, but would like to join? Fill out the sign-up sheet here: https://osc-delft.github.io/join</p>

<h1 id="opening-up-a-flow-battery-by-sanli-faez">Opening up a Flow battery by Sanli Faez</h1>

<p><i class="fas fa-calendar-alt"></i> <strong>Date &amp; time</strong>: Friday, January 21, 14:30-16:30 CET</p>

<p><i class="fas fa-calendar-alt"></i> <strong>Location</strong>: Online - <a href="https://www.eventbrite.co.uk/e/tickets-seminar-opening-up-a-flow-battery-by-sanli-faez-228310020417">Register here</a></p>

<p>In the future we will use more energy, batteries will play a critical role in providing us with this energy. Sanli Faez is working on an Open Source Flow Battery to democratise the energy market. Sanli Feaz is an assistant professor at the University Utrecht and a big supporter of open science and open-source working. He records regular podcasts and is now one of the pioneers of making open flow batteries!</p>

<p>We will also be showcasing some of the projects that have been built by the members of the Open Hardware community at Delft, these will include the Open Centrifuge, the Fume sensor, Raspberry Pi Computer Cluster and the award winning Plastic Scanner. Join us to know more about these projects or on how to work on your own!</p>

<p><strong>Find out more</strong> at <a href="https://www.eventbrite.co.uk/e/tickets-seminar-opening-up-a-flow-battery-by-sanli-faez-228310020417" target="_blank">the event website</a></p>

<h1 id="machine-learning-in-material-sciences">Machine Learning in Material Sciences</h1>

<p><i class="fas fa-calendar-alt"></i> <strong>Date &amp; time</strong>: Feb 10, Feb 22 and March 10</p>

<p><i class="fas fa-calendar-alt"></i> <strong>Location</strong>: Online - <a href="https://osc-delft.github.io/posts/2022/01/12/workshop-series-on-machine-learning-applications-in-material-science/">Registration links from this page</a></p>

<p>This workshop series aims to introduce early career researchers in materials science to fundamental machine learning concepts, as well as tools and techniques for applying machine learning approaches to their work, including handling and sharing data as well as building machine learning and deep learning models.</p>

<p><strong>Find out more</strong> at <a href="https://osc-delft.github.io/posts/2022/01/12/workshop-series-on-machine-learning-applications-in-material-science/" target="_blank">the event website</a></p>


    </div>    
</section>
  </div>
</section>

        <footer class="footer">
    <div class="container content">
      <div class="columns">
        <div class="column">
          <h3>Get in touch</h3>
          <p>
            <i class="fab fa-twitter"></i>
            <a href="https://twitter.com/OSCDelft">
              OSCDelft
            </a>
          </p>
          <p>
            <i class="fab fa-github"></i>
            <a href="https://github.com/osc-delft/osc-delft.github.io">
              osc-delft
            </a>
          </p>
        </div>
        <div class="column is-5">
          <p>
            If you want to report a problem or suggest an enhancement we'd love
            for you to
            <a href="https://github.com/osc-delft/osc-delft.github.io/issues/new">open an issue</a>
            on our GitHub repository because then we can get right on it.
          </p>
          <p> This website is created based on the <a href="https://openlifesci.org">Open life Science (OLS) proramme website</a>,
            created by the awesome OLS team.
          </p>
          <p>
            The website content is licensed
            <a href="http://creativecommons.org/licenses/by-sa/4.0/">CC BY SA 4.0</a>.
          </p>
        </div>
      </div>
    </div>
  </footer>


        <script defer src="https://use.fontawesome.com/releases/v5.3.1/js/all.js"></script>
        <script src="/js/scripts.js"></script>
        <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script> 
    </body>
</html>
 + recorded_at: Thu, 03 Jan 2019 08:00:00 GMT +recorded_with: VCR 6.2.0 From b2984bc7bf58862db4a092b58640d3c70bf5b149 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Tue, 22 Oct 2024 12:23:15 +0200 Subject: [PATCH 2/3] use input url to be able to use it for more --- lib/ingestors/oscd_ingestor.rb | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb index faab3c2a4..ba9c1b48a 100644 --- a/lib/ingestors/oscd_ingestor.rb +++ b/lib/ingestors/oscd_ingestor.rb @@ -25,8 +25,9 @@ def read(url) private - def process_oscd(_url) - url = 'https://osc-delft.github.io/events' + def process_oscd(url) + # url = 'https://osc-delft.github.io/events' + # url = 'https://osceindhoven.github.io/events' event_page = Nokogiri::HTML5.parse(open_url(url.to_s, raise: true)).css('.article-post').children first_event = true @@ -68,15 +69,15 @@ def process_oscd(_url) def oscd_fix_time(date_str) date_str.split(',').each do |str| str.strip.split(' ').each_cons(2) do |el1, el2| - if is_month?(el1) && el2.to_i.positive? - event_start = Time.zone.parse([el1, el2].join(' ')) - event_end = Time.zone.parse([el1, el2].join(' ')) - if event_start < (Time.zone.now - 2.weeks) - event_start = event_start.change(year: event_start.year + 1) - event_end = event_end.change(year: event_start.year + 1) - end - return event_start, event_end + next unless is_month?(el1) && el2.to_i.positive? + + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) end + return event_start, event_end end end end From 1ffd96f2032457b17f39c01512147f1082cc52a5 Mon Sep 17 00:00:00 2001 From: Mike Sanders Date: Tue, 22 Oct 2024 13:49:34 +0200 Subject: [PATCH 3/3] move extra funcs to class --- lib/ingestors/oscd_ingestor.rb | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/ingestors/oscd_ingestor.rb b/lib/ingestors/oscd_ingestor.rb index ba9c1b48a..21ec22eed 100644 --- a/lib/ingestors/oscd_ingestor.rb +++ b/lib/ingestors/oscd_ingestor.rb @@ -63,26 +63,26 @@ def process_oscd(url) @messages << "Extract event fields failed with: #{e.message}" end end - end -end -def oscd_fix_time(date_str) - date_str.split(',').each do |str| - str.strip.split(' ').each_cons(2) do |el1, el2| - next unless is_month?(el1) && el2.to_i.positive? + def oscd_fix_time(date_str) + date_str.split(',').each do |str| + str.strip.split(' ').each_cons(2) do |el1, el2| + next unless is_month?(el1) && el2.to_i.positive? - event_start = Time.zone.parse([el1, el2].join(' ')) - event_end = Time.zone.parse([el1, el2].join(' ')) - if event_start < (Time.zone.now - 2.weeks) - event_start = event_start.change(year: event_start.year + 1) - event_end = event_end.change(year: event_start.year + 1) + event_start = Time.zone.parse([el1, el2].join(' ')) + event_end = Time.zone.parse([el1, el2].join(' ')) + if event_start < (Time.zone.now - 2.weeks) + event_start = event_start.change(year: event_start.year + 1) + event_end = event_end.change(year: event_start.year + 1) + end + return event_start, event_end + end end - return event_start, event_end end - end -end -def is_month?(str) - formatted_str = str.strip.capitalize - Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) + def is_month?(str) + formatted_str = str.strip.capitalize + Date::MONTHNAMES.include?(formatted_str) || Date::ABBR_MONTHNAMES.include?(formatted_str) + end + end end