Merge pull request #2879 from sciencehistory/show_footnotes_ohms_vtt

OHMS new VTT transcripts support footnotes
sciencehistory · Feb 10, 2025 · bd8f05a · bd8f05a
2 parents 62240b8 + c007a03
commit bd8f05a
Show file tree

Hide file tree

Showing 7 changed files with 275 additions and 26 deletions.
diff --git a/app/components/oral_history/vtt_transcript_component.html.erb b/app/components/oral_history/vtt_transcript_component.html.erb
@@ -21,3 +21,18 @@
     </p>
   <% end %>
 </div>
+
+<% if sanitized_footnotes.present? %>
+  <hr />
+
+  <div class="mx-1 my-2"><strong>NOTES</strong></div>
+
+  <div class="footnote-list mx-1 mb-5">
+    <% sanitized_footnotes.each_pair do |reference, footnote_text| %>
+        <%= render ::OralHistory::FootnoteComponent.new(footnote_reference: reference, footnote_text: footnote_text) %>
+    <% end %>
+  </div>
+<% end %>
+
+
+
diff --git a/app/components/oral_history/vtt_transcript_component.rb b/app/components/oral_history/vtt_transcript_component.rb
@@ -2,21 +2,82 @@ module OralHistory
   class VttTranscriptComponent < ApplicationComponent
     delegate :format_ohms_timestamp, to: :helpers
 
+    TextScrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
+      # 'c' is WebVTT 'class' object, which we only expect in the form
+      # of c.1, c.12 etc for OHMS annotation references.
+      scrubber.tags = ['i', 'b', 'u', 'c']
+      scrubber.attributes = ['cref'] # for our weird custom c tag
+    end
+
+    FootnoteTextScrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
+      scrubber.tags = ['i', 'b', 'u', 'a']
+    end
+
+    attr_reader :vtt_transcript
+
     def initialize(vtt_transcript)
       @vtt_transcript = vtt_transcript
     end
 
+    def sanitized_footnotes
+      @sanitized_footnotes ||= vtt_transcript.footnotes.collect { |ref, text| [ref, scrub_footnote_text(text)] }.to_h
+    end
+
+    # Replace the VTT <c.N> tags used by OHMS for annotation/footnote references
+    # with our footnote <a> tags.
+    #
+    # And html sanitize the rest
+    def scrub_text(raw_html)
+      # Turn <c.1> tags to XML-legal <c ref='1'> tags with the one in a ref attribute
+      str = raw_html.gsub(/<c\.(\d+)/, "<c cref='\\1'")
+
+      str = Loofah.fragment(str).
+        scrub!(TextScrubber).
+        to_s
+
+      # And now we need to turn those <c> tags into our footnote reference links!
+      # Note non-greedy regex match '+?' or '*?' operator so it gets first </c>. They can't be nested!
+      str.gsub!(/<c cref="(\d+)"[^>]*>(.+?)<\/c>/) do |_matched|
+        refNum = $1
+        inner_content = $2.html_safe
+
+        render(OralHistory::FootnoteReferenceComponent.new(
+          footnote_text: sanitized_footnotes[refNum],
+          footnote_is_html: true,
+          number: refNum,
+          link_content: inner_content
+        ))
+      end
+
+      # we have sanitized and replaced with a component, it should be html_safe
+      str.html_safe
+    end
+
+    # Footnote text, unlike our main text, can have links, but
+    # we want to ensure they all have rel=opener and target=_blank set
+    # (Which some built-in scrubbers in Loofah can do)
+    #
+    def scrub_footnote_text(raw_html)
+      str = Loofah.fragment(raw_html).
+        scrub!(FootnoteTextScrubber).
+        scrub!(:targetblank).
+        scrub!(:noopener).
+        to_s.html_safe
+    end
+
     def display_paragraphs
       last_speaker = nil # don't do same speaker twice in a row
       @vtt_transcript.cues.each do |cue|
         start_sec_f = cue.start_sec_f # we only want to do this once per cue
+                                      #
         cue.paragraphs.each do |paragraph|
-          yield start_sec_f, (paragraph.speaker_name if paragraph.speaker_name != last_speaker), paragraph.safe_html
+          paragraph_safe_html = scrub_text(paragraph.raw_html)
+
+          yield start_sec_f, (paragraph.speaker_name if paragraph.speaker_name != last_speaker), paragraph_safe_html
           last_speaker = paragraph.speaker_name
           start_sec_f = nil
         end
       end
     end
-
   end
 end
diff --git a/app/frontend/stylesheets/local/oh_audio.scss b/app/frontend/stylesheets/local/oh_audio.scss
@@ -311,6 +311,9 @@
   // URLs in footnotes should be forced to break if needed to fit on screen
   .footnote-list {
     overflow-wrap: break-word;
+    a {
+      @extend .text-link;
+    }
   }
 }
 
@@ -344,6 +347,12 @@
 
   .ohms-footnote-popover {
     box-shadow: 0 5px 15px rgba(0, 0, 0, 0.5);
+
+    a {
+      // Make it look like a link, not sure why it wasn't looking like any kind
+      // of link otherwise
+      @extend .text-link;
+    }
   }
 }
 

diff --git a/app/models/oral_history_content/ohms_xml/vtt_transcript.rb b/app/models/oral_history_content/ohms_xml/vtt_transcript.rb
@@ -7,7 +7,8 @@ class OhmsXml
     # new-style 2025 OHMS transcript with <vtt_transcript> element in xml.
     # https://www.w3.org/TR/webvtt1/
     #
-    # Also handles some OHMS quirks.
+    # Also handles some OHMS quirks, strips and formats OHMS-style citation
+    # footnotes, etc.  This really is OHMS-speciifc in the end.
     #
     # Uses the `webvtt` gem for initial parsing, but that gem is basic and
     # not very maintained, so we need some massaging and post-processing
@@ -43,6 +44,33 @@ def cues
         end
       end
 
+      # delivers extracted and indexed footnotes from OHMS WebVTT
+      # using OHMS own custom format standards for such.
+      #
+      # Warning: Text is NOT sanitized!
+      #
+      # @returns a hash where index are OHMS footnote numbers/indicators
+      def footnotes
+        @footnotes ||= begin
+          by_ref = {}
+
+          raw_webvtt_text =~ /ANNOTATIONS BEGIN(.*)ANNOTATIONS END/m
+          Nokogiri::XML.fragment($1 || "").xpath("annotation").each do |node|
+            next unless node['ref'].present?
+
+            by_ref[node['ref']] = node.inner_html
+          end
+
+          by_ref
+        end
+      end
+
+      # scrubbed, ordered, html_safe values for printing footnotes at bottom
+      def safe_footnote_values
+        safe_footnote_values ||= footnote_array
+      end
+
+
       # eg for indexing, actual human-readable indexable plain text after parsed and extracted webVTT
       def transcript_text
         @transcript_text ||= cues.collect { |c| c.paragraphs }.flatten.collect do |p|
@@ -97,7 +125,7 @@ def end_sec_f
 
         # split text inside a cue into paragraphs.
         #
-        # Paragraphs are split on newlines (WebVTT standard) -- also on <br><br> (two in a row br tag),
+        # Paragraphs are split on newlines (WebVTT standard) -- also on <br><br> (two+ in a row br tag),
         # which OHMS at least sometimes does.
         #
         # A change in WebVTT "voice" (speaker) will also result in a paragraph split, which
@@ -107,16 +135,17 @@ def paragraphs
             # This tricky regex using both positive lookahead and negative lookahead
             # will split into voice tags, taking into account that some text might not
             # be in a voice tag, and that voice tag does not have to ber closed when it's the whole cue
-            (text || -"").split(/(?=\<v[ .])|(?<=\<\/v>)/).collect do |voice_span|
+            (text || -"").split(/(?=\<v[ .])|(?:\<\/v>)/).collect do |voice_span|
               # <v some name> or <v.class1.class2 some name>, in some cases ended with </v>
               if voice_span.gsub!(/\A\<v(?:\.[\w.]+)?\ ([^>]+)>/, '')
                 speaker_name = $1
               end
 
               # \R is any kind of linebreak
-              # Things coming from OHMS separate paragraphs by `<br><br>` instead
-              # sometimes annoyingly
-              voice_span.split(/\R|(?:\<br\>\<br\>)/).collect do |paragraph_text|
+              # Things coming from OHMS can separate paragraphs by `<br><br>`, annoyingly:
+              # Split paragraphs on two more consecutive <br>
+              voice_span.split(/\R|(?:\<br\>){2,}/).collect do |paragraph_text|
+                paragraph_text.gsub!("</v>", "") # remove stray ending tags
                 Paragraph.new(speaker_name: speaker_name, raw_html: paragraph_text)
               end
             end.flatten
@@ -125,20 +154,12 @@ def paragraphs
       end
 
       class Paragraph
-        Scrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
-          scrubber.tags = ['i', 'b', 'u']
-        end
-
-        attr_reader :speaker_name, :safe_html, :raw_html
+        # named raw_html to make sure we don't forget to scrub!
+        attr_reader :speaker_name, :raw_html
 
         def initialize(speaker_name:, raw_html:)
-          @raw_html = raw_html
+          @raw_html = raw_html.strip
           @speaker_name = speaker_name
-
-          html_fragment = Loofah.fragment(raw_html)
-          html_fragment.scrub!(Scrubber)
-
-          @safe_html = html_fragment.to_s.strip.html_safe
         end
       end
     end

diff --git a/spec/components/oral_history/vtt_transcript_component_spec.rb b/spec/components/oral_history/vtt_transcript_component_spec.rb
@@ -26,4 +26,105 @@
       "3603.0", "3618.0", "3631.0", "3638.0", nil, nil, "3674.0", "3682.0"
     ]
   end
+
+  describe "unsafe html in text" do
+    let(:ohms_webvtt) do
+      # Example includes what OHMS might, but also some extra stuff in WebVTT
+      # standard (but not necessarily everything!), to be a bit forward looking.
+      <<~EOS
+        WEBVTT
+
+        NOTE
+        TRANSCRIPTION BEGIN
+
+        00:00:00.000 --> 00:00:02.000
+        <v.first.loud Esme Johnson>It’s a <i>blue</i> <script>apple</script> tree!
+
+        00:00:02.400 --> 00:00:04.000
+        <v Mary>This content has some <b>bold</b> and <i>italics</i>
+
+        00:00:04.400 --> 00:00:06.000
+        <v Esme>Hee!</v> <i>laughter</i>
+
+        NOTE
+        TRANSCRIPTION END
+
+      EOS
+    end
+
+    it "scrubs output" do
+      parsed = render_inline(vtt_transcript_component)
+
+      paragraphs = parsed.css(".ohms-transcript-container p.ohms-transcript-paragraph.ohms-transcript-line")
+
+      expect(paragraphs.length).to eq 4
+
+      expect(paragraphs[0].inner_html).to include "It’s a <i>blue</i> apple tree!" # no more script tag
+      expect(paragraphs[1].inner_html).to include "This content has some <b>bold</b> and <i>italics</i>"
+
+      expect(paragraphs[3].inner_html).to include "<i>laughter</i>"
+    end
+  end
+
+  describe "with annotations" do
+    let(:ohms_webvtt) do
+      # Example includes what OHMS might, but also some extra stuff in WebVTT
+      # standard (but not necessarily everything!), to be a bit forward looking.
+      <<~EOS
+        WEBVTT
+
+        NOTE
+        TRANSCRIPTION BEGIN
+
+        00:00:00.000 --> 00:00:02.000
+        <v.first.loud Esme Johnson>We have a <c.1>footnote <b>ref</b> <script>no script tag</script></c>
+
+        NOTE
+        TRANSCRIPTION END
+
+        NOTE
+        ANNOTATIONS BEGIN
+        Annotation Set Title: Lorem Ipsum Transcript Annotations
+        Annotation Set Creator: Lorem Ipsum Generator
+        Annotation Set Date: 1985-10-26
+
+        NOTE
+        <annotation ref="1">Lorem ipsum <b>dolor</b> sit <i>amet</i>, consectetur <script>no script tag</script> <a href="https://example.com">internal link</a></annotation>
+
+        NOTE
+        ANNOTATIONS END
+
+      EOS
+    end
+
+    it "replaces WebVTT <c.1> classes with our footnote references, html-safely" do
+      parsed = render_inline vtt_transcript_component
+
+      footnote_link = parsed.at_css("a.footnote")
+
+      expect(footnote_link).to be_present
+      expect(footnote_link.inner_html.strip).to eq "footnote <b>ref</b> no script tag [1]"
+
+      # Nokogiri unescapes for us
+      expect(footnote_link['data-bs-content']).to eq (
+        'Lorem ipsum <b>dolor</b> sit <i>amet</i>, consectetur no script tag <a href="https://example.com" target="_blank" rel="noopener">internal link</a>'
+      )
+      expect(footnote_link['data-bs-html']).to eq "true"
+    end
+
+    it "renders footnotes at the bottom" do
+      parsed = render_inline vtt_transcript_component
+
+      footnotes = parsed.css(".footnote-list .footnote-page-bottom-container")
+
+      expect(footnotes.length).to eq 1
+      expect(footnotes.first.inner_html.strip.gsub(/\s+/, ' ')).to eq(
+        <<~EOS.gsub(/\s+/, ' ').strip
+          <a id="footnote-1" data-role="ohms-navbar-aware-internal-link" href="#footnote-reference-1"> 1.</a>
+          <span id="footnote-text-1">Lorem ipsum <b>dolor</b> sit <i>amet</i>,
+            consectetur no script tag <a href="https://example.com" target="_blank" rel="noopener">internal link</a></span>
+        EOS
+      )
+    end
+  end
 end
diff --git a/spec/indexers/work_indexer_spec.rb b/spec/indexers/work_indexer_spec.rb
@@ -215,7 +215,7 @@
         output_hash = WorkIndexer.new.map_record(work)
 
         expect(output_hash["searchable_fulltext_en"]).to be_present
-        expect(output_hash["searchable_fulltext_en"].first).to start_with("SCHNEIDER:   Today is December 16, 2024. I am Sarah Schneider")
+        expect(output_hash["searchable_fulltext_en"].first).to start_with("SCHNEIDER: Today is December 16, 2024. I am Sarah Schneider")
         # exactly how many entries depends on how many toc entries have synopsis, keywords, etc.
         expect(output_hash["searchable_fulltext_en"].length).to be >= (1 + index_toc_entries_count)
       end