Skip to content

Commit

Permalink
Merge pull request #2879 from sciencehistory/show_footnotes_ohms_vtt
Browse files Browse the repository at this point in the history
OHMS new VTT transcripts support footnotes
  • Loading branch information
jrochkind authored Feb 10, 2025
2 parents 62240b8 + c007a03 commit bd8f05a
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 26 deletions.
15 changes: 15 additions & 0 deletions app/components/oral_history/vtt_transcript_component.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,18 @@
</p>
<% end %>
</div>

<% if sanitized_footnotes.present? %>
<hr />

<div class="mx-1 my-2"><strong>NOTES</strong></div>

<div class="footnote-list mx-1 mb-5">
<% sanitized_footnotes.each_pair do |reference, footnote_text| %>
<%= render ::OralHistory::FootnoteComponent.new(footnote_reference: reference, footnote_text: footnote_text) %>
<% end %>
</div>
<% end %>



65 changes: 63 additions & 2 deletions app/components/oral_history/vtt_transcript_component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,82 @@ module OralHistory
class VttTranscriptComponent < ApplicationComponent
delegate :format_ohms_timestamp, to: :helpers

TextScrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
# 'c' is WebVTT 'class' object, which we only expect in the form
# of c.1, c.12 etc for OHMS annotation references.
scrubber.tags = ['i', 'b', 'u', 'c']
scrubber.attributes = ['cref'] # for our weird custom c tag
end

FootnoteTextScrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
scrubber.tags = ['i', 'b', 'u', 'a']
end

attr_reader :vtt_transcript

def initialize(vtt_transcript)
@vtt_transcript = vtt_transcript
end

def sanitized_footnotes
@sanitized_footnotes ||= vtt_transcript.footnotes.collect { |ref, text| [ref, scrub_footnote_text(text)] }.to_h
end

# Replace the VTT <c.N> tags used by OHMS for annotation/footnote references
# with our footnote <a> tags.
#
# And html sanitize the rest
def scrub_text(raw_html)
# Turn <c.1> tags to XML-legal <c ref='1'> tags with the one in a ref attribute
str = raw_html.gsub(/<c\.(\d+)/, "<c cref='\\1'")

str = Loofah.fragment(str).
scrub!(TextScrubber).
to_s

# And now we need to turn those <c> tags into our footnote reference links!
# Note non-greedy regex match '+?' or '*?' operator so it gets first </c>. They can't be nested!
str.gsub!(/<c cref="(\d+)"[^>]*>(.+?)<\/c>/) do |_matched|
refNum = $1
inner_content = $2.html_safe

render(OralHistory::FootnoteReferenceComponent.new(
footnote_text: sanitized_footnotes[refNum],
footnote_is_html: true,
number: refNum,
link_content: inner_content
))
end

# we have sanitized and replaced with a component, it should be html_safe
str.html_safe
end

# Footnote text, unlike our main text, can have links, but
# we want to ensure they all have rel=opener and target=_blank set
# (Which some built-in scrubbers in Loofah can do)
#
def scrub_footnote_text(raw_html)
str = Loofah.fragment(raw_html).
scrub!(FootnoteTextScrubber).
scrub!(:targetblank).
scrub!(:noopener).
to_s.html_safe
end

def display_paragraphs
last_speaker = nil # don't do same speaker twice in a row
@vtt_transcript.cues.each do |cue|
start_sec_f = cue.start_sec_f # we only want to do this once per cue
#
cue.paragraphs.each do |paragraph|
yield start_sec_f, (paragraph.speaker_name if paragraph.speaker_name != last_speaker), paragraph.safe_html
paragraph_safe_html = scrub_text(paragraph.raw_html)

yield start_sec_f, (paragraph.speaker_name if paragraph.speaker_name != last_speaker), paragraph_safe_html
last_speaker = paragraph.speaker_name
start_sec_f = nil
end
end
end

end
end
9 changes: 9 additions & 0 deletions app/frontend/stylesheets/local/oh_audio.scss
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@
// URLs in footnotes should be forced to break if needed to fit on screen
.footnote-list {
overflow-wrap: break-word;
a {
@extend .text-link;
}
}
}

Expand Down Expand Up @@ -344,6 +347,12 @@

.ohms-footnote-popover {
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.5);

a {
// Make it look like a link, not sure why it wasn't looking like any kind
// of link otherwise
@extend .text-link;
}
}
}

Expand Down
55 changes: 38 additions & 17 deletions app/models/oral_history_content/ohms_xml/vtt_transcript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class OhmsXml
# new-style 2025 OHMS transcript with <vtt_transcript> element in xml.
# https://www.w3.org/TR/webvtt1/
#
# Also handles some OHMS quirks.
# Also handles some OHMS quirks, strips and formats OHMS-style citation
# footnotes, etc. This really is OHMS-speciifc in the end.
#
# Uses the `webvtt` gem for initial parsing, but that gem is basic and
# not very maintained, so we need some massaging and post-processing
Expand Down Expand Up @@ -43,6 +44,33 @@ def cues
end
end

# delivers extracted and indexed footnotes from OHMS WebVTT
# using OHMS own custom format standards for such.
#
# Warning: Text is NOT sanitized!
#
# @returns a hash where index are OHMS footnote numbers/indicators
def footnotes
@footnotes ||= begin
by_ref = {}

raw_webvtt_text =~ /ANNOTATIONS BEGIN(.*)ANNOTATIONS END/m
Nokogiri::XML.fragment($1 || "").xpath("annotation").each do |node|
next unless node['ref'].present?

by_ref[node['ref']] = node.inner_html
end

by_ref
end
end

# scrubbed, ordered, html_safe values for printing footnotes at bottom
def safe_footnote_values
safe_footnote_values ||= footnote_array
end


# eg for indexing, actual human-readable indexable plain text after parsed and extracted webVTT
def transcript_text
@transcript_text ||= cues.collect { |c| c.paragraphs }.flatten.collect do |p|
Expand Down Expand Up @@ -97,7 +125,7 @@ def end_sec_f

# split text inside a cue into paragraphs.
#
# Paragraphs are split on newlines (WebVTT standard) -- also on <br><br> (two in a row br tag),
# Paragraphs are split on newlines (WebVTT standard) -- also on <br><br> (two+ in a row br tag),
# which OHMS at least sometimes does.
#
# A change in WebVTT "voice" (speaker) will also result in a paragraph split, which
Expand All @@ -107,16 +135,17 @@ def paragraphs
# This tricky regex using both positive lookahead and negative lookahead
# will split into voice tags, taking into account that some text might not
# be in a voice tag, and that voice tag does not have to ber closed when it's the whole cue
(text || -"").split(/(?=\<v[ .])|(?<=\<\/v>)/).collect do |voice_span|
(text || -"").split(/(?=\<v[ .])|(?:\<\/v>)/).collect do |voice_span|
# <v some name> or <v.class1.class2 some name>, in some cases ended with </v>
if voice_span.gsub!(/\A\<v(?:\.[\w.]+)?\ ([^>]+)>/, '')
speaker_name = $1
end

# \R is any kind of linebreak
# Things coming from OHMS separate paragraphs by `<br><br>` instead
# sometimes annoyingly
voice_span.split(/\R|(?:\<br\>\<br\>)/).collect do |paragraph_text|
# Things coming from OHMS can separate paragraphs by `<br><br>`, annoyingly:
# Split paragraphs on two more consecutive <br>
voice_span.split(/\R|(?:\<br\>){2,}/).collect do |paragraph_text|
paragraph_text.gsub!("</v>", "") # remove stray ending tags
Paragraph.new(speaker_name: speaker_name, raw_html: paragraph_text)
end
end.flatten
Expand All @@ -125,20 +154,12 @@ def paragraphs
end

class Paragraph
Scrubber = Rails::Html::PermitScrubber.new.tap do |scrubber|
scrubber.tags = ['i', 'b', 'u']
end

attr_reader :speaker_name, :safe_html, :raw_html
# named raw_html to make sure we don't forget to scrub!
attr_reader :speaker_name, :raw_html

def initialize(speaker_name:, raw_html:)
@raw_html = raw_html
@raw_html = raw_html.strip
@speaker_name = speaker_name

html_fragment = Loofah.fragment(raw_html)
html_fragment.scrub!(Scrubber)

@safe_html = html_fragment.to_s.strip.html_safe
end
end
end
Expand Down
101 changes: 101 additions & 0 deletions spec/components/oral_history/vtt_transcript_component_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,105 @@
"3603.0", "3618.0", "3631.0", "3638.0", nil, nil, "3674.0", "3682.0"
]
end

describe "unsafe html in text" do
let(:ohms_webvtt) do
# Example includes what OHMS might, but also some extra stuff in WebVTT
# standard (but not necessarily everything!), to be a bit forward looking.
<<~EOS
WEBVTT
NOTE
TRANSCRIPTION BEGIN
00:00:00.000 --> 00:00:02.000
<v.first.loud Esme Johnson>It’s a <i>blue</i> <script>apple</script> tree!
00:00:02.400 --> 00:00:04.000
<v Mary>This content has some <b>bold</b> and <i>italics</i>
00:00:04.400 --> 00:00:06.000
<v Esme>Hee!</v> <i>laughter</i>
NOTE
TRANSCRIPTION END
EOS
end

it "scrubs output" do
parsed = render_inline(vtt_transcript_component)

paragraphs = parsed.css(".ohms-transcript-container p.ohms-transcript-paragraph.ohms-transcript-line")

expect(paragraphs.length).to eq 4

expect(paragraphs[0].inner_html).to include "It’s a <i>blue</i> apple tree!" # no more script tag
expect(paragraphs[1].inner_html).to include "This content has some <b>bold</b> and <i>italics</i>"

expect(paragraphs[3].inner_html).to include "<i>laughter</i>"
end
end

describe "with annotations" do
let(:ohms_webvtt) do
# Example includes what OHMS might, but also some extra stuff in WebVTT
# standard (but not necessarily everything!), to be a bit forward looking.
<<~EOS
WEBVTT
NOTE
TRANSCRIPTION BEGIN
00:00:00.000 --> 00:00:02.000
<v.first.loud Esme Johnson>We have a <c.1>footnote <b>ref</b> <script>no script tag</script></c>
NOTE
TRANSCRIPTION END
NOTE
ANNOTATIONS BEGIN
Annotation Set Title: Lorem Ipsum Transcript Annotations
Annotation Set Creator: Lorem Ipsum Generator
Annotation Set Date: 1985-10-26
NOTE
<annotation ref="1">Lorem ipsum <b>dolor</b> sit <i>amet</i>, consectetur <script>no script tag</script> <a href="https://example.com">internal link</a></annotation>
NOTE
ANNOTATIONS END
EOS
end

it "replaces WebVTT <c.1> classes with our footnote references, html-safely" do
parsed = render_inline vtt_transcript_component

footnote_link = parsed.at_css("a.footnote")

expect(footnote_link).to be_present
expect(footnote_link.inner_html.strip).to eq "footnote <b>ref</b> no script tag [1]"

# Nokogiri unescapes for us
expect(footnote_link['data-bs-content']).to eq (
'Lorem ipsum <b>dolor</b> sit <i>amet</i>, consectetur no script tag <a href="https://example.com" target="_blank" rel="noopener">internal link</a>'
)
expect(footnote_link['data-bs-html']).to eq "true"
end

it "renders footnotes at the bottom" do
parsed = render_inline vtt_transcript_component

footnotes = parsed.css(".footnote-list .footnote-page-bottom-container")

expect(footnotes.length).to eq 1
expect(footnotes.first.inner_html.strip.gsub(/\s+/, ' ')).to eq(
<<~EOS.gsub(/\s+/, ' ').strip
<a id="footnote-1" data-role="ohms-navbar-aware-internal-link" href="#footnote-reference-1"> 1.</a>
<span id="footnote-text-1">Lorem ipsum <b>dolor</b> sit <i>amet</i>,
consectetur no script tag <a href="https://example.com" target="_blank" rel="noopener">internal link</a></span>
EOS
)
end
end
end
2 changes: 1 addition & 1 deletion spec/indexers/work_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@
output_hash = WorkIndexer.new.map_record(work)

expect(output_hash["searchable_fulltext_en"]).to be_present
expect(output_hash["searchable_fulltext_en"].first).to start_with("SCHNEIDER: Today is December 16, 2024. I am Sarah Schneider")
expect(output_hash["searchable_fulltext_en"].first).to start_with("SCHNEIDER: Today is December 16, 2024. I am Sarah Schneider")
# exactly how many entries depends on how many toc entries have synopsis, keywords, etc.
expect(output_hash["searchable_fulltext_en"].length).to be >= (1 + index_toc_entries_count)
end
Expand Down
Loading

0 comments on commit bd8f05a

Please sign in to comment.