Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve WebVTT tags in subtitle lines #97

Merged
merged 2 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion subtitles.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,19 +236,49 @@ type StyleAttributes struct {
TTMLWritingMode *string
TTMLZIndex *int
WebVTTAlign string
WebVTTItalics bool
WebVTTLine string
WebVTTLines int
WebVTTPosition string
WebVTTRegionAnchor string
WebVTTScroll string
NhanNguyen700 marked this conversation as resolved.
Show resolved Hide resolved
WebVTTSize string
WebVTTStyles []string
WebVTTTags []WebVTTTag
WebVTTVertical string
WebVTTViewportAnchor string
WebVTTWidth string
}

type WebVTTTag struct {
Name string
Annotation string
Classes []string
}

func (t WebVTTTag) startTag() string {
if t.Name == "" {
return ""
}

s := t.Name
NhanNguyen700 marked this conversation as resolved.
Show resolved Hide resolved
if len(t.Classes) > 0 {
s += "." + strings.Join(t.Classes, ".")
}

if t.Annotation != "" {
s += " " + t.Annotation
}

return "<" + s + ">"
}

func (t WebVTTTag) endTag() string {
if t.Name == "" {
return ""
}
return "</" + t.Name + ">"
}

func (sa *StyleAttributes) propagateSSAAttributes() {}

func (sa *StyleAttributes) propagateSTLAttributes() {
Expand Down
76 changes: 49 additions & 27 deletions webvtt.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ package astisub

import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"log"
"regexp"
"sort"
"strconv"
Expand Down Expand Up @@ -33,7 +33,7 @@ var (
bytesWebVTTItalicEndTag = []byte("</i>")
bytesWebVTTItalicStartTag = []byte("<i>")
bytesWebVTTTimeBoundariesSeparator = []byte(webvttTimeBoundariesSeparator)
webVTTRegexpStartTag = regexp.MustCompile(`(<v([\.\w]*)(.+?)>)`)
webVTTRegexpTag = regexp.MustCompile(`(</*\s*([^\.\s]+)(\.[^\s/]*)*\s*([^/]*)\s*/*>)`)
webVTTEscaper = strings.NewReplacer("&", "&amp;", "<", "&lt;")
webVTTUnescaper = strings.NewReplacer("&amp;", "&", "&lt;", "<")
)
Expand Down Expand Up @@ -306,45 +306,65 @@ func parseTextWebVTT(i string) (o Line) {
// Create tokenizer
tr := html.NewTokenizer(strings.NewReader(i))

webVTTTagStack := make([]WebVTTTag, 0, 16)

// Loop
italic := false
for {
// Get next tag
t := tr.Next()

// Process error
if err := tr.Err(); err != nil {
break
}

switch t {
case html.EndTagToken:
// Parse italic
if bytes.Equal(tr.Raw(), bytesWebVTTItalicEndTag) {
italic = false
continue
// Pop the top of stack if we meet end tag
if len(webVTTTagStack) > 0 {
webVTTTagStack = webVTTTagStack[:len(webVTTTagStack)-1]
}
case html.StartTagToken:
// Parse voice name
if matches := webVTTRegexpStartTag.FindStringSubmatch(string(tr.Raw())); len(matches) > 3 {
if s := strings.TrimSpace(matches[3]); s != "" {
o.VoiceName = s
if matches := webVTTRegexpTag.FindStringSubmatch(string(tr.Raw())); len(matches) > 4 {
tagName := matches[2]

var classes []string
if matches[3] != "" {
classes = strings.Split(strings.Trim(matches[3], "."), ".")
}

annotation := ""
if matches[4] != "" {
annotation = strings.TrimSpace(matches[4])
}

if tagName == "v" {
if o.VoiceName == "" {
// Only get voicename of the first <v> appears in the line
o.VoiceName = annotation
} else {
// TODO: do something with other <v> instead of ignoring
log.Printf("astisub: found another voice name %q in %q. Ignore", annotation, i)
}
continue
}
continue
}

// Parse italic
if bytes.Equal(tr.Raw(), bytesWebVTTItalicStartTag) {
italic = true
continue
// Push the tag to stack
webVTTTagStack = append(webVTTTagStack, WebVTTTag{
Name: tagName,
Classes: classes,
Annotation: annotation,
})
}

case html.TextToken:
if s := strings.TrimSpace(string(tr.Raw())); s != "" {
// Get style attribute
var sa *StyleAttributes
if italic {
if len(webVTTTagStack) > 0 {
tags := make([]WebVTTTag, len(webVTTTagStack))
copy(tags, webVTTTagStack)
sa = &StyleAttributes{
WebVTTItalics: italic,
WebVTTTags: tags,
}
sa.propagateWebVTTAttributes()
}
Expand Down Expand Up @@ -545,19 +565,21 @@ func (li LineItem) webVTTBytes() (c []byte) {
color = cssColor(*li.InlineStyle.TTMLColor)
}

// Get italics
i := li.InlineStyle != nil && li.InlineStyle.WebVTTItalics

// Append
if color != "" {
c = append(c, []byte("<c."+color+">")...)
}
if i {
c = append(c, []byte("<i>")...)
if li.InlineStyle != nil {
for _, tag := range li.InlineStyle.WebVTTTags {
c = append(c, []byte(tag.startTag())...)
}
}
c = append(c, []byte(escapeWebVTT(li.Text))...)
if i {
c = append(c, []byte("</i>")...)
if li.InlineStyle != nil {
noTags := len(li.InlineStyle.WebVTTTags)
for i := noTags - 1; i >= 0; i-- {
c = append(c, []byte(li.InlineStyle.WebVTTTags[i].endTag())...)
}
}
if color != "" {
c = append(c, []byte("</c>")...)
Expand Down
20 changes: 10 additions & 10 deletions webvtt_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,39 +96,39 @@ func TestCueVoiceSpanRegex(t *testing.T) {
}{
{
give: `<v 中文> this is the content</v>`,
want: ` 中文`,
want: `中文`,
},
{
give: `<v 中文> this is the content`,
want: ` 中文`,
want: `中文`,
},
{
give: `<v.abc 中文> this is the content</v>`,
want: ` 中文`,
want: `中文`,
},
{
give: `<v.jp 言語の> this is the content`,
want: ` 言語の`,
want: `言語の`,
},
{
give: `<v.ko 언어> this is the content`,
want: ` 언어`,
want: `언어`,
},
{
give: `<v foo bar> this is the content`,
want: ` foo bar`,
want: `foo bar`,
},
{
give: `<v هذا عربي> this is the content`,
want: ` هذا عربي`,
want: `هذا عربي`,
},
}

for _, tt := range tests {
t.Run(tt.want, func(t *testing.T) {
results := webVTTRegexpStartTag.FindStringSubmatch(tt.give)
assert.True(t, len(results) == 4)
assert.Equal(t, tt.want, results[3])
results := webVTTRegexpTag.FindStringSubmatch(tt.give)
assert.True(t, len(results) == 5)
assert.Equal(t, tt.want, results[4])
})
}
}
50 changes: 50 additions & 0 deletions webvtt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,53 @@ Sentence with an &amp; in the middle
Sentence with an &lt; in the middle
`, b.String())
}

func TestWebVTTTags(t *testing.T) {
testData := `WEBVTT

00:01:00.000 --> 00:02:00.000
<u><i>Italic with underline text</i></u> some extra

00:02:00.000 --> 00:03:00.000
<lang en>English here</lang> <c.yellow.bg_blue>Yellow text on blue background</c>

00:03:00.000 --> 00:04:00.000
<v Joe><c.red><i>Joe's words are red in italic</i></c>

00:04:00.000 --> 00:05:00.000
<customed_tag.class1.class2>Text here</customed_tag>

00:05:00.000 --> 00:06:00.000
<v Joe>Joe says something</v> <v Bob>Bob says something</v>`

s, err := astisub.ReadFromWebVTT(strings.NewReader(testData))
require.NoError(t, err)

require.Len(t, s.Items, 5)

b := &bytes.Buffer{}
err = s.WriteToWebVTT(b)
require.NoError(t, err)
require.Equal(t, `WEBVTT

1
00:01:00.000 --> 00:02:00.000
<u><i>Italic with underline text</i></u> some extra

2
00:02:00.000 --> 00:03:00.000
<lang en>English here</lang> <c.yellow.bg_blue>Yellow text on blue background</c>

3
00:03:00.000 --> 00:04:00.000
<v Joe><c.red><i>Joe's words are red in italic</i></c>

4
00:04:00.000 --> 00:05:00.000
<customed_tag.class1.class2>Text here</customed_tag>

5
00:05:00.000 --> 00:06:00.000
<v Joe>Joe says something Bob says something
`, b.String())
}