Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix exception when parsing USFM with an empty verse paragraph #201

Merged
merged 5 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 30 additions & 5 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace SIL.Machine.Corpora
{
public enum ScriptureTextType
{
None,
NonVerse,
Verse,
Note
Expand All @@ -25,7 +26,7 @@ protected ScriptureRefUsfmParserHandlerBase()
}

protected ScriptureTextType CurrentTextType =>
_curTextType.Count == 0 ? ScriptureTextType.NonVerse : _curTextType.Peek();
_curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek();

public override void EndUsfm(UsfmParserState state)
{
Expand Down Expand Up @@ -97,23 +98,31 @@ public override void EndPara(UsfmParserState state, string marker)
EndParentElement();
EndNonVerseText(state);
}
else if (CurrentTextType == ScriptureTextType.None)
{
// empty verse paragraph
StartParentElement(marker);
StartNonVerseText(state);
EndParentElement();
EndNonVerseText(state);
}
}

public override void StartRow(UsfmParserState state, string marker)
{
if (CurrentTextType == ScriptureTextType.NonVerse)
if (CurrentTextType == ScriptureTextType.NonVerse || CurrentTextType == ScriptureTextType.None)
StartParentElement(marker);
}

public override void EndRow(UsfmParserState state, string marker)
{
if (CurrentTextType == ScriptureTextType.NonVerse)
if (CurrentTextType == ScriptureTextType.NonVerse || CurrentTextType == ScriptureTextType.None)
EndParentElement();
}

public override void StartCell(UsfmParserState state, string marker, string align, int colspan)
{
if (CurrentTextType == ScriptureTextType.NonVerse)
if (CurrentTextType == ScriptureTextType.NonVerse || CurrentTextType == ScriptureTextType.None)
{
StartParentElement(marker);
StartNonVerseText(state);
Expand Down Expand Up @@ -150,7 +159,23 @@ public override void EndNote(UsfmParserState state, string marker, bool closed)
EndNoteText(state);
}

public override void Ref(UsfmParserState state, string marker, string display, string target) { }
public override void Text(UsfmParserState state, string text)
{
// if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment
UsfmTag paraTag = state.ParaTag;
if (
CurrentTextType == ScriptureTextType.None
&& paraTag != null
&& paraTag.Marker != "tr"
&& state.IsVerseText
&& _curVerseRef.VerseNum == 0
&& text.Trim().Length > 0
)
{
StartParentElement(paraTag.Marker);
StartNonVerseText(state);
}
}

protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
\h Matthew
\mt Matthew
\ip An introduction to Matthew\fe + \ft This is an endnote.\fe*
\p Here is another paragraph.
\p and with a \w keyword|a special concept\w* in it.
\p and a \weirdtaglookingthing that is not an actual tag.
\c 1
\s Chapter One
\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f*
Expand All @@ -22,6 +25,7 @@
\tr \tc1 Row two, column one. \tc2 Row two, column two.
\s1 Chapter \it Two \it*
\p
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
\v 2-3 Chapter two, // verse \fm ∆\fm*two.
\esb
Expand Down
92 changes: 52 additions & 40 deletions tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void GetRows_NonEmptyText_AllText()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(44));
Assert.That(rows, Has.Length.EqualTo(49));

Assert.That(rows[0].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/1:h", corpus.Versification)));
Assert.That(rows[0].Text, Is.EqualTo("Matthew"));
Expand All @@ -87,44 +87,56 @@ public void GetRows_NonEmptyText_AllText()
Assert.That(rows[3].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip/1:fe", corpus.Versification)));
Assert.That(rows[3].Text, Is.EqualTo("This is an endnote."));

Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:s", corpus.Versification)));
Assert.That(rows[4].Text, Is.EqualTo("Chapter One"));
Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:p", corpus.Versification)));
Assert.That(rows[4].Text, Is.EqualTo("Here is another paragraph."));

Assert.That(rows[6].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1/1:f", corpus.Versification)));
Assert.That(rows[6].Text, Is.EqualTo("1:1: This is a footnote."));
Assert.That(
rows[7].Ref,
Is.EqualTo(ScriptureRef.Parse("MAT 1:0/7:weirdtaglookingthing", corpus.Versification))
);
Assert.That(rows[7].Text, Is.EqualTo("that is not an actual tag."));

Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/8:s", corpus.Versification)));
Assert.That(rows[8].Text, Is.EqualTo("Chapter One"));

Assert.That(rows[10].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1/1:f", corpus.Versification)));
Assert.That(rows[10].Text, Is.EqualTo("1:1: This is a footnote."));

Assert.That(rows[12].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2/1:f", corpus.Versification)));
Assert.That(rows[12].Text, Is.EqualTo("1:2: This is a footnote."));

Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2/1:f", corpus.Versification)));
Assert.That(rows[8].Text, Is.EqualTo("1:2: This is a footnote."));
Assert.That(rows[19].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification)));
Assert.That(rows[19].Text, Is.EqualTo("Row one, column one."));

Assert.That(rows[15].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/1:tc1", corpus.Versification)));
Assert.That(rows[15].Text, Is.EqualTo("Row one, column one."));
Assert.That(rows[20].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/2:tc2", corpus.Versification)));
Assert.That(rows[20].Text, Is.EqualTo("Row one, column two."));

Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/1:tr/2:tc2", corpus.Versification)));
Assert.That(rows[16].Text, Is.EqualTo("Row one, column two."));
Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/1:tc1", corpus.Versification)));
Assert.That(rows[21].Text, Is.EqualTo("Row two, column one."));

Assert.That(rows[17].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/1:tc1", corpus.Versification)));
Assert.That(rows[17].Text, Is.EqualTo("Row two, column one."));
Assert.That(rows[22].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/2:tc2", corpus.Versification)));
Assert.That(rows[22].Text, Is.EqualTo("Row two, column two."));

Assert.That(rows[18].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/2:tr/2:tc2", corpus.Versification)));
Assert.That(rows[18].Text, Is.EqualTo("Row two, column two."));
Assert.That(rows[23].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification)));
Assert.That(rows[23].Text, Is.EqualTo("Chapter Two"));

Assert.That(rows[19].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification)));
Assert.That(rows[19].Text, Is.EqualTo("Chapter Two"));
Assert.That(rows[24].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/4:p", corpus.Versification)));
Assert.That(rows[24].Text, Is.Empty);

Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1/1:f", corpus.Versification)));
Assert.That(rows[21].Text, Is.EqualTo("2:1: This is a footnote."));
Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1/1:f", corpus.Versification)));
Assert.That(rows[26].Text, Is.EqualTo("2:1: This is a footnote."));

Assert.That(rows[24].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/1:ms", corpus.Versification)));
Assert.That(rows[24].Text, Is.EqualTo("This is a sidebar"));
Assert.That(rows[29].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/1:ms", corpus.Versification)));
Assert.That(rows[29].Text, Is.EqualTo("This is a sidebar"));

Assert.That(rows[25].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification)));
Assert.That(rows[25].Text, Is.EqualTo("Here is some sidebar content."));
Assert.That(rows[30].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification)));
Assert.That(rows[30].Text, Is.EqualTo("Here is some sidebar content."));

Assert.That(rows[31].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification)));
Assert.That(rows[31].Text, Is.EqualTo("Section header"));
Assert.That(rows[36].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:7a/1:s", corpus.Versification)));
Assert.That(rows[36].Text, Is.EqualTo("Section header"));

Assert.That(rows[38].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:12/1:restore", corpus.Versification)));
Assert.That(rows[38].Text, Is.EqualTo("restore information"));
Assert.That(rows[43].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:12/1:restore", corpus.Versification)));
Assert.That(rows[43].Text, Is.EqualTo("restore information"));
}

[Test]
Expand Down Expand Up @@ -242,41 +254,41 @@ public void GetRows_IncludeMarkers_AllText()

IText text = corpus["MAT"];
TextRow[] rows = text.GetRows().ToArray();
Assert.That(rows, Has.Length.EqualTo(40));
Assert.That(rows, Has.Length.EqualTo(45));

Assert.That(rows[2].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/3:ip", corpus.Versification)));
Assert.That(rows[2].Text, Is.EqualTo("An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*"));

Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification)));
Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:1", corpus.Versification)));
Assert.That(
rows[4].Text,
rows[8].Text,
Is.EqualTo("Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*")
);

Assert.That(rows[5].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", corpus.Versification)));
Assert.That(rows[9].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:2", corpus.Versification)));
Assert.That(
rows[5].Text,
rows[9].Text,
Is.EqualTo("\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two.")
);

Assert.That(rows[8].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification)));
Assert.That(rows[12].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:5", corpus.Versification)));
Assert.That(
rows[8].Text,
rows[12].Text,
Is.EqualTo(
"Chapter one, \\li2 verse \\fig Figure 1|src=\"image1.png\" size=\"col\" ref=\"1:5\"\\fig* five."
)
);

Assert.That(rows[16].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification)));
Assert.That(rows[16].Text, Is.EqualTo("Chapter \\it Two \\it*"));
Assert.That(rows[20].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:0/3:s1", corpus.Versification)));
Assert.That(rows[20].Text, Is.EqualTo("Chapter \\it Two \\it*"));

Assert.That(rows[17].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification)));
Assert.That(rows[22].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:1", corpus.Versification)));
Assert.That(
rows[17].Text,
rows[22].Text,
Is.EqualTo("Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one.")
);

Assert.That(rows[21].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification)));
Assert.That(rows[21].Text, Is.EqualTo("Here is some sidebar // content."));
Assert.That(rows[26].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 2:3/1:esb/2:p", corpus.Versification)));
Assert.That(rows[26].Text, Is.EqualTo("Here is some sidebar // content."));
}
}
2 changes: 1 addition & 1 deletion tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public void Tokenize()
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(204));
Assert.That(tokens, Has.Count.EqualTo(205));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Expand Down
Loading