Skip to content

Commit

Permalink
This is to fix: sillsdev/serval#424.
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Jul 10, 2024
1 parent 6979680 commit 8f84d55
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
11 changes: 10 additions & 1 deletion src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,16 @@ public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace =
else if (tokens[i - 1].Type == UsfmTokenType.End)
{
// Insert space token after * of end marker
int colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0);
int colNum;
if (index >= usfm.Length)
{
colNum = usfm.Length + 1;
}
else
{
colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0);
}

tokens.Insert(
i,
new UsfmToken(UsfmTokenType.Text, null, " ", null)
Expand Down
13 changes: 13 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ public void Detokenize()
Assert.That(result, Is.EqualTo(usfm));
}

[Test]
public void Tokenize_Ending_ParagraphMarker()
{
//The ending paragraph marker should not crash the parser.
string usfm =
@"\id MAT - Test
\c 1
\v 1 Descriptive title\x - \xo 18:16 \xt hello world\x*\p
";
IReadOnlyList<UsfmToken> tokens = new UsfmTokenizer().Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(13));
}

private static string ReadUsfm()
{
return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
Expand Down

0 comments on commit 8f84d55

Please sign in to comment.