diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index 32ba32fd..c03b836d 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -385,7 +385,16 @@ public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = else if (tokens[i - 1].Type == UsfmTokenType.End) { // Insert space token after * of end marker - int colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0); + int colNum; + if (index >= usfm.Length) + { + colNum = usfm.Length + 1; + } + else + { + colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0); + } + tokens.Insert( i, new UsfmToken(UsfmTokenType.Text, null, " ", null) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index 6137246a..d77c9035 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -47,6 +47,19 @@ public void Detokenize() Assert.That(result, Is.EqualTo(usfm)); } + [Test] + public void Tokenize_Ending_ParagraphMarker() + { + //The ending paragraph marker should not crash the parser. + string usfm = + @"\id MAT - Test +\c 1 +\v 1 Descriptive title\x - \xo 18:16 \xt hello world\x*\p +"; + IReadOnlyList tokens = new UsfmTokenizer().Tokenize(usfm); + Assert.That(tokens, Has.Count.EqualTo(13)); + } + private static string ReadUsfm() { return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));