From da949b98667f8581ac93eb5a66f5f50f05638aaf Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 5 Jun 2024 08:12:55 -0400 Subject: [PATCH] Fix USFM parsing/generation issues - fixes https://github.com/sillsdev/serval/issues/399 - fixes https://github.com/sillsdev/serval/issues/398 - Line and column number with USFM errors. --- .ignore | 2 + .../ScriptureRefUsfmParserHandlerBase.cs | 49 +++++-- src/SIL.Machine/Corpora/UsfmParser.cs | 3 + src/SIL.Machine/Corpora/UsfmParserState.cs | 5 + src/SIL.Machine/Corpora/UsfmTextBase.cs | 8 +- src/SIL.Machine/Corpora/UsfmToken.cs | 2 + src/SIL.Machine/Corpora/UsfmTokenizer.cs | 128 ++++++++++++++++-- .../Corpora/TestData/usfm/Tes/41MATTes.SFM | 4 +- .../Corpora/UsfmFileTextTests.cs | 2 +- .../Corpora/UsfmManualTests.cs | 26 +++- .../Corpora/UsfmTokenizerTests.cs | 32 +++-- 11 files changed, 213 insertions(+), 48 deletions(-) create mode 100644 .ignore diff --git a/.ignore b/.ignore new file mode 100644 index 000000000..4fa382dd5 --- /dev/null +++ b/.ignore @@ -0,0 +1,2 @@ +!tests/SIL.Machine.Tests/Corpora/TestData/usfm/source/* +!tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/* diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 86f20f481..661d7fdd6 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -150,8 +150,10 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close public override void StartNote(UsfmParserState state, string marker, string caller, string category) { - if (CurrentTextType != ScriptureTextType.None) + if (CurrentTextType != ScriptureTextType.None && !_duplicateVerse) { + // if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment + CheckConvertVerseParaToNonVerse(state); NextElement(marker); StartNoteText(state); } @@ -159,26 +161,27 @@ public override void StartNote(UsfmParserState state, string marker, string call public override void EndNote(UsfmParserState state, string marker, bool closed) { - if (CurrentTextType == ScriptureTextType.Note) + if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse) EndNoteText(state); } public override void Text(UsfmParserState state, string text) { // if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment - UsfmTag paraTag = state.ParaTag; - if ( - CurrentTextType == ScriptureTextType.None - && paraTag != null - && paraTag.Marker != "tr" - && state.IsVerseText - && _curVerseRef.VerseNum == 0 - && text.Trim().Length > 0 - ) - { - StartParentElement(paraTag.Marker); - StartNonVerseText(state); - } + if (text.Trim().Length > 0) + CheckConvertVerseParaToNonVerse(state); + } + + public override void StartChar( + UsfmParserState state, + string markerWithoutPlus, + bool unknown, + IReadOnlyList attributes + ) + { + // if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse + // segment + CheckConvertVerseParaToNonVerse(state); } protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } @@ -273,5 +276,21 @@ private ScriptureRef CreateNonVerseRef() _curElements.Where(e => e.Position > 0).Reverse() ); } + + private void CheckConvertVerseParaToNonVerse(UsfmParserState state) + { + UsfmTag paraTag = state.ParaTag; + if ( + CurrentTextType == ScriptureTextType.None + && paraTag != null + && paraTag.Marker != "tr" + && state.IsVersePara + && _curVerseRef.VerseNum == 0 + ) + { + StartParentElement(paraTag.Marker); + StartNonVerseText(state); + } + } } } diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index ce5047626..c17afb387 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -153,6 +153,9 @@ public bool ProcessToken() // Move to next token State.Index++; + State.LineNumber = State.Token.LineNumber; + State.ColumnNumber = State.Token.ColumnNumber; + // Update verse offset with previous token (since verse offset is from start of current token) if (State.PrevToken != null) State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 7f44d6022..1ad2c85b0 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -19,6 +19,8 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn _stack = new List(); VerseRef = new VerseRef(versification); VerseOffset = 0; + LineNumber = 1; + ColumnNumber = 0; Tokens = tokens; } @@ -59,6 +61,9 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// public int VerseOffset { get; internal set; } + public int LineNumber { get; internal set; } + public int ColumnNumber { get; internal set; } + /// /// True if the token processed is part of a special indivisible group /// of tokens (link or chapter/verse alternate/publishable) diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index d0f6f27db..7b0c016f4 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -53,9 +53,8 @@ protected override IEnumerable GetVersesInDocOrder() sb.Append($"An error occurred while parsing the text '{Id}`"); if (!string.IsNullOrEmpty(Project)) sb.Append($" in project '{Project}'"); - sb.Append( - $". Verse: {parser.State.VerseRef}, offset: {parser.State.VerseOffset}, error: '{ex.Message}'" - ); + sb.Append($". Verse: {parser.State.VerseRef}, line: {parser.State.LineNumber}, "); + sb.Append($"column: {parser.State.ColumnNumber}, error: '{ex.Message}'"); throw new InvalidOperationException(sb.ToString(), ex); } return rowCollector.Rows; @@ -168,6 +167,9 @@ bool closed { base.EndChar(state, marker, attributes, closed); + if (_rowTexts.Count == 0) + return; + if (_text._includeMarkers && attributes != null && state.PrevToken?.Type == UsfmTokenType.Attribute) _rowTexts.Peek().Append(state.PrevToken); diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs index 8f0cbc4cc..c0b105b9c 100644 --- a/src/SIL.Machine/Corpora/UsfmToken.cs +++ b/src/SIL.Machine/Corpora/UsfmToken.cs @@ -55,6 +55,8 @@ public UsfmToken(UsfmTokenType type, string marker, string text, string endMarke public string Data { get; } + public int LineNumber { get; internal set; } = -1; + public int ColumnNumber { get; internal set; } = -1; public IReadOnlyList Attributes { get; private set; } public string NestlessMarker diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index 1e9d9bf31..32ba32fd0 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -42,12 +42,18 @@ public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = List tokens = new List(); int index = 0; // Current position + int lineNum = 1; // Current line number + int previousIndex = 0; while (index < usfm.Length) { int nextMarkerIndex = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1; if (nextMarkerIndex == -1) nextMarkerIndex = usfm.Length; + lineNum += usfm.Substring(previousIndex, index - previousIndex).Count(c => c == '\n'); + int colNum = index - usfm.LastIndexOf('\n', index); + previousIndex = index; + // If text, create text token until end or next \ var ch = usfm[index]; if (ch != '\\') @@ -61,11 +67,21 @@ public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = preserveWhitespace, tokens, nextMarkerIndex, - ref text + ref text, + lineNum, + colNum ); if (text.Length > 0) - tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null)); + { + tokens.Add( + new UsfmToken(UsfmTokenType.Text, null, text, null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); + } if (attributeToken != null) tokens.Add(attributeToken); @@ -161,11 +177,21 @@ ref text null, GetNextWord(usfm, ref index, preserveWhitespace) ) + { + LineNumber = lineNum, + ColumnNumber = colNum + } ); } else { - tokens.Add(new UsfmToken(UsfmTokenType.Character, marker, null, endMarker)); + tokens.Add( + new UsfmToken(UsfmTokenType.Character, marker, null, endMarker) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } break; case UsfmStyleType.Paragraph: @@ -180,6 +206,10 @@ ref text null, GetNextWord(usfm, ref index, preserveWhitespace) ) + { + LineNumber = lineNum, + ColumnNumber = colNum + } ); } else if ((tag.TextProperties & UsfmTextProperties.Book) > 0) @@ -192,11 +222,21 @@ ref text null, GetNextWord(usfm, ref index, preserveWhitespace) ) + { + LineNumber = lineNum, + ColumnNumber = colNum + } ); } else { - tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)); + tokens.Add( + new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } break; @@ -209,16 +249,32 @@ ref text endMarker, GetNextWord(usfm, ref index, preserveWhitespace) ) + { + LineNumber = lineNum, + ColumnNumber = colNum + } ); break; case UsfmStyleType.End: - tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null, null)); + tokens.Add( + new UsfmToken(UsfmTokenType.End, marker, null, null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); break; case UsfmStyleType.Unknown: // End tokens are always end tokens, even if unknown if (marker.EndsWith("*", StringComparison.Ordinal)) { - tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null, null)); + tokens.Add( + new UsfmToken(UsfmTokenType.End, marker, null, null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } else { @@ -226,11 +282,23 @@ ref text // but are always sidebars and so should be tokenized as paragraphs if (marker == "esb" || marker == "esbe") { - tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)); + tokens.Add( + new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); break; } // Create unknown token with a corresponding end note - tokens.Add(new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*")); + tokens.Add( + new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*") + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } break; case UsfmStyleType.Milestone: @@ -247,16 +315,34 @@ ref text // add back space that was removed after marker if (milestoneText.Length > 0 && milestoneText[0] != ' ' && milestoneText[0] != '|') milestoneText = " " + milestoneText; - tokens.Add(new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null)); + tokens.Add( + new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); index = endOfText; } else if (tag.StyleType == UsfmStyleType.Milestone) { - tokens.Add(new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker)); + tokens.Add( + new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } else { - tokens.Add(new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null)); + tokens.Add( + new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); } break; @@ -299,7 +385,15 @@ ref text else if (tokens[i - 1].Type == UsfmTokenType.End) { // Insert space token after * of end marker - tokens.Insert(i, new UsfmToken(UsfmTokenType.Text, null, " ", null)); + int colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0); + tokens.Insert( + i, + new UsfmToken(UsfmTokenType.Text, null, " ", null) + { + LineNumber = lineNum, + ColumnNumber = colNum + } + ); i++; } } @@ -504,7 +598,9 @@ private UsfmToken HandleAttributes( bool preserveWhitespace, List tokens, int nextMarkerIndex, - ref string text + ref string text, + int lineNumber, + int columnNumber ) { int attributeIndex = text.IndexOf('|'); @@ -547,7 +643,11 @@ ref string text null, null, attributesValue - ); + ) + { + LineNumber = lineNumber, + ColumnNumber = columnNumber + attributeIndex + }; attributeToken.CopyAttributes(matchingToken); } } diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 43b266563..3a9e689bd 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -3,7 +3,7 @@ \h Matthew \mt Matthew \ip An introduction to Matthew\fe + \ft This is an endnote.\fe* -\p Here is another paragraph. +\p \rq MAT 1\rq* Here is another paragraph. \p and with a \w keyword|a special concept\w* in it. \p and a \weirdtaglookingthing that is not an actual tag. \c 1 @@ -38,7 +38,7 @@ \p \v 6 Chapter two, verse \w six|strong="12345" \w*. \p -\v 6 Bad verse. +\v 6 Bad verse. \x - \xo abc\xt 123\x* and more content. \p \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. \v 7a Chapter two, verse seven A, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index c2f51a53a..03322649f 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -88,7 +88,7 @@ public void GetRows_NonEmptyText_AllText() Assert.That(rows[3].Text, Is.EqualTo("This is an endnote.")); Assert.That(rows[4].Ref, Is.EqualTo(ScriptureRef.Parse("MAT 1:0/4:p", corpus.Versification))); - Assert.That(rows[4].Text, Is.EqualTo("Here is another paragraph.")); + Assert.That(rows[4].Text, Is.EqualTo("MAT 1 Here is another paragraph.")); Assert.That( rows[7].Ref, diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 33b6cbb13..f0b636cfb 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -8,7 +8,7 @@ public class UsfmManualTests { [Test] [Ignore("This is for manual testing only. Remove this tag to run the test.")] - public void ParseParallelCorpus() + public async Task ParseParallelCorpusAsync() { ParatextTextCorpus tCorpus = new(projectDir: CorporaTestHelpers.UsfmTargetProjectPath, includeAllText: true, includeMarkers: true); @@ -25,6 +25,30 @@ public void ParseParallelCorpus() List rows = pCorpus.GetRows().ToList(); Assert.That(rows, Has.Count.GreaterThan(0)); + + // insert the source into the target as pretranslations to make sure that USFM generation works + IReadOnlyList<(IReadOnlyList, string)> pretranslations = rows.Select(r => + ((IReadOnlyList)r.SourceRefs.Select(s => (ScriptureRef)s).ToList(), r.SourceText) + ) + .ToList(); + + ParatextProjectSettings targetSettings = new FileParatextProjectSettingsParser( + CorporaTestHelpers.UsfmTargetProjectPath + ).Parse(); + + foreach ( + string sfmFileName in Directory.EnumerateFiles( + CorporaTestHelpers.UsfmTargetProjectPath, + $"{targetSettings.FileNamePrefix}*{targetSettings.FileNameSuffix}" + ) + ) + { + var updater = new UsfmTextUpdater(pretranslations, stripAllText: true, preferExistingText: false); + string usfm = await File.ReadAllTextAsync(sfmFileName); + UsfmParser.Parse(usfm, updater, targetSettings.Stylesheet, targetSettings.Versification); + string newUsfm = updater.GetUsfm(targetSettings.Stylesheet); + Assert.That(newUsfm, Is.Not.Null); + } } public record PretranslationDto diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index 5cdb0e2cc..6bd7f254c 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -11,22 +11,30 @@ public void Tokenize() string usfm = ReadUsfm(); var tokenizer = new UsfmTokenizer(); IReadOnlyList tokens = tokenizer.Tokenize(usfm); - Assert.That(tokens, Has.Count.EqualTo(224)); + Assert.That(tokens, Has.Count.EqualTo(234)); Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); Assert.That(tokens[0].Marker, Is.EqualTo("id")); Assert.That(tokens[0].Data, Is.EqualTo("MAT")); - - Assert.That(tokens[34].Type, Is.EqualTo(UsfmTokenType.Text)); - Assert.That(tokens[34].Text, Is.EqualTo("Chapter One ")); - - Assert.That(tokens[35].Type, Is.EqualTo(UsfmTokenType.Verse)); - Assert.That(tokens[35].Marker, Is.EqualTo("v")); - Assert.That(tokens[35].Data, Is.EqualTo("1")); - - Assert.That(tokens[44].Type, Is.EqualTo(UsfmTokenType.Note)); - Assert.That(tokens[44].Marker, Is.EqualTo("f")); - Assert.That(tokens[44].Data, Is.EqualTo("+")); + Assert.That(tokens[0].LineNumber, Is.EqualTo(1)); + Assert.That(tokens[0].ColumnNumber, Is.EqualTo(1)); + + Assert.That(tokens[37].Type, Is.EqualTo(UsfmTokenType.Text)); + Assert.That(tokens[37].Text, Is.EqualTo("Chapter One ")); + Assert.That(tokens[37].LineNumber, Is.EqualTo(10)); + Assert.That(tokens[37].ColumnNumber, Is.EqualTo(4)); + + Assert.That(tokens[38].Type, Is.EqualTo(UsfmTokenType.Verse)); + Assert.That(tokens[38].Marker, Is.EqualTo("v")); + Assert.That(tokens[38].Data, Is.EqualTo("1")); + Assert.That(tokens[38].LineNumber, Is.EqualTo(11)); + Assert.That(tokens[38].ColumnNumber, Is.EqualTo(1)); + + Assert.That(tokens[47].Type, Is.EqualTo(UsfmTokenType.Note)); + Assert.That(tokens[47].Marker, Is.EqualTo("f")); + Assert.That(tokens[47].Data, Is.EqualTo("+")); + Assert.That(tokens[47].LineNumber, Is.EqualTo(11)); + Assert.That(tokens[47].ColumnNumber, Is.EqualTo(52)); } [Test]