From 27180f4090ca8d723745a6e02d930d9b0e677de4 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 25 Jul 2024 18:16:39 -0400 Subject: [PATCH] *Add custom exception for parsing *Fix off-by-one error *Handle triplicate, quadruplicate, n-plicate verses *Add test to cover triplicate verse --- .../ScriptureRefUsfmParserHandlerBase.cs | 2 +- src/SIL.Machine/Corpora/UsfmParser.cs | 749 +++++++++--------- .../Corpora/UsfmParsingException.cs | 14 + src/SIL.Machine/Corpora/UsfmTextUpdater.cs | 3 +- .../Corpora/UsfmMemoryTextTests.cs | 16 + 5 files changed, 414 insertions(+), 370 deletions(-) create mode 100644 src/SIL.Machine/Corpora/UsfmParsingException.cs diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 7d9e3391d..f7e9d5b73 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -53,7 +53,7 @@ public override void Verse( string pubNumber ) { - if (state.VerseRef.Equals(_curVerseRef)) + if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse) { EndVerseText(state, CreateVerseRefs()); // ignore duplicate verses diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index c17afb387..40b4a91b7 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using SIL.Scripture; @@ -138,288 +139,382 @@ public void ProcessTokens() /// false if there were no more tokens process public bool ProcessToken() { - // If past end - if (State.Index >= State.Tokens.Count - 1) + try { - CloseAll(); - Handler?.EndUsfm(State); - return false; - } - else if (State.Index < 0) - { - Handler?.StartUsfm(State); - } + // If past end + if (State.Index >= State.Tokens.Count - 1) + { + CloseAll(); + Handler?.EndUsfm(State); + return false; + } + else if (State.Index < 0) + { + Handler?.StartUsfm(State); + } - // Move to next token - State.Index++; + // Move to next token + State.Index++; - State.LineNumber = State.Token.LineNumber; - State.ColumnNumber = State.Token.ColumnNumber; + State.LineNumber = State.Token.LineNumber; + State.ColumnNumber = State.Token.ColumnNumber; - // Update verse offset with previous token (since verse offset is from start of current token) - if (State.PrevToken != null) - State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); + // Update verse offset with previous token (since verse offset is from start of current token) + if (State.PrevToken != null) + State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); - // Skip over tokens that are to be skipped, ensuring that - // SpecialToken state is true. - if (State.SpecialTokenCount > 0) - { - State.SpecialTokenCount--; - State.SpecialToken = true; - return true; - } + // Skip over tokens that are to be skipped, ensuring that + // SpecialToken state is true. + if (State.SpecialTokenCount > 0) + { + State.SpecialTokenCount--; + State.SpecialToken = true; + return true; + } - // Reset special token and figure status - State.SpecialToken = false; + // Reset special token and figure status + State.SpecialToken = false; - UsfmToken token = State.Token; + UsfmToken token = State.Token; - // Switch unknown types to either character or paragraph - UsfmTokenType tokenType = token.Type; - if (tokenType == UsfmTokenType.Unknown) - tokenType = DetermineUnknownTokenType(); + // Switch unknown types to either character or paragraph + UsfmTokenType tokenType = token.Type; + if (tokenType == UsfmTokenType.Unknown) + tokenType = DetermineUnknownTokenType(); - if (Handler != null && !string.IsNullOrEmpty(token.Marker)) - Handler.GotMarker(State, token.Marker); + if (Handler != null && !string.IsNullOrEmpty(token.Marker)) + Handler.GotMarker(State, token.Marker); - // Close open elements - switch (tokenType) - { - case UsfmTokenType.Book: - case UsfmTokenType.Chapter: - CloseAll(); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Close all but table and sidebar - while ( - State.Stack.Count > 0 - && State.Peek().Type != UsfmElementType.Table - && State.Peek().Type != UsfmElementType.Sidebar - ) + // Close open elements + switch (tokenType) + { + case UsfmTokenType.Book: + case UsfmTokenType.Chapter: + CloseAll(); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") { - CloseElement(); - } + // Close all but table and sidebar + while ( + State.Stack.Count > 0 + && State.Peek().Type != UsfmElementType.Table + && State.Peek().Type != UsfmElementType.Sidebar + ) + { + CloseElement(); + } - break; - } + break; + } - // Handle special case of sidebars - if (token.Marker == "esb") - { - // Close all - CloseAll(); - break; - } + // Handle special case of sidebars + if (token.Marker == "esb") + { + // Close all + CloseAll(); + break; + } - // Close all but sidebar - while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) - CloseElement(); - break; - case UsfmTokenType.Character: - // Handle special case of table cell - if (IsCell(token)) - { - // Close until row - while (State.Peek().Type != UsfmElementType.Row) + // Close all but sidebar + while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar) CloseElement(); break; - } + case UsfmTokenType.Character: + // Handle special case of table cell + if (IsCell(token)) + { + // Close until row + while (State.Peek().Type != UsfmElementType.Row) + CloseElement(); + break; + } - // Handle refs - if (IsRef(token)) - { - // Refs don't close anything - break; - } + // Handle refs + if (IsRef(token)) + { + // Refs don't close anything + break; + } - // If non-nested character style, close all character styles - if (!token.Marker.StartsWith("+")) - CloseCharStyles(); - break; - case UsfmTokenType.Verse: - UsfmTag paraTag = State.ParaTag; - if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) - CloseAll(); - else + // If non-nested character style, close all character styles + if (!token.Marker.StartsWith("+")) + CloseCharStyles(); + break; + case UsfmTokenType.Verse: + UsfmTag paraTag = State.ParaTag; + if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0) + CloseAll(); + else + CloseNote(); + break; + case UsfmTokenType.Note: CloseNote(); - break; - case UsfmTokenType.Note: - CloseNote(); - break; - case UsfmTokenType.End: - // If end marker for an active note - if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) - { - CloseNote(closed: true); break; - } - - // If end marker for a character style on stack, close it - // If no matching end marker, close all character styles on top of stack - UsfmParserElement elem; - bool unmatched = true; - while (State.Stack.Count > 0) - { - elem = State.Peek(); - if (elem.Type != UsfmElementType.Char) + case UsfmTokenType.End: + // If end marker for an active note + if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker))) + { + CloseNote(closed: true); break; + } - // Determine if a + prefix is needed to close it (was nested char style) - bool plusPrefix = - State.Stack.Count > 1 && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; + // If end marker for a character style on stack, close it + // If no matching end marker, close all character styles on top of stack + UsfmParserElement elem; + bool unmatched = true; + while (State.Stack.Count > 0) + { + elem = State.Peek(); + if (elem.Type != UsfmElementType.Char) + break; + + // Determine if a + prefix is needed to close it (was nested char style) + bool plusPrefix = + State.Stack.Count > 1 + && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char; + + // If is a match + if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) + { + CloseElement(closed: true); + + unmatched = false; + break; + } + else + { + CloseElement(); + } + } + + // Unmatched end marker + if (unmatched) + Handler?.Unmatched(State, token.Marker); + break; + } + + VerseRef vref; + // Handle tokens + switch (tokenType) + { + case UsfmTokenType.Book: + State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); + + // Code is always upper case + string code = token.Data.ToUpperInvariant(); + + vref = State.VerseRef; + // Update verse ref. Leave book alone if not empty to prevent parsing errors + // on books with bad id lines. + if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) + vref.Book = code; + vref.ChapterNum = 1; + vref.VerseNum = 0; + State.VerseRef = vref; + State.VerseOffset = 0; - // If is a match - if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker) + // Book start. + Handler?.StartBook(State, token.Marker, code); + break; + case UsfmTokenType.Chapter: + // Get alternate chapter number + string altChapter = null; + string pubChapter = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "ca" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "ca*" + ) { - CloseElement(closed: true); + altChapter = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; - unmatched = false; - break; + // Skip blank space after if present + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 + ) + { + State.SpecialTokenCount++; + } } - else + + // Get publishable chapter number + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + ) { - CloseElement(); + pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 2; } - } - // Unmatched end marker - if (unmatched) - Handler?.Unmatched(State, token.Marker); - break; - } + // Chapter + vref = State.VerseRef; + vref.Chapter = token.Data; + vref.VerseNum = 0; + State.VerseRef = vref; + // Verse offset is not zeroed for chapter 1, as it is part of intro + if (State.VerseRef.ChapterNum != 1) + State.VerseOffset = 0; - VerseRef vref; - // Handle tokens - switch (tokenType) - { - case UsfmTokenType.Book: - State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker)); - - // Code is always upper case - string code = token.Data.ToUpperInvariant(); - - vref = State.VerseRef; - // Update verse ref. Leave book alone if not empty to prevent parsing errors - // on books with bad id lines. - if (vref.Book == "" && Canon.BookIdToNumber(code) != 0) - vref.Book = code; - vref.ChapterNum = 1; - vref.VerseNum = 0; - State.VerseRef = vref; - State.VerseOffset = 0; - - // Book start. - Handler?.StartBook(State, token.Marker, code); - break; - case UsfmTokenType.Chapter: - // Get alternate chapter number - string altChapter = null; - string pubChapter = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "ca" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "ca*" - ) - { - altChapter = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - - // Skip blank space after if present + Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); + break; + case UsfmTokenType.Verse: + string pubVerse = null; + string altVerse = null; if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 1 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0 + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "va" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "va*" ) { - State.SpecialTokenCount++; + // Get alternate verse number + altVerse = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + if ( + State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 + && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" + && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null + && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" + ) + { + // Get publishable verse number + pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); + State.SpecialTokenCount += 3; } - } - - // Get publishable chapter number - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 2 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - ) - { - pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 2; - } - - // Chapter - vref = State.VerseRef; - vref.Chapter = token.Data; - vref.VerseNum = 0; - State.VerseRef = vref; - // Verse offset is not zeroed for chapter 1, as it is part of intro - if (State.VerseRef.ChapterNum != 1) + + // Verse + vref = State.VerseRef; + vref.Verse = token.Data; + State.VerseRef = vref; State.VerseOffset = 0; - Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); - break; - case UsfmTokenType.Verse: - string pubVerse = null; - string altVerse = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "va" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "va*" - ) - { - // Get alternate verse number - altVerse = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - if ( - State.Index + State.SpecialTokenCount < State.Tokens.Count - 3 - && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp" - && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null - && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*" - ) - { - // Get publishable verse number - pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - - // Verse - vref = State.VerseRef; - vref.Verse = token.Data; - State.VerseRef = vref; - State.VerseOffset = 0; - - Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); - break; - case UsfmTokenType.Paragraph: - // Handle special case of table rows - if (token.Marker == "tr") - { - // Start table if not open - if (State.Stack.All(e => e.Type != UsfmElementType.Table)) + Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse); + break; + case UsfmTokenType.Paragraph: + // Handle special case of table rows + if (token.Marker == "tr") + { + // Start table if not open + if (State.Stack.All(e => e.Type != UsfmElementType.Table)) + { + State.Push(new UsfmParserElement(UsfmElementType.Table, null)); + Handler?.StartTable(State); + } + + State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); + + // Row start + Handler?.StartRow(State, token.Marker); + break; + } + + // Handle special case of sidebars + if (token.Marker == "esb") { - State.Push(new UsfmParserElement(UsfmElementType.Table, null)); - Handler?.StartTable(State); + State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); + + // Look for category + string sidebarCategory = null; + if ( + State.Index < State.Tokens.Count - 3 + && State.Tokens[State.Index + 1].Marker == "cat" + && State.Tokens[State.Index + 2].Text != null + && State.Tokens[State.Index + 3].Marker == "cat*" + ) + { + // Get category + sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); + State.SpecialTokenCount += 3; + } + + Handler?.StartSidebar(State, token.Marker, sidebarCategory); + break; } - State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); + // Close sidebar if in sidebar + if (token.Marker == "esbe") + { + if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) + { + while (State.Stack.Count > 0) + CloseElement(State.Peek().Type == UsfmElementType.Sidebar); + } + else + { + Handler?.Unmatched(State, token.Marker); + } + break; + } - // Row start - Handler?.StartRow(State, token.Marker); + State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); + + // Paragraph opening + Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); break; - } + case UsfmTokenType.Character: + // Handle special case of table cells (treated as special character style) + if (IsCell(token)) + { + string align = "start"; + if (token.Marker.Length > 2 && token.Marker[2] == 'c') + align = "center"; + else if (token.Marker.Length > 2 && token.Marker[2] == 'r') + align = "end"; + + UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); + State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); + + Handler?.StartCell(State, baseMarker, align, colspan); + break; + } + + if (IsRef(token)) + { + // xrefs are special tokens (they do not stand alone) + State.SpecialToken = true; + + ParseDisplayAndTarget(out string display, out string target); + + State.SpecialTokenCount += 2; - // Handle special case of sidebars - if (token.Marker == "esb") - { - State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker)); + Handler?.Ref(State, token.Marker, display, target); + break; + } + string actualMarker; + bool invalidMarker = false; + if (token.Marker.StartsWith("+")) + { + // Only strip + if properly nested + UsfmTag charTag = State.CharTag; + actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; + invalidMarker = charTag == null; + } + else + { + actualMarker = token.Marker; + } + + State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); + Handler?.StartChar( + State, + actualMarker, + token.Type == UsfmTokenType.Unknown || invalidMarker, + token.Attributes + ); + break; + case UsfmTokenType.Note: // Look for category - string sidebarCategory = null; + string noteCategory = null; if ( State.Index < State.Tokens.Count - 3 && State.Tokens[State.Index + 1].Marker == "cat" @@ -428,148 +523,66 @@ public bool ProcessToken() ) { // Get category - sidebarCategory = State.Tokens[State.Index + 2].Text.Trim(); + noteCategory = State.Tokens[State.Index + 2].Text.Trim(); State.SpecialTokenCount += 3; } - Handler?.StartSidebar(State, token.Marker, sidebarCategory); + State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); + + Handler?.StartNote(State, token.Marker, token.Data, noteCategory); break; - } + case UsfmTokenType.Text: + string text = token.Text; - // Close sidebar if in sidebar - if (token.Marker == "esbe") - { - if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar)) + // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), + // or at very end, strip final space + // This is because USFM requires these to be on a new line, therefore adding whitespace + if ( + ( + State.Index == State.Tokens.Count - 1 + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book + || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter + ) + && text.Length > 0 + && text[text.Length - 1] == ' ' + ) { - while (State.Stack.Count > 0) - CloseElement(State.Peek().Type == UsfmElementType.Sidebar); + text = text.Substring(0, text.Length - 1); } - else + + if (Handler != null) { - Handler?.Unmatched(State, token.Marker); + // Replace ~ with nbsp + text = text.Replace('~', '\u00A0'); + + // Replace // with + foreach (string str in OptBreakSplitter.Split(text)) + { + if (str == "//") + Handler.OptBreak(State); + else + Handler.Text(State, str); + } } break; - } - - State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); - - // Paragraph opening - Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); - break; - case UsfmTokenType.Character: - // Handle special case of table cells (treated as special character style) - if (IsCell(token)) - { - string align = "start"; - if (token.Marker.Length > 2 && token.Marker[2] == 'c') - align = "center"; - else if (token.Marker.Length > 2 && token.Marker[2] == 'r') - align = "end"; - - UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); - State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); - - Handler?.StartCell(State, baseMarker, align, colspan); - break; - } - - if (IsRef(token)) - { - // xrefs are special tokens (they do not stand alone) - State.SpecialToken = true; - - ParseDisplayAndTarget(out string display, out string target); - - State.SpecialTokenCount += 2; - Handler?.Ref(State, token.Marker, display, target); + case UsfmTokenType.Milestone: + case UsfmTokenType.MilestoneEnd: + // currently, parse state doesn't need to be update, so just inform the handler about the milestone. + Handler?.Milestone( + State, + token.Marker, + token.Type == UsfmTokenType.Milestone, + token.Attributes + ); break; - } - - string actualMarker; - bool invalidMarker = false; - if (token.Marker.StartsWith("+")) - { - // Only strip + if properly nested - UsfmTag charTag = State.CharTag; - actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker; - invalidMarker = charTag == null; - } - else - { - actualMarker = token.Marker; - } - - State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); - Handler?.StartChar( - State, - actualMarker, - token.Type == UsfmTokenType.Unknown || invalidMarker, - token.Attributes - ); - break; - case UsfmTokenType.Note: - // Look for category - string noteCategory = null; - if ( - State.Index < State.Tokens.Count - 3 - && State.Tokens[State.Index + 1].Marker == "cat" - && State.Tokens[State.Index + 2].Text != null - && State.Tokens[State.Index + 3].Marker == "cat*" - ) - { - // Get category - noteCategory = State.Tokens[State.Index + 2].Text.Trim(); - State.SpecialTokenCount += 3; - } - - State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); - - Handler?.StartNote(State, token.Marker, token.Data, noteCategory); - break; - case UsfmTokenType.Text: - string text = token.Text; - - // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types), - // or at very end, strip final space - // This is because USFM requires these to be on a new line, therefore adding whitespace - if ( - ( - State.Index == State.Tokens.Count - 1 - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book - || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter - ) - && text.Length > 0 - && text[text.Length - 1] == ' ' - ) - { - text = text.Substring(0, text.Length - 1); - } - - if (Handler != null) - { - // Replace ~ with nbsp - text = text.Replace('~', '\u00A0'); - - // Replace // with - foreach (string str in OptBreakSplitter.Split(text)) - { - if (str == "//") - Handler.OptBreak(State); - else - Handler.Text(State, str); - } - } - break; - - case UsfmTokenType.Milestone: - case UsfmTokenType.MilestoneEnd: - // currently, parse state doesn't need to be update, so just inform the handler about the milestone. - Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes); - break; + } + } + catch (Exception e) + { + throw new UsfmParsingException(State, e); } - return true; } diff --git a/src/SIL.Machine/Corpora/UsfmParsingException.cs b/src/SIL.Machine/Corpora/UsfmParsingException.cs new file mode 100644 index 000000000..b3bcbbd59 --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmParsingException.cs @@ -0,0 +1,14 @@ +using System; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public class UsfmParsingException : Exception + { + public UsfmParsingException(UsfmParserState state, Exception exception) + : base( + $"Failed to parse at line {state.LineNumber} column {state.ColumnNumber} verse ref {state.VerseRef} with surrounding tokens [{string.Join(",", state.Tokens.ToList().GetRange(Math.Max(state.Index - 3, 0), Math.Min(7, state.Tokens.Count - (state.Index - 3))).Select(t => $"{t.Text} (TokenType={t.Type})"))}]", + exception + ) { } + } +} diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs index 9265a3178..4ac11cd78 100644 --- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs +++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs @@ -361,7 +361,7 @@ private void SkipTokens(UsfmParserState state) private bool ReplaceWithNewTokens(UsfmParserState state) { bool newText = _replace.Count > 0 && _replace.Peek(); - int tokenEnd = state.Index + state.SpecialTokenCount + 1; + int tokenEnd = state.Index + state.SpecialTokenCount; bool existingText = false; for (int index = _tokenIndex; index <= tokenEnd; index++) { @@ -393,6 +393,7 @@ private void PushTokensAsPrevious() private void PopNewTokens() { + // if (_replace.Any()) _replace.Pop(); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index b046be229..5a472b1f4 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -70,6 +70,22 @@ public void GetRows_DuplicateVerseWithTable() Assert.That(rows, Has.Length.EqualTo(5)); } + [Test] + public void GetRows_TriplicateVerse() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\c 1 +\v 1 First verse +\v 1 First verse +\v 1 First verse +", + includeAllText: true + ); + + Assert.That(rows, Has.Length.EqualTo(1)); + } + [Test] public void GetRows_VersePara_BeginningNonVerseSegment() {