diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
index 7d9e3391..f7e9d5b7 100644
--- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
+++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
@@ -53,7 +53,7 @@ public override void Verse(
string pubNumber
)
{
- if (state.VerseRef.Equals(_curVerseRef))
+ if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse)
{
EndVerseText(state, CreateVerseRefs());
// ignore duplicate verses
diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs
index c17afb38..40b4a91b 100644
--- a/src/SIL.Machine/Corpora/UsfmParser.cs
+++ b/src/SIL.Machine/Corpora/UsfmParser.cs
@@ -1,4 +1,5 @@
-using System.Collections.Generic;
+using System;
+using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using SIL.Scripture;
@@ -138,288 +139,382 @@ public void ProcessTokens()
/// false if there were no more tokens process
public bool ProcessToken()
{
- // If past end
- if (State.Index >= State.Tokens.Count - 1)
+ try
{
- CloseAll();
- Handler?.EndUsfm(State);
- return false;
- }
- else if (State.Index < 0)
- {
- Handler?.StartUsfm(State);
- }
+ // If past end
+ if (State.Index >= State.Tokens.Count - 1)
+ {
+ CloseAll();
+ Handler?.EndUsfm(State);
+ return false;
+ }
+ else if (State.Index < 0)
+ {
+ Handler?.StartUsfm(State);
+ }
- // Move to next token
- State.Index++;
+ // Move to next token
+ State.Index++;
- State.LineNumber = State.Token.LineNumber;
- State.ColumnNumber = State.Token.ColumnNumber;
+ State.LineNumber = State.Token.LineNumber;
+ State.ColumnNumber = State.Token.ColumnNumber;
- // Update verse offset with previous token (since verse offset is from start of current token)
- if (State.PrevToken != null)
- State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace);
+ // Update verse offset with previous token (since verse offset is from start of current token)
+ if (State.PrevToken != null)
+ State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace);
- // Skip over tokens that are to be skipped, ensuring that
- // SpecialToken state is true.
- if (State.SpecialTokenCount > 0)
- {
- State.SpecialTokenCount--;
- State.SpecialToken = true;
- return true;
- }
+ // Skip over tokens that are to be skipped, ensuring that
+ // SpecialToken state is true.
+ if (State.SpecialTokenCount > 0)
+ {
+ State.SpecialTokenCount--;
+ State.SpecialToken = true;
+ return true;
+ }
- // Reset special token and figure status
- State.SpecialToken = false;
+ // Reset special token and figure status
+ State.SpecialToken = false;
- UsfmToken token = State.Token;
+ UsfmToken token = State.Token;
- // Switch unknown types to either character or paragraph
- UsfmTokenType tokenType = token.Type;
- if (tokenType == UsfmTokenType.Unknown)
- tokenType = DetermineUnknownTokenType();
+ // Switch unknown types to either character or paragraph
+ UsfmTokenType tokenType = token.Type;
+ if (tokenType == UsfmTokenType.Unknown)
+ tokenType = DetermineUnknownTokenType();
- if (Handler != null && !string.IsNullOrEmpty(token.Marker))
- Handler.GotMarker(State, token.Marker);
+ if (Handler != null && !string.IsNullOrEmpty(token.Marker))
+ Handler.GotMarker(State, token.Marker);
- // Close open elements
- switch (tokenType)
- {
- case UsfmTokenType.Book:
- case UsfmTokenType.Chapter:
- CloseAll();
- break;
- case UsfmTokenType.Paragraph:
- // Handle special case of table rows
- if (token.Marker == "tr")
- {
- // Close all but table and sidebar
- while (
- State.Stack.Count > 0
- && State.Peek().Type != UsfmElementType.Table
- && State.Peek().Type != UsfmElementType.Sidebar
- )
+ // Close open elements
+ switch (tokenType)
+ {
+ case UsfmTokenType.Book:
+ case UsfmTokenType.Chapter:
+ CloseAll();
+ break;
+ case UsfmTokenType.Paragraph:
+ // Handle special case of table rows
+ if (token.Marker == "tr")
{
- CloseElement();
- }
+ // Close all but table and sidebar
+ while (
+ State.Stack.Count > 0
+ && State.Peek().Type != UsfmElementType.Table
+ && State.Peek().Type != UsfmElementType.Sidebar
+ )
+ {
+ CloseElement();
+ }
- break;
- }
+ break;
+ }
- // Handle special case of sidebars
- if (token.Marker == "esb")
- {
- // Close all
- CloseAll();
- break;
- }
+ // Handle special case of sidebars
+ if (token.Marker == "esb")
+ {
+ // Close all
+ CloseAll();
+ break;
+ }
- // Close all but sidebar
- while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar)
- CloseElement();
- break;
- case UsfmTokenType.Character:
- // Handle special case of table cell
- if (IsCell(token))
- {
- // Close until row
- while (State.Peek().Type != UsfmElementType.Row)
+ // Close all but sidebar
+ while (State.Stack.Count > 0 && State.Peek().Type != UsfmElementType.Sidebar)
CloseElement();
break;
- }
+ case UsfmTokenType.Character:
+ // Handle special case of table cell
+ if (IsCell(token))
+ {
+ // Close until row
+ while (State.Peek().Type != UsfmElementType.Row)
+ CloseElement();
+ break;
+ }
- // Handle refs
- if (IsRef(token))
- {
- // Refs don't close anything
- break;
- }
+ // Handle refs
+ if (IsRef(token))
+ {
+ // Refs don't close anything
+ break;
+ }
- // If non-nested character style, close all character styles
- if (!token.Marker.StartsWith("+"))
- CloseCharStyles();
- break;
- case UsfmTokenType.Verse:
- UsfmTag paraTag = State.ParaTag;
- if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0)
- CloseAll();
- else
+ // If non-nested character style, close all character styles
+ if (!token.Marker.StartsWith("+"))
+ CloseCharStyles();
+ break;
+ case UsfmTokenType.Verse:
+ UsfmTag paraTag = State.ParaTag;
+ if (paraTag != null && paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != 0)
+ CloseAll();
+ else
+ CloseNote();
+ break;
+ case UsfmTokenType.Note:
CloseNote();
- break;
- case UsfmTokenType.Note:
- CloseNote();
- break;
- case UsfmTokenType.End:
- // If end marker for an active note
- if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker)))
- {
- CloseNote(closed: true);
break;
- }
-
- // If end marker for a character style on stack, close it
- // If no matching end marker, close all character styles on top of stack
- UsfmParserElement elem;
- bool unmatched = true;
- while (State.Stack.Count > 0)
- {
- elem = State.Peek();
- if (elem.Type != UsfmElementType.Char)
+ case UsfmTokenType.End:
+ // If end marker for an active note
+ if (State.Stack.Any(e => e.Type == UsfmElementType.Note && (e.Marker + "*" == token.Marker)))
+ {
+ CloseNote(closed: true);
break;
+ }
- // Determine if a + prefix is needed to close it (was nested char style)
- bool plusPrefix =
- State.Stack.Count > 1 && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char;
+ // If end marker for a character style on stack, close it
+ // If no matching end marker, close all character styles on top of stack
+ UsfmParserElement elem;
+ bool unmatched = true;
+ while (State.Stack.Count > 0)
+ {
+ elem = State.Peek();
+ if (elem.Type != UsfmElementType.Char)
+ break;
+
+ // Determine if a + prefix is needed to close it (was nested char style)
+ bool plusPrefix =
+ State.Stack.Count > 1
+ && State.Stack[State.Stack.Count - 2].Type == UsfmElementType.Char;
+
+ // If is a match
+ if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker)
+ {
+ CloseElement(closed: true);
+
+ unmatched = false;
+ break;
+ }
+ else
+ {
+ CloseElement();
+ }
+ }
+
+ // Unmatched end marker
+ if (unmatched)
+ Handler?.Unmatched(State, token.Marker);
+ break;
+ }
+
+ VerseRef vref;
+ // Handle tokens
+ switch (tokenType)
+ {
+ case UsfmTokenType.Book:
+ State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker));
+
+ // Code is always upper case
+ string code = token.Data.ToUpperInvariant();
+
+ vref = State.VerseRef;
+ // Update verse ref. Leave book alone if not empty to prevent parsing errors
+ // on books with bad id lines.
+ if (vref.Book == "" && Canon.BookIdToNumber(code) != 0)
+ vref.Book = code;
+ vref.ChapterNum = 1;
+ vref.VerseNum = 0;
+ State.VerseRef = vref;
+ State.VerseOffset = 0;
- // If is a match
- if ((plusPrefix ? "+" : "") + elem.Marker + "*" == token.Marker)
+ // Book start.
+ Handler?.StartBook(State, token.Marker, code);
+ break;
+ case UsfmTokenType.Chapter:
+ // Get alternate chapter number
+ string altChapter = null;
+ string pubChapter = null;
+ if (
+ State.Index < State.Tokens.Count - 3
+ && State.Tokens[State.Index + 1].Marker == "ca"
+ && State.Tokens[State.Index + 2].Text != null
+ && State.Tokens[State.Index + 3].Marker == "ca*"
+ )
{
- CloseElement(closed: true);
+ altChapter = State.Tokens[State.Index + 2].Text.Trim();
+ State.SpecialTokenCount += 3;
- unmatched = false;
- break;
+ // Skip blank space after if present
+ if (
+ State.Index + State.SpecialTokenCount < State.Tokens.Count - 1
+ && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null
+ && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0
+ )
+ {
+ State.SpecialTokenCount++;
+ }
}
- else
+
+ // Get publishable chapter number
+ if (
+ State.Index + State.SpecialTokenCount < State.Tokens.Count - 2
+ && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp"
+ && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null
+ )
{
- CloseElement();
+ pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim();
+ State.SpecialTokenCount += 2;
}
- }
- // Unmatched end marker
- if (unmatched)
- Handler?.Unmatched(State, token.Marker);
- break;
- }
+ // Chapter
+ vref = State.VerseRef;
+ vref.Chapter = token.Data;
+ vref.VerseNum = 0;
+ State.VerseRef = vref;
+ // Verse offset is not zeroed for chapter 1, as it is part of intro
+ if (State.VerseRef.ChapterNum != 1)
+ State.VerseOffset = 0;
- VerseRef vref;
- // Handle tokens
- switch (tokenType)
- {
- case UsfmTokenType.Book:
- State.Push(new UsfmParserElement(UsfmElementType.Book, token.Marker));
-
- // Code is always upper case
- string code = token.Data.ToUpperInvariant();
-
- vref = State.VerseRef;
- // Update verse ref. Leave book alone if not empty to prevent parsing errors
- // on books with bad id lines.
- if (vref.Book == "" && Canon.BookIdToNumber(code) != 0)
- vref.Book = code;
- vref.ChapterNum = 1;
- vref.VerseNum = 0;
- State.VerseRef = vref;
- State.VerseOffset = 0;
-
- // Book start.
- Handler?.StartBook(State, token.Marker, code);
- break;
- case UsfmTokenType.Chapter:
- // Get alternate chapter number
- string altChapter = null;
- string pubChapter = null;
- if (
- State.Index < State.Tokens.Count - 3
- && State.Tokens[State.Index + 1].Marker == "ca"
- && State.Tokens[State.Index + 2].Text != null
- && State.Tokens[State.Index + 3].Marker == "ca*"
- )
- {
- altChapter = State.Tokens[State.Index + 2].Text.Trim();
- State.SpecialTokenCount += 3;
-
- // Skip blank space after if present
+ Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter);
+ break;
+ case UsfmTokenType.Verse:
+ string pubVerse = null;
+ string altVerse = null;
if (
- State.Index + State.SpecialTokenCount < State.Tokens.Count - 1
- && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text != null
- && State.Tokens[State.Index + State.SpecialTokenCount + 1].Text.Trim().Length == 0
+ State.Index < State.Tokens.Count - 3
+ && State.Tokens[State.Index + 1].Marker == "va"
+ && State.Tokens[State.Index + 2].Text != null
+ && State.Tokens[State.Index + 3].Marker == "va*"
)
{
- State.SpecialTokenCount++;
+ // Get alternate verse number
+ altVerse = State.Tokens[State.Index + 2].Text.Trim();
+ State.SpecialTokenCount += 3;
+ }
+ if (
+ State.Index + State.SpecialTokenCount < State.Tokens.Count - 3
+ && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp"
+ && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null
+ && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*"
+ )
+ {
+ // Get publishable verse number
+ pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim();
+ State.SpecialTokenCount += 3;
}
- }
-
- // Get publishable chapter number
- if (
- State.Index + State.SpecialTokenCount < State.Tokens.Count - 2
- && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "cp"
- && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null
- )
- {
- pubChapter = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim();
- State.SpecialTokenCount += 2;
- }
-
- // Chapter
- vref = State.VerseRef;
- vref.Chapter = token.Data;
- vref.VerseNum = 0;
- State.VerseRef = vref;
- // Verse offset is not zeroed for chapter 1, as it is part of intro
- if (State.VerseRef.ChapterNum != 1)
+
+ // Verse
+ vref = State.VerseRef;
+ vref.Verse = token.Data;
+ State.VerseRef = vref;
State.VerseOffset = 0;
- Handler?.Chapter(State, token.Data, token.Marker, altChapter, pubChapter);
- break;
- case UsfmTokenType.Verse:
- string pubVerse = null;
- string altVerse = null;
- if (
- State.Index < State.Tokens.Count - 3
- && State.Tokens[State.Index + 1].Marker == "va"
- && State.Tokens[State.Index + 2].Text != null
- && State.Tokens[State.Index + 3].Marker == "va*"
- )
- {
- // Get alternate verse number
- altVerse = State.Tokens[State.Index + 2].Text.Trim();
- State.SpecialTokenCount += 3;
- }
- if (
- State.Index + State.SpecialTokenCount < State.Tokens.Count - 3
- && State.Tokens[State.Index + State.SpecialTokenCount + 1].Marker == "vp"
- && State.Tokens[State.Index + State.SpecialTokenCount + 2].Text != null
- && State.Tokens[State.Index + State.SpecialTokenCount + 3].Marker == "vp*"
- )
- {
- // Get publishable verse number
- pubVerse = State.Tokens[State.Index + State.SpecialTokenCount + 2].Text.Trim();
- State.SpecialTokenCount += 3;
- }
-
- // Verse
- vref = State.VerseRef;
- vref.Verse = token.Data;
- State.VerseRef = vref;
- State.VerseOffset = 0;
-
- Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse);
- break;
- case UsfmTokenType.Paragraph:
- // Handle special case of table rows
- if (token.Marker == "tr")
- {
- // Start table if not open
- if (State.Stack.All(e => e.Type != UsfmElementType.Table))
+ Handler?.Verse(State, token.Data, token.Marker, altVerse, pubVerse);
+ break;
+ case UsfmTokenType.Paragraph:
+ // Handle special case of table rows
+ if (token.Marker == "tr")
+ {
+ // Start table if not open
+ if (State.Stack.All(e => e.Type != UsfmElementType.Table))
+ {
+ State.Push(new UsfmParserElement(UsfmElementType.Table, null));
+ Handler?.StartTable(State);
+ }
+
+ State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker));
+
+ // Row start
+ Handler?.StartRow(State, token.Marker);
+ break;
+ }
+
+ // Handle special case of sidebars
+ if (token.Marker == "esb")
{
- State.Push(new UsfmParserElement(UsfmElementType.Table, null));
- Handler?.StartTable(State);
+ State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker));
+
+ // Look for category
+ string sidebarCategory = null;
+ if (
+ State.Index < State.Tokens.Count - 3
+ && State.Tokens[State.Index + 1].Marker == "cat"
+ && State.Tokens[State.Index + 2].Text != null
+ && State.Tokens[State.Index + 3].Marker == "cat*"
+ )
+ {
+ // Get category
+ sidebarCategory = State.Tokens[State.Index + 2].Text.Trim();
+ State.SpecialTokenCount += 3;
+ }
+
+ Handler?.StartSidebar(State, token.Marker, sidebarCategory);
+ break;
}
- State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker));
+ // Close sidebar if in sidebar
+ if (token.Marker == "esbe")
+ {
+ if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar))
+ {
+ while (State.Stack.Count > 0)
+ CloseElement(State.Peek().Type == UsfmElementType.Sidebar);
+ }
+ else
+ {
+ Handler?.Unmatched(State, token.Marker);
+ }
+ break;
+ }
- // Row start
- Handler?.StartRow(State, token.Marker);
+ State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker));
+
+ // Paragraph opening
+ Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes);
break;
- }
+ case UsfmTokenType.Character:
+ // Handle special case of table cells (treated as special character style)
+ if (IsCell(token))
+ {
+ string align = "start";
+ if (token.Marker.Length > 2 && token.Marker[2] == 'c')
+ align = "center";
+ else if (token.Marker.Length > 2 && token.Marker[2] == 'r')
+ align = "end";
+
+ UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan);
+ State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker));
+
+ Handler?.StartCell(State, baseMarker, align, colspan);
+ break;
+ }
+
+ if (IsRef(token))
+ {
+ // xrefs are special tokens (they do not stand alone)
+ State.SpecialToken = true;
+
+ ParseDisplayAndTarget(out string display, out string target);
+
+ State.SpecialTokenCount += 2;
- // Handle special case of sidebars
- if (token.Marker == "esb")
- {
- State.Push(new UsfmParserElement(UsfmElementType.Sidebar, token.Marker));
+ Handler?.Ref(State, token.Marker, display, target);
+ break;
+ }
+ string actualMarker;
+ bool invalidMarker = false;
+ if (token.Marker.StartsWith("+"))
+ {
+ // Only strip + if properly nested
+ UsfmTag charTag = State.CharTag;
+ actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker;
+ invalidMarker = charTag == null;
+ }
+ else
+ {
+ actualMarker = token.Marker;
+ }
+
+ State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes));
+ Handler?.StartChar(
+ State,
+ actualMarker,
+ token.Type == UsfmTokenType.Unknown || invalidMarker,
+ token.Attributes
+ );
+ break;
+ case UsfmTokenType.Note:
// Look for category
- string sidebarCategory = null;
+ string noteCategory = null;
if (
State.Index < State.Tokens.Count - 3
&& State.Tokens[State.Index + 1].Marker == "cat"
@@ -428,148 +523,66 @@ public bool ProcessToken()
)
{
// Get category
- sidebarCategory = State.Tokens[State.Index + 2].Text.Trim();
+ noteCategory = State.Tokens[State.Index + 2].Text.Trim();
State.SpecialTokenCount += 3;
}
- Handler?.StartSidebar(State, token.Marker, sidebarCategory);
+ State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker));
+
+ Handler?.StartNote(State, token.Marker, token.Data, noteCategory);
break;
- }
+ case UsfmTokenType.Text:
+ string text = token.Text;
- // Close sidebar if in sidebar
- if (token.Marker == "esbe")
- {
- if (State.Stack.Any(e => e.Type == UsfmElementType.Sidebar))
+ // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types),
+ // or at very end, strip final space
+ // This is because USFM requires these to be on a new line, therefore adding whitespace
+ if (
+ (
+ State.Index == State.Tokens.Count - 1
+ || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph
+ || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book
+ || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter
+ )
+ && text.Length > 0
+ && text[text.Length - 1] == ' '
+ )
{
- while (State.Stack.Count > 0)
- CloseElement(State.Peek().Type == UsfmElementType.Sidebar);
+ text = text.Substring(0, text.Length - 1);
}
- else
+
+ if (Handler != null)
{
- Handler?.Unmatched(State, token.Marker);
+ // Replace ~ with nbsp
+ text = text.Replace('~', '\u00A0');
+
+ // Replace // with
+ foreach (string str in OptBreakSplitter.Split(text))
+ {
+ if (str == "//")
+ Handler.OptBreak(State);
+ else
+ Handler.Text(State, str);
+ }
}
break;
- }
-
- State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker));
-
- // Paragraph opening
- Handler?.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes);
- break;
- case UsfmTokenType.Character:
- // Handle special case of table cells (treated as special character style)
- if (IsCell(token))
- {
- string align = "start";
- if (token.Marker.Length > 2 && token.Marker[2] == 'c')
- align = "center";
- else if (token.Marker.Length > 2 && token.Marker[2] == 'r')
- align = "end";
-
- UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan);
- State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker));
-
- Handler?.StartCell(State, baseMarker, align, colspan);
- break;
- }
-
- if (IsRef(token))
- {
- // xrefs are special tokens (they do not stand alone)
- State.SpecialToken = true;
-
- ParseDisplayAndTarget(out string display, out string target);
-
- State.SpecialTokenCount += 2;
- Handler?.Ref(State, token.Marker, display, target);
+ case UsfmTokenType.Milestone:
+ case UsfmTokenType.MilestoneEnd:
+ // currently, parse state doesn't need to be update, so just inform the handler about the milestone.
+ Handler?.Milestone(
+ State,
+ token.Marker,
+ token.Type == UsfmTokenType.Milestone,
+ token.Attributes
+ );
break;
- }
-
- string actualMarker;
- bool invalidMarker = false;
- if (token.Marker.StartsWith("+"))
- {
- // Only strip + if properly nested
- UsfmTag charTag = State.CharTag;
- actualMarker = charTag != null ? token.Marker.TrimStart('+') : token.Marker;
- invalidMarker = charTag == null;
- }
- else
- {
- actualMarker = token.Marker;
- }
-
- State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes));
- Handler?.StartChar(
- State,
- actualMarker,
- token.Type == UsfmTokenType.Unknown || invalidMarker,
- token.Attributes
- );
- break;
- case UsfmTokenType.Note:
- // Look for category
- string noteCategory = null;
- if (
- State.Index < State.Tokens.Count - 3
- && State.Tokens[State.Index + 1].Marker == "cat"
- && State.Tokens[State.Index + 2].Text != null
- && State.Tokens[State.Index + 3].Marker == "cat*"
- )
- {
- // Get category
- noteCategory = State.Tokens[State.Index + 2].Text.Trim();
- State.SpecialTokenCount += 3;
- }
-
- State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker));
-
- Handler?.StartNote(State, token.Marker, token.Data, noteCategory);
- break;
- case UsfmTokenType.Text:
- string text = token.Text;
-
- // If last token before a paragraph, book or chapter, esb, esbe (both are paragraph types),
- // or at very end, strip final space
- // This is because USFM requires these to be on a new line, therefore adding whitespace
- if (
- (
- State.Index == State.Tokens.Count - 1
- || State.Tokens[State.Index + 1].Type == UsfmTokenType.Paragraph
- || State.Tokens[State.Index + 1].Type == UsfmTokenType.Book
- || State.Tokens[State.Index + 1].Type == UsfmTokenType.Chapter
- )
- && text.Length > 0
- && text[text.Length - 1] == ' '
- )
- {
- text = text.Substring(0, text.Length - 1);
- }
-
- if (Handler != null)
- {
- // Replace ~ with nbsp
- text = text.Replace('~', '\u00A0');
-
- // Replace // with
- foreach (string str in OptBreakSplitter.Split(text))
- {
- if (str == "//")
- Handler.OptBreak(State);
- else
- Handler.Text(State, str);
- }
- }
- break;
-
- case UsfmTokenType.Milestone:
- case UsfmTokenType.MilestoneEnd:
- // currently, parse state doesn't need to be update, so just inform the handler about the milestone.
- Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes);
- break;
+ }
+ }
+ catch (Exception e)
+ {
+ throw new UsfmParsingException(State, e);
}
-
return true;
}
diff --git a/src/SIL.Machine/Corpora/UsfmParsingException.cs b/src/SIL.Machine/Corpora/UsfmParsingException.cs
new file mode 100644
index 00000000..b3bcbbd5
--- /dev/null
+++ b/src/SIL.Machine/Corpora/UsfmParsingException.cs
@@ -0,0 +1,14 @@
+using System;
+using System.Linq;
+
+namespace SIL.Machine.Corpora
+{
+ public class UsfmParsingException : Exception
+ {
+ public UsfmParsingException(UsfmParserState state, Exception exception)
+ : base(
+ $"Failed to parse at line {state.LineNumber} column {state.ColumnNumber} verse ref {state.VerseRef} with surrounding tokens [{string.Join(",", state.Tokens.ToList().GetRange(Math.Max(state.Index - 3, 0), Math.Min(7, state.Tokens.Count - (state.Index - 3))).Select(t => $"{t.Text} (TokenType={t.Type})"))}]",
+ exception
+ ) { }
+ }
+}
diff --git a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs
index 9265a317..4ac11cd7 100644
--- a/src/SIL.Machine/Corpora/UsfmTextUpdater.cs
+++ b/src/SIL.Machine/Corpora/UsfmTextUpdater.cs
@@ -361,7 +361,7 @@ private void SkipTokens(UsfmParserState state)
private bool ReplaceWithNewTokens(UsfmParserState state)
{
bool newText = _replace.Count > 0 && _replace.Peek();
- int tokenEnd = state.Index + state.SpecialTokenCount + 1;
+ int tokenEnd = state.Index + state.SpecialTokenCount;
bool existingText = false;
for (int index = _tokenIndex; index <= tokenEnd; index++)
{
@@ -393,6 +393,7 @@ private void PushTokensAsPrevious()
private void PopNewTokens()
{
+ // if (_replace.Any())
_replace.Pop();
}
}
diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs
index b046be22..5a472b1f 100644
--- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs
+++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs
@@ -70,6 +70,22 @@ public void GetRows_DuplicateVerseWithTable()
Assert.That(rows, Has.Length.EqualTo(5));
}
+ [Test]
+ public void GetRows_TriplicateVerse()
+ {
+ TextRow[] rows = GetRows(
+ @"\id MAT - Test
+\c 1
+\v 1 First verse
+\v 1 First verse
+\v 1 First verse
+",
+ includeAllText: true
+ );
+
+ Assert.That(rows, Has.Length.EqualTo(1));
+ }
+
[Test]
public void GetRows_VersePara_BeginningNonVerseSegment()
{