Skip to content

Commit

Permalink
Fix USFM parsing/generation issues
Browse files Browse the repository at this point in the history
- fixes sillsdev/serval#399
- fixes sillsdev/serval#398
- Line and column number with USFM errors.
  • Loading branch information
johnml1135 committed Jun 5, 2024
1 parent bf2b46d commit da949b9
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 48 deletions.
2 changes: 2 additions & 0 deletions .ignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
!tests/SIL.Machine.Tests/Corpora/TestData/usfm/source/*
!tests/SIL.Machine.Tests/Corpora/TestData/usfm/target/*
49 changes: 34 additions & 15 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -150,35 +150,38 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close

public override void StartNote(UsfmParserState state, string marker, string caller, string category)
{
if (CurrentTextType != ScriptureTextType.None)
if (CurrentTextType != ScriptureTextType.None && !_duplicateVerse)
{
// if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment
CheckConvertVerseParaToNonVerse(state);
NextElement(marker);
StartNoteText(state);
}
}

public override void EndNote(UsfmParserState state, string marker, bool closed)
{
if (CurrentTextType == ScriptureTextType.Note)
if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse)
EndNoteText(state);
}

public override void Text(UsfmParserState state, string text)
{
// if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment
UsfmTag paraTag = state.ParaTag;
if (
CurrentTextType == ScriptureTextType.None
&& paraTag != null
&& paraTag.Marker != "tr"
&& state.IsVerseText
&& _curVerseRef.VerseNum == 0
&& text.Trim().Length > 0
)
{
StartParentElement(paraTag.Marker);
StartNonVerseText(state);
}
if (text.Trim().Length > 0)
CheckConvertVerseParaToNonVerse(state);
}

public override void StartChar(
UsfmParserState state,
string markerWithoutPlus,
bool unknown,
IReadOnlyList<UsfmAttribute> attributes
)
{
// if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse
// segment
CheckConvertVerseParaToNonVerse(state);
}

protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }
Expand Down Expand Up @@ -273,5 +276,21 @@ private ScriptureRef CreateNonVerseRef()
_curElements.Where(e => e.Position > 0).Reverse()
);
}

private void CheckConvertVerseParaToNonVerse(UsfmParserState state)
{
UsfmTag paraTag = state.ParaTag;
if (
CurrentTextType == ScriptureTextType.None
&& paraTag != null
&& paraTag.Marker != "tr"
&& state.IsVersePara
&& _curVerseRef.VerseNum == 0
)
{
StartParentElement(paraTag.Marker);
StartNonVerseText(state);
}
}
}
}
3 changes: 3 additions & 0 deletions src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ public bool ProcessToken()
// Move to next token
State.Index++;

State.LineNumber = State.Token.LineNumber;
State.ColumnNumber = State.Token.ColumnNumber;

// Update verse offset with previous token (since verse offset is from start of current token)
if (State.PrevToken != null)
State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace);
Expand Down
5 changes: 5 additions & 0 deletions src/SIL.Machine/Corpora/UsfmParserState.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn
_stack = new List<UsfmParserElement>();
VerseRef = new VerseRef(versification);
VerseOffset = 0;
LineNumber = 1;
ColumnNumber = 0;
Tokens = tokens;
}

Expand Down Expand Up @@ -59,6 +61,9 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn
/// </summary>
public int VerseOffset { get; internal set; }

public int LineNumber { get; internal set; }
public int ColumnNumber { get; internal set; }

/// <summary>
/// True if the token processed is part of a special indivisible group
/// of tokens (link or chapter/verse alternate/publishable)
Expand Down
8 changes: 5 additions & 3 deletions src/SIL.Machine/Corpora/UsfmTextBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ protected override IEnumerable<TextRow> GetVersesInDocOrder()
sb.Append($"An error occurred while parsing the text '{Id}`");
if (!string.IsNullOrEmpty(Project))
sb.Append($" in project '{Project}'");
sb.Append(
$". Verse: {parser.State.VerseRef}, offset: {parser.State.VerseOffset}, error: '{ex.Message}'"
);
sb.Append($". Verse: {parser.State.VerseRef}, line: {parser.State.LineNumber}, ");
sb.Append($"column: {parser.State.ColumnNumber}, error: '{ex.Message}'");
throw new InvalidOperationException(sb.ToString(), ex);
}
return rowCollector.Rows;
Expand Down Expand Up @@ -168,6 +167,9 @@ bool closed
{
base.EndChar(state, marker, attributes, closed);

if (_rowTexts.Count == 0)
return;

if (_text._includeMarkers && attributes != null && state.PrevToken?.Type == UsfmTokenType.Attribute)
_rowTexts.Peek().Append(state.PrevToken);

Expand Down
2 changes: 2 additions & 0 deletions src/SIL.Machine/Corpora/UsfmToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ public UsfmToken(UsfmTokenType type, string marker, string text, string endMarke

public string Data { get; }

public int LineNumber { get; internal set; } = -1;
public int ColumnNumber { get; internal set; } = -1;
public IReadOnlyList<UsfmAttribute> Attributes { get; private set; }

public string NestlessMarker
Expand Down
128 changes: 114 additions & 14 deletions src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,18 @@ public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace =
List<UsfmToken> tokens = new List<UsfmToken>();

int index = 0; // Current position
int lineNum = 1; // Current line number
int previousIndex = 0;
while (index < usfm.Length)
{
int nextMarkerIndex = (index < usfm.Length - 1) ? usfm.IndexOf('\\', index + 1) : -1;
if (nextMarkerIndex == -1)
nextMarkerIndex = usfm.Length;

lineNum += usfm.Substring(previousIndex, index - previousIndex).Count(c => c == '\n');
int colNum = index - usfm.LastIndexOf('\n', index);
previousIndex = index;

// If text, create text token until end or next \
var ch = usfm[index];
if (ch != '\\')
Expand All @@ -61,11 +67,21 @@ public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace =
preserveWhitespace,
tokens,
nextMarkerIndex,
ref text
ref text,
lineNum,
colNum
);

if (text.Length > 0)
tokens.Add(new UsfmToken(UsfmTokenType.Text, null, text, null));
{
tokens.Add(
new UsfmToken(UsfmTokenType.Text, null, text, null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}

if (attributeToken != null)
tokens.Add(attributeToken);
Expand Down Expand Up @@ -161,11 +177,21 @@ ref text
null,
GetNextWord(usfm, ref index, preserveWhitespace)
)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
else
{
tokens.Add(new UsfmToken(UsfmTokenType.Character, marker, null, endMarker));
tokens.Add(
new UsfmToken(UsfmTokenType.Character, marker, null, endMarker)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
break;
case UsfmStyleType.Paragraph:
Expand All @@ -180,6 +206,10 @@ ref text
null,
GetNextWord(usfm, ref index, preserveWhitespace)
)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
else if ((tag.TextProperties & UsfmTextProperties.Book) > 0)
Expand All @@ -192,11 +222,21 @@ ref text
null,
GetNextWord(usfm, ref index, preserveWhitespace)
)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
else
{
tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker));
tokens.Add(
new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}

break;
Expand All @@ -209,28 +249,56 @@ ref text
endMarker,
GetNextWord(usfm, ref index, preserveWhitespace)
)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
break;
case UsfmStyleType.End:
tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null, null));
tokens.Add(
new UsfmToken(UsfmTokenType.End, marker, null, null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
break;
case UsfmStyleType.Unknown:
// End tokens are always end tokens, even if unknown
if (marker.EndsWith("*", StringComparison.Ordinal))
{
tokens.Add(new UsfmToken(UsfmTokenType.End, marker, null, null));
tokens.Add(
new UsfmToken(UsfmTokenType.End, marker, null, null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
else
{
// Handle special case of esb and esbe which might not be in basic stylesheet
// but are always sidebars and so should be tokenized as paragraphs
if (marker == "esb" || marker == "esbe")
{
tokens.Add(new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker));
tokens.Add(
new UsfmToken(UsfmTokenType.Paragraph, marker, null, endMarker)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
break;
}
// Create unknown token with a corresponding end note
tokens.Add(new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*"));
tokens.Add(
new UsfmToken(UsfmTokenType.Unknown, marker, null, marker + "*")
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
break;
case UsfmStyleType.Milestone:
Expand All @@ -247,16 +315,34 @@ ref text
// add back space that was removed after marker
if (milestoneText.Length > 0 && milestoneText[0] != ' ' && milestoneText[0] != '|')
milestoneText = " " + milestoneText;
tokens.Add(new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null));
tokens.Add(
new UsfmToken(UsfmTokenType.Text, null, @"\" + marker + milestoneText, null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
index = endOfText;
}
else if (tag.StyleType == UsfmStyleType.Milestone)
{
tokens.Add(new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker));
tokens.Add(
new UsfmToken(UsfmTokenType.Milestone, marker, null, endMarker)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}
else
{
tokens.Add(new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null));
tokens.Add(
new UsfmToken(UsfmTokenType.MilestoneEnd, marker, null, null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
}

break;
Expand Down Expand Up @@ -299,7 +385,15 @@ ref text
else if (tokens[i - 1].Type == UsfmTokenType.End)
{
// Insert space token after * of end marker
tokens.Insert(i, new UsfmToken(UsfmTokenType.Text, null, " ", null));
int colNum = usfm.Length + 1 - Math.Max(usfm.LastIndexOf('\n', index), 0);
tokens.Insert(
i,
new UsfmToken(UsfmTokenType.Text, null, " ", null)
{
LineNumber = lineNum,
ColumnNumber = colNum
}
);
i++;
}
}
Expand Down Expand Up @@ -504,7 +598,9 @@ private UsfmToken HandleAttributes(
bool preserveWhitespace,
List<UsfmToken> tokens,
int nextMarkerIndex,
ref string text
ref string text,
int lineNumber,
int columnNumber
)
{
int attributeIndex = text.IndexOf('|');
Expand Down Expand Up @@ -547,7 +643,11 @@ ref string text
null,
null,
attributesValue
);
)
{
LineNumber = lineNumber,
ColumnNumber = columnNumber + attributeIndex
};
attributeToken.CopyAttributes(matchingToken);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
\h Matthew
\mt Matthew
\ip An introduction to Matthew\fe + \ft This is an endnote.\fe*
\p Here is another paragraph.
\p \rq MAT 1\rq* Here is another paragraph.
\p and with a \w keyword|a special concept\w* in it.
\p and a \weirdtaglookingthing that is not an actual tag.
\c 1
Expand Down Expand Up @@ -38,7 +38,7 @@
\p
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\p
\v 6 Bad verse.
\v 6 Bad verse. \x - \xo abc\xt 123\x* and more content.
\p
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
Expand Down
Loading

0 comments on commit da949b9

Please sign in to comment.