Skip to content

Commit

Permalink
Ignore a note that occurs right after \id marker
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed May 23, 2024
1 parent 8d84ddd commit 9743519
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 12 deletions.
10 changes: 7 additions & 3 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,17 @@ public override void EndSidebar(UsfmParserState state, string marker, bool close

public override void StartNote(UsfmParserState state, string marker, string caller, string category)
{
NextElement(marker);
StartNoteText(state);
if (CurrentTextType != ScriptureTextType.None)
{
NextElement(marker);
StartNoteText(state);
}
}

public override void EndNote(UsfmParserState state, string marker, bool closed)
{
EndNoteText(state);
if (CurrentTextType == ScriptureTextType.Note)
EndNoteText(state);
}

public override void Text(UsfmParserState state, string text)
Expand Down
16 changes: 16 additions & 0 deletions src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespac
{
UsfmToken prevToken = null;
var usfm = new StringBuilder();
bool inBook = false;
foreach (UsfmToken token in tokens)
{
string tokenUsfm = "";
Expand All @@ -335,6 +336,7 @@ public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespac
usfm.Append("\r\n");
}
tokenUsfm = token.ToUsfm();
inBook = token.Type == UsfmTokenType.Book;
break;
case UsfmTokenType.Verse:
// Add newline if after anything other than [ or (
Expand All @@ -359,6 +361,7 @@ public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespac
RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f";
tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2");
}
inBook = false;
break;
case UsfmTokenType.Text:
// Ensure spaces are preserved
Expand All @@ -383,7 +386,20 @@ public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespac
}
break;
default:
if (inBook)
{
if (
usfm[usfm.Length - 1] == ' '
&& ((prevToken != null && prevToken.ToUsfm().Trim() != "") || !tokensHaveWhitespace)
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}
tokenUsfm = token.ToUsfm();
inBook = false;
break;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
\id MAT - Test
\f + \fr 1.0 \ft \f*
\h Matthew
\mt Matthew
\ip An introduction to Matthew\fe + \ft This is an endnote.\fe*
Expand Down
18 changes: 9 additions & 9 deletions tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,22 @@ public void Tokenize()
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(218));
Assert.That(tokens, Has.Count.EqualTo(224));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Assert.That(tokens[0].Data, Is.EqualTo("MAT"));

Assert.That(tokens[28].Type, Is.EqualTo(UsfmTokenType.Text));
Assert.That(tokens[28].Text, Is.EqualTo("Chapter One "));
Assert.That(tokens[34].Type, Is.EqualTo(UsfmTokenType.Text));
Assert.That(tokens[34].Text, Is.EqualTo("Chapter One "));

Assert.That(tokens[29].Type, Is.EqualTo(UsfmTokenType.Verse));
Assert.That(tokens[29].Marker, Is.EqualTo("v"));
Assert.That(tokens[29].Data, Is.EqualTo("1"));
Assert.That(tokens[35].Type, Is.EqualTo(UsfmTokenType.Verse));
Assert.That(tokens[35].Marker, Is.EqualTo("v"));
Assert.That(tokens[35].Data, Is.EqualTo("1"));

Assert.That(tokens[38].Type, Is.EqualTo(UsfmTokenType.Note));
Assert.That(tokens[38].Marker, Is.EqualTo("f"));
Assert.That(tokens[38].Data, Is.EqualTo("+"));
Assert.That(tokens[44].Type, Is.EqualTo(UsfmTokenType.Note));
Assert.That(tokens[44].Marker, Is.EqualTo("f"));
Assert.That(tokens[44].Data, Is.EqualTo("+"));
}

[Test]
Expand Down

0 comments on commit 9743519

Please sign in to comment.