Skip to content

Commit

Permalink
Add support for detokenizing USFM
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Jan 3, 2024
1 parent 05c2ee7 commit b0fb126
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 82 deletions.
185 changes: 117 additions & 68 deletions src/SIL.Machine/Corpora/UsfmParser.cs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/SIL.Machine/Corpora/UsfmTextBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ protected override IEnumerable<TextRow> GetVersesInDocOrder()
{
string usfm = ReadUsfm();
var rowCollector = new TextRowCollector(this);
UsfmParser.Parse(_stylesheet, usfm, rowCollector, Versification, preserveWhitespace: _includeMarkers);
UsfmParser.Parse(usfm, rowCollector, _stylesheet, Versification, preserveWhitespace: _includeMarkers);
return rowCollector.Rows;
}

Expand Down
132 changes: 126 additions & 6 deletions src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,41 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
public enum RtlReferenceOrder
{
NotSet,
BookChapterVerse,
BookVerseChapter
}

public class UsfmTokenizer
{
private const char ZeroWidthSpace = '\u200B';

private readonly UsfmStylesheet _stylesheet;
private static readonly Regex RtlVerseRegex = new Regex(
@"[\u200E\u200F]*(\d+\w?)[\u200E\u200F]*([\p{P}\p{S}])[\u200E\u200F]*(?=\d)",
RegexOptions.Compiled
);

public UsfmTokenizer(UsfmStylesheet stylesheet)
public UsfmTokenizer(
string stylesheetFileName = "usfm.sty",
RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet
)
: this(new UsfmStylesheet(stylesheetFileName), rtlReferenceOrder) { }

public UsfmTokenizer(UsfmStylesheet stylesheet, RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet)
{
_stylesheet = stylesheet;
Stylesheet = stylesheet ?? new UsfmStylesheet("usfm.sty");
RtlReferenceOrder = rtlReferenceOrder;
}

public UsfmStylesheet Stylesheet { get; }
public RtlReferenceOrder RtlReferenceOrder { get; }

public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace = false)
{
List<UsfmToken> tokens = new List<UsfmToken>();
Expand Down Expand Up @@ -112,7 +133,7 @@ ref text
}

// Lookup marker
UsfmTag tag = _stylesheet.GetTag(marker.TrimStart('+'));
UsfmTag tag = Stylesheet.GetTag(marker.TrimStart('+'));

// If starts with a plus and is not a character style or an end style, it is an unknown tag
if (
Expand All @@ -121,7 +142,7 @@ ref text
&& tag.StyleType != UsfmStyleType.End
)
{
tag = _stylesheet.GetTag(marker);
tag = Stylesheet.GetTag(marker);
}

string endMarker = tag.StyleType != UsfmStyleType.Milestone ? marker + "*" : tag.EndMarker;
Expand Down Expand Up @@ -276,6 +297,105 @@ ref text
return tokens;
}

public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespace = false)
{
UsfmToken prevToken = null;
var usfm = new StringBuilder();
foreach (UsfmToken token in tokens)
{
string tokenUsfm = "";
switch (token.Type)
{
case UsfmTokenType.Book:
case UsfmTokenType.Chapter:
case UsfmTokenType.Paragraph:
// Strip space from end of string before CR/LF
if (usfm.Length > 0)
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}
break;
case UsfmTokenType.Verse:
// Add newline if after anything other than [ or (
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(')
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}

tokenUsfm = tokensHaveWhitespace ? token.ToUsfm().Trim() : token.ToUsfm();

if (RtlReferenceOrder != RtlReferenceOrder.NotSet)
{
string directionMarker =
RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f";
tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2");
}
break;
case UsfmTokenType.Text:
// Ensure spaces are preserved
tokenUsfm = token.ToUsfm();
if (tokensHaveWhitespace && usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
{
if (
(
tokenUsfm.Length > 0
&& tokenUsfm[0] == ' '
&& prevToken != null
&& prevToken.ToUsfm().Trim() != ""
) || tokenUsfm.StartsWith("\r\n")
)
{
usfm.Length--;
}
else
{
tokenUsfm = tokenUsfm.TrimStart(' ');
}
}
break;
}

usfm.Append(tokenUsfm);
prevToken = token;
}

// Make sure begins without space or CR/LF
if (usfm.Length > 0 && usfm[0] == ' ')
usfm.Remove(0, 1);
if (usfm.Length > 0 && usfm[0] == '\r')
usfm.Remove(0, 2);

// Make sure ends without space and with a CR/LF
if (usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
usfm.Length--;
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '\n')
usfm.Append("\r\n");
if (
usfm.Length > 3
&& usfm[usfm.Length - 3] == ' '
&& usfm[usfm.Length - 2] == '\r'
&& usfm[usfm.Length - 1] == '\n'
)
usfm.Remove(usfm.Length - 3, 1);
return usfm.ToString();
}

/// <summary>
/// Gets the next word in the usfm and advances the index past it
/// </summary>
Expand Down Expand Up @@ -361,7 +481,7 @@ ref string text
if (matchingToken == null)
return null;

UsfmTag matchingTag = _stylesheet.GetTag(matchingToken.NestlessMarker);
UsfmTag matchingTag = Stylesheet.GetTag(matchingToken.NestlessMarker);
if (
matchingTag.StyleType != UsfmStyleType.Character
&& matchingTag.StyleType != UsfmStyleType.Milestone
Expand Down
4 changes: 1 addition & 3 deletions tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System;
using System.IO;
using System.IO.Compression;
using System.IO.Compression;

namespace SIL.Machine.Corpora
{
Expand Down
1 change: 1 addition & 0 deletions tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.SFM eol=crlf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
\li2 verse four,
\v 5 Chapter one,
\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
\c 2
\c 2
\s1 Chapter Two
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
Expand All @@ -25,7 +25,7 @@
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\v 6 Bad verse.
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
\v 7a Chapter two, verse seven A,
\s Section header
\p
\v 7b verse seven B.
Expand Down
3 changes: 1 addition & 2 deletions tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Linq;
using System.Text;
using System.Text;
using NUnit.Framework;
using SIL.Scripture;

Expand Down
22 changes: 22 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
using NUnit.Framework;

namespace SIL.Machine.Corpora
{
[TestFixture]
public class UsfmTokenizerTests
{
[Test]
public void Tokenize()
{
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(136));
}

private static string ReadUsfm()
{
return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
}
}
}

0 comments on commit b0fb126

Please sign in to comment.