Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for detokenizing USFM #154

Merged
merged 1 commit into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 117 additions & 68 deletions src/SIL.Machine/Corpora/UsfmParser.cs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/SIL.Machine/Corpora/UsfmTextBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ protected override IEnumerable<TextRow> GetVersesInDocOrder()
{
string usfm = ReadUsfm();
var rowCollector = new TextRowCollector(this);
UsfmParser.Parse(_stylesheet, usfm, rowCollector, Versification, preserveWhitespace: _includeMarkers);
UsfmParser.Parse(usfm, rowCollector, _stylesheet, Versification, preserveWhitespace: _includeMarkers);
return rowCollector.Rows;
}

Expand Down
8 changes: 5 additions & 3 deletions src/SIL.Machine/Corpora/UsfmToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ public int GetLength(bool includeNewlines = false, bool addSpaces = true)

if (!string.IsNullOrEmpty(Data))
{
if (Marker.Length > 0)
if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
totalLength++;
totalLength += Data.Length;
if (addSpaces)
Expand Down Expand Up @@ -225,7 +225,7 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)

if (!string.IsNullOrEmpty(Data))
{
if (Marker.Length > 0)
if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
sb.Append(' ');
sb.Append(Data);
if (addSpaces)
Expand All @@ -236,11 +236,13 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)
{
string attributes = ToAttributeString();
if (attributes != "")
{
sb.Append(attributes);
}
else
{
// remove space that was put after marker - not needed when there are no attributes.
sb.Length -= 1;
sb.Length--;
}
sb.Append(@"\*");
}
Expand Down
136 changes: 130 additions & 6 deletions src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,41 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
public enum RtlReferenceOrder
{
NotSet,
BookChapterVerse,
BookVerseChapter
}

public class UsfmTokenizer
{
private const char ZeroWidthSpace = '\u200B';

private readonly UsfmStylesheet _stylesheet;
private static readonly Regex RtlVerseRegex = new Regex(
@"[\u200E\u200F]*(\d+\w?)[\u200E\u200F]*([\p{P}\p{S}])[\u200E\u200F]*(?=\d)",
RegexOptions.Compiled
);

public UsfmTokenizer(UsfmStylesheet stylesheet)
public UsfmTokenizer(
string stylesheetFileName = "usfm.sty",
RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet
)
: this(new UsfmStylesheet(stylesheetFileName), rtlReferenceOrder) { }

public UsfmTokenizer(UsfmStylesheet stylesheet, RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet)
{
_stylesheet = stylesheet;
Stylesheet = stylesheet ?? new UsfmStylesheet("usfm.sty");
RtlReferenceOrder = rtlReferenceOrder;
}

public UsfmStylesheet Stylesheet { get; }
public RtlReferenceOrder RtlReferenceOrder { get; }

public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace = false)
{
List<UsfmToken> tokens = new List<UsfmToken>();
Expand Down Expand Up @@ -112,7 +133,7 @@ ref text
}

// Lookup marker
UsfmTag tag = _stylesheet.GetTag(marker.TrimStart('+'));
UsfmTag tag = Stylesheet.GetTag(marker.TrimStart('+'));

// If starts with a plus and is not a character style or an end style, it is an unknown tag
if (
Expand All @@ -121,7 +142,7 @@ ref text
&& tag.StyleType != UsfmStyleType.End
)
{
tag = _stylesheet.GetTag(marker);
tag = Stylesheet.GetTag(marker);
}

string endMarker = tag.StyleType != UsfmStyleType.Milestone ? marker + "*" : tag.EndMarker;
Expand Down Expand Up @@ -276,6 +297,109 @@ ref text
return tokens;
}

public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespace = false)
{
UsfmToken prevToken = null;
var usfm = new StringBuilder();
foreach (UsfmToken token in tokens)
{
string tokenUsfm = "";
switch (token.Type)
{
case UsfmTokenType.Book:
case UsfmTokenType.Chapter:
case UsfmTokenType.Paragraph:
// Strip space from end of string before CR/LF
if (usfm.Length > 0)
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}
tokenUsfm = token.ToUsfm();
break;
case UsfmTokenType.Verse:
// Add newline if after anything other than [ or (
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(')
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}

tokenUsfm = tokensHaveWhitespace ? token.ToUsfm().Trim() : token.ToUsfm();

if (RtlReferenceOrder != RtlReferenceOrder.NotSet)
{
string directionMarker =
RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f";
tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2");
}
break;
case UsfmTokenType.Text:
// Ensure spaces are preserved
tokenUsfm = token.ToUsfm();
if (tokensHaveWhitespace && usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
{
if (
(
tokenUsfm.Length > 0
&& tokenUsfm[0] == ' '
&& prevToken != null
&& prevToken.ToUsfm().Trim() != ""
) || tokenUsfm.StartsWith("\r\n")
)
{
usfm.Length--;
}
else
{
tokenUsfm = tokenUsfm.TrimStart(' ');
}
}
break;
default:
tokenUsfm = token.ToUsfm();
break;
}

usfm.Append(tokenUsfm);
prevToken = token;
}

// Make sure begins without space or CR/LF
if (usfm.Length > 0 && usfm[0] == ' ')
usfm.Remove(0, 1);
if (usfm.Length > 0 && usfm[0] == '\r')
usfm.Remove(0, 2);

// Make sure ends without space and with a CR/LF
if (usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
usfm.Length--;
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '\n')
usfm.Append("\r\n");
if (
usfm.Length > 3
&& usfm[usfm.Length - 3] == ' '
&& usfm[usfm.Length - 2] == '\r'
&& usfm[usfm.Length - 1] == '\n'
)
usfm.Remove(usfm.Length - 3, 1);
return usfm.ToString();
}

/// <summary>
/// Gets the next word in the usfm and advances the index past it
/// </summary>
Expand Down Expand Up @@ -361,7 +485,7 @@ ref string text
if (matchingToken == null)
return null;

UsfmTag matchingTag = _stylesheet.GetTag(matchingToken.NestlessMarker);
UsfmTag matchingTag = Stylesheet.GetTag(matchingToken.NestlessMarker);
if (
matchingTag.StyleType != UsfmStyleType.Character
&& matchingTag.StyleType != UsfmStyleType.Milestone
Expand Down
4 changes: 1 addition & 3 deletions tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System;
using System.IO;
using System.IO.Compression;
using System.IO.Compression;

namespace SIL.Machine.Corpora
{
Expand Down
1 change: 1 addition & 0 deletions tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.SFM eol=crlf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
\li2 verse four,
\v 5 Chapter one,
\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
\c 2
\c 2
\s1 Chapter Two
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
Expand All @@ -25,7 +25,7 @@
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\v 6 Bad verse.
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
\v 7a Chapter two, verse seven A,
\s Section header
\p
\v 7b verse seven B.
Expand Down
3 changes: 1 addition & 2 deletions tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Linq;
using System.Text;
using System.Text;
using NUnit.Framework;
using SIL.Scripture;

Expand Down
47 changes: 47 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using NUnit.Framework;

namespace SIL.Machine.Corpora
{
[TestFixture]
public class UsfmTokenizerTests
{
[Test]
public void Tokenize()
{
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(136));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Assert.That(tokens[0].Data, Is.EqualTo("MAT"));

Assert.That(tokens[10].Type, Is.EqualTo(UsfmTokenType.Text));
Assert.That(tokens[10].Text, Is.EqualTo("Chapter One "));

Assert.That(tokens[11].Type, Is.EqualTo(UsfmTokenType.Verse));
Assert.That(tokens[11].Marker, Is.EqualTo("v"));
Assert.That(tokens[11].Data, Is.EqualTo("1"));

Assert.That(tokens[20].Type, Is.EqualTo(UsfmTokenType.Note));
Assert.That(tokens[20].Marker, Is.EqualTo("f"));
Assert.That(tokens[20].Data, Is.EqualTo("+"));
}

[Test]
public void Detokenize()
{
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
string result = tokenizer.Detokenize(tokens);
Assert.That(result, Is.EqualTo(usfm));
}

private static string ReadUsfm()
{
return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
}
}
}
Loading