diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 0e632bf96..32a463fe0 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -16,21 +16,35 @@ namespace SIL.Machine.Corpora public class UsfmParser { public static void Parse( - UsfmStylesheet stylesheet, string usfm, IUsfmParserHandler handler, + string stylesheetFileName = "usfm.sty", ScrVers versification = null, bool preserveWhitespace = false ) { - var parser = new UsfmParser(stylesheet, usfm, handler, versification, preserveWhitespace); + Parse(usfm, handler, new UsfmStylesheet(stylesheetFileName), versification, preserveWhitespace); + } + + public static void Parse( + string usfm, + IUsfmParserHandler handler, + UsfmStylesheet stylesheet = null, + ScrVers versification = null, + bool preserveWhitespace = false + ) + { + var parser = new UsfmParser( + usfm, + handler, + stylesheet ?? new UsfmStylesheet("usfm.sty"), + versification, + preserveWhitespace + ); parser.ProcessTokens(); } private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled); - private readonly bool _tokensPreserveWhitespace; - - private readonly IUsfmParserHandler _handler; /// /// Number of tokens to skip over because have been processed in advance @@ -39,33 +53,64 @@ public static void Parse( private int _skip = 0; public UsfmParser( - UsfmStylesheet stylesheet, IReadOnlyList tokens, IUsfmParserHandler handler = null, + string stylesheetFileName = "usfm.sty", ScrVers versification = null, bool tokensPreserveWhitespace = false ) - { - State = new UsfmParserState(stylesheet, versification ?? ScrVers.English, tokens); - _handler = handler; - _tokensPreserveWhitespace = tokensPreserveWhitespace; - } + : this(tokens, handler, new UsfmStylesheet(stylesheetFileName), versification, tokensPreserveWhitespace) { } + + public UsfmParser( + IReadOnlyList tokens, + IUsfmParserHandler handler = null, + UsfmStylesheet stylesheet = null, + ScrVers versification = null, + bool tokensPreserveWhitespace = false + ) + : this( + new UsfmParserState( + stylesheet ?? new UsfmStylesheet("usfm.sty"), + versification ?? ScrVers.English, + tokens + ), + handler, + tokensPreserveWhitespace + ) { } public UsfmParser( - UsfmStylesheet stylesheet, string usfm, IUsfmParserHandler handler = null, + string stylesheetFileName = "usfm.sty", ScrVers versification = null, - bool preserveWhitespace = false + bool tokensPreserveWhitespace = false + ) + : this(usfm, handler, new UsfmStylesheet(stylesheetFileName), versification, tokensPreserveWhitespace) { } + + public UsfmParser( + string usfm, + IUsfmParserHandler handler = null, + UsfmStylesheet stylesheet = null, + ScrVers versification = null, + bool tokensPreserveWhitespace = false ) : this( - stylesheet, - GetTokens(stylesheet, usfm, preserveWhitespace), + new UsfmParserState( + stylesheet ?? new UsfmStylesheet("usfm.sty"), + versification ?? ScrVers.English, + GetTokens(stylesheet, usfm, tokensPreserveWhitespace) + ), handler, - versification, - preserveWhitespace + tokensPreserveWhitespace ) { } + private UsfmParser(UsfmParserState state, IUsfmParserHandler handler, bool tokensPreserveWhitespace) + { + State = state; + Handler = handler; + TokensPreserveWhitespace = tokensPreserveWhitespace; + } + private static IReadOnlyList GetTokens( UsfmStylesheet stylesheet, string usfm, @@ -76,6 +121,10 @@ bool preserveWhitespace return tokenizer.Tokenize(usfm, preserveWhitespace); } + public IUsfmParserHandler Handler { get; } + + public bool TokensPreserveWhitespace { get; } + /// /// Gets the current parser state. Note: Will change with each token parsed /// @@ -98,12 +147,12 @@ public bool ProcessToken() // If past end if (State.Index >= State.Tokens.Count - 1) { - _handler?.EndUsfm(State); + Handler?.EndUsfm(State); return false; } else if (State.Index < 0) { - _handler?.StartUsfm(State); + Handler?.StartUsfm(State); } // Move to next token @@ -111,7 +160,7 @@ public bool ProcessToken() // Update verse offset with previous token (since verse offset is from start of current token) if (State.PrevToken != null) - State.VerseOffset += State.PrevToken.GetLength(addSpaces: !_tokensPreserveWhitespace); + State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace); // Skip over tokens that are to be skipped, ensuring that // SpecialToken state is true. @@ -132,8 +181,8 @@ public bool ProcessToken() if (tokenType == UsfmTokenType.Unknown) tokenType = DetermineUnknownTokenType(); - if (_handler != null && !string.IsNullOrEmpty(token.Marker)) - _handler.GotMarker(State, token.Marker); + if (Handler != null && !string.IsNullOrEmpty(token.Marker)) + Handler.GotMarker(State, token.Marker); // Close open elements switch (tokenType) @@ -237,8 +286,8 @@ public bool ProcessToken() // Unmatched end marker if (unmatched) - if (_handler != null) - _handler.Unmatched(State, token.Marker); + if (Handler != null) + Handler.Unmatched(State, token.Marker); break; } @@ -263,8 +312,8 @@ public bool ProcessToken() State.VerseOffset = 0; // Book start. - if (_handler != null) - _handler.StartBook(State, token.Marker, code); + if (Handler != null) + Handler.StartBook(State, token.Marker, code); break; case UsfmTokenType.Chapter: // Get alternate chapter number @@ -309,8 +358,8 @@ public bool ProcessToken() if (State.VerseRef.ChapterNum != 1) State.VerseOffset = 0; - if (_handler != null) - _handler.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); + if (Handler != null) + Handler.Chapter(State, token.Data, token.Marker, altChapter, pubChapter); break; case UsfmTokenType.Verse: string pubVerse = null; @@ -344,8 +393,8 @@ public bool ProcessToken() State.VerseRef = vref; State.VerseOffset = 0; - if (_handler != null) - _handler.Verse(State, token.Data, token.Marker, altVerse, pubVerse); + if (Handler != null) + Handler.Verse(State, token.Data, token.Marker, altVerse, pubVerse); break; case UsfmTokenType.Paragraph: // Handle special case of table rows @@ -355,15 +404,15 @@ public bool ProcessToken() if (State.Stack.All(e => e.Type != UsfmElementType.Table)) { State.Push(new UsfmParserElement(UsfmElementType.Table, null)); - if (_handler != null) - _handler.StartTable(State); + if (Handler != null) + Handler.StartTable(State); } State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker)); // Row start - if (_handler != null) - _handler.StartRow(State, token.Marker); + if (Handler != null) + Handler.StartRow(State, token.Marker); break; } @@ -386,8 +435,8 @@ public bool ProcessToken() _skip += 3; } - if (_handler != null) - _handler.StartSidebar(State, token.Marker, sidebarCategory); + if (Handler != null) + Handler.StartSidebar(State, token.Marker, sidebarCategory); break; } @@ -399,9 +448,9 @@ public bool ProcessToken() while (State.Stack.Count > 0) CloseElement(State.Peek().Type == UsfmElementType.Sidebar); } - else if (_handler != null) + else if (Handler != null) { - _handler.Unmatched(State, token.Marker); + Handler.Unmatched(State, token.Marker); } break; } @@ -409,8 +458,8 @@ public bool ProcessToken() State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker)); // Paragraph opening - if (_handler != null) - _handler.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); + if (Handler != null) + Handler.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes); break; case UsfmTokenType.Character: // Handle special case of table cells (treated as special character style) @@ -425,8 +474,8 @@ public bool ProcessToken() UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan); State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker)); - if (_handler != null) - _handler.StartCell(State, baseMarker, align, colspan); + if (Handler != null) + Handler.StartCell(State, baseMarker, align, colspan); break; } @@ -439,8 +488,8 @@ public bool ProcessToken() _skip += 2; - if (_handler != null) - _handler.Ref(State, token.Marker, display, target); + if (Handler != null) + Handler.Ref(State, token.Marker, display, target); break; } @@ -457,9 +506,9 @@ public bool ProcessToken() actualMarker = token.Marker; State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes)); - if (_handler != null) + if (Handler != null) { - _handler.StartChar( + Handler.StartChar( State, actualMarker, token.Type == UsfmTokenType.Unknown || invalidMarker, @@ -484,8 +533,8 @@ public bool ProcessToken() State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker)); - if (_handler != null) - _handler.StartNote(State, token.Marker, token.Data, noteCategory); + if (Handler != null) + Handler.StartNote(State, token.Marker, token.Data, noteCategory); break; case UsfmTokenType.Text: string text = token.Text; @@ -507,7 +556,7 @@ public bool ProcessToken() text = text.Substring(0, text.Length - 1); } - if (_handler != null) + if (Handler != null) { // Replace ~ with nbsp text = text.Replace('~', '\u00A0'); @@ -516,9 +565,9 @@ public bool ProcessToken() foreach (string str in OptBreakSplitter.Split(text)) { if (str == "//") - _handler.OptBreak(State); + Handler.OptBreak(State); else - _handler.Text(State, str); + Handler.Text(State, str); } } break; @@ -526,7 +575,7 @@ public bool ProcessToken() case UsfmTokenType.Milestone: case UsfmTokenType.MilestoneEnd: // currently, parse state doesn't need to be update, so just inform the handler about the milestone. - _handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes); + Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes); break; } @@ -589,36 +638,36 @@ private void CloseElement(bool closed = false) switch (element.Type) { case UsfmElementType.Book: - if (_handler != null) - _handler.EndBook(State, element.Marker); + if (Handler != null) + Handler.EndBook(State, element.Marker); break; case UsfmElementType.Para: - if (_handler != null) - _handler.EndPara(State, element.Marker); + if (Handler != null) + Handler.EndPara(State, element.Marker); break; case UsfmElementType.Char: - if (_handler != null) - _handler.EndChar(State, element.Marker, element.Attributes, closed); + if (Handler != null) + Handler.EndChar(State, element.Marker, element.Attributes, closed); break; case UsfmElementType.Note: - if (_handler != null) - _handler.EndNote(State, element.Marker, closed); + if (Handler != null) + Handler.EndNote(State, element.Marker, closed); break; case UsfmElementType.Table: - if (_handler != null) - _handler.EndTable(State); + if (Handler != null) + Handler.EndTable(State); break; case UsfmElementType.Row: - if (_handler != null) - _handler.EndRow(State, element.Marker); + if (Handler != null) + Handler.EndRow(State, element.Marker); break; case UsfmElementType.Cell: - if (_handler != null) - _handler.EndCell(State, element.Marker); + if (Handler != null) + Handler.EndCell(State, element.Marker); break; case UsfmElementType.Sidebar: - if (_handler != null) - _handler.EndSidebar(State, element.Marker, closed); + if (Handler != null) + Handler.EndSidebar(State, element.Marker, closed); break; } } diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index af75f639f..3180a3eb2 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -30,7 +30,7 @@ protected override IEnumerable GetVersesInDocOrder() { string usfm = ReadUsfm(); var rowCollector = new TextRowCollector(this); - UsfmParser.Parse(_stylesheet, usfm, rowCollector, Versification, preserveWhitespace: _includeMarkers); + UsfmParser.Parse(usfm, rowCollector, _stylesheet, Versification, preserveWhitespace: _includeMarkers); return rowCollector.Rows; } diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs index 765b3fb50..46ec1bd9c 100644 --- a/src/SIL.Machine/Corpora/UsfmToken.cs +++ b/src/SIL.Machine/Corpora/UsfmToken.cs @@ -172,7 +172,7 @@ public int GetLength(bool includeNewlines = false, bool addSpaces = true) if (!string.IsNullOrEmpty(Data)) { - if (Marker.Length > 0) + if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*')) totalLength++; totalLength += Data.Length; if (addSpaces) @@ -225,7 +225,7 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true) if (!string.IsNullOrEmpty(Data)) { - if (Marker.Length > 0) + if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*')) sb.Append(' '); sb.Append(Data); if (addSpaces) @@ -236,11 +236,13 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true) { string attributes = ToAttributeString(); if (attributes != "") + { sb.Append(attributes); + } else { // remove space that was put after marker - not needed when there are no attributes. - sb.Length -= 1; + sb.Length--; } sb.Append(@"\*"); } diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index e44dd91e3..564c4b4e9 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -2,20 +2,41 @@ using System.Collections.Generic; using System.Linq; using System.Text; +using System.Text.RegularExpressions; namespace SIL.Machine.Corpora { + public enum RtlReferenceOrder + { + NotSet, + BookChapterVerse, + BookVerseChapter + } + public class UsfmTokenizer { private const char ZeroWidthSpace = '\u200B'; - private readonly UsfmStylesheet _stylesheet; + private static readonly Regex RtlVerseRegex = new Regex( + @"[\u200E\u200F]*(\d+\w?)[\u200E\u200F]*([\p{P}\p{S}])[\u200E\u200F]*(?=\d)", + RegexOptions.Compiled + ); - public UsfmTokenizer(UsfmStylesheet stylesheet) + public UsfmTokenizer( + string stylesheetFileName = "usfm.sty", + RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet + ) + : this(new UsfmStylesheet(stylesheetFileName), rtlReferenceOrder) { } + + public UsfmTokenizer(UsfmStylesheet stylesheet, RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet) { - _stylesheet = stylesheet; + Stylesheet = stylesheet ?? new UsfmStylesheet("usfm.sty"); + RtlReferenceOrder = rtlReferenceOrder; } + public UsfmStylesheet Stylesheet { get; } + public RtlReferenceOrder RtlReferenceOrder { get; } + public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = false) { List tokens = new List(); @@ -112,7 +133,7 @@ ref text } // Lookup marker - UsfmTag tag = _stylesheet.GetTag(marker.TrimStart('+')); + UsfmTag tag = Stylesheet.GetTag(marker.TrimStart('+')); // If starts with a plus and is not a character style or an end style, it is an unknown tag if ( @@ -121,7 +142,7 @@ ref text && tag.StyleType != UsfmStyleType.End ) { - tag = _stylesheet.GetTag(marker); + tag = Stylesheet.GetTag(marker); } string endMarker = tag.StyleType != UsfmStyleType.Milestone ? marker + "*" : tag.EndMarker; @@ -276,6 +297,109 @@ ref text return tokens; } + public string Detokenize(IEnumerable tokens, bool tokensHaveWhitespace = false) + { + UsfmToken prevToken = null; + var usfm = new StringBuilder(); + foreach (UsfmToken token in tokens) + { + string tokenUsfm = ""; + switch (token.Type) + { + case UsfmTokenType.Book: + case UsfmTokenType.Chapter: + case UsfmTokenType.Paragraph: + // Strip space from end of string before CR/LF + if (usfm.Length > 0) + { + if ( + usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "") + || !tokensHaveWhitespace + ) + { + usfm.Length--; + } + if (!tokensHaveWhitespace) + usfm.Append("\r\n"); + } + tokenUsfm = token.ToUsfm(); + break; + case UsfmTokenType.Verse: + // Add newline if after anything other than [ or ( + if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(') + { + if ( + usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "") + || !tokensHaveWhitespace + ) + { + usfm.Length--; + } + if (!tokensHaveWhitespace) + usfm.Append("\r\n"); + } + + tokenUsfm = tokensHaveWhitespace ? token.ToUsfm().Trim() : token.ToUsfm(); + + if (RtlReferenceOrder != RtlReferenceOrder.NotSet) + { + string directionMarker = + RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f"; + tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2"); + } + break; + case UsfmTokenType.Text: + // Ensure spaces are preserved + tokenUsfm = token.ToUsfm(); + if (tokensHaveWhitespace && usfm.Length > 0 && usfm[usfm.Length - 1] == ' ') + { + if ( + ( + tokenUsfm.Length > 0 + && tokenUsfm[0] == ' ' + && prevToken != null + && prevToken.ToUsfm().Trim() != "" + ) || tokenUsfm.StartsWith("\r\n") + ) + { + usfm.Length--; + } + else + { + tokenUsfm = tokenUsfm.TrimStart(' '); + } + } + break; + default: + tokenUsfm = token.ToUsfm(); + break; + } + + usfm.Append(tokenUsfm); + prevToken = token; + } + + // Make sure begins without space or CR/LF + if (usfm.Length > 0 && usfm[0] == ' ') + usfm.Remove(0, 1); + if (usfm.Length > 0 && usfm[0] == '\r') + usfm.Remove(0, 2); + + // Make sure ends without space and with a CR/LF + if (usfm.Length > 0 && usfm[usfm.Length - 1] == ' ') + usfm.Length--; + if (usfm.Length > 0 && usfm[usfm.Length - 1] != '\n') + usfm.Append("\r\n"); + if ( + usfm.Length > 3 + && usfm[usfm.Length - 3] == ' ' + && usfm[usfm.Length - 2] == '\r' + && usfm[usfm.Length - 1] == '\n' + ) + usfm.Remove(usfm.Length - 3, 1); + return usfm.ToString(); + } + /// /// Gets the next word in the usfm and advances the index past it /// @@ -361,7 +485,7 @@ ref string text if (matchingToken == null) return null; - UsfmTag matchingTag = _stylesheet.GetTag(matchingToken.NestlessMarker); + UsfmTag matchingTag = Stylesheet.GetTag(matchingToken.NestlessMarker); if ( matchingTag.StyleType != UsfmStyleType.Character && matchingTag.StyleType != UsfmStyleType.Milestone diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs index 9c19308e7..26deaf022 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs @@ -1,6 +1,4 @@ -using System; -using System.IO; -using System.IO.Compression; +using System.IO.Compression; namespace SIL.Machine.Corpora { diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes b/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes new file mode 100644 index 000000000..e29803af9 --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes @@ -0,0 +1 @@ +*.SFM eol=crlf diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 83a1f6792..8cb81d7a4 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -14,7 +14,7 @@ \li2 verse four, \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. -\c 2 +\c 2 \s1 Chapter Two \p \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. @@ -25,7 +25,7 @@ \v 6 Chapter two, verse \w six|strong="12345" \w*. \v 6 Bad verse. \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. -\v 7a Chapter two, verse seven A, +\v 7a Chapter two, verse seven A, \s Section header \p \v 7b verse seven B. diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs index 26fe4eba7..c7a162b51 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs @@ -1,5 +1,4 @@ -using System.Linq; -using System.Text; +using System.Text; using NUnit.Framework; using SIL.Scripture; diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs new file mode 100644 index 000000000..ff6aaf30a --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -0,0 +1,47 @@ +using NUnit.Framework; + +namespace SIL.Machine.Corpora +{ + [TestFixture] + public class UsfmTokenizerTests + { + [Test] + public void Tokenize() + { + string usfm = ReadUsfm(); + var tokenizer = new UsfmTokenizer(); + IReadOnlyList tokens = tokenizer.Tokenize(usfm); + Assert.That(tokens, Has.Count.EqualTo(136)); + + Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); + Assert.That(tokens[0].Marker, Is.EqualTo("id")); + Assert.That(tokens[0].Data, Is.EqualTo("MAT")); + + Assert.That(tokens[10].Type, Is.EqualTo(UsfmTokenType.Text)); + Assert.That(tokens[10].Text, Is.EqualTo("Chapter One ")); + + Assert.That(tokens[11].Type, Is.EqualTo(UsfmTokenType.Verse)); + Assert.That(tokens[11].Marker, Is.EqualTo("v")); + Assert.That(tokens[11].Data, Is.EqualTo("1")); + + Assert.That(tokens[20].Type, Is.EqualTo(UsfmTokenType.Note)); + Assert.That(tokens[20].Marker, Is.EqualTo("f")); + Assert.That(tokens[20].Data, Is.EqualTo("+")); + } + + [Test] + public void Detokenize() + { + string usfm = ReadUsfm(); + var tokenizer = new UsfmTokenizer(); + IReadOnlyList tokens = tokenizer.Tokenize(usfm); + string result = tokenizer.Detokenize(tokens); + Assert.That(result, Is.EqualTo(usfm)); + } + + private static string ReadUsfm() + { + return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM")); + } + } +}