From d1cc368690d0b5569a7b23ea72abd4d5e8300686 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Mon, 16 Dec 2024 12:36:30 -0500 Subject: [PATCH] the ideas down - more refining needed --- src/SIL.Machine/Corpora/IUsfmParserHandler.cs | 10 ++ .../Corpora/ParatextProjectTextUpdaterBase.cs | 12 ++- .../ScriptureRefUsfmParserHandlerBase.cs | 19 +++- .../Corpora/UpdateUsfmParserHandler.cs | 94 ++++++++++++------- src/SIL.Machine/Corpora/UsfmParser.cs | 10 ++ .../Corpora/UsfmParserHandlerBase.cs | 4 + src/SIL.Machine/Corpora/UsfmParserState.cs | 7 +- src/SIL.Machine/Corpora/UsfmStylesheet.cs | 66 +++++++++++++ .../Corpora/TestData/usfm/Tes/41MATTes.SFM | 9 +- .../Corpora/UpdateUsfmParserHandlerTests.cs | 76 ++++++++++++--- .../Corpora/UsfmManualTests.cs | 8 +- .../Corpora/UsfmTokenizerTests.cs | 2 +- 12 files changed, 254 insertions(+), 63 deletions(-) diff --git a/src/SIL.Machine/Corpora/IUsfmParserHandler.cs b/src/SIL.Machine/Corpora/IUsfmParserHandler.cs index ea27a8a57..74a341f27 100644 --- a/src/SIL.Machine/Corpora/IUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/IUsfmParserHandler.cs @@ -78,6 +78,16 @@ IReadOnlyList attributes /// void EndNote(UsfmParserState state, string marker, bool closed); + /// + /// Start of a note text + /// + void StartNoteText(UsfmParserState state, string marker); + + /// + /// End of a note text + /// + void EndNoteText(UsfmParserState state, string marker); + /// /// Start of a table /// diff --git a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs index ea86eb60d..7441f94d8 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs @@ -23,7 +23,9 @@ public string UpdateUsfm( string bookId, IReadOnlyList<(IReadOnlyList, string)> rows, string fullName = null, - UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting + UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, + UpdateUsfmIntraVerseMarkerBehavior noteBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve, + UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip ) { string fileName = _settings.GetBookFileName(bookId); @@ -36,7 +38,13 @@ public string UpdateUsfm( usfm = reader.ReadToEnd(); } - var handler = new UpdateUsfmParserHandler(rows, fullName is null ? null : $"- {fullName}", behavior); + var handler = new UpdateUsfmParserHandler( + rows, + fullName is null ? null : $"- {fullName}", + textBehavior, + noteBehavior, + formattingBehavior + ); try { UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification); diff --git a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs index 363209ed9..0d5dc8d71 100644 --- a/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs @@ -18,6 +18,7 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase private readonly Stack _curElements; private readonly Stack _curTextType; private bool _duplicateVerse = false; + private bool _inNoteText = false; protected ScriptureRefUsfmParserHandlerBase() { @@ -158,13 +159,12 @@ public override void StartNote(UsfmParserState state, string marker, string call // if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment CheckConvertVerseParaToNonVerse(state); NextElement(marker); - StartNoteText(state); } } public override void EndNote(UsfmParserState state, string marker, bool closed) { - if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse) + if (_inNoteText && !_duplicateVerse) EndNoteText(state); } @@ -192,6 +192,17 @@ IReadOnlyList attributes CheckConvertVerseParaToNonVerse(state); } + public override void EndChar( + UsfmParserState state, + string marker, + IReadOnlyList attributes, + bool closed + ) + { + if (_inNoteText && !_duplicateVerse && UsfmStylesheet.IsNoteOrCrossReferencePart(marker)) + EndNoteText(state); + } + protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } protected virtual void EndVerseText(UsfmParserState state, IReadOnlyList scriptureRefs) { } @@ -231,16 +242,18 @@ private void EndNonVerseText(UsfmParserState state) _curTextType.Pop(); } - private void StartNoteText(UsfmParserState state) + public void StartNoteText(UsfmParserState state) { _curTextType.Push(ScriptureTextType.Note); StartNoteText(state, CreateNonVerseRef()); + _inNoteText = true; } private void EndNoteText(UsfmParserState state) { EndNoteText(state, CreateNonVerseRef()); _curTextType.Pop(); + _inNoteText = false; } private void UpdateVerseRef(VerseRef verseRef, string marker) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 03c9bb12b..f86a57106 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -1,16 +1,23 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text.RegularExpressions; namespace SIL.Machine.Corpora { - public enum UpdateUsfmBehavior + public enum UpdateUsfmTextBehavior { PreferExisting, PreferNew, StripExisting } + public enum UpdateUsfmIntraVerseMarkerBehavior + { + Preserve, + Strip, + } + /*** * This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified * text. @@ -21,15 +28,20 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase private readonly List _tokens; private readonly List _newTokens; private readonly string _idText; - private readonly UpdateUsfmBehavior _behavior; + private readonly UpdateUsfmTextBehavior _textBehavior; + private readonly UpdateUsfmIntraVerseMarkerBehavior _noteBehavior; + private readonly UpdateUsfmIntraVerseMarkerBehavior _formattingBehavior; private readonly Stack _replace; + private readonly Regex _nonAlpha = new Regex("[^a-zA-Z0-9]"); private int _rowIndex; private int _tokenIndex; public UpdateUsfmParserHandler( IReadOnlyList<(IReadOnlyList, string)> rows = null, string idText = null, - UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting + UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting, + UpdateUsfmIntraVerseMarkerBehavior noteBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve, + UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip ) { _rows = rows ?? Array.Empty<(IReadOnlyList, string)>(); @@ -37,7 +49,9 @@ public UpdateUsfmParserHandler( _newTokens = new List(); _idText = idText; _replace = new Stack(); - _behavior = behavior; + _textBehavior = textBehavior; + _noteBehavior = noteBehavior; + _formattingBehavior = formattingBehavior; } public IReadOnlyList Tokens => _tokens; @@ -176,8 +190,10 @@ bool closed ) { // strip out char-style markers in verses that are being replaced - if (closed && ReplaceWithNewTokens(state)) + if (ReplaceWithNewTokens(state, closed: closed, endCharacter: true)) SkipTokens(state); + else + CollectTokens(state); base.EndChar(state, marker, attributes, closed); } @@ -196,8 +212,10 @@ public override void StartNote(UsfmParserState state, string marker, string call public override void EndNote(UsfmParserState state, string marker, bool closed) { // strip out notes in verses that are being replaced - if (closed && ReplaceWithNewTokens(state)) + if (ReplaceWithNewTokens(state, closed: closed, endNote: true)) SkipTokens(state); + else + CollectTokens(state); base.EndNote(state, marker, closed); } @@ -271,25 +289,7 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef) { IReadOnlyList rowTexts = AdvanceRows(new[] { scriptureRef }); - var newTokens = new List(); - if (rowTexts.Count > 0) - { - newTokens.Add(state.Token); - newTokens.Add(new UsfmToken(UsfmTokenType.Character, "ft", null, "ft*")); - for (int i = 0; i < rowTexts.Count; i++) - { - string text = rowTexts[i]; - if (i < rowTexts.Count - 1) - text += " "; - newTokens.Add(new UsfmToken(text)); - } - newTokens.Add(new UsfmToken(UsfmTokenType.End, state.Token.EndMarker, null, null)); - PushNewTokens(newTokens); - } - else - { - PushTokensAsPrevious(); - } + PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " "))); } protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef) @@ -362,9 +362,16 @@ private void SkipTokens(UsfmParserState state) _tokenIndex = state.Index + 1 + state.SpecialTokenCount; } - private bool ReplaceWithNewTokens(UsfmParserState state) + private bool ReplaceWithNewTokens( + UsfmParserState state, + bool closed = true, + bool endCharacter = false, + bool endNote = false + ) { + bool stripExistingText = _textBehavior == UpdateUsfmTextBehavior.StripExisting; bool newText = _replace.Count > 0 && _replace.Peek(); + int tokenEnd = state.Index + state.SpecialTokenCount; bool existingText = false; for (int index = _tokenIndex; index <= tokenEnd; index++) @@ -376,15 +383,37 @@ private bool ReplaceWithNewTokens(UsfmParserState state) } } bool useNewTokens = - _behavior == UpdateUsfmBehavior.StripExisting + stripExistingText || (newText && !existingText) - || (newText && _behavior == UpdateUsfmBehavior.PreferNew); + || (newText && _textBehavior == UpdateUsfmTextBehavior.PreferNew && !state.IsReferenceText); - if (useNewTokens) + if (useNewTokens && _newTokens.Count > 0) _tokens.AddRange(_newTokens); _newTokens.Clear(); - return useNewTokens; + + bool skipTokens = useNewTokens && closed; + + bool withinNewText = _replace.Any(r => r); + + if (withinNewText) + { + string bareMarker = _nonAlpha.Replace(state.Token.Marker ?? "", ""); + if (state.Token.Type == UsfmTokenType.Character || endCharacter) + { + var behavior = UsfmStylesheet.IsNoteOrCrossReferencePart(bareMarker) + ? _noteBehavior + : _formattingBehavior; + skipTokens = stripExistingText || behavior == UpdateUsfmIntraVerseMarkerBehavior.Strip; + } + + if (state.NoteTag != null || endNote) + { + skipTokens = stripExistingText || _noteBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip; + } + } + + return skipTokens; } private void PushNewTokens(IEnumerable tokens) @@ -393,11 +422,6 @@ private void PushNewTokens(IEnumerable tokens) _newTokens.AddRange(tokens); } - private void PushTokensAsPrevious() - { - _replace.Push(_replace.Peek()); - } - private void PopNewTokens() { _replace.Pop(); diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 8028b2fa3..9d366c7c2 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -486,6 +486,11 @@ public bool ProcessToken() Handler?.Ref(State, token.Marker, display, target); break; } + if (IsNoteTextStart(token) && State.NoteTag != null) + { + Handler?.StartNoteText(State, token.Marker); + break; + } string actualMarker; bool invalidMarker = false; @@ -672,5 +677,10 @@ private bool IsRef(UsfmToken token) && (State.Tokens[State.Index + 2].Marker == token.EndMarker) && (token.Marker == "ref"); } + + private bool IsNoteTextStart(UsfmToken token) + { + return token.Marker == "ft"; + } } } diff --git a/src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs b/src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs index 760750945..85e939922 100644 --- a/src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs +++ b/src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs @@ -57,6 +57,10 @@ public virtual void StartNote(UsfmParserState state, string marker, string calle public virtual void EndNote(UsfmParserState state, string marker, bool closed) { } + public virtual void StartNoteText(UsfmParserState state, string marker) { } + + public virtual void EndNoteText(UsfmParserState state, string marker) { } + public virtual void StartTable(UsfmParserState state) { } public virtual void EndTable(UsfmParserState state) { } diff --git a/src/SIL.Machine/Corpora/UsfmParserState.cs b/src/SIL.Machine/Corpora/UsfmParserState.cs index 1ad2c85b0..e8cd08bc3 100644 --- a/src/SIL.Machine/Corpora/UsfmParserState.cs +++ b/src/SIL.Machine/Corpora/UsfmParserState.cs @@ -76,10 +76,9 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn /// public int SpecialTokenCount { get; internal set; } - /// - /// True if the token processed is a figure. - /// - public bool IsFigure => CharTag?.Marker == "fig"; + public bool IsReferenceText => + !(CharTag is null) + && (UsfmStylesheet.IsReference(CharTag.Marker) || UsfmStylesheet.IsFigure(CharTag.Marker)); /// /// Current paragraph tag or null for none. diff --git a/src/SIL.Machine/Corpora/UsfmStylesheet.cs b/src/SIL.Machine/Corpora/UsfmStylesheet.cs index e63f8c96b..25ed975e4 100644 --- a/src/SIL.Machine/Corpora/UsfmStylesheet.cs +++ b/src/SIL.Machine/Corpora/UsfmStylesheet.cs @@ -11,6 +11,57 @@ namespace SIL.Machine.Corpora public class UsfmStylesheet { private static readonly Regex CellRangeRegex = new Regex(@"^(t[ch][cr]?[1-5])-([2-5])$", RegexOptions.Compiled); + private static readonly HashSet ReferenceTags = new HashSet + { + "fl", + "fr", + "fv", + "r", + "rq", + "va", + "vp", + "xo", + "xop", + "xot", + "xnt", + "xdc", + "xt", + "zpa-xb", + "zpa-xc", + "zpa-xv" + }; + + private static readonly HashSet NoteTextTags = new HashSet { "ft", }; + + private static readonly HashSet NoteAndCrossReferencePartTags = new HashSet + { + "f", + "fe", + "fr", + "fq", + "fqa", + "fk", + "fw", + "fp", + "fv", + "ft", + "fdc", + "fm", + "x", + "xo", + "xk", + "xq", + "xt", + "xta", + "xop", + "xot", + "xnt", + "xdc", + "rq", + "zpa-xb", + "zpa-xc", + "zpa-xv" + }; private static readonly Dictionary JustificationMappings = new Dictionary< string, @@ -111,6 +162,21 @@ public static bool IsCellRange(string tag, out string baseMarker, out int colSpa return false; } + public static bool IsReference(string tag) + { + return !(tag is null) && ReferenceTags.Contains(tag); + } + + public static bool IsNoteOrCrossReferencePart(string tag) + { + return !(tag is null) && NoteAndCrossReferencePartTags.Contains(tag); + } + + public static bool IsFigure(string tag) + { + return tag == "fig"; + } + private static IEnumerable GetEmbeddedStylesheet(string fileName) { using ( diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM index 672b93daa..4a8067b8e 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM @@ -8,13 +8,13 @@ \p and a \weirdtaglookingthing that is not an actual tag. \c 1 \s Chapter One -\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* +\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse \f + \fr 1:1: \ft This is a footnote for v1.\f*one. \li1 \v 2 \bd C\bd*hapter one, -\li2 verse\f + \fr 1:2: \ft This is a footnote.\f* two. +\li2 verse\f + \fr 1:2: \ft This is a footnote for v2.\f* two. \v 3 Chapter one \w*, \li2 verse three. -\v 4 Chapter one,  +\v 4 Chapter one, \li2 verse four, \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. @@ -22,6 +22,7 @@ \v 7 \v 8 \c 2 +\r (Mark 1:2-3; Luke 4:5-6) \tr \tc1 Row one, column one. \tc2 Row one, column two. \tr \tc1 Row two, column one. \tc2 Row two, column two. \s1 Chapter \it Two \it* @@ -38,7 +39,7 @@ \p \v 6 Chapter two, verse \w six|strong="12345" \w*. \p -\v 6 Bad verse. \x - \xo abc\xt 123\x* and more content. +\v 6 Bad verse. \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. \p \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. \v 7a Chapter two, verse seven A, diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index d27873d3b..48085a812 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -15,7 +15,26 @@ public void GetUsfm_Verse_CharStyle() string target = UpdateUsfm(rows); Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); - Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n")); + Assert.That( + target, + Contains.Substring( + "\\v 1 First verse of the first chapter. \\f + \\fr 1:1: \\ft This is a footnote for v1.\\f*\r\n\\li1\r\n\\v 2" + ) + ); + } + + [Test] + public void GetUsfm_StripNotesWithUpdatedVerseText() + { + var rows = new List<(IReadOnlyList, string)> + { + (ScrRef("MAT 1:1"), "First verse of the first chapter.") + }; + + string target = UpdateUsfm(rows, noteBehavior: UpdateUsfmIntraVerseMarkerBehavior.Strip); + Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); + Assert.That(target, Contains.Substring("\\ip An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*")); + Assert.That(target, Contains.Substring("\\v 1 First verse of the first chapter.\r\n\\li1\r\n\\v 2")); } [Test] @@ -28,7 +47,7 @@ public void GetUsfm_IdText() [Test] public void GetUsfm_StripAllText() { - string target = UpdateUsfm(behavior: UpdateUsfmBehavior.StripExisting); + string target = UpdateUsfm(textBehavior: UpdateUsfmTextBehavior.StripExisting); Assert.That(target, Contains.Substring("\\id MAT\r\n")); Assert.That(target, Contains.Substring("\\v 1\r\n")); Assert.That(target, Contains.Substring("\\s\r\n")); @@ -43,7 +62,7 @@ public void GetUsfm_PreferExisting() (ScrRef("MAT 1:6"), "Text 6"), (ScrRef("MAT 1:7"), "Text 7"), }; - string target = UpdateUsfm(rows, behavior: UpdateUsfmBehavior.PreferExisting); + string target = UpdateUsfm(rows, textBehavior: UpdateUsfmTextBehavior.PreferExisting); Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); Assert.That(target, Contains.Substring("\\v 6 Verse 6 content.\r\n")); Assert.That(target, Contains.Substring("\\v 7 Text 7\r\n")); @@ -57,37 +76,68 @@ public void GetUsfm_PreferRows() (ScrRef("MAT 1:6"), "Text 6"), (ScrRef("MAT 1:7"), "Text 7"), }; - string target = UpdateUsfm(rows, behavior: UpdateUsfmBehavior.PreferNew); + string target = UpdateUsfm(rows, textBehavior: UpdateUsfmTextBehavior.PreferNew); Assert.That(target, Contains.Substring("\\id MAT - Test\r\n")); Assert.That(target, Contains.Substring("\\v 6 Text 6\r\n")); Assert.That(target, Contains.Substring("\\v 7 Text 7\r\n")); } [Test] - public void GetUsfm_Verse_SkipNote() + public void GetUsfm_Verse_StripNote() { var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 2:1"), "First verse of the second chapter.") }; - string target = UpdateUsfm(rows); + string target = UpdateUsfm(rows, noteBehavior: UpdateUsfmIntraVerseMarkerBehavior.Strip); Assert.That(target, Contains.Substring("\\v 1 First verse of the second chapter.\r\n")); } [Test] - public void GetUsfm_Verse_ReplaceNote() + public void GetUsfm_Verse_ReplaceNoteKeepReference() { var rows = new List<(IReadOnlyList, string)> { (ScrRef("MAT 2:1"), "First verse of the second chapter."), - (ScrRef("MAT 2:1/1:f"), "This is a new footnote.") + (ScrRef("MAT 2:1/1:f/1:ft"), "This is a new footnote.") }; string target = UpdateUsfm(rows); Assert.That( target, - Contains.Substring("\\v 1 First verse of the second chapter. \\f + \\ft This is a new footnote.\\f*\r\n") + Contains.Substring( + "\\v 1 First verse of the second chapter. \\f + \\fr 2:1: \\ft This is a new footnote.\\f*\r\n" + ) + ); + } + + [Test] + public void GetUsfm_Verse_PreserveFiguresAndReferences() + { + var rows = new List<(IReadOnlyList, string)> + { + // fig + (ScrRef("MAT 1:5"), "Fifth verse of the first chapter."), + (ScrRef("MAT 1:5/1:fig"), "figure text not updated"), + // rq + (ScrRef("MAT 2:5/1:rq"), "quote reference not updated"), + // r + (ScrRef("MAT 2/1:r"), "parallel reference not updated"), + // xo + (ScrRef("MAT 2:6/3:xo"), "Cross reference not update"), + // xt + (ScrRef("MAT 2:6/4:xt"), "cross reference - target reference not updated"), + // xta + (ScrRef("MAT 2:6/5:xta"), "cross reference annotation updated"), + }; + + string target = UpdateUsfm(rows); + Assert.That( + target, + Contains.Substring( + "\\v 1 First verse of the second chapter. \\f + \\fr 2:1: \\ft This is a new footnote.\\f*\r\n" + ) ); } @@ -438,18 +488,20 @@ private static string UpdateUsfm( IReadOnlyList<(IReadOnlyList, string)>? rows = null, string? source = null, string? idText = null, - UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferNew + UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferNew, + UpdateUsfmIntraVerseMarkerBehavior noteBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve, + UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip ) { if (source is null) { var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmTestProjectPath); - return updater.UpdateUsfm("MAT", rows, idText, behavior); + return updater.UpdateUsfm("MAT", rows, idText, textBehavior, noteBehavior, formattingBehavior); } else { source = source.Trim().ReplaceLineEndings("\r\n") + "\r\n"; - var updater = new UpdateUsfmParserHandler(rows, idText, behavior); + var updater = new UpdateUsfmParserHandler(rows, idText, textBehavior, noteBehavior, formattingBehavior); UsfmParser.Parse(source, updater); return updater.GetUsfm(); } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index b2dbdcf32..63fe388a2 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -49,7 +49,11 @@ string sfmFileName in Directory string bookId; if (!targetSettings.IsBookFileName(sfmFileName, out bookId)) continue; - string newUsfm = updater.UpdateUsfm(bookId, pretranslations, behavior: UpdateUsfmBehavior.StripExisting); + string newUsfm = updater.UpdateUsfm( + bookId, + pretranslations, + textBehavior: UpdateUsfmTextBehavior.StripExisting + ); Assert.That(newUsfm, Is.Not.Null); } } @@ -150,7 +154,7 @@ async Task GetUsfmAsync(string projectPath) string newUsfm = updater.UpdateUsfm( bookId, pretranslations, - behavior: UpdateUsfmBehavior.StripExisting + textBehavior: UpdateUsfmTextBehavior.StripExisting ); Assert.That(newUsfm, Is.Not.Null); } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs index d77c9035b..c03db1587 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs @@ -11,7 +11,7 @@ public void Tokenize() string usfm = ReadUsfm(); var tokenizer = new UsfmTokenizer(); IReadOnlyList tokens = tokenizer.Tokenize(usfm); - Assert.That(tokens, Has.Count.EqualTo(236)); + Assert.That(tokens, Has.Count.EqualTo(240)); Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book)); Assert.That(tokens[0].Marker, Is.EqualTo("id"));