From d8be7032113f016f072eb7f1246ef00c9cacc9a0 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Fri, 30 Aug 2024 16:28:33 -0500 Subject: [PATCH] Test versification mismatch --- .../Corpora/UsfmManualTests.cs | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 88773c85..a562d683 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -1,4 +1,5 @@ using System.IO.Compression; +using System.Text; using System.Text.Json; using NUnit.Framework; @@ -169,4 +170,76 @@ async Task GetUsfmAsync(string projectPath) await GetUsfmAsync(ParatextProjectPath); } } + + [Test] + public void Test() + { + var sourceCorpus = new ParatextTextCorpus(CorporaTestHelpers.UsfmSourceProjectPath); + var targetCorpus = new ParatextTextCorpus(CorporaTestHelpers.UsfmTargetProjectPath); + + var rows = AlignPretranslateCorpus(sourceCorpus.FilterTexts(["SUS"]), targetCorpus.FilterTexts(["SUS"])) + .Select(p => + ( + Refs: (IReadOnlyList) + p.Refs.Select(r => ScriptureRef.Parse(r, targetCorpus.Versification)).ToArray(), + p.Translation + ) + ) + .OrderBy(p => p.Refs[0]); + + var updater = new FileParatextProjectTextUpdater(CorporaTestHelpers.UsfmSourceProjectPath); + string newUsfm = updater.UpdateUsfm("SUS", rows.ToArray(), stripAllText: true, preferExistingText: true); + Assert.That( + newUsfm, + Contains.Substring( + "\\v 65 et rex Astyages adpositus est ad patres suos et suscepit Cyrus Perses regnum eius" + ) + ); + } + + private static IEnumerable<(IReadOnlyList Refs, string Translation)> AlignPretranslateCorpus( + ITextCorpus srcCorpus, + ITextCorpus trgCorpus + ) + { + int rowCount = 0; + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List refs = []; + foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allSourceRows: true)) + { + if (!row.IsTargetRangeStart && row.IsTargetInRange) + { + refs.AddRange(row.TargetRefs.Cast()); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + rowCount++; + } + else + { + if (rowCount > 0) + { + yield return (refs.Select(r => r.ToString()).ToArray(), srcSegBuffer.ToString()); + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + rowCount = 0; + } + + refs.AddRange(row.TargetRefs.Cast()); + srcSegBuffer.Append(row.SourceText); + trgSegBuffer.Append(row.TargetText); + rowCount++; + } + } + + if (rowCount > 0) + { + yield return (refs.Select(r => r.ToString()).ToArray(), srcSegBuffer.ToString()); + } + } }