From 282c473d98fd1931ddd8e355371888c956ffc3d1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Nov 2024 17:44:28 -0500 Subject: [PATCH] Fix test; add corpora extensions test --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 2 +- .../Corpora/NParallelTextCorpus.cs | 16 +- .../Corpora/CorporaExtensionsTests.cs | 150 +++++++++++++++++- .../Corpora/NParallelTextCorpusTests.cs | 43 +++++ 4 files changed, 204 insertions(+), 7 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index b2247a972..0c6c4228d 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -564,7 +564,7 @@ public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule public override IEnumerable GetRows(IEnumerable textIds) { - foreach (NParallelTextRow nRow in _corpus.GetRows()) + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) { if (nRow.N == 0 || nRow.IsEmpty) continue; diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 22a2e91ac..ad5fc73ad 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -354,7 +354,10 @@ private IEnumerable CreateRows( } else { - refs.Add(CorrectVersification(refRefs, i)); + if (Corpora[i].IsScripture()) + refs.Add(CorrectVersification(refRefs, i)); + else + refs.Add(new object[] { }); flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } @@ -381,7 +384,7 @@ private IEnumerable CreateMinRefRows( .ToList(); List alreadyYielded = new List(); - + TextRow[] textRows; foreach (int i in minRefIndexes) { TextRow textRow = currentRows[i]; @@ -394,7 +397,7 @@ private IEnumerable CreateMinRefRows( alreadyYielded.Add(i); foreach (TextRow sameRefRow in sameRefRows) { - var textRows = new TextRow[N]; + textRows = new TextRow[N]; textRows[i] = textRow; textRows[j] = sameRefRow; foreach ( @@ -407,13 +410,16 @@ NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRan } } } + textRows = new TextRow[N]; + var forceCurrentInRange = new bool[N]; foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded)) { TextRow textRow = currentRows[i]; - var textRows = new TextRow[N]; textRows[i] = textRow; - var forceCurrentInRange = new bool[N]; forceCurrentInRange[i] = forceCurrentInRange[i]; + } + if (textRows.Any(tr => tr != null)) + { foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) { yield return row; diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 29b645b9a..d813aff44 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -1,4 +1,5 @@ -using NUnit.Framework; +using System.Text.Json; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -64,4 +65,151 @@ public void ExtractScripture() Assert.That(origRef, Is.EqualTo(new VerseRef("MAT 2:12", ScrVers.Original))); Assert.That(corpusRef, Is.EqualTo(new VerseRef("MAT 2:12", corpus.Versification))); } + + [Test] + public void MergedCorpus_SelectFirst() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source 1 segment 1 ."), TextRow("text1", 3) }) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectFirst(); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed123456() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(123456); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 1 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 1 segment 3 .")); + }); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed4501() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(4501); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + }); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs index cb1b4f4ba..ee3a9150f 100644 --- a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -163,6 +163,49 @@ public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() Assert.That(rows[1].GetIsSentenceStart(1), Is.False); } + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(1), Is.True); + } + [Test] public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() {