diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 02669cb4..2a5c82ae 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -508,6 +508,148 @@ public async Task ParallelCorpusLogic() }); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 71d49a50..231d9083 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index babe8c9b..0c9a82ab 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e75a2d59..a98f57c1 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -82,15 +82,21 @@ public void Preprocess( if (useKeyTerms) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1));