diff --git a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs b/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs index 17d562ad..322eb45d 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs index bbcc9de3..60fe8b37 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ); } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index d9e433ce..e9458545 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -221,15 +221,19 @@ row.Ref is not ScriptureRef sr if ((bool?)buildOptionsObject?["use_key_terms"] ?? true) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) + ITextCorpus? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToList() + ) .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) + ITextCorpus? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToList() + ) .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora.AlignRows(targetTermCorpora); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 539b9c4c..f231d437 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -338,7 +338,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -478,6 +478,149 @@ await env.GetTargetExtractAsync(), ); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source one, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source two, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source two, chapter one, verse seven. Source two, chapter one, verse eight. +Source two, chapter one, verse nine. Source two, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. +Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(37), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + +