From 4f37c2365d6406a308b477a117aaf19425c010f7 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 15:42:45 -0500 Subject: [PATCH 01/12] Use chapter-filtering for terms --- .../Services/PreprocessBuildJobTests.cs | 144 +++++++++++++++++- .../data/pt-source1/TermRenderings.xml | 7 + .../data/pt-target1/TermRenderings.xml | 7 + .../Services/CorpusService.cs | 8 +- .../Services/ICorpusService.cs | 4 +- .../ParallelCorpusPreprocessingService.cs | 22 ++- 6 files changed, 179 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 02669cb4..2a5c82ae 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -508,6 +508,148 @@ public async Task ParallelCorpusLogic() }); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 71d49a50..231d9083 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index babe8c9b..0c9a82ab 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e75a2d59..a98f57c1 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -82,15 +82,21 @@ public void Preprocess( if (useKeyTerms) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); From e5ead88a02d16b4792e22ab5525a42280402f19a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 15:42:45 -0500 Subject: [PATCH 02/12] Use chapter-filtering for terms --- .../Services/PreprocessBuildJobTests.cs | 144 +++++++++++++++++- .../data/pt-source1/TermRenderings.xml | 7 + .../data/pt-target1/TermRenderings.xml | 7 + .../Services/CorpusService.cs | 8 +- .../Services/ICorpusService.cs | 4 +- .../ParallelCorpusPreprocessingService.cs | 22 ++- 6 files changed, 179 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 02669cb4..2a5c82ae 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -508,6 +508,148 @@ public async Task ParallelCorpusLogic() }); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 71d49a50..231d9083 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index babe8c9b..0c9a82ab 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e75a2d59..a98f57c1 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -82,15 +82,21 @@ public void Preprocess( if (useKeyTerms) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); From 8f25c6b389a3b7bc433cfc2f8414b1d579cc8ae9 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:45:03 -0500 Subject: [PATCH 03/12] Add support for key term filtering by textId --- .../Services/PreprocessBuildJobTests.cs | 98 +++++++++++++++++++ .../Services/CorpusService.cs | 4 +- .../Services/ICorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 21 ++-- 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 2a5c82ae..81c4cec2 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -633,6 +633,104 @@ public async Task ParallelCorpusAsync_UseKeyTerms() Target one, chapter one, verse five and six. Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms_TextIds() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnTextIds = ["MAT", "LEV"], + PretranslateTextIds = ["1CH"] + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnTextIds = ["MAT", "MRK"], + PretranslateTextIds = [] + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnTextIds = ["MAT", "MRK"] + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnTextIds = ["MAT", "MRK", "LEV"] + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source one, chapter two, verse one. +Source one, chapter two, verse two. + +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +Target one, chapter two, verse one. + +Target one, chapter two, verse three. + ", target ); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 231d9083..793e5046 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -37,10 +37,10 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file } public IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary>? chapters) in corpora) + foreach ((CorpusFile file, Dictionary> chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index 0c9a82ab..3f19fccc 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -4,6 +4,6 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index a98f57c1..917a25a3 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,14 +83,10 @@ public void Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora( - corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora( - corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -117,6 +113,19 @@ public void Preprocess( } } + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + MonolingualCorpus mc + ) + { + Dictionary>? chapters = mc.TrainOnChapters; + if (chapters is null && mc.TrainOnTextIds is not null) + { + chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); + } + chapters ??= []; + return mc.Files.Select(f => (f, chapters)); + } + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) { textCorpus = textCorpus.Transform(CleanSegment); From e02ad7cb1c5e9616dddabc96c44116a6edcc591f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:51:01 -0500 Subject: [PATCH 04/12] Change function name --- .../Services/ParallelCorpusPreprocessingService.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 917a25a3..b4685797 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,10 @@ public void Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -113,7 +113,7 @@ public void Preprocess( } } - private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( MonolingualCorpus mc ) { From 069dc402d482bb9bde26bea8053b426aa144b137 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 19:17:54 -0500 Subject: [PATCH 05/12] Use latest machine version --- .../src/Serval.Machine.Shared/Serval.Machine.Shared.csproj | 6 +++--- src/Serval/src/Serval.Shared/Serval.Shared.csproj | 2 +- .../src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index f9eea0c5..4206b29e 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 75ccbd9b..0e504535 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index f9476b69..a64c5d85 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -17,7 +17,7 @@ - + From 788beb9ef74df396e90a978faa7447fdd3dc6ec2 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 27 Nov 2024 10:39:57 -0500 Subject: [PATCH 06/12] When no filter is specified, only include kbts that are associated with the texts of the corpus --- .../Services/PreprocessBuildJobTests.cs | 16 ++++++++-------- .../Services/CorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 17 +++++++++++++---- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 81c4cec2..13785191 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -118,10 +118,10 @@ public async Task RunAsync_EnableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); - Assert.That(termCount, Is.EqualTo(5726)); + Assert.That(trgCount, Is.EqualTo(1)); + Assert.That(termCount, Is.EqualTo(166)); }); } @@ -136,9 +136,9 @@ public async Task RunAsync_DisableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); + Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); } @@ -853,8 +853,8 @@ public TestEnvironment() Id = "src_1", Language = "es", Files = [ParatextFile("pt-source1")], - TrainOnTextIds = [], - PretranslateTextIds = [] + TrainOnTextIds = null, + PretranslateTextIds = null } }, TargetCorpora = new List() @@ -864,7 +864,7 @@ public TestEnvironment() Id = "trg_1", Language = "en", Files = [ParatextFile("pt-target1")], - TrainOnTextIds = [] + TrainOnTextIds = null } } }; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 793e5046..dcabcd2d 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -40,7 +40,7 @@ public IEnumerable CreateTermCorpora( IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary> chapters) in corpora) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index b4685797..166a0374 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,18 @@ public void Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + sourceCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + targetCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -114,7 +122,8 @@ public void Preprocess( } private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( - MonolingualCorpus mc + MonolingualCorpus mc, + ITextCorpus tc ) { Dictionary>? chapters = mc.TrainOnChapters; @@ -122,7 +131,7 @@ MonolingualCorpus mc { chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); } - chapters ??= []; + chapters ??= tc.Texts.Select(t => (t.Id, new HashSet() { })).ToDictionary(); return mc.Files.Select(f => (f, chapters)); } From 50387ece0e0a8dfbf4d6ec70d045a19e30c5122e Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 15:42:45 -0500 Subject: [PATCH 07/12] Use chapter-filtering for terms --- .../Services/PreprocessBuildJobTests.cs | 144 +++++++++++++++++- .../data/pt-source1/TermRenderings.xml | 7 + .../data/pt-target1/TermRenderings.xml | 7 + .../Services/CorpusService.cs | 8 +- .../Services/ICorpusService.cs | 4 +- .../ParallelCorpusPreprocessingService.cs | 22 ++- 6 files changed, 179 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 02669cb4..2a5c82ae 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -508,6 +508,148 @@ public async Task ParallelCorpusLogic() }); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 71d49a50..231d9083 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index babe8c9b..0c9a82ab 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 25d6b55c..0e0a68d7 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -82,15 +82,21 @@ public async Task Preprocess( if (useKeyTerms) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); From 2d734885ae47ec41431da148f1589af80174fe4d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:45:03 -0500 Subject: [PATCH 08/12] Add support for key term filtering by textId --- .../Services/PreprocessBuildJobTests.cs | 98 +++++++++++++++++++ .../Services/CorpusService.cs | 4 +- .../Services/ICorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 21 ++-- 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 2a5c82ae..81c4cec2 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -633,6 +633,104 @@ public async Task ParallelCorpusAsync_UseKeyTerms() Target one, chapter one, verse five and six. Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms_TextIds() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnTextIds = ["MAT", "LEV"], + PretranslateTextIds = ["1CH"] + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnTextIds = ["MAT", "MRK"], + PretranslateTextIds = [] + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnTextIds = ["MAT", "MRK"] + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnTextIds = ["MAT", "MRK", "LEV"] + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source one, chapter two, verse one. +Source one, chapter two, verse two. + +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +Target one, chapter two, verse one. + +Target one, chapter two, verse three. + ", target ); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 231d9083..793e5046 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -37,10 +37,10 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file } public IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary>? chapters) in corpora) + foreach ((CorpusFile file, Dictionary> chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index 0c9a82ab..3f19fccc 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -4,6 +4,6 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 0e0a68d7..9469719a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,14 +83,10 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora( - corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora( - corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -117,6 +113,19 @@ public async Task Preprocess( } } + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + MonolingualCorpus mc + ) + { + Dictionary>? chapters = mc.TrainOnChapters; + if (chapters is null && mc.TrainOnTextIds is not null) + { + chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); + } + chapters ??= []; + return mc.Files.Select(f => (f, chapters)); + } + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) { textCorpus = textCorpus.Transform(CleanSegment); From 69d6d527d01c6f29868acefd1954af10e3f10a98 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:51:01 -0500 Subject: [PATCH 09/12] Change function name --- .../Services/ParallelCorpusPreprocessingService.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 9469719a..e1dace1e 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,10 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -113,7 +113,7 @@ public async Task Preprocess( } } - private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( MonolingualCorpus mc ) { From 551e6cdc52d6177c2bcb124f4ce82ce5246498bb Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 27 Nov 2024 10:39:57 -0500 Subject: [PATCH 10/12] When no filter is specified, only include kbts that are associated with the texts of the corpus --- .../Services/PreprocessBuildJobTests.cs | 16 ++++++++-------- .../Services/CorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 17 +++++++++++++---- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 81c4cec2..13785191 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -118,10 +118,10 @@ public async Task RunAsync_EnableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); - Assert.That(termCount, Is.EqualTo(5726)); + Assert.That(trgCount, Is.EqualTo(1)); + Assert.That(termCount, Is.EqualTo(166)); }); } @@ -136,9 +136,9 @@ public async Task RunAsync_DisableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); + Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); } @@ -853,8 +853,8 @@ public TestEnvironment() Id = "src_1", Language = "es", Files = [ParatextFile("pt-source1")], - TrainOnTextIds = [], - PretranslateTextIds = [] + TrainOnTextIds = null, + PretranslateTextIds = null } }, TargetCorpora = new List() @@ -864,7 +864,7 @@ public TestEnvironment() Id = "trg_1", Language = "en", Files = [ParatextFile("pt-target1")], - TrainOnTextIds = [] + TrainOnTextIds = null } } }; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 793e5046..dcabcd2d 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -40,7 +40,7 @@ public IEnumerable CreateTermCorpora( IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary> chapters) in corpora) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e1dace1e..71769985 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,18 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + sourceCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + targetCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -114,7 +122,8 @@ public async Task Preprocess( } private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( - MonolingualCorpus mc + MonolingualCorpus mc, + ITextCorpus tc ) { Dictionary>? chapters = mc.TrainOnChapters; @@ -122,7 +131,7 @@ MonolingualCorpus mc { chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); } - chapters ??= []; + chapters ??= tc.Texts.Select(t => (t.Id, new HashSet() { })).ToDictionary(); return mc.Files.Select(f => (f, chapters)); } From dd7f62038c1cf7955532facd2481809b0370bfa6 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 27 Nov 2024 13:21:53 -0500 Subject: [PATCH 11/12] Update to new machine version --- .../src/Serval.Machine.Shared/Serval.Machine.Shared.csproj | 6 +++--- src/Serval/src/Serval.Shared/Serval.Shared.csproj | 2 +- .../src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index 4206b29e..f9756293 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 0e504535..f2607b7b 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index a64c5d85..ced38ebc 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -17,7 +17,7 @@ - + From bff30842ff71c7fb1d4701b9316d5f3a43c10ebe Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 27 Nov 2024 13:32:50 -0500 Subject: [PATCH 12/12] Remove accidentally added ? --- .../src/SIL.ServiceToolkit/Services/CorpusService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index dcabcd2d..793e5046 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -40,7 +40,7 @@ public IEnumerable CreateTermCorpora( IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary>? chapters) in corpora) + foreach ((CorpusFile file, Dictionary> chapters) in corpora) { switch (file.Format) {