diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
index 4206b29e..f9756293 100644
--- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
+++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
@@ -36,9 +36,9 @@
-
-
-
+
+
+
diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs
index 02669cb4..13785191 100644
--- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs
+++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs
@@ -118,10 +118,10 @@ public async Task RunAsync_EnableKeyTerms()
(int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync();
Assert.Multiple(() =>
{
- Assert.That(src1Count, Is.EqualTo(0));
+ Assert.That(src1Count, Is.EqualTo(14));
Assert.That(src2Count, Is.EqualTo(0));
- Assert.That(trgCount, Is.EqualTo(0));
- Assert.That(termCount, Is.EqualTo(5726));
+ Assert.That(trgCount, Is.EqualTo(1));
+ Assert.That(termCount, Is.EqualTo(166));
});
}
@@ -136,9 +136,9 @@ public async Task RunAsync_DisableKeyTerms()
(int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync();
Assert.Multiple(() =>
{
- Assert.That(src1Count, Is.EqualTo(0));
+ Assert.That(src1Count, Is.EqualTo(14));
Assert.That(src2Count, Is.EqualTo(0));
- Assert.That(trgCount, Is.EqualTo(0));
+ Assert.That(trgCount, Is.EqualTo(1));
Assert.That(termCount, Is.EqualTo(0));
});
}
@@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook()
}
[Test]
- public async Task ParallelCorpusLogic()
+ public async Task ParallelCorpusAsync()
{
using TestEnvironment env = new();
var corpora = new List()
@@ -508,6 +508,246 @@ public async Task ParallelCorpusLogic()
});
}
+ [Test]
+ public async Task ParallelCorpusAsync_UseKeyTerms()
+ {
+ using TestEnvironment env = new();
+ var corpora = new List()
+ {
+ new ParallelCorpus()
+ {
+ Id = "1",
+ SourceCorpora = new List()
+ {
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-source1") },
+ TrainOnChapters = new()
+ {
+ {
+ "MAT",
+ new() { 1 }
+ },
+ {
+ "LEV",
+ new() { }
+ }
+ },
+ PretranslateChapters = new()
+ {
+ {
+ "1CH",
+ new() { }
+ }
+ }
+ },
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-source2") },
+ TrainOnChapters = new()
+ {
+ {
+ "MAT",
+ new() { 1 }
+ },
+ {
+ "MRK",
+ new() { }
+ }
+ },
+ PretranslateChapters = new() { }
+ },
+ },
+ TargetCorpora = new List()
+ {
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-target1") },
+ TrainOnChapters = new()
+ {
+ {
+ "MAT",
+ new() { 1 }
+ },
+ {
+ "MRK",
+ new() { }
+ }
+ }
+ },
+ new()
+ {
+ Id = "_2",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-target2") },
+ TrainOnChapters = new()
+ {
+ {
+ "MAT",
+ new() { 1 }
+ },
+ {
+ "MRK",
+ new() { }
+ },
+ {
+ "LEV",
+ new() { }
+ }
+ }
+ }
+ }
+ }
+ };
+ await env.RunBuildJobAsync(corpora, useKeyTerms: true);
+ string source = await env.GetSourceExtractAsync();
+ string target = await env.GetTargetExtractAsync();
+ Assert.Multiple(() =>
+ {
+ StringAssert.StartsWith(
+ @"Source one, chapter fourteen, verse fifty-five. Segment b.
+Source one, chapter fourteen, verse fifty-six.
+Source two, chapter one, verse one.
+Source two, chapter one, verse two.
+Source two, chapter one, verse three.
+Source one, chapter one, verse four.
+Source two, chapter one, verse five. Source two, chapter one, verse six.
+Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten.
+Source two, chapter one, verse one.
+",
+ source
+ );
+ StringAssert.StartsWith(
+ @"Target two, chapter fourteen, verse fifty-five.
+Target two, chapter fourteen, verse fifty-six.
+Target one, chapter one, verse one.
+Target one, chapter one, verse two.
+Target one, chapter one, verse three.
+
+Target one, chapter one, verse five and six.
+Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten.
+
+",
+ target
+ );
+ StringAssert.Contains("Abraham", source);
+ StringAssert.Contains("Abraham", target);
+ StringAssert.DoesNotContain("Zedekiah", source);
+ StringAssert.DoesNotContain("Zedekiah", target);
+ });
+ JsonArray? pretranslations = await env.GetPretranslationsAsync();
+ Assert.That(pretranslations, Is.Not.Null);
+ Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString());
+ Assert.That(
+ pretranslations[2]!["translation"]!.ToString(),
+ Is.EqualTo("Source one, chapter twelve, verse one.")
+ );
+ }
+
+ [Test]
+ public async Task ParallelCorpusAsync_UseKeyTerms_TextIds()
+ {
+ using TestEnvironment env = new();
+ var corpora = new List()
+ {
+ new ParallelCorpus()
+ {
+ Id = "1",
+ SourceCorpora = new List()
+ {
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-source1") },
+ TrainOnTextIds = ["MAT", "LEV"],
+ PretranslateTextIds = ["1CH"]
+ },
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-source2") },
+ TrainOnTextIds = ["MAT", "MRK"],
+ PretranslateTextIds = []
+ },
+ },
+ TargetCorpora = new List()
+ {
+ new()
+ {
+ Id = "_1",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-target1") },
+ TrainOnTextIds = ["MAT", "MRK"]
+ },
+ new()
+ {
+ Id = "_2",
+ Language = "en",
+ Files = new List { env.ParatextFile("pt-target2") },
+ TrainOnTextIds = ["MAT", "MRK", "LEV"]
+ }
+ }
+ }
+ };
+ await env.RunBuildJobAsync(corpora, useKeyTerms: true);
+ string source = await env.GetSourceExtractAsync();
+ string target = await env.GetTargetExtractAsync();
+ Assert.Multiple(() =>
+ {
+ StringAssert.StartsWith(
+ @"Source one, chapter fourteen, verse fifty-five. Segment b.
+Source one, chapter fourteen, verse fifty-six.
+Source two, chapter one, verse one.
+Source two, chapter one, verse two.
+Source two, chapter one, verse three.
+Source one, chapter one, verse four.
+Source two, chapter one, verse five. Source two, chapter one, verse six.
+Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten.
+Source one, chapter two, verse one.
+Source one, chapter two, verse two.
+
+Source two, chapter one, verse one.
+",
+ source
+ );
+ StringAssert.StartsWith(
+ @"Target two, chapter fourteen, verse fifty-five.
+Target two, chapter fourteen, verse fifty-six.
+Target one, chapter one, verse one.
+Target one, chapter one, verse two.
+Target one, chapter one, verse three.
+
+Target one, chapter one, verse five and six.
+Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten.
+Target one, chapter two, verse one.
+
+Target one, chapter two, verse three.
+
+",
+ target
+ );
+ StringAssert.Contains("Abraham", source);
+ StringAssert.Contains("Abraham", target);
+ StringAssert.DoesNotContain("Zedekiah", source);
+ StringAssert.DoesNotContain("Zedekiah", target);
+ });
+ JsonArray? pretranslations = await env.GetPretranslationsAsync();
+ Assert.That(pretranslations, Is.Not.Null);
+ Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString());
+ Assert.That(
+ pretranslations[2]!["translation"]!.ToString(),
+ Is.EqualTo("Source one, chapter twelve, verse one.")
+ );
+ }
+
private class TestEnvironment : DisposableBase
{
private static readonly string TestDataPath = Path.Combine(
@@ -613,8 +853,8 @@ public TestEnvironment()
Id = "src_1",
Language = "es",
Files = [ParatextFile("pt-source1")],
- TrainOnTextIds = [],
- PretranslateTextIds = []
+ TrainOnTextIds = null,
+ PretranslateTextIds = null
}
},
TargetCorpora = new List()
@@ -624,7 +864,7 @@ public TestEnvironment()
Id = "trg_1",
Language = "en",
Files = [ParatextFile("pt-target1")],
- TrainOnTextIds = []
+ TrainOnTextIds = null
}
}
};
diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml
index 03e45020..b5c2bb97 100644
--- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml
+++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml
@@ -6,4 +6,11 @@
+
+ Zedekiah
+
+
+
+
+
diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml
index 03e45020..b5c2bb97 100644
--- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml
+++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml
@@ -6,4 +6,11 @@
+
+ Zedekiah
+
+
+
+
+
diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj
index 0e504535..f2607b7b 100644
--- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj
+++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj
@@ -19,7 +19,7 @@
-
+
diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj
index a64c5d85..ced38ebc 100644
--- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj
+++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj
@@ -17,7 +17,7 @@
-
+
diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs
index 71d49a50..793e5046 100644
--- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs
+++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs
@@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file
return corpora;
}
- public IEnumerable CreateTermCorpora(IReadOnlyList files)
+ public IEnumerable CreateTermCorpora(
+ IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora
+ )
{
- foreach (CorpusFile file in files)
+ foreach ((CorpusFile file, Dictionary> chapters) in corpora)
{
switch (file.Format)
{
case FileFormat.Paratext:
- yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]);
+ yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters);
break;
}
}
diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs
index babe8c9b..3f19fccc 100644
--- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs
+++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs
@@ -3,5 +3,7 @@
public interface ICorpusService
{
IEnumerable CreateTextCorpora(IReadOnlyList files);
- IEnumerable CreateTermCorpora(IReadOnlyList files);
+ IEnumerable CreateTermCorpora(
+ IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora
+ );
}
diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs
index 25d6b55c..71769985 100644
--- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs
+++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs
@@ -82,15 +82,25 @@ public async Task Preprocess(
if (useKeyTerms)
{
- ITextCorpus? sourceTermCorpus = _corpusService
- .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList())
- .FirstOrDefault();
- ITextCorpus? targetTermCorpus = _corpusService
- .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList())
- .FirstOrDefault();
- if (sourceTermCorpus is not null && targetTermCorpus is not null)
+ ITextCorpus[]? sourceTermCorpora = _corpusService
+ .CreateTermCorpora(
+ sourceCorpora
+ .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus))
+ .ToArray()
+ )
+ .ToArray();
+ ITextCorpus[]? targetTermCorpora = _corpusService
+ .CreateTermCorpora(
+ targetCorpora
+ .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus))
+ .ToArray()
+ )
+ .ToArray();
+ if (sourceTermCorpora is not null && targetTermCorpora is not null)
{
- IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus);
+ IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora
+ .ChooseRandom(Seed)
+ .AlignRows(targetTermCorpora.ChooseFirst());
foreach (ParallelTextRow row in parallelKeyTermsCorpus)
{
await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1));
@@ -111,6 +121,20 @@ public async Task Preprocess(
}
}
+ private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile(
+ MonolingualCorpus mc,
+ ITextCorpus tc
+ )
+ {
+ Dictionary>? chapters = mc.TrainOnChapters;
+ if (chapters is null && mc.TrainOnTextIds is not null)
+ {
+ chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary();
+ }
+ chapters ??= tc.Texts.Select(t => (t.Id, new HashSet() { })).ToDictionary();
+ return mc.Files.Select(f => (f, chapters));
+ }
+
private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus)
{
textCorpus = textCorpus.Transform(CleanSegment);