From 60304f8353b70d22e8e8f1639349c37550c3d905 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 27 Nov 2024 08:58:58 -0600 Subject: [PATCH 1/6] Update machine to 3.5.1 and small bug (#546) Preprocess should be async Make is so that write async can be called multiple times on S3Writer. Never have the S3 buffer grow above max size Update machine to 3.5.1 --- .../TranslationEngineServiceV1.cs | 5 +- .../Serval.Machine.Shared.csproj | 6 +- .../Services/PreprocessBuildJob.cs | 14 +-- .../Services/S3WriteStream.cs | 88 +++++++++++-------- .../src/Serval.Shared/Serval.Shared.csproj | 2 +- .../SIL.ServiceToolkit.csproj | 2 +- .../IParallelCorpusPreprocessingService.cs | 8 +- .../ParallelCorpusPreprocessingService.cs | 12 +-- .../ParallelCorpusProcessingServiceTests.cs | 6 +- 9 files changed, 84 insertions(+), 59 deletions(-) diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs index fb7abc66..720a0126 100644 --- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs @@ -82,9 +82,9 @@ await client.BuildStartedAsync( try { List pretranslationsRequests = []; - _parallelCorpusPreprocessingService.Preprocess( + await _parallelCorpusPreprocessingService.Preprocess( request.Corpora.Select(Map).ToList(), - row => { }, + row => Task.CompletedTask, (row, corpus) => { pretranslationsRequests.Add( @@ -97,6 +97,7 @@ await client.BuildStartedAsync( Translation = row.SourceSegment } ); + return Task.CompletedTask; }, false ); diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index f9eea0c5..4206b29e 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 46baa68d..831a6ad0 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -93,11 +93,11 @@ CancellationToken cancellationToken JsonObject? buildOptionsObject = null; if (buildOptions is not null) buildOptionsObject = JsonSerializer.Deserialize(buildOptions); + await using StreamWriter sourceTrainWriter = new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.src.txt", cancellationToken)); await using StreamWriter targetTrainWriter = new(await _sharedFileService.OpenWriteAsync($"builds/{buildId}/train.trg.txt", cancellationToken)); - await using Stream pretranslateStream = await _sharedFileService.OpenWriteAsync( $"builds/{buildId}/pretranslate.src.json", cancellationToken @@ -107,19 +107,19 @@ CancellationToken cancellationToken int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - _parallelCorpusPreprocessingService.Preprocess( + await _parallelCorpusPreprocessingService.Preprocess( corpora, - row => + async row => { if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) { - sourceTrainWriter.Write($"{row.SourceSegment}\n"); - targetTrainWriter.Write($"{row.TargetSegment}\n"); + await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n"); + await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n"); } if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; }, - (row, corpus) => + async (row, corpus) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) { @@ -134,6 +134,8 @@ CancellationToken cancellationToken pretranslateWriter.WriteEndObject(); pretranslateCount++; } + if (pretranslateWriter.BytesPending > 1024 * 1024) + await pretranslateWriter.FlushAsync(); }, (bool?)buildOptionsObject?["use_key_terms"] ?? true ); diff --git a/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs b/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs index 4b623d6d..e1ba3494 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/S3WriteStream.cs @@ -15,6 +15,9 @@ ILoggerFactory loggerFactory private readonly List _uploadResponses = new List(); private readonly ILogger _logger = loggerFactory.CreateLogger(); + private readonly Stream _stream = new MemoryStream(); + private int _bytesWritten = 0; + public const int MaxPartSize = 5 * 1024 * 1024; public override bool CanRead => false; @@ -23,7 +26,7 @@ ILoggerFactory loggerFactory public override bool CanWrite => true; - public override long Length => 0; + public override long Length => _stream.Length; public override long Position { @@ -48,47 +51,60 @@ public override async ValueTask WriteAsync( CancellationToken cancellationToken = default ) { - try - { - using Stream stream = buffer.AsStream(); + // S3 buckets can only be written to in chunks of MaxPartSize + // therefore, break it into chunks, resetting the stream each time - int bytesWritten = 0; + while (buffer.Length + _stream.Position > MaxPartSize) + { + int toWrite = MaxPartSize - (int)_stream.Position; + await _stream.WriteAsync(buffer[..toWrite], cancellationToken); + await UploadPartAsync(cancellationToken); + buffer = buffer[toWrite..]; + } + // save the remaining buffer for future calls + await _stream.WriteAsync(buffer, cancellationToken); + } - while (stream.Length > bytesWritten) - { - int partNumber = _uploadResponses.Count + 1; - UploadPartRequest request = - new() - { - BucketName = _bucketName, - Key = _key, - UploadId = _uploadId, - PartNumber = partNumber, - InputStream = stream, - PartSize = MaxPartSize - }; - request.StreamTransferProgress += new EventHandler( - (_, e) => - { - _logger.LogDebug( - "Transferred {e.TransferredBytes}/{e.TotalBytes}", - e.TransferredBytes, - e.TotalBytes - ); - } - ); - UploadPartResponse response = await _client.UploadPartAsync(request, cancellationToken); - if (response.HttpStatusCode != HttpStatusCode.OK) + private async Task UploadPartAsync(CancellationToken cancellationToken = default) + { + if (_stream.Length == 0) + return; + try + { + _stream.Position = 0; + int partNumber = _uploadResponses.Count + 1; + UploadPartRequest request = + new() { - throw new HttpRequestException( - $"Tried to upload part {partNumber} of upload {_uploadId} to {_bucketName}/{_key} but received response code {response.HttpStatusCode}" + BucketName = _bucketName, + Key = _key, + UploadId = _uploadId, + PartNumber = partNumber, + InputStream = _stream, + PartSize = MaxPartSize + }; + request.StreamTransferProgress += new EventHandler( + (_, e) => + { + _logger.LogDebug( + "Transferred {e.TransferredBytes}/{e.TotalBytes}", + e.TransferredBytes, + e.TotalBytes ); } + ); + UploadPartResponse response = await _client.UploadPartAsync(request, cancellationToken); + if (response.HttpStatusCode != HttpStatusCode.OK) + { + throw new HttpRequestException( + $"Tried to upload part {partNumber} of upload {_uploadId} to {_bucketName}/{_key} but received response code {response.HttpStatusCode}" + ); + } - _uploadResponses.Add(response); + _uploadResponses.Add(response); - bytesWritten += MaxPartSize; - } + _bytesWritten += MaxPartSize; + _stream.SetLength(0); } catch (Exception e) { @@ -104,6 +120,7 @@ public override async Task WriteAsync(byte[] buffer, int offset, int count, Canc protected override void Dispose(bool disposing) { + UploadPartAsync().WaitAndUnwrapException(); try { if (disposing) @@ -164,6 +181,7 @@ protected override void Dispose(bool disposing) public override async ValueTask DisposeAsync() { + await UploadPartAsync(); try { if (_uploadResponses.Count == 0) diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 75ccbd9b..0e504535 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index f9476b69..a64c5d85 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -17,7 +17,7 @@ - + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs index 1556de6d..1be70d5e 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs @@ -1,11 +1,13 @@ +using Nito.AsyncEx; + namespace SIL.ServiceToolkit.Utils; public interface IParallelCorpusPreprocessingService { - void Preprocess( + Task Preprocess( IReadOnlyList corpora, - Action train, - Action pretranslate, + Func train, + Func pretranslate, bool useKeyTerms = false ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e75a2d59..25d6b55c 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -25,10 +25,10 @@ internal int Seed } } - public void Preprocess( + public async Task Preprocess( IReadOnlyList corpora, - Action train, - Action pretranslate, + Func train, + Func pretranslate, bool useKeyTerms = false ) { @@ -77,7 +77,7 @@ public void Preprocess( foreach (Row row in CollapseRanges(trainingRows)) { - train(row); + await train(row); } if (useKeyTerms) @@ -93,7 +93,7 @@ public void Preprocess( IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { - train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); + await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); } } } @@ -106,7 +106,7 @@ public void Preprocess( foreach (Row row in CollapseRanges(pretranslateCorpus.ToArray())) { - pretranslate(row, corpus); + await pretranslate(row, corpus); } } } diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs index 543332e2..033467f4 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs @@ -13,7 +13,7 @@ public class ParallelCorpusPreprocessingServiceTests ); [Test] - public void TestParallelCorpusPreprocessor() + public async Task TestParallelCorpusPreprocessor() { ParallelCorpusPreprocessingService processor = new(new CorpusService()); List corpora = @@ -73,17 +73,19 @@ public void TestParallelCorpusPreprocessor() ]; int trainCount = 0; int pretranslateCount = 0; - processor.Preprocess( + await processor.Preprocess( corpora, row => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; + return Task.CompletedTask; }, (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) pretranslateCount++; + return Task.CompletedTask; }, false ); From 0635292df8a715ce141d033f76bda85e7bc4ce6d Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 27 Nov 2024 10:09:47 -0500 Subject: [PATCH 2/6] update QA to 1.8.1 --- deploy/qa-ext-values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml index 7106e030..5d7d1ecf 100644 --- a/deploy/qa-ext-values.yaml +++ b/deploy/qa-ext-values.yaml @@ -1,6 +1,6 @@ externalHost: qa.serval-api.org environment: Production -deploymentVersion: '1.7.QA7' +deploymentVersion: '1.8.QA1' alertEmail: ext-qa-serval-alerts@languagetechnology.org emailsToAlert: john_lambert@sil.org enableTls: true @@ -8,8 +8,8 @@ namespace: serval auth0Domain: dev-sillsdev.auth0.com lokiTenent: serval-tenant lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local -servalImage: ghcr.io/sillsdev/serval:1.7.7 -ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.7.2 +servalImage: ghcr.io/sillsdev/serval:1.8.1 +ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.8.1 ClearMLQueue: production MongoConnectionPrefix: qa_ SharedFileLocation: s3://silnlp/ext-qa/ From 50387ece0e0a8dfbf4d6ec70d045a19e30c5122e Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 15:42:45 -0500 Subject: [PATCH 3/6] Use chapter-filtering for terms --- .../Services/PreprocessBuildJobTests.cs | 144 +++++++++++++++++- .../data/pt-source1/TermRenderings.xml | 7 + .../data/pt-target1/TermRenderings.xml | 7 + .../Services/CorpusService.cs | 8 +- .../Services/ICorpusService.cs | 4 +- .../ParallelCorpusPreprocessingService.cs | 22 ++- 6 files changed, 179 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 02669cb4..2a5c82ae 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -359,7 +359,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() } [Test] - public async Task ParallelCorpusLogic() + public async Task ParallelCorpusAsync() { using TestEnvironment env = new(); var corpora = new List() @@ -508,6 +508,148 @@ public async Task ParallelCorpusLogic() }); } + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "LEV", + new() { } + } + }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + }, + PretranslateChapters = new() { } + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + } + } + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnChapters = new() + { + { + "MAT", + new() { 1 } + }, + { + "MRK", + new() { } + }, + { + "LEV", + new() { } + } + } + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. + +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + private class TestEnvironment : DisposableBase { private static readonly string TestDataPath = Path.Combine( diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-source1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml index 03e45020..b5c2bb97 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/data/pt-target1/TermRenderings.xml @@ -6,4 +6,11 @@ + + Zedekiah + + + + + diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 71d49a50..231d9083 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -36,14 +36,16 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file return corpora; } - public IEnumerable CreateTermCorpora(IReadOnlyList files) + public IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + ) { - foreach (CorpusFile file in files) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); + yield return new ParatextBackupTermsCorpus(file.Location, ["PN"], chapters: chapters); break; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index babe8c9b..0c9a82ab 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -3,5 +3,7 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList files); + IEnumerable CreateTermCorpora( + IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 25d6b55c..0e0a68d7 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -82,15 +82,21 @@ public async Task Preprocess( if (useKeyTerms) { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) + ITextCorpus[]? sourceTermCorpora = _corpusService + .CreateTermCorpora( + corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() + ) + .ToArray(); + ITextCorpus[]? targetTermCorpora = _corpusService + .CreateTermCorpora( + corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() + ) + .ToArray(); + if (sourceTermCorpora is not null && targetTermCorpora is not null) { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); foreach (ParallelTextRow row in parallelKeyTermsCorpus) { await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); From 2d734885ae47ec41431da148f1589af80174fe4d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:45:03 -0500 Subject: [PATCH 4/6] Add support for key term filtering by textId --- .../Services/PreprocessBuildJobTests.cs | 98 +++++++++++++++++++ .../Services/CorpusService.cs | 4 +- .../Services/ICorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 21 ++-- 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 2a5c82ae..81c4cec2 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -633,6 +633,104 @@ public async Task ParallelCorpusAsync_UseKeyTerms() Target one, chapter one, verse five and six. Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +", + target + ); + StringAssert.Contains("Abraham", source); + StringAssert.Contains("Abraham", target); + StringAssert.DoesNotContain("Zedekiah", source); + StringAssert.DoesNotContain("Zedekiah", target); + }); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7), pretranslations.ToJsonString()); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") + ); + } + + [Test] + public async Task ParallelCorpusAsync_UseKeyTerms_TextIds() + { + using TestEnvironment env = new(); + var corpora = new List() + { + new ParallelCorpus() + { + Id = "1", + SourceCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source1") }, + TrainOnTextIds = ["MAT", "LEV"], + PretranslateTextIds = ["1CH"] + }, + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-source2") }, + TrainOnTextIds = ["MAT", "MRK"], + PretranslateTextIds = [] + }, + }, + TargetCorpora = new List() + { + new() + { + Id = "_1", + Language = "en", + Files = new List { env.ParatextFile("pt-target1") }, + TrainOnTextIds = ["MAT", "MRK"] + }, + new() + { + Id = "_2", + Language = "en", + Files = new List { env.ParatextFile("pt-target2") }, + TrainOnTextIds = ["MAT", "MRK", "LEV"] + } + } + } + }; + await env.RunBuildJobAsync(corpora, useKeyTerms: true); + string source = await env.GetSourceExtractAsync(); + string target = await env.GetTargetExtractAsync(); + Assert.Multiple(() => + { + StringAssert.StartsWith( + @"Source one, chapter fourteen, verse fifty-five. Segment b. +Source one, chapter fourteen, verse fifty-six. +Source two, chapter one, verse one. +Source two, chapter one, verse two. +Source two, chapter one, verse three. +Source one, chapter one, verse four. +Source two, chapter one, verse five. Source two, chapter one, verse six. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. +Source one, chapter two, verse one. +Source one, chapter two, verse two. + +Source two, chapter one, verse one. +", + source + ); + StringAssert.StartsWith( + @"Target two, chapter fourteen, verse fifty-five. +Target two, chapter fourteen, verse fifty-six. +Target one, chapter one, verse one. +Target one, chapter one, verse two. +Target one, chapter one, verse three. + +Target one, chapter one, verse five and six. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. +Target one, chapter two, verse one. + +Target one, chapter two, verse three. + ", target ); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 231d9083..793e5046 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -37,10 +37,10 @@ public IEnumerable CreateTextCorpora(IReadOnlyList file } public IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> corpora + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary>? chapters) in corpora) + foreach ((CorpusFile file, Dictionary> chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index 0c9a82ab..3f19fccc 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -4,6 +4,6 @@ public interface ICorpusService { IEnumerable CreateTextCorpora(IReadOnlyList files); IEnumerable CreateTermCorpora( - IReadOnlyList<(CorpusFile File, Dictionary>? Chapters)> files + IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 0e0a68d7..9469719a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,14 +83,10 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora( - corpus.SourceCorpora.SelectMany(sc => sc.Files.Select(f => (f, sc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora( - corpus.TargetCorpora.SelectMany(tc => tc.Files.Select(f => (f, tc.TrainOnChapters))).ToArray() - ) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -117,6 +113,19 @@ public async Task Preprocess( } } + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + MonolingualCorpus mc + ) + { + Dictionary>? chapters = mc.TrainOnChapters; + if (chapters is null && mc.TrainOnTextIds is not null) + { + chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); + } + chapters ??= []; + return mc.Files.Select(f => (f, chapters)); + } + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) { textCorpus = textCorpus.Transform(CleanSegment); From 69d6d527d01c6f29868acefd1954af10e3f10a98 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 26 Nov 2024 17:51:01 -0500 Subject: [PATCH 5/6] Change function name --- .../Services/ParallelCorpusPreprocessingService.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 9469719a..e1dace1e 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,10 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChapters).ToArray()) + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -113,7 +113,7 @@ public async Task Preprocess( } } - private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChapters( + private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( MonolingualCorpus mc ) { From 551e6cdc52d6177c2bcb124f4ce82ce5246498bb Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 27 Nov 2024 10:39:57 -0500 Subject: [PATCH 6/6] When no filter is specified, only include kbts that are associated with the texts of the corpus --- .../Services/PreprocessBuildJobTests.cs | 16 ++++++++-------- .../Services/CorpusService.cs | 2 +- .../ParallelCorpusPreprocessingService.cs | 17 +++++++++++++---- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 81c4cec2..13785191 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -118,10 +118,10 @@ public async Task RunAsync_EnableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); - Assert.That(termCount, Is.EqualTo(5726)); + Assert.That(trgCount, Is.EqualTo(1)); + Assert.That(termCount, Is.EqualTo(166)); }); } @@ -136,9 +136,9 @@ public async Task RunAsync_DisableKeyTerms() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(0)); + Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); - Assert.That(trgCount, Is.EqualTo(0)); + Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); } @@ -853,8 +853,8 @@ public TestEnvironment() Id = "src_1", Language = "es", Files = [ParatextFile("pt-source1")], - TrainOnTextIds = [], - PretranslateTextIds = [] + TrainOnTextIds = null, + PretranslateTextIds = null } }, TargetCorpora = new List() @@ -864,7 +864,7 @@ public TestEnvironment() Id = "trg_1", Language = "en", Files = [ParatextFile("pt-target1")], - TrainOnTextIds = [] + TrainOnTextIds = null } } }; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 793e5046..dcabcd2d 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -40,7 +40,7 @@ public IEnumerable CreateTermCorpora( IReadOnlyList<(CorpusFile File, Dictionary> Chapters)> corpora ) { - foreach ((CorpusFile file, Dictionary> chapters) in corpora) + foreach ((CorpusFile file, Dictionary>? chapters) in corpora) { switch (file.Format) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index e1dace1e..71769985 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -83,10 +83,18 @@ public async Task Preprocess( if (useKeyTerms) { ITextCorpus[]? sourceTermCorpora = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + sourceCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); ITextCorpus[]? targetTermCorpora = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(GetChaptersPerFile).ToArray()) + .CreateTermCorpora( + targetCorpora + .SelectMany(corpus => GetChaptersPerFile(corpus.Corpus, corpus.TextCorpus)) + .ToArray() + ) .ToArray(); if (sourceTermCorpora is not null && targetTermCorpora is not null) { @@ -114,7 +122,8 @@ public async Task Preprocess( } private static IEnumerable<(CorpusFile File, Dictionary> Chapters)> GetChaptersPerFile( - MonolingualCorpus mc + MonolingualCorpus mc, + ITextCorpus tc ) { Dictionary>? chapters = mc.TrainOnChapters; @@ -122,7 +131,7 @@ MonolingualCorpus mc { chapters = mc.TrainOnTextIds.Select(tid => (tid, new HashSet { })).ToDictionary(); } - chapters ??= []; + chapters ??= tc.Texts.Select(t => (t.Id, new HashSet() { })).ToDictionary(); return mc.Files.Select(f => (f, chapters)); }