From aa7f31f53940ec54c8f5bd820c1bb38e7a79dd9c Mon Sep 17 00:00:00 2001 From: "Eli C. Lowry" <83078660+Enkidu93@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:49:11 -0500 Subject: [PATCH] Deduplicate kbts (#551) * Deduplicate kbts * Switch to tuple --- .../Services/PreprocessBuildJobTests.cs | 2 +- .../Services/ParallelCorpusPreprocessingService.cs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 13785191..470817cc 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -121,7 +121,7 @@ public async Task RunAsync_EnableKeyTerms() Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); Assert.That(trgCount, Is.EqualTo(1)); - Assert.That(termCount, Is.EqualTo(166)); + Assert.That(termCount, Is.EqualTo(144)); }); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 9d49e45a..a5fb70ac 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -101,7 +101,11 @@ public async Task PreprocessAsync( IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora .ChooseRandom(Seed) .AlignRows(targetTermCorpora.ChooseFirst()); - foreach (ParallelTextRow row in parallelKeyTermsCorpus) + foreach ( + ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => + (row.SourceText, row.TargetText) + ) + ) { await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); }