From 7b9c4a8e87b6d8dce9c9b13ca160c41e3b6021c8 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 2 Dec 2024 14:06:14 -0500 Subject: [PATCH] Deduplicate kbts --- .../Services/PreprocessBuildJobTests.cs | 2 +- .../Services/ParallelCorpusPreprocessingService.cs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 13785191..470817cc 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -121,7 +121,7 @@ public async Task RunAsync_EnableKeyTerms() Assert.That(src1Count, Is.EqualTo(14)); Assert.That(src2Count, Is.EqualTo(0)); Assert.That(trgCount, Is.EqualTo(1)); - Assert.That(termCount, Is.EqualTo(166)); + Assert.That(termCount, Is.EqualTo(144)); }); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs index 71769985..7ef4d67c 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -101,7 +101,9 @@ public async Task Preprocess( IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora .ChooseRandom(Seed) .AlignRows(targetTermCorpora.ChooseFirst()); - foreach (ParallelTextRow row in parallelKeyTermsCorpus) + foreach ( + ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => row.SourceText + row.TargetText) + ) { await train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); }