diff --git a/.github/workflows/ci-e2e.yml b/.github/workflows/ci-e2e.yml
index 472e33d0..fc2a72df 100644
--- a/.github/workflows/ci-e2e.yml
+++ b/.github/workflows/ci-e2e.yml
@@ -10,7 +10,7 @@ jobs:
build:
name: Build
runs-on: ubuntu-latest
- timeout-minutes: 45
+ timeout-minutes: 60
env:
SERVAL_CLIENT_ID: ${{ secrets.SERVAL_CLIENT_ID }}
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 4c5aadb3..cbe0a073 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -28,6 +28,7 @@
"ptcc",
"Rebinder",
"stylesheet",
+ "timespan",
"upserted",
"USFM"
],
diff --git a/README.md b/README.md
index 4e2b2880..326d20ab 100644
--- a/README.md
+++ b/README.md
@@ -60,15 +60,13 @@ There are 3 different environments that Serval is deployed to:
- Run `kubectl config use-context dallas-rke`
- First, startup the storage (using internal qa for example)
- `helm install serval-pvc deploy/serval-pvc -n nlp -f deploy/qa-int-values.yaml`
-- Then, startup the database (give it 60 seconds)
-- `helm install mongo deploy/mongo -n nlp -f deploy/qa-int-values.yaml`
- Now you can turn on Serval
- `helm install serval deploy/serval -n nlp -f deploy/qa-int-values.yaml`
### To update the cluster
- To upgrade Serval:
- For QA internal Run:
- - `kubectl config use-context dallas-rke`
+ - `kubectl config use-context dallas-stage`
- `helm upgrade serval deploy/serval -n nlp -f deploy/qa-int-values.yaml`
- For QA external Run:
- `kubectl config use-context dallas-rke`
diff --git a/Serval.sln b/Serval.sln
index edd3f075..12c0aaaf 100644
--- a/Serval.sln
+++ b/Serval.sln
@@ -86,6 +86,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -180,6 +184,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -215,6 +223,8 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
+ {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
+ {C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
diff --git a/deploy/mongo/Chart.yaml b/deploy/mongo/Chart.yaml
deleted file mode 100644
index e7a63115..00000000
--- a/deploy/mongo/Chart.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: mongo-repl
-description: A mongo deployment to support serval
-version: 0.0.1
-apiVersion: v1
-keywords:
- - mongo
-sources:
-home:
diff --git a/deploy/mongo/templates/mongo-deployment.yaml b/deploy/mongo/templates/mongo-deployment.yaml
deleted file mode 100644
index 8ae37d93..00000000
--- a/deploy/mongo/templates/mongo-deployment.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
- labels:
- app: mongo
- name: mongo
-spec:
- replicas: 1
- selector:
- matchLabels:
- app: mongo
- strategy:
- type: Recreate
- template:
- metadata:
- labels:
- app: mongo
- spec:
- terminationGracePeriodSeconds: 30
- containers:
- - command: ["/bin/sh", "-c"]
- args: ['mongod --replSet myRS --bind_ip 0.0.0.0 & sleep 15s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity']
- image: mongo:6.0
- imagePullPolicy: "Always"
- name: mongo
- ports:
- - containerPort: 27017
- resources:
- limits:
- memory: "2000Mi"
- cpu: "1000m"
- requests:
- memory: "2000Mi"
- cpu: "1000m"
- volumeMounts:
- - mountPath: /data/db
- name: mongo-data
- hostname: mongo
- restartPolicy: Always
- volumes:
- - name: mongo-data
- persistentVolumeClaim:
- claimName: serval-mongo-claim
-status: {}
diff --git a/deploy/mongo/templates/mongo-service.yaml b/deploy/mongo/templates/mongo-service.yaml
deleted file mode 100644
index f787c84e..00000000
--- a/deploy/mongo/templates/mongo-service.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
- labels:
- app: mongo
- name: mongo
-spec:
- ports:
- - name: "27017"
- port: 27017
- targetPort: 27017
- selector:
- app: mongo
-status:
- loadBalancer: {}
diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml
index 96074da6..7106e030 100644
--- a/deploy/qa-ext-values.yaml
+++ b/deploy/qa-ext-values.yaml
@@ -1,6 +1,6 @@
externalHost: qa.serval-api.org
environment: Production
-deploymentVersion: '1.6.QA3'
+deploymentVersion: '1.7.QA7'
alertEmail: ext-qa-serval-alerts@languagetechnology.org
emailsToAlert: john_lambert@sil.org
enableTls: true
@@ -8,8 +8,8 @@ namespace: serval
auth0Domain: dev-sillsdev.auth0.com
lokiTenent: serval-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.6.3
-ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.6.3
+servalImage: ghcr.io/sillsdev/serval:1.7.7
+ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.7.2
ClearMLQueue: production
MongoConnectionPrefix: qa_
SharedFileLocation: s3://silnlp/ext-qa/
diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml
index 21aaec25..e047f4a7 100644
--- a/deploy/qa-int-values.yaml
+++ b/deploy/qa-int-values.yaml
@@ -8,11 +8,11 @@ namespace: nlp
auth0Domain: sil-appbuilder.auth0.com
lokiTenent: nlp-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.6.1
+servalImage: ghcr.io/sillsdev/serval:1.7.0
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.6.3
ClearMLQueue: lambert_24gb
MongoConnectionPrefix: qa_int_
SharedFileLocation: s3://silnlp/int-qa/
-servalClaimSize: 1Gi
-machineClaimSize: 2Gi
+servalClaimSize: 5Gi
+machineClaimSize: 20Gi
enableEcho: true
\ No newline at end of file
diff --git a/deploy/serval-pvc/templates/persistent-volume-claims.yaml b/deploy/serval-pvc/templates/persistent-volume-claims.yaml
index 5acc3718..c4f1a8d5 100644
--- a/deploy/serval-pvc/templates/persistent-volume-claims.yaml
+++ b/deploy/serval-pvc/templates/persistent-volume-claims.yaml
@@ -35,17 +35,4 @@ spec:
- ReadWriteMany
resources:
requests:
- storage: 50M
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
- name: serval-mongo-claim
- namespace: {{ .Values.namespace}}
-spec:
- storageClassName: "longhorn"
- accessModes:
- - ReadWriteMany
- resources:
- requests:
- storage: 10Gi
\ No newline at end of file
+ storage: 55M
\ No newline at end of file
diff --git a/deploy/serval/templates/fluentd-flows.yaml b/deploy/serval/templates/fluentd-flows.yaml
index 84db700e..2d9729bc 100644
--- a/deploy/serval/templates/fluentd-flows.yaml
+++ b/deploy/serval/templates/fluentd-flows.yaml
@@ -26,21 +26,3 @@ spec:
- echo
hosts: []
labels: {}
----
-apiVersion: logging.banzaicloud.io/v1beta1
-kind: Flow
-metadata:
- name: mongo-flow
- namespace: {{ .Values.namespace }}
-spec:
- globalOutputRefs: []
- localOutputRefs:
- - {{ .Values.namespace }}-loki-output
- match:
- - select:
- container_names:
- - mongo
- hosts: []
- labels: {}
-status:
- active: true
diff --git a/docker-compose.yml b/docker-compose.yml
index 8592c6e7..6e568f99 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -182,4 +182,4 @@ services:
'/bin/sh',
'-c',
'mongod --quiet --replSet myRS --bind_ip 0.0.0.0 & sleep 2s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity'
- ]
+ ]
\ No newline at end of file
diff --git a/samples/ApiExample/ApiExample.csproj b/samples/ApiExample/ApiExample.csproj
new file mode 100644
index 00000000..9a87fdcc
--- /dev/null
+++ b/samples/ApiExample/ApiExample.csproj
@@ -0,0 +1,28 @@
+
+
+
+ Exe
+ net8.0
+ enable
+ enable
+ 4d0606c3-0fc7-4d76-b43b-236485004e81
+
+
+
+
+ PreserveNewest
+
+
+ PreserveNewest
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/samples/ApiExample/ApiExample.sln b/samples/ApiExample/ApiExample.sln
new file mode 100644
index 00000000..dbdd4696
--- /dev/null
+++ b/samples/ApiExample/ApiExample.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.11.35327.3
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ApiExample", "ApiExample.csproj", "{F80F8853-776B-4C3A-B789-B8FD5820150A}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {F80F8853-776B-4C3A-B789-B8FD5820150A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {F80F8853-776B-4C3A-B789-B8FD5820150A}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {F80F8853-776B-4C3A-B789-B8FD5820150A}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {F80F8853-776B-4C3A-B789-B8FD5820150A}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {72D18D80-E951-41EE-8A1F-97B2B72615AD}
+ EndGlobalSection
+EndGlobal
diff --git a/samples/ApiExample/Program.cs b/samples/ApiExample/Program.cs
new file mode 100644
index 00000000..00dd0830
--- /dev/null
+++ b/samples/ApiExample/Program.cs
@@ -0,0 +1,318 @@
+using System.IO.Compression;
+using ApiExample;
+using IdentityModel.Client;
+using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.DependencyInjection;
+using Newtonsoft.Json.Linq;
+using Serval.Client;
+
+// Setup and get the services
+ServiceProvider services = SetupServices();
+IDataFilesClient dataFilesClient = services.GetService()!;
+ICorporaClient corporaClient = services.GetService()!;
+ITranslationEnginesClient translationEnginesClient = services.GetService()!;
+
+// Trap Ctrl+C cancellation
+var cancellationTokenSource = new CancellationTokenSource();
+Console.CancelKeyPress += (_, eventArgs) =>
+{
+ Console.WriteLine("Cancelling...");
+ cancellationTokenSource.Cancel();
+ eventArgs.Cancel = true;
+};
+
+// Create then tear down a pre-translation (NMT) engine
+await CreatePreTranslationEngineAsync(cancellationTokenSource.Token);
+
+// Exit
+return;
+
+static ServiceProvider SetupServices()
+{
+ const string HttpClientName = "serval-api";
+ const string TokenClientName = "serval-api-token";
+
+ var configurationBuilder = new ConfigurationBuilder();
+ IConfiguration configuration = configurationBuilder
+ .AddJsonFile("appsettings.json", false, true)
+ .AddUserSecrets()
+ .Build();
+ ServalOptions servalOptions = configuration.GetSection("Serval").Get()!;
+
+ var services = new ServiceCollection();
+ services.AddDistributedMemoryCache();
+ services
+ .AddClientCredentialsTokenManagement()
+ .AddClient(
+ TokenClientName,
+ client =>
+ {
+ client.TokenEndpoint = servalOptions.TokenUrl;
+ client.ClientId = servalOptions.ClientId;
+ client.ClientSecret = servalOptions.ClientSecret;
+ client.Parameters = new Parameters { { "audience", servalOptions.Audience } };
+ }
+ );
+ services.AddClientCredentialsHttpClient(
+ HttpClientName,
+ TokenClientName,
+ configureClient: client => client.BaseAddress = new Uri(servalOptions.ApiServer)
+ );
+ services.AddHttpClient(HttpClientName).SetHandlerLifetime(TimeSpan.FromMinutes(5));
+ services.AddSingleton(sp =>
+ {
+ // Instantiate the translation engines client with the named HTTP client
+ IHttpClientFactory? factory = sp.GetService();
+ HttpClient httpClient = factory!.CreateClient(HttpClientName);
+ return new TranslationEnginesClient(httpClient);
+ });
+ services.AddSingleton(sp =>
+ {
+ // Instantiate the data files client with the named HTTP client
+ IHttpClientFactory? factory = sp.GetService();
+ HttpClient httpClient = factory!.CreateClient(HttpClientName);
+ return new DataFilesClient(httpClient);
+ });
+ services.AddSingleton(sp =>
+ {
+ // Instantiate the corpora client with the named HTTP client
+ IHttpClientFactory? factory = sp.GetService();
+ HttpClient httpClient = factory!.CreateClient(HttpClientName);
+ return new CorporaClient(httpClient);
+ });
+ return services.BuildServiceProvider();
+}
+
+async Task CreatePreTranslationEngineAsync(CancellationToken cancellationToken)
+{
+ string? sourceDataFileId = null;
+ string? targetDataFileId = null;
+ string? sourceCorpusId = null;
+ string? targetCorpusId = null;
+ string? parallelCorpusId = null;
+ string? translationEngineId = null;
+
+ try
+ {
+ // 1a. Create the source data file
+ Console.WriteLine("Create a source data file");
+ const string SourceDirectory = "TEA";
+ const string SourceFileName = $"{SourceDirectory}.zip";
+ await using (var sourceFileStream = new MemoryStream())
+ {
+ ZipFile.CreateFromDirectory(Path.Combine("data", SourceDirectory), sourceFileStream);
+ sourceFileStream.Seek(0, SeekOrigin.Begin);
+ DataFile sourceDataFile = await dataFilesClient.CreateAsync(
+ new FileParameter(sourceFileStream, SourceFileName),
+ FileFormat.Paratext,
+ SourceFileName,
+ cancellationToken
+ );
+ sourceDataFileId = sourceDataFile.Id;
+ }
+
+ // 1b. Create the target data file
+ Console.WriteLine("Create a target data file");
+ const string TargetDirectory = "TMA";
+ const string TargetFileName = $"{TargetDirectory}.zip";
+ await using (var targetFileStream = new MemoryStream())
+ {
+ ZipFile.CreateFromDirectory(Path.Combine("data", TargetDirectory), targetFileStream);
+ targetFileStream.Seek(0, SeekOrigin.Begin);
+ DataFile targetDataFile = await dataFilesClient.CreateAsync(
+ new FileParameter(targetFileStream, TargetFileName),
+ FileFormat.Paratext,
+ TargetFileName,
+ cancellationToken
+ );
+ targetDataFileId = targetDataFile.Id;
+ }
+
+ // 2a. Create the source corpus
+ // NOTE: The text id for the source and target corpora must match
+ Console.WriteLine("Create the source corpus");
+ const string SourceLanguageCode = "en";
+ var corpusConfig = new CorpusConfig
+ {
+ Name = "English Source Corpus",
+ Files = [new CorpusFileConfig { FileId = sourceDataFileId, TextId = "TestData" }],
+ Language = SourceLanguageCode,
+ };
+ Corpus translationCorpus = await corporaClient.CreateAsync(corpusConfig, cancellationToken);
+ sourceCorpusId = translationCorpus.Id;
+
+ // 2b. Create the target corpus
+ Console.WriteLine("Create the target corpus");
+ const string TargetLanguageCode = "mi";
+ corpusConfig = new CorpusConfig
+ {
+ Name = "Maori Target Corpus",
+ Files = [new CorpusFileConfig { FileId = targetDataFileId, TextId = "TestData" }],
+ Language = TargetLanguageCode,
+ };
+ translationCorpus = await corporaClient.CreateAsync(corpusConfig, cancellationToken);
+ targetCorpusId = translationCorpus.Id;
+
+ // 3. Create the translation engine
+ Console.WriteLine("Create the translation engine");
+ var engineConfig = new TranslationEngineConfig
+ {
+ Name = "Test Engine",
+ SourceLanguage = SourceLanguageCode,
+ TargetLanguage = TargetLanguageCode,
+ Type = "nmt",
+ };
+ TranslationEngine translationEngine = await translationEnginesClient.CreateAsync(
+ engineConfig,
+ cancellationToken
+ );
+ translationEngineId = translationEngine.Id;
+
+ // 4. Create the parallel corpus
+ TranslationParallelCorpus parallelCorpus = await translationEnginesClient.AddParallelCorpusAsync(
+ translationEngineId,
+ new TranslationParallelCorpusConfig
+ {
+ Name = "Test Parallel Corpus",
+ SourceCorpusIds = [sourceCorpusId],
+ TargetCorpusIds = [targetCorpusId],
+ },
+ cancellationToken
+ );
+ parallelCorpusId = parallelCorpus.Id;
+
+ // 5. Start a build
+ Console.WriteLine("Start a build");
+
+ // NOTE: This build is restricted to 20 steps for speed of build
+ // The generated translation will be very, very inaccurate.
+ JObject options = [];
+ options.Add("max_steps", 20);
+
+ // We will train on one book, and translate two books
+ var translationBuildConfig = new TranslationBuildConfig
+ {
+ Name = "Test Build",
+ Options = options,
+ Pretranslate =
+ [
+ new PretranslateCorpusConfig
+ {
+ ParallelCorpusId = parallelCorpusId,
+ SourceFilters =
+ [
+ new ParallelCorpusFilterConfig { CorpusId = sourceCorpusId, ScriptureRange = "LAO;MAN" },
+ ],
+ },
+ ],
+ TrainOn =
+ [
+ new TrainingCorpusConfig
+ {
+ ParallelCorpusId = parallelCorpusId,
+ SourceFilters =
+ [
+ new ParallelCorpusFilterConfig { CorpusId = sourceCorpusId, ScriptureRange = "PS2" },
+ ],
+ TargetFilters =
+ [
+ new ParallelCorpusFilterConfig { CorpusId = targetCorpusId, ScriptureRange = "PS2" },
+ ],
+ },
+ ],
+ };
+ TranslationBuild translationBuild = await translationEnginesClient.StartBuildAsync(
+ translationEngineId,
+ translationBuildConfig,
+ cancellationToken
+ );
+
+ // Wait until the build is finished
+ (int _, int cursorTop) = Console.GetCursorPosition();
+ DateTime timeOut = DateTime.Now.AddMinutes(30);
+ while (DateTime.Now < timeOut)
+ {
+ translationBuild = await translationEnginesClient.GetBuildAsync(
+ translationEngineId,
+ translationBuild.Id,
+ minRevision: null,
+ cancellationToken
+ );
+ if (translationBuild.DateFinished is not null)
+ {
+ break;
+ }
+
+ Console.SetCursorPosition(0, cursorTop);
+ Console.WriteLine(
+ $"{translationBuild.State}: {(translationBuild.PercentCompleted ?? 0) * 100}% completed... "
+ );
+
+ // Wait 20 seconds
+ cancellationToken.WaitHandle.WaitOne(millisecondsTimeout: 20000);
+ }
+
+ // Display the pre-translation USFM
+ string usfm = await translationEnginesClient.GetPretranslatedUsfmAsync(
+ translationEngineId,
+ parallelCorpusId,
+ textId: "LAO",
+ PretranslationUsfmTextOrigin.OnlyPretranslated,
+ PretranslationUsfmTemplate.Source,
+ cancellationToken
+ );
+ Console.WriteLine(usfm);
+
+ Console.WriteLine("Done!");
+ }
+ catch (TaskCanceledException)
+ {
+ // The process was cancelled via Ctrl+C
+ }
+ finally
+ {
+ // Clean up created entities
+ if (!string.IsNullOrWhiteSpace(sourceDataFileId))
+ {
+ Console.WriteLine("Delete the Source Data File");
+ await dataFilesClient.DeleteAsync(sourceDataFileId, CancellationToken.None);
+ }
+
+ if (!string.IsNullOrWhiteSpace(targetDataFileId))
+ {
+ Console.WriteLine("Delete the Target Data File");
+ await dataFilesClient.DeleteAsync(targetDataFileId, CancellationToken.None);
+ }
+
+ if (!string.IsNullOrWhiteSpace(sourceCorpusId))
+ {
+ Console.WriteLine("Delete the Source Corpus");
+ await corporaClient.DeleteAsync(sourceCorpusId, CancellationToken.None);
+ }
+
+ if (!string.IsNullOrWhiteSpace(targetCorpusId))
+ {
+ Console.WriteLine("Delete the Target Corpus");
+ await corporaClient.DeleteAsync(targetCorpusId, CancellationToken.None);
+ }
+
+ if (!string.IsNullOrWhiteSpace(translationEngineId))
+ {
+ if (!string.IsNullOrWhiteSpace(parallelCorpusId))
+ {
+ Console.WriteLine("Delete the Parallel Corpus");
+ await translationEnginesClient.DeleteParallelCorpusAsync(
+ translationEngineId,
+ parallelCorpusId,
+ CancellationToken.None
+ );
+ }
+
+ Console.WriteLine("Cancel the current build");
+ await translationEnginesClient.CancelBuildAsync(translationEngineId, CancellationToken.None);
+
+ Console.WriteLine("Delete the Translation Engine");
+ await translationEnginesClient.DeleteAsync(translationEngineId, CancellationToken.None);
+ }
+ }
+}
diff --git a/samples/ApiExample/README.md b/samples/ApiExample/README.md
new file mode 100644
index 00000000..9e45acac
--- /dev/null
+++ b/samples/ApiExample/README.md
@@ -0,0 +1,24 @@
+# Serval API Example
+
+This example application will generate a pre-translation USFM draft using the Serval API, and display it in the terminal window.
+
+## Pre-Requisites
+
+ * .NET SDK 8.0
+ * You must have a Serval Client ID and Client Secret before running this example.
+
+## Setup
+
+Before running, you must configure your Serval Client Id and Client Secret via `dotnet user-secrets`:
+```
+dotnet user-secrets set "Serval:ClientId" "your_client_id_here"
+dotnet user-secrets set "Serval:ClientSecret" "your_client_secret_here"
+```
+
+## Run
+
+To run this example after configuring your user secrets, execute the following command from a terminal window:
+
+```
+dotnet run
+```
diff --git a/samples/ApiExample/ServalOptions.cs b/samples/ApiExample/ServalOptions.cs
new file mode 100644
index 00000000..3148fc18
--- /dev/null
+++ b/samples/ApiExample/ServalOptions.cs
@@ -0,0 +1,32 @@
+namespace ApiExample;
+
+///
+/// The Serval API options configured via dotnet user-secrets.
+///
+public record ServalOptions
+{
+ ///
+ /// Gets the Serval API Server to use.
+ ///
+ public string ApiServer { get; init; } = string.Empty;
+
+ ///
+ /// Gets the JWT audience.
+ ///
+ public string Audience { get; init; } = string.Empty;
+
+ ///
+ /// Gets the JWT client identifier.
+ ///
+ public string ClientId { get; init; } = string.Empty;
+
+ ///
+ /// Gets the JWT client secret.
+ ///
+ public string ClientSecret { get; init; } = string.Empty;
+
+ ///
+ /// Gets or sets the endpoint to generate the JWT.
+ ///
+ public string TokenUrl { get; init; } = string.Empty;
+}
diff --git a/samples/ApiExample/appsettings.json b/samples/ApiExample/appsettings.json
new file mode 100644
index 00000000..9bbb173d
--- /dev/null
+++ b/samples/ApiExample/appsettings.json
@@ -0,0 +1,7 @@
+{
+ "Serval": {
+ "ApiServer": "https://qa.serval-api.org",
+ "Audience": "https://serval-api.org/",
+ "TokenUrl": "https://dev-sillsdev.auth0.com/oauth/token"
+ }
+}
diff --git a/samples/ApiExample/data/TEA/84MANTEA.SFM b/samples/ApiExample/data/TEA/84MANTEA.SFM
new file mode 100644
index 00000000..e3a34715
--- /dev/null
+++ b/samples/ApiExample/data/TEA/84MANTEA.SFM
@@ -0,0 +1,66 @@
+\id MAN - Test English Apocrypha
+\h Prayer of Manasseh
+\toc1 Prayer of Manasseh
+\toc2 Prayer of Manasseh
+\toc3 Prayer of Manasseh
+\mt1 Prayer of Manasseh\f + \fr 1.0 \ft Latin adds \fq King of Judah when he was held captive in Babylon\f*
+\imt Introduction
+\ip This prayer for forgiveness purports to be from King Manasseh during his imprisonment (see \xt 2 Chronicles 33:19\xt*), and appears to be originally written in Greek. It is found in the eighth chapter in the Book of Odes (chapter 12 in Rahlf’s edition), and is present in the Eastern Orthodox canon.
+\c 1
+\q1
+\v 1 Lord Almighty,\f + \fr 1.1 \fq Almighty \ft Codex Alexandrinus adds \fq in heaven\f*
+\q2 the God of our fathers:\x - \xo 1.1 \xt 2 Chr 33:12\x*
+\q1 of Abraham, and Isaac, and Jacob,\x - \xo 1.1 \xt Ex 3:15, 16; Acts 3:13\x*
+\q2 and of their righteous seed;
+\q1
+\v 2 Who made heaven and the earth, and\f + \fr 1.2 \fq and \ft Greek \fq with\f* all the universe\f + \fr 1.2 \fq universe \ft Or \fqa adornment\fqa*. Greek \fq cosmos\fq*\f* within;
+\q1
+\v 3 Who bound the sea by the word of your command,\x - \xo 1.3 \xt Job 33:8-11; Ps 74:12\x*
+\q2 who closed the abyss and sealed it by your terrible and glorious name.
+\q1
+\v 4 Who all things shudder and tremble before, because of your power;
+\q1
+\v 5 For your majesty and glory is unbearable,
+\q1 and the anger of your threat towards sinners is unendurable;
+\q1
+\v 6 Both immeasurable and unsearchable is the mercy of your promise;\x - \xo 1.6 \xt Rom 11:33\x*
+\q1
+\v 7 For you are the Lord Most High,
+\q2 tender-hearted, longsuffering, abounding in mercy,\x - \xo 1.7 \xt Ex 34:6; Ps 86:15; Joel 2:13\x*
+\q3 and you repent at the time of man’s trouble.\f + \fr 1.7 \ft Latin adds \fq Lord, according to your great goodness, you have promised repentance and forgiveness to those that have sinned against you, and in your infinite mercy have appointed repentance for sinners, so that they may be saved.\f*
+\q1
+\v 8 Therefore you, Lord, the God of the righteous,
+\q2 has not made repentance for the righteous,\x - \xo 1.8 \xt Lk 5:32\x*
+\q1 for Abraham, and Isaac, and Jacob did not sin against you,
+\q2 but you made repentance for me, a sinner.
+\q1
+\v 9 Therefore my sins number more than the sand of the sea,
+\q2 \f + \fr 1.9 \ft Codex Alexandrinus adds \fq For\f*my transgressions are multiplied, Lord, \add they\add*\f + \fr 1.9 \ft Latin reads \fq my transgressions\f* are multiplied,\f + \fr 1.9 \fq Lord, they are multiplied, \ft Codex Alexandrinus omits.\f*\x - \xo 1.9 \xt Is 59:12 \x*
+\q1 and I am not worthy to look upon and see the height of heaven,
+\q2 because of the multitude of my iniquities.\f + \fr 1.9 \ft Latin adds \fq Lord I now suffer justly, I deserve the trouble I receive, I am caught in a trap.\f*\x - \xo 1.9 \xt Ezra 9:6\x*
+\q1
+\v 10 I am bowed down by many iron chains,\x - \xo 1.10 \xt 2 Chr 33:11\x*
+\q2 I am rejected because of my sins,\f + \fr 1.10 \fq I am rejected because of my sins, \ft Latin reads \fq so that I cannot lift up my head,\f*
+\q3 and I can find\f + \fr 1.10 \fq can find \ft Greek \fqa have\f* no rest;
+\q1 Therefore I have kindled your anger,
+\q2 I have done evil before you,\f + \fr 1.10 \ft Latin adds \fq I did not your will\f*
+\q3 setting up abominations and abominable things.\f + \fr 1.10 \fq abominable things. \ft Greek \fqa objects of anger\fqa*. This word is often translated abominations (see \xt 2 Kings 23:13\xt*)\f*\x - \xo 1.10 \xt 2 Ki 21:2-9; 2 Chr 33:2-9\x*
+\q1
+\v 11 And now I bend the knee of my heart, to pray to you for your kindness,\x - \xo 1.11 \xt Sir 17:25\x*
+\q1
+\v 12 I have sinned, Lord, I have sinned,
+\q2 and I acknowledge my transgressions.\f + \fr 1.12 \ft Ps 51:3\f*
+\q1
+\v 13 I ask you in prayer,
+\q2 forgive me, Lord, forgive me,
+\q1 do not destroy me for my transgressions,
+\q2 neither stay angry with me forever, storing up evil for me,
+\q3 and do not\f + \fr 1.13 \fq and do not \ft Greek \fqa neither\f* condemn me to the depths of the earth.\x - \xo 1.13 \xt Ps 63:9; Ps 88:6\x*
+\q1 For you are, Lord,\f + \fr 1.13 \fq Lord \ft Latin reads \fq God\f* the God of those who repent;
+\q2
+\v 14 And to me you will show your goodness.
+\q1 For \add though I am\add* unworthy, \add you will\add* save me according to your abounding mercy.
+\q2
+\v 15 And I will praise you for all of the days of my life.
+\q1 For all of the host of heaven sing your praise,\x - \xo 1.15 \xt Ps 103:21; S3Y 39\x*
+\q2 and yours is the glory forever.\f + \fr 1.15 \fq forever \ft Latin reads \fq forever and ever\f* Amen.\x - \xo 1.15 \xt Rom 11:36; 16:7\x*
diff --git a/samples/ApiExample/data/TEA/85PS2TEA.SFM b/samples/ApiExample/data/TEA/85PS2TEA.SFM
new file mode 100644
index 00000000..fed19599
--- /dev/null
+++ b/samples/ApiExample/data/TEA/85PS2TEA.SFM
@@ -0,0 +1,32 @@
+\id PS2 - Test English Apocrypha
+\h Psalm 151
+\toc1 Psalm 151
+\toc2 Psalm 151
+\toc3 Psalm 151
+\mt1 Psalm 151
+\imt Introduction
+\ip Psalm 151 is included in some Septuagint manuscripts, and is present in the Dead Sea Scrolls (4QPs\sup a\sup*) in both Hebrew (151A) and Syraic (151B). The following is a translation of the version found in the Septuagint.
+\c 1
+\cp 151
+\d This psalm is written by David in his own hand (although it is outside the number), after he had fought one-on-one with Goliath.\f + \fr 1.1 \fq Goliath \ft Greek \fq Goliad\f*
+\q1
+\v 1 Smallest among my brothers, and the youngest in my father’s house;
+\q2 I shepherded my father’s sheep.\x - \xo 1.1 \xt 1 Sam 16:11\x*
+\q1
+\v 2 My hands made a harp;
+\q2 my fingers fashioned a lyre.\x - \xo 1.2 \xt 1 Sam 16:23\x*
+\q1
+\v 3 And who will report to my Lord?
+\q2 The Lord himself, he hears.\f + \fr 1.3 \fq hears \ft Codex Sinaiticus: \fqa hears everything.\fqa*; Codex Alexandrinus: \fqa who will hear me. \f*
+\q1
+\v 4 He sent his messenger\f + \fr 1.4 \fq messenger \ft Or \fqa angel\f* \add to me\add*, took me from my father’s sheep,
+\q2 and anointed me with olive oil.\x - \xo 1.4 \xt 1 Sam 16:13\x*
+\q1
+\v 5 My brothers were handsome and great \add indeed\add*,
+\q2 but with them the Lord was not pleased.\x - \xo 1.5 \xt 1 Sam 16:10\x*
+\q1
+\v 6 I came out to meet the foreigner,
+\q2 and he cursed me by his idols.\x - \xo 1.6 \xt 1 Sam 17:43\x*
+\q1
+\v 7 But I drew his own sword, beheaded him,\x - \xo 1.7 \xt 1 Sam 17:51\x*
+\q2 and took away disgrace from Israel’s sons.
diff --git a/samples/ApiExample/data/TEA/BookNames.xml b/samples/ApiExample/data/TEA/BookNames.xml
new file mode 100644
index 00000000..833a316b
--- /dev/null
+++ b/samples/ApiExample/data/TEA/BookNames.xml
@@ -0,0 +1,126 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/C3LAOTEA.SFM b/samples/ApiExample/data/TEA/C3LAOTEA.SFM
new file mode 100644
index 00000000..f5209310
--- /dev/null
+++ b/samples/ApiExample/data/TEA/C3LAOTEA.SFM
@@ -0,0 +1,37 @@
+\id LAO - Test English Apocrypha
+\h Laodiceans
+\toc1 Laodiceans
+\toc2 Laodiceans
+\toc3 Laodiceans
+\mt1 Epistle to the Laodiceans
+\imt Introduction
+\ip The following is a translation of the J.B. Lightfoot’s reverse translation of the surviving Latin translation of the Epistle to the Laodiceans into Koine Greek. This translation, published in his commentary on Colossians and Philemon (new edition, 1879) is based on the premise that the original epistle is a composition of quotations from the Pauline Epistles, compiled by an unknown author, purporting to be a letter from Paul to the church at Laodicea.
+\c 1
+\po
+\v 1 Paul, an apostle—not from men nor through man, but through Jesus Christ,\x - \xo 1.1 \xt Gal 1:1\x* to the brothers who are in Laodicea.\x - \xo 1.1 \xt Col 4:16\x*
+\v 2 Grace to you and peace from God the\f + \fr 1.2 \fq the \ft Some manuscripts \fq our\f* Father and the Lord Jesus Christ.\x - \xo 1.2 \xt Gal 1:3; Phil 1:2 \x*
+\p
+\v 3 I give thanks to Christ in all my prayers,\x - \xo 1.3 \xt Phil 1:3\x* that you are continuing in him and persevering in his works, eagerly awaiting the promise \add of salvation\add*\x - \xo 1.3 \xt Gal 5:5\x* in the day of judgment.\x - \xo 1.3 \xt 2 Pet 2:9; 3:7; cf. Phil 2:16\x*
+\p
+\v 4 Neither do the vain discussions of certain men\x - \xo 1.4 \xt 1 Tim 1:6\x* deceive you, with their aim to turn you away\x - \xo 1.4 \xt 2 Tim 4:4\x* from the truth of the gospel\x - \xo 1.4 \xt Col 1:5; Gal 2:5, 14\x* which is preached by me.\x - \xo 1.4 \xt Gal 1:11 (cf. Gal 1:8)\x*
+\v 5 So\f + \fr 1.5 \fq So \ft Greek: \fqa And \f* now God will work in those who are \add imitators\add*\x - \xo 1.5 \xt 1 Thes 2:14\x* of me\f + \fr 1.5 \fq imitators of me \ft Greek \fqa of mine\f* to advance the truth of the gospel,\x - \xo 1.5 \xt Phil 1:12\x* […]\f + \fr 1.5 \fq […] \ft A section appears to be missing, according to J.B. Lightfoot. \f* worshipping and practicing generosity—works of salvation [and]\f + \fr 1.5 \fq [and] \ft It is doubtful that this word was in the original Greek.\f* of eternal life.
+\v 6 And now my imprisonment\f + \fr 1.6 \fq imprisonment \ft Greek \fqa chains\f* is widely known, which I suffer in Christ, in which I rejoice and am glad.\x - \xo 1.6 \xt Matt 5:12 cf. Phil 1:18\x*
+\v 7 And this is for my eternal salvation, which will occur through your prayers, and the help of the Holy Spirit,\x - \xo 1.7 \xt Phil 1:19\x* whether by life or by death.\x - \xo 1.7 \xt Phil 1:20\x*
+\v 8 For to me, to live is Christ, and to die is joy.\x - \xo 1.8 \xt Phil 1:21\x*
+\v 9 And so he will work in you according to his mercy, that you may have the same love, and be in full accord.\x - \xo 1.9 \xt Phil 2:2\x*
+\v 10 Therefore beloved, as you have obeyed in my presence,\x - \xo 1.10 \xt Phil 2:12\x* so work, remembering\x - \xo 1.10 \xt 2 Thes 2:5 (Vulgate)\x* the fear of God,\f + \fr 1.10 \fq God \ft J.B. Lightfoot’s Greek text has \fqa Lord\fqa*, but this is not present in any Latin manuscripts.\f* and it will be to you eternal life,\f + \fr 1.10 \fq life, \ft The Latin and Greek text end the sentence here.\f*
+\v 11 for it is God who works in you.\x - \xo 1.11 \xt Phil 2:13\x*
+\v 12 And do without grumbling,\x - \xo 1.12 \xt Phil 2:14\x* whatever you do.\x - \xo 1.12 \xt Col 3:17\x*
+\p
+\v 13 And finally, beloved, rejoice in Christ.\x - \xo 1.13 \xt Phil 3:1\x* Look out for those \add who are\add* greedy for dishonest gain.\x - \xo 1.13 \xt 1 Tim 3:8; Tit 1:7\x*
+\v 14 Let all your requests be made known to God,\x - \xo 1.14 \xt Phil 4:6\x* and be steadfast\x - \xo 1.14 \xt 1 Cor 15:58\x* in the mind of Christ.\x - \xo 1.14 \xt 1 Cor 2:16\x*
+\v 15 Whatever is sound, and true, and honourable, and just,\f + \fr 1.15 \ft Some manuscripts add \fq and pure\f* and lovely,\x - \xo 1.15 \xt Phil 4:8\x* practice these things.\x - \xo 1.15 \xt Phil 4:9\x*
+\v 16 And what you have heard and received, hold in your heart, and peace will be with you.
+\p
+\v 17 [Greet the brothers.\x - \xo 1.17 \xt 1 Thes 5:26\x*]\f + \fr 1.17 \ft Most manuscripts omit verse 17.\f*
+\p
+\v 18 The saints greet you.\f + \fr 1.18 \ft One manuscript omits this verse.\f*\x - \xo 1.18 \xt Phil 4:22\x*
+\p
+\v 19 The grace of the Lord Jesus Christ\f + \fr 1.19 \ft Some manuscripts omit \fq Christ\f* be with your spirit.\x - \xo 1.19 \xt Phil 4:28\x*
+\p
+\v 20 And have this \add letter\add* read to the Colossians, and that of the Colossians to you.\f + \fr 1.20 \ft One manuscript adds \fq Amen.\fq*, another manuscript omits this verse.\f*\x - \xo 1.20 \xt Col 4:16\x*
diff --git a/samples/ApiExample/data/TEA/CommentTags.xml b/samples/ApiExample/data/TEA/CommentTags.xml
new file mode 100644
index 00000000..624f1523
--- /dev/null
+++ b/samples/ApiExample/data/TEA/CommentTags.xml
@@ -0,0 +1,5 @@
+
+
+
+ 1
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/ProjectProgress.xml b/samples/ApiExample/data/TEA/ProjectProgress.xml
new file mode 100644
index 00000000..bd16524a
--- /dev/null
+++ b/samples/ApiExample/data/TEA/ProjectProgress.xml
@@ -0,0 +1,20 @@
+
+
+
+ None
+
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001
+
+
+ 000001111111110010000000000000010000000000000000000000000000000000111001111111001010100000000000000000000000000000000000000
+
+
+ 110110000000001100000000000000000000000111010000000001111010001111000000000000110101000000000000000000000000000111111111111
+
+
+ 001000000000000000111100001000000000101000100110000110000001110000000110000000000000000000000000000000000000000000000000000
+
+
+ 000000000000000001000011110111101111010000001001111000000100000000000000000000000000010000000000000000011100000000000000000
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/ProjectUpdates.xml b/samples/ApiExample/data/TEA/ProjectUpdates.xml
new file mode 100644
index 00000000..0bbf0e6e
--- /dev/null
+++ b/samples/ApiExample/data/TEA/ProjectUpdates.xml
@@ -0,0 +1,7 @@
+
+
+ 1FE40EDA-1D82-4ED8-95D1-5F44B8EC25CD
+ 207EF1E9-D931-41A0-920D-96BAEF744746
+ 5C974ECE-A444-4E5A-B980-125E3CDEE7E2
+ B946EEE7-B890-47FA-BBEF-8D0E6F729F82
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/Settings.xml b/samples/ApiExample/data/TEA/Settings.xml
new file mode 100644
index 00000000..43bbbf3d
--- /dev/null
+++ b/samples/ApiExample/data/TEA/Settings.xml
@@ -0,0 +1,32 @@
+
+ usfm.sty
+ 4
+ English
+ 8.0.100.76
+ Test English Apocrypha
+ 65001
+ T
+
+ NFC
+ TEA
+ a7e9f1c362e728a143bb5eef7f6c79bcab2478fa
+ Charis SIL
+ 12
+
+
+ en:::
+ 41MAT
+
+ TEA.SFM
+ Major::BiblicalTerms.xml
+ F
+ F
+ F
+ Public
+ Standard::
+
+ 3
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/en.ldml b/samples/ApiExample/data/TEA/en.ldml
new file mode 100644
index 00000000..87c6fb5a
--- /dev/null
+++ b/samples/ApiExample/data/TEA/en.ldml
@@ -0,0 +1,26 @@
+[A-Za-z][!'-),-.\:;?\[\]\u00B4\u200C\u200D\u2014\u2018\u2019\u201C\u201D]['\-\u00B4\u2014][][][a b c d e f g h i j k l m n o p q r s t u v w x y z {aa} {bb} {cc} {dd} {ee} {ff} {gg} {hh} {ii} {jj} {kk} {ll} {mm} {nn} {oo} {pp} {qq} {rr} {ss} {tt} {uu} {vv} {ww} {xx} {yy} {zz}][][]“”‘’left-to-rightstandard
\ No newline at end of file
diff --git a/samples/ApiExample/data/TEA/unique.id b/samples/ApiExample/data/TEA/unique.id
new file mode 100644
index 00000000..66104d45
--- /dev/null
+++ b/samples/ApiExample/data/TEA/unique.id
@@ -0,0 +1 @@
+ed450f1c-1d1f-4ef1-87ac-a6b1d3b4735b
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/84MANTMA.SFM b/samples/ApiExample/data/TMA/84MANTMA.SFM
new file mode 100644
index 00000000..ce7aa080
--- /dev/null
+++ b/samples/ApiExample/data/TMA/84MANTMA.SFM
@@ -0,0 +1,48 @@
+\id MAN - Test Maori Apocrypha
+\h
+\mt1
+\imt
+\ip
+\c 1
+\q1 \v 1
+\q2
+\q1
+\q2
+\q1 \v 2
+\q1 \v 3
+\q2
+\q1 \v 4
+\q1 \v 5
+\q1
+\q1 \v 6
+\q1 \v 7
+\q2
+\q3
+\q1 \v 8
+\q2
+\q1
+\q2
+\q1 \v 9
+\q2
+\q1
+\q2
+\q1 \v 10
+\q2
+\q3
+\q1
+\q2
+\q3
+\q1 \v 11
+\q1 \v 12
+\q2
+\q1 \v 13
+\q2
+\q1
+\q2
+\q3
+\q1
+\q2 \v 14
+\q1
+\q2 \v 15
+\q1
+\q2
diff --git a/samples/ApiExample/data/TMA/85PS2TMA.SFM b/samples/ApiExample/data/TMA/85PS2TMA.SFM
new file mode 100644
index 00000000..1a1922d6
--- /dev/null
+++ b/samples/ApiExample/data/TMA/85PS2TMA.SFM
@@ -0,0 +1,32 @@
+\id PS2 - Test Māori Apocrypha
+\h NGA WAIATA 151
+\toc1 Ko Nga Waiata 151
+\toc2 Nga Waiata 151
+\toc3 Waiata 151
+\mt1 NGA WAIATA 151
+\imt Te Tīmatanga Kōrero
+\ip
+\c 1
+\cp 151
+\d Na Rawiri i tuhituhi tenei waiata ki tona ringa ake (ahakoa kei waho i te tatau), i muri i tana whawhai kotahi ki a Golia.
+\q1
+\v 1 He i iti ahau waenga i oku tuākana, me te pōtiki i te whare o āku papa;
+\q2 I tiaki ahau i nga hipi a toku papa.
+\q1
+\v 2 I hanga e oku ringa te hapa;
+\q2 i hanga e oku maihao he kutā.
+\q1
+\v 3 A ma wai e korero ki toku Ariki?
+\q2 Ko te Ariki tonu, e rongo ana ia.
+\q1
+\v 4 I tono mai ia i tana karere ki ahau, ka tango mai i ahau i roto i nga hipi a toku papa,
+\q2 a pania ana ahau e ia ki te hinu.
+\q1
+\v 5 He ataahua, he nunui rawa oku teina;
+\q2 otiia kihai te Ariki i ahuareka ki a ratou.
+\q1
+\v 6 I haere mai ahau kia whakatau i te tangata iwi ke,
+\q2 a kanga iho ahau e ia ki ana whakapakoko.
+\q1
+\v 7 Na unuhia ana e ahau tana hoari, tapahia ana tona matenga e ahau,
+\q2 a ka tangohia e ahau te tawai o nga tama a Iharaira.
diff --git a/samples/ApiExample/data/TMA/BookNames.xml b/samples/ApiExample/data/TMA/BookNames.xml
new file mode 100644
index 00000000..833a316b
--- /dev/null
+++ b/samples/ApiExample/data/TMA/BookNames.xml
@@ -0,0 +1,126 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/C3LAOTMA.SFM b/samples/ApiExample/data/TMA/C3LAOTMA.SFM
new file mode 100644
index 00000000..9459c187
--- /dev/null
+++ b/samples/ApiExample/data/TMA/C3LAOTMA.SFM
@@ -0,0 +1,14 @@
+\id LAO - Test Maori Apocrypha
+\h
+\mt1
+\imt
+\ip
+\c 1
+\po \v 1 \v 2
+\p \v 3
+\p \v 4 \v 5 \v 6 \v 7 \v 8 \v 9 \v 10 \v 11 \v 12
+\p \v 13 \v 14 \v 15 \v 16
+\p \v 17
+\p \v 18
+\p \v 19
+\p \v 20
diff --git a/samples/ApiExample/data/TMA/CommentTags.xml b/samples/ApiExample/data/TMA/CommentTags.xml
new file mode 100644
index 00000000..624f1523
--- /dev/null
+++ b/samples/ApiExample/data/TMA/CommentTags.xml
@@ -0,0 +1,5 @@
+
+
+
+ 1
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/ProjectProgress.xml b/samples/ApiExample/data/TMA/ProjectProgress.xml
new file mode 100644
index 00000000..bd16524a
--- /dev/null
+++ b/samples/ApiExample/data/TMA/ProjectProgress.xml
@@ -0,0 +1,20 @@
+
+
+
+ None
+
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001
+
+
+ 000001111111110010000000000000010000000000000000000000000000000000111001111111001010100000000000000000000000000000000000000
+
+
+ 110110000000001100000000000000000000000111010000000001111010001111000000000000110101000000000000000000000000000111111111111
+
+
+ 001000000000000000111100001000000000101000100110000110000001110000000110000000000000000000000000000000000000000000000000000
+
+
+ 000000000000000001000011110111101111010000001001111000000100000000000000000000000000010000000000000000011100000000000000000
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/Settings.xml b/samples/ApiExample/data/TMA/Settings.xml
new file mode 100644
index 00000000..a970e88e
--- /dev/null
+++ b/samples/ApiExample/data/TMA/Settings.xml
@@ -0,0 +1,31 @@
+
+ usfm.sty
+ Maori
+ 8.0.100.76
+ Test Maori Apocrypha
+ 65001
+ T
+
+ NFC
+ TMA
+ e1b3f0c799c4378a1757dd1b382c1dd515af37db
+ Charis SIL
+ 12
+
+
+ mi:::
+ 41MAT
+
+ TMA.SFM
+ Major::BiblicalTerms.xml
+ F
+ F
+ F
+ Public
+ Daughter:TEA:a7e9f1c362e728a143bb5eef7f6c79bcab2478fa
+
+ 3
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+ 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001
+
+
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/mi.ldml b/samples/ApiExample/data/TMA/mi.ldml
new file mode 100644
index 00000000..aa095e0e
--- /dev/null
+++ b/samples/ApiExample/data/TMA/mi.ldml
@@ -0,0 +1,15 @@
+[AEHIKM-PRTUWaehikm-prtuw\u0100\u0101\u0112\u0113\u012A\u012B\u014C\u014D\u016A\u016B{ng}{wh}][!(-*,-.\:;?\u00B6\u200C\u200D\u2010\u2014][*\-][][a e h i k m n {ng} o p r t u w {wh}][a e h i k m n {ng} o p r t u w {wh}][][]“”“left-to-rightstandard
\ No newline at end of file
diff --git a/samples/ApiExample/data/TMA/unique.id b/samples/ApiExample/data/TMA/unique.id
new file mode 100644
index 00000000..d3b98c55
--- /dev/null
+++ b/samples/ApiExample/data/TMA/unique.id
@@ -0,0 +1 @@
+f2ca92e1-0778-4424-9096-a1e64feb6123
\ No newline at end of file
diff --git a/samples/ServalApp/poetry.lock b/samples/ServalApp/poetry.lock
index a0d60480..13a1ea86 100644
--- a/samples/ServalApp/poetry.lock
+++ b/samples/ServalApp/poetry.lock
@@ -706,8 +706,8 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.20.3", markers = "python_version < \"3.10\""},
- {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+ {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@@ -1367,22 +1367,22 @@ files = [
[[package]]
name = "tornado"
-version = "6.4"
+version = "6.4.2"
description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
optional = false
-python-versions = ">= 3.8"
+python-versions = ">=3.8"
files = [
- {file = "tornado-6.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:02ccefc7d8211e5a7f9e8bc3f9e5b0ad6262ba2fbb683a6443ecc804e5224ce0"},
- {file = "tornado-6.4-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:27787de946a9cffd63ce5814c33f734c627a87072ec7eed71f7fc4417bb16263"},
- {file = "tornado-6.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7894c581ecdcf91666a0912f18ce5e757213999e183ebfc2c3fdbf4d5bd764e"},
- {file = "tornado-6.4-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e43bc2e5370a6a8e413e1e1cd0c91bedc5bd62a74a532371042a18ef19e10579"},
- {file = "tornado-6.4-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0251554cdd50b4b44362f73ad5ba7126fc5b2c2895cc62b14a1c2d7ea32f212"},
- {file = "tornado-6.4-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fd03192e287fbd0899dd8f81c6fb9cbbc69194d2074b38f384cb6fa72b80e9c2"},
- {file = "tornado-6.4-cp38-abi3-musllinux_1_1_i686.whl", hash = "sha256:88b84956273fbd73420e6d4b8d5ccbe913c65d31351b4c004ae362eba06e1f78"},
- {file = "tornado-6.4-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:71ddfc23a0e03ef2df1c1397d859868d158c8276a0603b96cf86892bff58149f"},
- {file = "tornado-6.4-cp38-abi3-win32.whl", hash = "sha256:6f8a6c77900f5ae93d8b4ae1196472d0ccc2775cc1dfdc9e7727889145c45052"},
- {file = "tornado-6.4-cp38-abi3-win_amd64.whl", hash = "sha256:10aeaa8006333433da48dec9fe417877f8bcc21f48dda8d661ae79da357b2a63"},
- {file = "tornado-6.4.tar.gz", hash = "sha256:72291fa6e6bc84e626589f1c29d90a5a6d593ef5ae68052ee2ef000dfd273dee"},
+ {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"},
+ {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"},
+ {file = "tornado-6.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a017d239bd1bb0919f72af256a970624241f070496635784d9bf0db640d3fec"},
+ {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36e62ce8f63409301537222faffcef7dfc5284f27eec227389f2ad11b09d946"},
+ {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca9eb02196e789c9cb5c3c7c0f04fb447dc2adffd95265b2c7223a8a615ccbf"},
+ {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:304463bd0772442ff4d0f5149c6f1c2135a1fae045adf070821c6cdc76980634"},
+ {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:c82c46813ba483a385ab2a99caeaedf92585a1f90defb5693351fa7e4ea0bf73"},
+ {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:932d195ca9015956fa502c6b56af9eb06106140d844a335590c1ec7f5277d10c"},
+ {file = "tornado-6.4.2-cp38-abi3-win32.whl", hash = "sha256:2876cef82e6c5978fde1e0d5b1f919d756968d5b4282418f3146b79b58556482"},
+ {file = "tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38"},
+ {file = "tornado-6.4.2.tar.gz", hash = "sha256:92bad5b4746e9879fd7bf1eb21dce4e3fc5128d71601f80005afa39237ad620b"},
]
[[package]]
@@ -1523,4 +1523,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<3.9.7 || >3.9.7,<4.0"
-content-hash = "1a59c67f2dcec9f413c7918e000e267400866f2e15a5f09767f0c506f0bd9352"
\ No newline at end of file
+content-hash = "8c024ad81f66beff9f4cccfdf65629b8d9d87bf49ce3d5774a4d8ad35663be5d"
diff --git a/samples/ServalApp/pyproject.toml b/samples/ServalApp/pyproject.toml
index ba86a555..85ea229e 100644
--- a/samples/ServalApp/pyproject.toml
+++ b/samples/ServalApp/pyproject.toml
@@ -11,6 +11,7 @@ streamlit = "^1.31.1"
requests = "^2.31.0"
SQLAlchemy = "^2.0.22"
pyarrow = "^14.0.1"
+tornado = "^6.4.2"
[tool.poetry.group.dev.dependencies]
black = "^23.10.1"
diff --git a/scripts/clearml_stats.py b/scripts/clearml_stats.py
index c20c33dc..cb60196f 100644
--- a/scripts/clearml_stats.py
+++ b/scripts/clearml_stats.py
@@ -2,7 +2,7 @@
import json
import os
import pickle
-from datetime import datetime, timezone
+from datetime import datetime
import numpy as np
import pandas as pd
@@ -47,6 +47,13 @@ class clearml_stats:
def __init__(self):
self._client: APIClient = APIClient()
self._tasks: dict[str, dict] = self._read_tasks()
+ self._project_id_to_task_id: dict[str, list[str]] = {}
+ for task_id in self._tasks.keys():
+ project_id = self._tasks[task_id]["project"]
+ if project_id in self._project_id_to_task_id:
+ self._project_id_to_task_id[project_id].append(task_id)
+ else:
+ self._project_id_to_task_id[project_id] = [task_id]
self._projects: dict[str, dict] = self._read_projects()
self._languages: pd.DataFrame = pd.read_excel(
language_database_filename, index_col=0
@@ -306,24 +313,14 @@ def add_lang(lang):
else:
langs_by_occurrence[lang] = 1
- num_of_tasks_found = 0
- num_of_tasks_not_found = 0
for project_id in self._projects:
self._projects[project_id]["src_lang"] = "unknown"
self._projects[project_id]["trg_lang"] = "unknown"
self._projects[project_id]["lang_candidates"] = []
project = self._projects[project_id]
- if len(project["tasks"]) > 0:
- task_not_found = True
- for task_id in project["tasks"]:
- if task_id in self._tasks.keys():
- task_not_found = False
- break
- if task_not_found:
- num_of_tasks_not_found += 1
- continue
- num_of_tasks_found += 1
+ if project_id in self._project_id_to_task_id:
+ project["tasks"] = self._project_id_to_task_id[project_id]
task = self._tasks[project["tasks"][0]]
args = task["script_args"]
if "src_lang" in args and "trg_lang" in args:
@@ -491,3 +488,6 @@ def violin_task_delay_time_per_week(
axes.set_ylim(0, 8)
axes.set_ylabel("hours")
axes.grid(True)
+
+
+# %%
diff --git a/src/Echo/src/EchoTranslationEngine/Program.cs b/src/Echo/src/EchoTranslationEngine/Program.cs
index 6c6f3768..352c536a 100644
--- a/src/Echo/src/EchoTranslationEngine/Program.cs
+++ b/src/Echo/src/EchoTranslationEngine/Program.cs
@@ -10,6 +10,8 @@
builder.Services.AddHostedService();
builder.Services.AddSingleton();
+builder.Services.AddParallelCorpusPreprocessor();
+
builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy());
builder.Services.Configure(builder.Configuration.GetSection("Bugsnag"));
@@ -17,9 +19,6 @@
WebApplication app = builder.Build();
-// Configure the HTTP request pipeline.
-app.UseHttpsRedirection();
-
app.MapGrpcService();
app.MapGrpcService();
diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
index 67779bc0..fb7abc66 100644
--- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
+++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
@@ -1,10 +1,16 @@
namespace EchoTranslationEngine;
-public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase
+public class TranslationEngineServiceV1(
+ BackgroundTaskQueue taskQueue,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
+) : TranslationEngineApi.TranslationEngineApiBase
{
private static readonly Empty Empty = new();
private readonly BackgroundTaskQueue _taskQueue = taskQueue;
+ private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
+ parallelCorpusPreprocessingService;
+
public override Task Create(CreateRequest request, ServerCallContext context)
{
if (request.SourceLanguage != request.TargetLanguage)
@@ -75,151 +81,34 @@ await client.BuildStartedAsync(
try
{
+ List pretranslationsRequests = [];
+ _parallelCorpusPreprocessingService.Preprocess(
+ request.Corpora.Select(Map).ToList(),
+ row => { },
+ (row, corpus) =>
+ {
+ pretranslationsRequests.Add(
+ new InsertPretranslationsRequest
+ {
+ EngineId = request.EngineId,
+ CorpusId = corpus.Id,
+ TextId = row.TextId,
+ Refs = { row.Refs.Select(r => r.ToString()) },
+ Translation = row.SourceSegment
+ }
+ );
+ },
+ false
+ );
using (
AsyncClientStreamingCall call =
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
- foreach (ParallelCorpus corpus in request.Corpora)
+ foreach (InsertPretranslationsRequest request in pretranslationsRequests)
{
- var sourceFiles = corpus
- .SourceCorpora.SelectMany(sc =>
- sc.Files.Where(f =>
- (sc.PretranslateTextIds is null || sc.PretranslateTextIds.Contains(f.TextId))
- && f.Format == FileFormat.Text
- )
- )
- .ToDictionary(f => f.TextId, f => f.Location);
- var targetFiles = corpus
- .TargetCorpora.SelectMany(tc =>
- tc.Files.Where(f =>
- (tc.PretranslateTextIds is null || tc.PretranslateTextIds.Contains(f.TextId))
- && f.Format == FileFormat.Text
- )
- )
- .ToDictionary(f => f.TextId, f => f.Location);
-
- foreach (KeyValuePair sourceFile in sourceFiles)
- {
- string[] sourceLines = await File.ReadAllLinesAsync(
- sourceFile.Value,
- cancellationToken
- );
-
- if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath))
- {
- string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken);
- bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
- if (!isTabSeparated)
- {
- int lineNum = 1;
- foreach (
- (string sourceLine, string targetLine) in sourceLines
- .Select(l => l.Trim())
- .Zip(targetLines.Select(l => l.Trim()))
- )
- {
- if (sourceLine.Length > 0 && targetLine.Length == 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{lineNum}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- lineNum++;
- }
- }
- else
- {
- var sourceLinesDict = sourceLines.ToDictionary(
- l => l.Split('\t')[0].Trim(),
- l => l.Split('\t')[1].Trim()
- );
- var targetLinesDict = targetLines.ToDictionary(
- l => l.Split('\t')[0].Trim(),
- l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty
- );
- foreach (KeyValuePair targetLineKVPair in targetLinesDict)
- {
- string? sourceLine = null;
- sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine);
- sourceLine ??= string.Empty;
- string? targetLine = targetLineKVPair.Value;
- if (sourceLine.Length > 0 && targetLine.Length == 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- }
- }
- }
- else
- {
- bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
- if (!isTabSeparated)
- {
- int lineNum = 1;
- foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
- {
- if (sourceLine.Length > 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{lineNum}" },
- Translation = sourceLine
- },
- cancellationToken
- );
- }
- lineNum++;
- }
- }
- else
- {
- foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
- {
- if (sourceLine.Length > 0)
- {
- await call.RequestStream.WriteAsync(
- new InsertPretranslationsRequest
- {
- EngineId = request.EngineId,
- CorpusId = corpus.Id,
- TextId = sourceFile.Key,
- Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" },
- Translation = sourceLine.Contains('\t')
- ? sourceLine.Split('\t')[1].Trim()
- : string.Empty
- },
- cancellationToken
- );
- }
- }
- }
- }
- }
+ await call.RequestStream.WriteAsync(request, cancellationToken);
}
-
await call.RequestStream.CompleteAsync();
await call;
}
@@ -317,4 +206,78 @@ ServerCallContext context
new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, }
);
}
+
+ private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source)
+ {
+ return new SIL.ServiceToolkit.Models.ParallelCorpus
+ {
+ Id = source.Id,
+ SourceCorpora = source.SourceCorpora.Select(Map).ToList(),
+ TargetCorpora = source.TargetCorpora.Select(Map).ToList()
+ };
+ }
+
+ private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source)
+ {
+ var trainOnChapters = source.TrainOnChapters.ToDictionary(
+ kvp => kvp.Key,
+ kvp => kvp.Value.Chapters.ToHashSet()
+ );
+ var trainOnTextIds = source.TrainOnTextIds.ToHashSet();
+ FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll);
+
+ var pretranslateChapters = source.PretranslateChapters.ToDictionary(
+ kvp => kvp.Key,
+ kvp => kvp.Value.Chapters.ToHashSet()
+ );
+ var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
+ FilterChoice pretranslateFilter = GetFilterChoice(
+ pretranslateChapters,
+ pretranslateTextIds,
+ source.PretranslateAll
+ );
+
+ return new SIL.ServiceToolkit.Models.MonolingualCorpus
+ {
+ Id = source.Id,
+ Language = source.Language,
+ Files = source.Files.Select(Map).ToList(),
+ TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null,
+ TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null,
+ PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
+ PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
+ };
+ }
+
+ private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source)
+ {
+ return new SIL.ServiceToolkit.Models.CorpusFile
+ {
+ Location = source.Location,
+ Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format,
+ TextId = source.TextId
+ };
+ }
+
+ private enum FilterChoice
+ {
+ Chapters,
+ TextIds,
+ None
+ }
+
+ private static FilterChoice GetFilterChoice(
+ IReadOnlyDictionary> chapters,
+ HashSet textIds,
+ bool noFilter
+ )
+ {
+ // Only either textIds or Scripture Range will be used at a time
+ // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text)
+ if (noFilter || (chapters is null && textIds is null))
+ return FilterChoice.None;
+ if (chapters is null || chapters.Count == 0)
+ return FilterChoice.TextIds;
+ return FilterChoice.Chapters;
+ }
}
diff --git a/src/Echo/src/EchoTranslationEngine/Usings.cs b/src/Echo/src/EchoTranslationEngine/Usings.cs
index b7f3ba2d..0404305b 100644
--- a/src/Echo/src/EchoTranslationEngine/Usings.cs
+++ b/src/Echo/src/EchoTranslationEngine/Usings.cs
@@ -5,3 +5,4 @@
global using Grpc.Core;
global using Microsoft.Extensions.Diagnostics.HealthChecks;
global using Serval.Translation.V1;
+global using SIL.ServiceToolkit.Utils;
diff --git a/src/Machine/src/Serval.Machine.EngineServer/Program.cs b/src/Machine/src/Serval.Machine.EngineServer/Program.cs
index e36db6c2..b03f6575 100644
--- a/src/Machine/src/Serval.Machine.EngineServer/Program.cs
+++ b/src/Machine/src/Serval.Machine.EngineServer/Program.cs
@@ -35,8 +35,6 @@
var app = builder.Build();
-app.UseHttpsRedirection();
-
app.MapServalTranslationEngineService();
app.MapHangfireDashboard();
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs
index f8dfbcd5..ce0180b5 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs
@@ -3,5 +3,5 @@
public interface IMachineBuilder
{
IServiceCollection Services { get; }
- IConfiguration? Configuration { get; }
+ IConfiguration Configuration { get; }
}
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
index 5a577cb5..67b8ef3d 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs
@@ -1,63 +1,28 @@
-using Serval.Translation.V1;
+using Polly.Extensions.Http;
+using Serval.Translation.V1;
namespace Microsoft.Extensions.DependencyInjection;
public static class IMachineBuilderExtensions
{
- public static IMachineBuilder AddServiceOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddServiceOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddSmtTransferEngineOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddSmtTransferEngineOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddClearMLOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddClearMLOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddDistributedReaderWriterLockOptions(
- this IMachineBuilder build,
- Action configureOptions
- )
- {
- build.Services.Configure(configureOptions);
- return build;
- }
-
public static IMachineBuilder AddDistributedReaderWriterLockOptions(
this IMachineBuilder build,
IConfiguration config
@@ -67,67 +32,33 @@ IConfiguration config
return build;
}
- public static IMachineBuilder AddMessageOutboxOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddMessageOutboxOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddSharedFileOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddSharedFileOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddBuildJobOptions(
- this IMachineBuilder builder,
- Action configureOptions
- )
- {
- builder.Services.Configure(configureOptions);
- return builder;
- }
-
public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure(config);
return builder;
}
- public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
+ public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder)
{
- if (builder.Configuration is null)
- return builder.AddThotSmtModel(o => { });
- else
- return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key));
+ builder.Services.AddParallelCorpusPreprocessor();
+ return builder;
}
- public static IMachineBuilder AddThotSmtModel(
- this IMachineBuilder builder,
- Action configureOptions
- )
+ public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
{
- builder.Services.Configure(configureOptions);
- builder.Services.AddSingleton();
- return builder;
+ return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key));
}
public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder, IConfiguration config)
@@ -151,17 +82,38 @@ public static IMachineBuilder AddUnigramTruecaser(this IMachineBuilder builder)
public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, string? connectionString = null)
{
- connectionString ??= builder.Configuration?.GetConnectionString("ClearML");
+ connectionString ??= builder.Configuration.GetConnectionString("ClearML");
if (connectionString is null)
throw new InvalidOperationException("ClearML connection string is required");
+ var policy = Policy
+ .Handle()
+ .OrTransientHttpStatusCode()
+ .OrResult(msg => msg.StatusCode == HttpStatusCode.TooManyRequests)
+ .WaitAndRetryAsync(
+ 7,
+ retryAttempt => TimeSpan.FromSeconds(2 * retryAttempt), // total 56, less than the 1 minute limit
+ onRetryAsync: (outcome, timespan, retryAttempt, context) =>
+ {
+ if (retryAttempt < 3)
+ return Task.CompletedTask;
+ // Log the retry attempt
+ var serviceProvider = builder.Services.BuildServiceProvider();
+ var logger = serviceProvider.GetService>();
+ logger?.LogInformation(
+ "Retry {RetryAttempt} encountered an error. Waiting {Timespan} before next retry. Error: {ErrorMessage}",
+ retryAttempt,
+ timespan,
+ outcome.Exception?.Message
+ );
+ return Task.CompletedTask;
+ }
+ );
+
builder
.Services.AddHttpClient("ClearML")
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!))
- // Add retry policy; fail after approx. 2 + 4 + 8 = 14 seconds
- .AddTransientHttpErrorPolicy(b =>
- b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)))
- );
+ .AddPolicyHandler(policy);
builder.Services.AddSingleton();
@@ -199,7 +151,7 @@ public static IMachineBuilder AddMongoHangfireJobClient(
string? connectionString = null
)
{
- connectionString ??= builder.Configuration?.GetConnectionString("Hangfire");
+ connectionString ??= builder.Configuration.GetConnectionString("Hangfire");
if (connectionString is null)
throw new InvalidOperationException("Hangfire connection string is required");
@@ -220,7 +172,7 @@ public static IMachineBuilder AddHangfireJobServer(
)
{
engineTypes ??=
- builder.Configuration?.GetSection("TranslationEngines").Get()
+ builder.Configuration.GetSection("TranslationEngines").Get()
?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt];
var queues = new List();
foreach (TranslationEngineType engineType in engineTypes.Distinct())
@@ -261,7 +213,7 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder)
public static IMachineBuilder AddMongoDataAccess(this IMachineBuilder builder, string? connectionString = null)
{
- connectionString ??= builder.Configuration?.GetConnectionString("Mongo");
+ connectionString ??= builder.Configuration.GetConnectionString("Mongo");
if (connectionString is null)
throw new InvalidOperationException("Mongo connection string is required");
builder.Services.AddMongoDataAccess(
@@ -316,7 +268,7 @@ public static IMachineBuilder AddServalPlatformService(
string? connectionString = null
)
{
- connectionString ??= builder.Configuration?.GetConnectionString("Serval");
+ connectionString ??= builder.Configuration.GetConnectionString("Serval");
if (connectionString is null)
throw new InvalidOperationException("Serval connection string is required");
@@ -383,7 +335,7 @@ public static IMachineBuilder AddServalTranslationEngineService(
builder.AddServalPlatformService(connectionString);
engineTypes ??=
- builder.Configuration?.GetSection("TranslationEngines").Get()
+ builder.Configuration.GetSection("TranslationEngines").Get()
?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt];
foreach (TranslationEngineType engineType in engineTypes.Distinct())
{
@@ -422,7 +374,7 @@ public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, s
if (smtTransferEngineDir is null)
{
var smtTransferEngineOptions = new SmtTransferEngineOptions();
- builder.Configuration?.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
+ builder.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
smtTransferEngineDir = smtTransferEngineOptions.EnginesDir;
}
string? driveLetter = Path.GetPathRoot(smtTransferEngineDir)?[..1];
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
index 9ae176d8..8fcaced4 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs
@@ -2,7 +2,7 @@
public static class IServiceCollectionExtensions
{
- public static IMachineBuilder AddMachine(this IServiceCollection services, IConfiguration? configuration = null)
+ public static IMachineBuilder AddMachine(this IServiceCollection services, IConfiguration configuration)
{
if (!Sldr.IsInitialized)
Sldr.Initialize();
@@ -15,35 +15,20 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
services.AddTransient();
services.AddScoped();
- services.AddSingleton();
services.AddStartupTask(
(sp, cancellationToken) =>
sp.GetRequiredService().InitAsync(cancellationToken)
);
+ services.AddParallelCorpusPreprocessor();
var builder = new MachineBuilder(services, configuration);
- if (configuration is null)
- {
- builder.AddServiceOptions(o => { });
- builder.AddSharedFileOptions(o => { });
- builder.AddSmtTransferEngineOptions(o => { });
- builder.AddClearMLOptions(o => { });
- builder.AddDistributedReaderWriterLockOptions(o => { });
- builder.AddBuildJobOptions(o => { });
- builder.AddMessageOutboxOptions(o => { });
- }
- else
- {
- builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key));
- builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key));
- builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key));
- builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key));
- builder.AddDistributedReaderWriterLockOptions(
- configuration.GetSection(DistributedReaderWriterLockOptions.Key)
- );
- builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key));
- builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key));
- }
+ builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key));
+ builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key));
+ builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key));
+ builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key));
+ builder.AddDistributedReaderWriterLockOptions(configuration.GetSection(DistributedReaderWriterLockOptions.Key));
+ builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key));
+ builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key));
return builder;
}
diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs
index 58ddf5c1..5fece454 100644
--- a/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs
@@ -1,7 +1,7 @@
namespace Microsoft.Extensions.DependencyInjection;
-internal class MachineBuilder(IServiceCollection services, IConfiguration? configuration) : IMachineBuilder
+internal class MachineBuilder(IServiceCollection services, IConfiguration configuration) : IMachineBuilder
{
public IServiceCollection Services { get; } = services;
- public IConfiguration? Configuration { get; } = configuration;
+ public IConfiguration Configuration { get; } = configuration;
}
diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
index 97d7fb64..f9eea0c5 100644
--- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
+++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj
@@ -36,9 +36,9 @@
-
-
-
+
+
+
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs
index 2b2b6718..66e1b350 100644
--- a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs
@@ -161,7 +161,7 @@ public async Task> GetTasksForQueueAsync(
var body = new JsonObject { ["queue"] = queueId };
JsonObject? result = await CallAsync("queues", "get_by_id", body, cancellationToken);
var tasks = (JsonArray?)result?["data"]?["queue"]?["entries"];
- IEnumerable taskIds = tasks?.Select(t => (string)t?["id"]!) ?? new List();
+ IEnumerable taskIds = tasks?.Select(t => (string)t?["task"]!) ?? new List();
return await GetTasksByIdAsync(taskIds, cancellationToken);
}
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
index 3c46a34e..2e79d09a 100644
--- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs
@@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob(
ILogger logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
- ICorpusService corpusService,
- ILanguageTagService languageTagService
+ ILanguageTagService languageTagService,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
)
: PreprocessBuildJob(
platformService,
@@ -17,7 +17,7 @@ ILanguageTagService languageTagService
logger,
buildJobService,
sharedFileService,
- corpusService
+ parallelCorpusPreprocessingService
)
{
private readonly ILanguageTagService _languageTagService = languageTagService;
diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
index d9e433ce..46baa68d 100644
--- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
+++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs
@@ -1,49 +1,35 @@
namespace Serval.Machine.Shared.Services;
-public class PreprocessBuildJob : HangfireBuildJob>
+public class PreprocessBuildJob(
+ IPlatformService platformService,
+ IRepository engines,
+ IDataAccessContext dataAccessContext,
+ ILogger logger,
+ IBuildJobService buildJobService,
+ ISharedFileService sharedFileService,
+ IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
+)
+ : HangfireBuildJob>(
+ platformService,
+ engines,
+ dataAccessContext,
+ buildJobService,
+ logger
+ )
{
private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true };
internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML;
- private readonly ISharedFileService _sharedFileService;
- private readonly ICorpusService _corpusService;
- private int _seed = 1234;
- private Random _random;
-
- public PreprocessBuildJob(
- IPlatformService platformService,
- IRepository engines,
- IDataAccessContext dataAccessContext,
- ILogger logger,
- IBuildJobService buildJobService,
- ISharedFileService sharedFileService,
- ICorpusService corpusService
- )
- : base(platformService, engines, dataAccessContext, buildJobService, logger)
- {
- _sharedFileService = sharedFileService;
- _corpusService = corpusService;
- _random = new Random(_seed);
- }
+ private readonly ISharedFileService _sharedFileService = sharedFileService;
- internal int Seed
- {
- get => _seed;
- set
- {
- if (_seed != value)
- {
- _seed = value;
- _random = new Random(_seed);
- }
- }
- }
+ private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
+ parallelCorpusPreprocessingService;
protected override async Task DoWorkAsync(
string engineId,
string buildId,
- IReadOnlyList data,
+ IReadOnlyList data,
string? buildOptions,
CancellationToken cancellationToken
)
@@ -121,127 +107,21 @@ CancellationToken cancellationToken
int trainCount = 0;
int pretranslateCount = 0;
pretranslateWriter.WriteStartArray();
- foreach (ParallelCorpus corpus in corpora)
- {
- (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus
- .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
- .ToArray();
- ITextCorpus[] sourceTrainingCorpora = sourceCorpora
- .Select(sc =>
- {
- ITextCorpus textCorpus = sc.TextCorpus;
- if (sc.Corpus.TrainOnTextIds is not null)
- textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds);
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || sc.Corpus.TrainOnChapters is null
- || IsInChapters(sr, sc.Corpus.TrainOnChapters)
- );
- })
- .ToArray();
- ITextCorpus[] sourcePretranslateCorpora = sourceCorpora
- .Select(sc =>
- {
- ITextCorpus textCorpus = sc.TextCorpus;
- if (sc.Corpus.PretranslateTextIds is not null)
- textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds);
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || sc.Corpus.PretranslateChapters is null
- || (
- IsInChapters(sr, sc.Corpus.PretranslateChapters)
- && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new())
- )
- );
- })
- .ToArray();
-
- (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus
- .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)))
- .ToArray();
- ITextCorpus[] targetTrainingCorpora = targetCorpora
- .Select(tc =>
- {
- ITextCorpus textCorpus = tc.TextCorpus;
- if (tc.Corpus.TrainOnTextIds is not null)
- textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds);
- return textCorpus.Where(row =>
- row.Ref is not ScriptureRef sr
- || tc.Corpus.TrainOnChapters is null
- || IsInChapters(sr, tc.Corpus.TrainOnChapters)
- );
- })
- .ToArray();
-
- if (sourceCorpora.Length == 0)
- continue;
-
- int skipCount = 0;
- foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora))
+ _parallelCorpusPreprocessingService.Preprocess(
+ corpora,
+ row =>
{
- if (skipCount > 0)
+ if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0)
{
- skipCount--;
- continue;
- }
-
- Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray();
- if (trainRows.Length > 0)
- {
- Row row = trainRows[0];
- if (rows.Length > 1)
- {
- Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray();
- Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray();
- if (targetNonEmptyRows.Length > 0)
- nonEmptyRows = targetNonEmptyRows;
- if (nonEmptyRows.Length > 0)
- {
- nonEmptyRows = nonEmptyRows
- .GroupBy(r => r.SourceSegment)
- .Select(group => group.First())
- .ToArray();
- {
- nonEmptyRows = nonEmptyRows
- .GroupBy(r => r.SourceSegment)
- .Select(group => group.First())
- .ToArray();
- row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
- }
- }
- }
-
- await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
- await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
- skipCount = row.RowCount - 1;
- if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
- trainCount++;
+ sourceTrainWriter.Write($"{row.SourceSegment}\n");
+ targetTrainWriter.Write($"{row.TargetSegment}\n");
}
- }
-
- if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
+ if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
+ trainCount++;
+ },
+ (row, corpus) =>
{
- ITextCorpus? sourceTermCorpus = _corpusService
- .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList())
- .FirstOrDefault();
- ITextCorpus? targetTermCorpus = _corpusService
- .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList())
- .FirstOrDefault();
- if (sourceTermCorpus is not null && targetTermCorpus is not null)
- {
- IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus);
- foreach (ParallelTextRow row in parallelKeyTermsCorpus)
- {
- await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
- await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
- trainCount++;
- }
- }
- }
-
- foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus))
- {
- if (row.SourceSegment.Length > 0)
+ if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0)
{
pretranslateWriter.WriteStartObject();
pretranslateWriter.WriteString("corpusId", corpus.Id);
@@ -254,21 +134,15 @@ row.Ref is not ScriptureRef sr
pretranslateWriter.WriteEndObject();
pretranslateCount++;
}
- }
- }
+ },
+ (bool?)buildOptionsObject?["use_key_terms"] ?? true
+ );
pretranslateWriter.WriteEndArray();
return (trainCount, pretranslateCount);
}
- private static bool IsInChapters(ScriptureRef sr, Dictionary> selection)
- {
- return selection.TryGetValue(sr.Book, out HashSet? chapters)
- && chapters != null
- && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum));
- }
-
protected override async Task CleanupAsync(
string engineId,
string buildId,
@@ -289,189 +163,9 @@ JobCompletionStatus completionStatus
}
}
- private static IEnumerable AlignTrainCorpus(
- IReadOnlyList srcCorpora,
- IReadOnlyList trgCorpora
- )
- {
- srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray();
- trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray();
-
- if (trgCorpora.All(tc => tc.IsScripture()))
- {
- return srcCorpora
- .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc)))
- .ZipMany(rows => rows.ToArray())
- // filter out every list that only contains completely empty rows
- .Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
- }
-
- IEnumerable sourceOnlyRows = srcCorpora
- .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true)))
- .ZipMany(rows =>
- rows.Where(r => r.TargetSegment.Count == 0)
- .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
- .ToArray()
- );
-
- IEnumerable targetRows = srcCorpora
- .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true)))
- .ZipMany(rows =>
- rows.Where(r => r.TargetSegment.Count > 0)
- .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
- .ToArray()
- );
-
- return sourceOnlyRows
- .Concat(targetRows)
- // filter out every list that only contains completely empty rows
- .Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
- }
-
- private static IEnumerable AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
- {
- int rowCount = 0;
- StringBuilder srcSegBuffer = new();
- StringBuilder trgSegBuffer = new();
- HashSet vrefs = [];
- foreach (
- (VerseRef vref, string srcSegment, string trgSegment) in srcCorpus
- .ExtractScripture()
- .Select(r => (r.CorpusVerseRef, r.Text))
- .Zip(
- trgCorpus.ExtractScripture().Select(r => r.Text),
- (s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t)
- )
- )
- {
- if (srcSegment == "" && trgSegment == "")
- {
- vrefs.UnionWith(vref.AllVerses());
- rowCount++;
- }
- else if (srcSegment == "")
- {
- vrefs.UnionWith(vref.AllVerses());
- if (trgSegment.Length > 0)
- {
- if (trgSegBuffer.Length > 0)
- trgSegBuffer.Append(' ');
- trgSegBuffer.Append(trgSegment);
- }
- rowCount++;
- }
- else if (trgSegment == "")
- {
- vrefs.UnionWith(vref.AllVerses());
- if (srcSegment.Length > 0)
- {
- if (srcSegBuffer.Length > 0)
- srcSegBuffer.Append(' ');
- srcSegBuffer.Append(srcSegment);
- }
- rowCount++;
- }
- else
- {
- if (rowCount > 0)
- {
- yield return new(
- vrefs.First().Book,
- vrefs.Order().Select(v => new ScriptureRef(v)).Cast