diff --git a/.github/workflows/ci-e2e.yml b/.github/workflows/ci-e2e.yml index 472e33d0..fc2a72df 100644 --- a/.github/workflows/ci-e2e.yml +++ b/.github/workflows/ci-e2e.yml @@ -10,7 +10,7 @@ jobs: build: name: Build runs-on: ubuntu-latest - timeout-minutes: 45 + timeout-minutes: 60 env: SERVAL_CLIENT_ID: ${{ secrets.SERVAL_CLIENT_ID }} diff --git a/.vscode/settings.json b/.vscode/settings.json index 4c5aadb3..cbe0a073 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -28,6 +28,7 @@ "ptcc", "Rebinder", "stylesheet", + "timespan", "upserted", "USFM" ], diff --git a/README.md b/README.md index 4e2b2880..326d20ab 100644 --- a/README.md +++ b/README.md @@ -60,15 +60,13 @@ There are 3 different environments that Serval is deployed to: - Run `kubectl config use-context dallas-rke` - First, startup the storage (using internal qa for example) - `helm install serval-pvc deploy/serval-pvc -n nlp -f deploy/qa-int-values.yaml` -- Then, startup the database (give it 60 seconds) -- `helm install mongo deploy/mongo -n nlp -f deploy/qa-int-values.yaml` - Now you can turn on Serval - `helm install serval deploy/serval -n nlp -f deploy/qa-int-values.yaml` ### To update the cluster - To upgrade Serval: - For QA internal Run: - - `kubectl config use-context dallas-rke` + - `kubectl config use-context dallas-stage` - `helm upgrade serval deploy/serval -n nlp -f deploy/qa-int-values.yaml` - For QA external Run: - `kubectl config use-context dallas-rke` diff --git a/Serval.sln b/Serval.sln index edd3f075..12c0aaaf 100644 --- a/Serval.sln +++ b/Serval.sln @@ -86,6 +86,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65 EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit.Tests\SIL.ServiceToolkit.Tests.csproj", "{C50ED15A-876D-42BF-980A-388E8C49C78D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -180,6 +184,10 @@ Global {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU {0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C50ED15A-876D-42BF-980A-388E8C49C78D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -215,6 +223,8 @@ Global {10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D} {C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98} {0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51} + {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} = {EA69B41C-49EF-4017-A687-44B9DF37FF98} + {C50ED15A-876D-42BF-980A-388E8C49C78D} = {1DB5E6D1-17A8-4FF2-B90A-C5DFBEF63126} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370} diff --git a/deploy/mongo/Chart.yaml b/deploy/mongo/Chart.yaml deleted file mode 100644 index e7a63115..00000000 --- a/deploy/mongo/Chart.yaml +++ /dev/null @@ -1,8 +0,0 @@ -name: mongo-repl -description: A mongo deployment to support serval -version: 0.0.1 -apiVersion: v1 -keywords: - - mongo -sources: -home: diff --git a/deploy/mongo/templates/mongo-deployment.yaml b/deploy/mongo/templates/mongo-deployment.yaml deleted file mode 100644 index 8ae37d93..00000000 --- a/deploy/mongo/templates/mongo-deployment.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app: mongo - name: mongo -spec: - replicas: 1 - selector: - matchLabels: - app: mongo - strategy: - type: Recreate - template: - metadata: - labels: - app: mongo - spec: - terminationGracePeriodSeconds: 30 - containers: - - command: ["/bin/sh", "-c"] - args: ['mongod --replSet myRS --bind_ip 0.0.0.0 & sleep 15s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity'] - image: mongo:6.0 - imagePullPolicy: "Always" - name: mongo - ports: - - containerPort: 27017 - resources: - limits: - memory: "2000Mi" - cpu: "1000m" - requests: - memory: "2000Mi" - cpu: "1000m" - volumeMounts: - - mountPath: /data/db - name: mongo-data - hostname: mongo - restartPolicy: Always - volumes: - - name: mongo-data - persistentVolumeClaim: - claimName: serval-mongo-claim -status: {} diff --git a/deploy/mongo/templates/mongo-service.yaml b/deploy/mongo/templates/mongo-service.yaml deleted file mode 100644 index f787c84e..00000000 --- a/deploy/mongo/templates/mongo-service.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - app: mongo - name: mongo -spec: - ports: - - name: "27017" - port: 27017 - targetPort: 27017 - selector: - app: mongo -status: - loadBalancer: {} diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml index 96074da6..7106e030 100644 --- a/deploy/qa-ext-values.yaml +++ b/deploy/qa-ext-values.yaml @@ -1,6 +1,6 @@ externalHost: qa.serval-api.org environment: Production -deploymentVersion: '1.6.QA3' +deploymentVersion: '1.7.QA7' alertEmail: ext-qa-serval-alerts@languagetechnology.org emailsToAlert: john_lambert@sil.org enableTls: true @@ -8,8 +8,8 @@ namespace: serval auth0Domain: dev-sillsdev.auth0.com lokiTenent: serval-tenant lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local -servalImage: ghcr.io/sillsdev/serval:1.6.3 -ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.6.3 +servalImage: ghcr.io/sillsdev/serval:1.7.7 +ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.7.2 ClearMLQueue: production MongoConnectionPrefix: qa_ SharedFileLocation: s3://silnlp/ext-qa/ diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml index 21aaec25..e047f4a7 100644 --- a/deploy/qa-int-values.yaml +++ b/deploy/qa-int-values.yaml @@ -8,11 +8,11 @@ namespace: nlp auth0Domain: sil-appbuilder.auth0.com lokiTenent: nlp-tenant lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local -servalImage: ghcr.io/sillsdev/serval:1.6.1 +servalImage: ghcr.io/sillsdev/serval:1.7.0 ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.6.3 ClearMLQueue: lambert_24gb MongoConnectionPrefix: qa_int_ SharedFileLocation: s3://silnlp/int-qa/ -servalClaimSize: 1Gi -machineClaimSize: 2Gi +servalClaimSize: 5Gi +machineClaimSize: 20Gi enableEcho: true \ No newline at end of file diff --git a/deploy/serval-pvc/templates/persistent-volume-claims.yaml b/deploy/serval-pvc/templates/persistent-volume-claims.yaml index 5acc3718..c4f1a8d5 100644 --- a/deploy/serval-pvc/templates/persistent-volume-claims.yaml +++ b/deploy/serval-pvc/templates/persistent-volume-claims.yaml @@ -35,17 +35,4 @@ spec: - ReadWriteMany resources: requests: - storage: 50M ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: serval-mongo-claim - namespace: {{ .Values.namespace}} -spec: - storageClassName: "longhorn" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi \ No newline at end of file + storage: 55M \ No newline at end of file diff --git a/deploy/serval/templates/fluentd-flows.yaml b/deploy/serval/templates/fluentd-flows.yaml index 84db700e..2d9729bc 100644 --- a/deploy/serval/templates/fluentd-flows.yaml +++ b/deploy/serval/templates/fluentd-flows.yaml @@ -26,21 +26,3 @@ spec: - echo hosts: [] labels: {} ---- -apiVersion: logging.banzaicloud.io/v1beta1 -kind: Flow -metadata: - name: mongo-flow - namespace: {{ .Values.namespace }} -spec: - globalOutputRefs: [] - localOutputRefs: - - {{ .Values.namespace }}-loki-output - match: - - select: - container_names: - - mongo - hosts: [] - labels: {} -status: - active: true diff --git a/docker-compose.yml b/docker-compose.yml index 8592c6e7..6e568f99 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -182,4 +182,4 @@ services: '/bin/sh', '-c', 'mongod --quiet --replSet myRS --bind_ip 0.0.0.0 & sleep 2s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity' - ] + ] \ No newline at end of file diff --git a/samples/ApiExample/ApiExample.csproj b/samples/ApiExample/ApiExample.csproj new file mode 100644 index 00000000..9a87fdcc --- /dev/null +++ b/samples/ApiExample/ApiExample.csproj @@ -0,0 +1,28 @@ + + + + Exe + net8.0 + enable + enable + 4d0606c3-0fc7-4d76-b43b-236485004e81 + + + + + PreserveNewest + + + PreserveNewest + + + + + + + + + + + + diff --git a/samples/ApiExample/ApiExample.sln b/samples/ApiExample/ApiExample.sln new file mode 100644 index 00000000..dbdd4696 --- /dev/null +++ b/samples/ApiExample/ApiExample.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.11.35327.3 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ApiExample", "ApiExample.csproj", "{F80F8853-776B-4C3A-B789-B8FD5820150A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F80F8853-776B-4C3A-B789-B8FD5820150A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F80F8853-776B-4C3A-B789-B8FD5820150A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F80F8853-776B-4C3A-B789-B8FD5820150A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F80F8853-776B-4C3A-B789-B8FD5820150A}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {72D18D80-E951-41EE-8A1F-97B2B72615AD} + EndGlobalSection +EndGlobal diff --git a/samples/ApiExample/Program.cs b/samples/ApiExample/Program.cs new file mode 100644 index 00000000..00dd0830 --- /dev/null +++ b/samples/ApiExample/Program.cs @@ -0,0 +1,318 @@ +using System.IO.Compression; +using ApiExample; +using IdentityModel.Client; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Newtonsoft.Json.Linq; +using Serval.Client; + +// Setup and get the services +ServiceProvider services = SetupServices(); +IDataFilesClient dataFilesClient = services.GetService()!; +ICorporaClient corporaClient = services.GetService()!; +ITranslationEnginesClient translationEnginesClient = services.GetService()!; + +// Trap Ctrl+C cancellation +var cancellationTokenSource = new CancellationTokenSource(); +Console.CancelKeyPress += (_, eventArgs) => +{ + Console.WriteLine("Cancelling..."); + cancellationTokenSource.Cancel(); + eventArgs.Cancel = true; +}; + +// Create then tear down a pre-translation (NMT) engine +await CreatePreTranslationEngineAsync(cancellationTokenSource.Token); + +// Exit +return; + +static ServiceProvider SetupServices() +{ + const string HttpClientName = "serval-api"; + const string TokenClientName = "serval-api-token"; + + var configurationBuilder = new ConfigurationBuilder(); + IConfiguration configuration = configurationBuilder + .AddJsonFile("appsettings.json", false, true) + .AddUserSecrets() + .Build(); + ServalOptions servalOptions = configuration.GetSection("Serval").Get()!; + + var services = new ServiceCollection(); + services.AddDistributedMemoryCache(); + services + .AddClientCredentialsTokenManagement() + .AddClient( + TokenClientName, + client => + { + client.TokenEndpoint = servalOptions.TokenUrl; + client.ClientId = servalOptions.ClientId; + client.ClientSecret = servalOptions.ClientSecret; + client.Parameters = new Parameters { { "audience", servalOptions.Audience } }; + } + ); + services.AddClientCredentialsHttpClient( + HttpClientName, + TokenClientName, + configureClient: client => client.BaseAddress = new Uri(servalOptions.ApiServer) + ); + services.AddHttpClient(HttpClientName).SetHandlerLifetime(TimeSpan.FromMinutes(5)); + services.AddSingleton(sp => + { + // Instantiate the translation engines client with the named HTTP client + IHttpClientFactory? factory = sp.GetService(); + HttpClient httpClient = factory!.CreateClient(HttpClientName); + return new TranslationEnginesClient(httpClient); + }); + services.AddSingleton(sp => + { + // Instantiate the data files client with the named HTTP client + IHttpClientFactory? factory = sp.GetService(); + HttpClient httpClient = factory!.CreateClient(HttpClientName); + return new DataFilesClient(httpClient); + }); + services.AddSingleton(sp => + { + // Instantiate the corpora client with the named HTTP client + IHttpClientFactory? factory = sp.GetService(); + HttpClient httpClient = factory!.CreateClient(HttpClientName); + return new CorporaClient(httpClient); + }); + return services.BuildServiceProvider(); +} + +async Task CreatePreTranslationEngineAsync(CancellationToken cancellationToken) +{ + string? sourceDataFileId = null; + string? targetDataFileId = null; + string? sourceCorpusId = null; + string? targetCorpusId = null; + string? parallelCorpusId = null; + string? translationEngineId = null; + + try + { + // 1a. Create the source data file + Console.WriteLine("Create a source data file"); + const string SourceDirectory = "TEA"; + const string SourceFileName = $"{SourceDirectory}.zip"; + await using (var sourceFileStream = new MemoryStream()) + { + ZipFile.CreateFromDirectory(Path.Combine("data", SourceDirectory), sourceFileStream); + sourceFileStream.Seek(0, SeekOrigin.Begin); + DataFile sourceDataFile = await dataFilesClient.CreateAsync( + new FileParameter(sourceFileStream, SourceFileName), + FileFormat.Paratext, + SourceFileName, + cancellationToken + ); + sourceDataFileId = sourceDataFile.Id; + } + + // 1b. Create the target data file + Console.WriteLine("Create a target data file"); + const string TargetDirectory = "TMA"; + const string TargetFileName = $"{TargetDirectory}.zip"; + await using (var targetFileStream = new MemoryStream()) + { + ZipFile.CreateFromDirectory(Path.Combine("data", TargetDirectory), targetFileStream); + targetFileStream.Seek(0, SeekOrigin.Begin); + DataFile targetDataFile = await dataFilesClient.CreateAsync( + new FileParameter(targetFileStream, TargetFileName), + FileFormat.Paratext, + TargetFileName, + cancellationToken + ); + targetDataFileId = targetDataFile.Id; + } + + // 2a. Create the source corpus + // NOTE: The text id for the source and target corpora must match + Console.WriteLine("Create the source corpus"); + const string SourceLanguageCode = "en"; + var corpusConfig = new CorpusConfig + { + Name = "English Source Corpus", + Files = [new CorpusFileConfig { FileId = sourceDataFileId, TextId = "TestData" }], + Language = SourceLanguageCode, + }; + Corpus translationCorpus = await corporaClient.CreateAsync(corpusConfig, cancellationToken); + sourceCorpusId = translationCorpus.Id; + + // 2b. Create the target corpus + Console.WriteLine("Create the target corpus"); + const string TargetLanguageCode = "mi"; + corpusConfig = new CorpusConfig + { + Name = "Maori Target Corpus", + Files = [new CorpusFileConfig { FileId = targetDataFileId, TextId = "TestData" }], + Language = TargetLanguageCode, + }; + translationCorpus = await corporaClient.CreateAsync(corpusConfig, cancellationToken); + targetCorpusId = translationCorpus.Id; + + // 3. Create the translation engine + Console.WriteLine("Create the translation engine"); + var engineConfig = new TranslationEngineConfig + { + Name = "Test Engine", + SourceLanguage = SourceLanguageCode, + TargetLanguage = TargetLanguageCode, + Type = "nmt", + }; + TranslationEngine translationEngine = await translationEnginesClient.CreateAsync( + engineConfig, + cancellationToken + ); + translationEngineId = translationEngine.Id; + + // 4. Create the parallel corpus + TranslationParallelCorpus parallelCorpus = await translationEnginesClient.AddParallelCorpusAsync( + translationEngineId, + new TranslationParallelCorpusConfig + { + Name = "Test Parallel Corpus", + SourceCorpusIds = [sourceCorpusId], + TargetCorpusIds = [targetCorpusId], + }, + cancellationToken + ); + parallelCorpusId = parallelCorpus.Id; + + // 5. Start a build + Console.WriteLine("Start a build"); + + // NOTE: This build is restricted to 20 steps for speed of build + // The generated translation will be very, very inaccurate. + JObject options = []; + options.Add("max_steps", 20); + + // We will train on one book, and translate two books + var translationBuildConfig = new TranslationBuildConfig + { + Name = "Test Build", + Options = options, + Pretranslate = + [ + new PretranslateCorpusConfig + { + ParallelCorpusId = parallelCorpusId, + SourceFilters = + [ + new ParallelCorpusFilterConfig { CorpusId = sourceCorpusId, ScriptureRange = "LAO;MAN" }, + ], + }, + ], + TrainOn = + [ + new TrainingCorpusConfig + { + ParallelCorpusId = parallelCorpusId, + SourceFilters = + [ + new ParallelCorpusFilterConfig { CorpusId = sourceCorpusId, ScriptureRange = "PS2" }, + ], + TargetFilters = + [ + new ParallelCorpusFilterConfig { CorpusId = targetCorpusId, ScriptureRange = "PS2" }, + ], + }, + ], + }; + TranslationBuild translationBuild = await translationEnginesClient.StartBuildAsync( + translationEngineId, + translationBuildConfig, + cancellationToken + ); + + // Wait until the build is finished + (int _, int cursorTop) = Console.GetCursorPosition(); + DateTime timeOut = DateTime.Now.AddMinutes(30); + while (DateTime.Now < timeOut) + { + translationBuild = await translationEnginesClient.GetBuildAsync( + translationEngineId, + translationBuild.Id, + minRevision: null, + cancellationToken + ); + if (translationBuild.DateFinished is not null) + { + break; + } + + Console.SetCursorPosition(0, cursorTop); + Console.WriteLine( + $"{translationBuild.State}: {(translationBuild.PercentCompleted ?? 0) * 100}% completed... " + ); + + // Wait 20 seconds + cancellationToken.WaitHandle.WaitOne(millisecondsTimeout: 20000); + } + + // Display the pre-translation USFM + string usfm = await translationEnginesClient.GetPretranslatedUsfmAsync( + translationEngineId, + parallelCorpusId, + textId: "LAO", + PretranslationUsfmTextOrigin.OnlyPretranslated, + PretranslationUsfmTemplate.Source, + cancellationToken + ); + Console.WriteLine(usfm); + + Console.WriteLine("Done!"); + } + catch (TaskCanceledException) + { + // The process was cancelled via Ctrl+C + } + finally + { + // Clean up created entities + if (!string.IsNullOrWhiteSpace(sourceDataFileId)) + { + Console.WriteLine("Delete the Source Data File"); + await dataFilesClient.DeleteAsync(sourceDataFileId, CancellationToken.None); + } + + if (!string.IsNullOrWhiteSpace(targetDataFileId)) + { + Console.WriteLine("Delete the Target Data File"); + await dataFilesClient.DeleteAsync(targetDataFileId, CancellationToken.None); + } + + if (!string.IsNullOrWhiteSpace(sourceCorpusId)) + { + Console.WriteLine("Delete the Source Corpus"); + await corporaClient.DeleteAsync(sourceCorpusId, CancellationToken.None); + } + + if (!string.IsNullOrWhiteSpace(targetCorpusId)) + { + Console.WriteLine("Delete the Target Corpus"); + await corporaClient.DeleteAsync(targetCorpusId, CancellationToken.None); + } + + if (!string.IsNullOrWhiteSpace(translationEngineId)) + { + if (!string.IsNullOrWhiteSpace(parallelCorpusId)) + { + Console.WriteLine("Delete the Parallel Corpus"); + await translationEnginesClient.DeleteParallelCorpusAsync( + translationEngineId, + parallelCorpusId, + CancellationToken.None + ); + } + + Console.WriteLine("Cancel the current build"); + await translationEnginesClient.CancelBuildAsync(translationEngineId, CancellationToken.None); + + Console.WriteLine("Delete the Translation Engine"); + await translationEnginesClient.DeleteAsync(translationEngineId, CancellationToken.None); + } + } +} diff --git a/samples/ApiExample/README.md b/samples/ApiExample/README.md new file mode 100644 index 00000000..9e45acac --- /dev/null +++ b/samples/ApiExample/README.md @@ -0,0 +1,24 @@ +# Serval API Example + +This example application will generate a pre-translation USFM draft using the Serval API, and display it in the terminal window. + +## Pre-Requisites + + * .NET SDK 8.0 + * You must have a Serval Client ID and Client Secret before running this example. + +## Setup + +Before running, you must configure your Serval Client Id and Client Secret via `dotnet user-secrets`: +``` +dotnet user-secrets set "Serval:ClientId" "your_client_id_here" +dotnet user-secrets set "Serval:ClientSecret" "your_client_secret_here" +``` + +## Run + +To run this example after configuring your user secrets, execute the following command from a terminal window: + +``` +dotnet run +``` diff --git a/samples/ApiExample/ServalOptions.cs b/samples/ApiExample/ServalOptions.cs new file mode 100644 index 00000000..3148fc18 --- /dev/null +++ b/samples/ApiExample/ServalOptions.cs @@ -0,0 +1,32 @@ +namespace ApiExample; + +/// +/// The Serval API options configured via dotnet user-secrets. +/// +public record ServalOptions +{ + /// + /// Gets the Serval API Server to use. + /// + public string ApiServer { get; init; } = string.Empty; + + /// + /// Gets the JWT audience. + /// + public string Audience { get; init; } = string.Empty; + + /// + /// Gets the JWT client identifier. + /// + public string ClientId { get; init; } = string.Empty; + + /// + /// Gets the JWT client secret. + /// + public string ClientSecret { get; init; } = string.Empty; + + /// + /// Gets or sets the endpoint to generate the JWT. + /// + public string TokenUrl { get; init; } = string.Empty; +} diff --git a/samples/ApiExample/appsettings.json b/samples/ApiExample/appsettings.json new file mode 100644 index 00000000..9bbb173d --- /dev/null +++ b/samples/ApiExample/appsettings.json @@ -0,0 +1,7 @@ +{ + "Serval": { + "ApiServer": "https://qa.serval-api.org", + "Audience": "https://serval-api.org/", + "TokenUrl": "https://dev-sillsdev.auth0.com/oauth/token" + } +} diff --git a/samples/ApiExample/data/TEA/84MANTEA.SFM b/samples/ApiExample/data/TEA/84MANTEA.SFM new file mode 100644 index 00000000..e3a34715 --- /dev/null +++ b/samples/ApiExample/data/TEA/84MANTEA.SFM @@ -0,0 +1,66 @@ +\id MAN - Test English Apocrypha +\h Prayer of Manasseh +\toc1 Prayer of Manasseh +\toc2 Prayer of Manasseh +\toc3 Prayer of Manasseh +\mt1 Prayer of Manasseh\f + \fr 1.0 \ft Latin adds \fq King of Judah when he was held captive in Babylon\f* +\imt Introduction +\ip This prayer for forgiveness purports to be from King Manasseh during his imprisonment (see \xt 2 Chronicles 33:19\xt*), and appears to be originally written in Greek. It is found in the eighth chapter in the Book of Odes (chapter 12 in Rahlf’s edition), and is present in the Eastern Orthodox canon. +\c 1 +\q1 +\v 1 Lord Almighty,\f + \fr 1.1 \fq Almighty \ft Codex Alexandrinus adds \fq in heaven\f* +\q2 the God of our fathers:\x - \xo 1.1 \xt 2 Chr 33:12\x* +\q1 of Abraham, and Isaac, and Jacob,\x - \xo 1.1 \xt Ex 3:15, 16; Acts 3:13\x* +\q2 and of their righteous seed; +\q1 +\v 2 Who made heaven and the earth, and\f + \fr 1.2 \fq and \ft Greek \fq with\f* all the universe\f + \fr 1.2 \fq universe \ft Or \fqa adornment\fqa*. Greek \fq cosmos\fq*\f* within; +\q1 +\v 3 Who bound the sea by the word of your command,\x - \xo 1.3 \xt Job 33:8-11; Ps 74:12\x* +\q2 who closed the abyss and sealed it by your terrible and glorious name. +\q1 +\v 4 Who all things shudder and tremble before, because of your power; +\q1 +\v 5 For your majesty and glory is unbearable, +\q1 and the anger of your threat towards sinners is unendurable; +\q1 +\v 6 Both immeasurable and unsearchable is the mercy of your promise;\x - \xo 1.6 \xt Rom 11:33\x* +\q1 +\v 7 For you are the Lord Most High, +\q2 tender-hearted, longsuffering, abounding in mercy,\x - \xo 1.7 \xt Ex 34:6; Ps 86:15; Joel 2:13\x* +\q3 and you repent at the time of man’s trouble.\f + \fr 1.7 \ft Latin adds \fq Lord, according to your great goodness, you have promised repentance and forgiveness to those that have sinned against you, and in your infinite mercy have appointed repentance for sinners, so that they may be saved.\f* +\q1 +\v 8 Therefore you, Lord, the God of the righteous, +\q2 has not made repentance for the righteous,\x - \xo 1.8 \xt Lk 5:32\x* +\q1 for Abraham, and Isaac, and Jacob did not sin against you, +\q2 but you made repentance for me, a sinner. +\q1 +\v 9 Therefore my sins number more than the sand of the sea, +\q2 \f + \fr 1.9 \ft Codex Alexandrinus adds \fq For\f*my transgressions are multiplied, Lord, \add they\add*\f + \fr 1.9 \ft Latin reads \fq my transgressions\f* are multiplied,\f + \fr 1.9 \fq Lord, they are multiplied, \ft Codex Alexandrinus omits.\f*\x - \xo 1.9 \xt Is 59:12 \x* +\q1 and I am not worthy to look upon and see the height of heaven, +\q2 because of the multitude of my iniquities.\f + \fr 1.9 \ft Latin adds \fq Lord I now suffer justly, I deserve the trouble I receive, I am caught in a trap.\f*\x - \xo 1.9 \xt Ezra 9:6\x* +\q1 +\v 10 I am bowed down by many iron chains,\x - \xo 1.10 \xt 2 Chr 33:11\x* +\q2 I am rejected because of my sins,\f + \fr 1.10 \fq I am rejected because of my sins, \ft Latin reads \fq so that I cannot lift up my head,\f* +\q3 and I can find\f + \fr 1.10 \fq can find \ft Greek \fqa have\f* no rest; +\q1 Therefore I have kindled your anger, +\q2 I have done evil before you,\f + \fr 1.10 \ft Latin adds \fq I did not your will\f* +\q3 setting up abominations and abominable things.\f + \fr 1.10 \fq abominable things. \ft Greek \fqa objects of anger\fqa*. This word is often translated abominations (see \xt 2 Kings 23:13\xt*)\f*\x - \xo 1.10 \xt 2 Ki 21:2-9; 2 Chr 33:2-9\x* +\q1 +\v 11 And now I bend the knee of my heart, to pray to you for your kindness,\x - \xo 1.11 \xt Sir 17:25\x* +\q1 +\v 12 I have sinned, Lord, I have sinned, +\q2 and I acknowledge my transgressions.\f + \fr 1.12 \ft Ps 51:3\f* +\q1 +\v 13 I ask you in prayer, +\q2 forgive me, Lord, forgive me, +\q1 do not destroy me for my transgressions, +\q2 neither stay angry with me forever, storing up evil for me, +\q3 and do not\f + \fr 1.13 \fq and do not \ft Greek \fqa neither\f* condemn me to the depths of the earth.\x - \xo 1.13 \xt Ps 63:9; Ps 88:6\x* +\q1 For you are, Lord,\f + \fr 1.13 \fq Lord \ft Latin reads \fq God\f* the God of those who repent; +\q2 +\v 14 And to me you will show your goodness. +\q1 For \add though I am\add* unworthy, \add you will\add* save me according to your abounding mercy. +\q2 +\v 15 And I will praise you for all of the days of my life. +\q1 For all of the host of heaven sing your praise,\x - \xo 1.15 \xt Ps 103:21; S3Y 39\x* +\q2 and yours is the glory forever.\f + \fr 1.15 \fq forever \ft Latin reads \fq forever and ever\f* Amen.\x - \xo 1.15 \xt Rom 11:36; 16:7\x* diff --git a/samples/ApiExample/data/TEA/85PS2TEA.SFM b/samples/ApiExample/data/TEA/85PS2TEA.SFM new file mode 100644 index 00000000..fed19599 --- /dev/null +++ b/samples/ApiExample/data/TEA/85PS2TEA.SFM @@ -0,0 +1,32 @@ +\id PS2 - Test English Apocrypha +\h Psalm 151 +\toc1 Psalm 151 +\toc2 Psalm 151 +\toc3 Psalm 151 +\mt1 Psalm 151 +\imt Introduction +\ip Psalm 151 is included in some Septuagint manuscripts, and is present in the Dead Sea Scrolls (4QPs\sup a\sup*) in both Hebrew (151A) and Syraic (151B). The following is a translation of the version found in the Septuagint. +\c 1 +\cp 151 +\d This psalm is written by David in his own hand (although it is outside the number), after he had fought one-on-one with Goliath.\f + \fr 1.1 \fq Goliath \ft Greek \fq Goliad\f* +\q1 +\v 1 Smallest among my brothers, and the youngest in my father’s house; +\q2 I shepherded my father’s sheep.\x - \xo 1.1 \xt 1 Sam 16:11\x* +\q1 +\v 2 My hands made a harp; +\q2 my fingers fashioned a lyre.\x - \xo 1.2 \xt 1 Sam 16:23\x* +\q1 +\v 3 And who will report to my Lord? +\q2 The Lord himself, he hears.\f + \fr 1.3 \fq hears \ft Codex Sinaiticus: \fqa hears everything.\fqa*; Codex Alexandrinus: \fqa who will hear me. \f* +\q1 +\v 4 He sent his messenger\f + \fr 1.4 \fq messenger \ft Or \fqa angel\f* \add to me\add*, took me from my father’s sheep, +\q2 and anointed me with olive oil.\x - \xo 1.4 \xt 1 Sam 16:13\x* +\q1 +\v 5 My brothers were handsome and great \add indeed\add*, +\q2 but with them the Lord was not pleased.\x - \xo 1.5 \xt 1 Sam 16:10\x* +\q1 +\v 6 I came out to meet the foreigner, +\q2 and he cursed me by his idols.\x - \xo 1.6 \xt 1 Sam 17:43\x* +\q1 +\v 7 But I drew his own sword, beheaded him,\x - \xo 1.7 \xt 1 Sam 17:51\x* +\q2 and took away disgrace from Israel’s sons. diff --git a/samples/ApiExample/data/TEA/BookNames.xml b/samples/ApiExample/data/TEA/BookNames.xml new file mode 100644 index 00000000..833a316b --- /dev/null +++ b/samples/ApiExample/data/TEA/BookNames.xml @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/C3LAOTEA.SFM b/samples/ApiExample/data/TEA/C3LAOTEA.SFM new file mode 100644 index 00000000..f5209310 --- /dev/null +++ b/samples/ApiExample/data/TEA/C3LAOTEA.SFM @@ -0,0 +1,37 @@ +\id LAO - Test English Apocrypha +\h Laodiceans +\toc1 Laodiceans +\toc2 Laodiceans +\toc3 Laodiceans +\mt1 Epistle to the Laodiceans +\imt Introduction +\ip The following is a translation of the J.B. Lightfoot’s reverse translation of the surviving Latin translation of the Epistle to the Laodiceans into Koine Greek. This translation, published in his commentary on Colossians and Philemon (new edition, 1879) is based on the premise that the original epistle is a composition of quotations from the Pauline Epistles, compiled by an unknown author, purporting to be a letter from Paul to the church at Laodicea. +\c 1 +\po +\v 1 Paul, an apostle—not from men nor through man, but through Jesus Christ,\x - \xo 1.1 \xt Gal 1:1\x* to the brothers who are in Laodicea.\x - \xo 1.1 \xt Col 4:16\x* +\v 2 Grace to you and peace from God the\f + \fr 1.2 \fq the \ft Some manuscripts \fq our\f* Father and the Lord Jesus Christ.\x - \xo 1.2 \xt Gal 1:3; Phil 1:2 \x* +\p +\v 3 I give thanks to Christ in all my prayers,\x - \xo 1.3 \xt Phil 1:3\x* that you are continuing in him and persevering in his works, eagerly awaiting the promise \add of salvation\add*\x - \xo 1.3 \xt Gal 5:5\x* in the day of judgment.\x - \xo 1.3 \xt 2 Pet 2:9; 3:7; cf. Phil 2:16\x* +\p +\v 4 Neither do the vain discussions of certain men\x - \xo 1.4 \xt 1 Tim 1:6\x* deceive you, with their aim to turn you away\x - \xo 1.4 \xt 2 Tim 4:4\x* from the truth of the gospel\x - \xo 1.4 \xt Col 1:5; Gal 2:5, 14\x* which is preached by me.\x - \xo 1.4 \xt Gal 1:11 (cf. Gal 1:8)\x* +\v 5 So\f + \fr 1.5 \fq So \ft Greek: \fqa And \f* now God will work in those who are \add imitators\add*\x - \xo 1.5 \xt 1 Thes 2:14\x* of me\f + \fr 1.5 \fq imitators of me \ft Greek \fqa of mine\f* to advance the truth of the gospel,\x - \xo 1.5 \xt Phil 1:12\x* […]\f + \fr 1.5 \fq […] \ft A section appears to be missing, according to J.B. Lightfoot. \f* worshipping and practicing generosity—works of salvation [and]\f + \fr 1.5 \fq [and] \ft It is doubtful that this word was in the original Greek.\f* of eternal life. +\v 6 And now my imprisonment\f + \fr 1.6 \fq imprisonment \ft Greek \fqa chains\f* is widely known, which I suffer in Christ, in which I rejoice and am glad.\x - \xo 1.6 \xt Matt 5:12 cf. Phil 1:18\x* +\v 7 And this is for my eternal salvation, which will occur through your prayers, and the help of the Holy Spirit,\x - \xo 1.7 \xt Phil 1:19\x* whether by life or by death.\x - \xo 1.7 \xt Phil 1:20\x* +\v 8 For to me, to live is Christ, and to die is joy.\x - \xo 1.8 \xt Phil 1:21\x* +\v 9 And so he will work in you according to his mercy, that you may have the same love, and be in full accord.\x - \xo 1.9 \xt Phil 2:2\x* +\v 10 Therefore beloved, as you have obeyed in my presence,\x - \xo 1.10 \xt Phil 2:12\x* so work, remembering\x - \xo 1.10 \xt 2 Thes 2:5 (Vulgate)\x* the fear of God,\f + \fr 1.10 \fq God \ft J.B. Lightfoot’s Greek text has \fqa Lord\fqa*, but this is not present in any Latin manuscripts.\f* and it will be to you eternal life,\f + \fr 1.10 \fq life, \ft The Latin and Greek text end the sentence here.\f* +\v 11 for it is God who works in you.\x - \xo 1.11 \xt Phil 2:13\x* +\v 12 And do without grumbling,\x - \xo 1.12 \xt Phil 2:14\x* whatever you do.\x - \xo 1.12 \xt Col 3:17\x* +\p +\v 13 And finally, beloved, rejoice in Christ.\x - \xo 1.13 \xt Phil 3:1\x* Look out for those \add who are\add* greedy for dishonest gain.\x - \xo 1.13 \xt 1 Tim 3:8; Tit 1:7\x* +\v 14 Let all your requests be made known to God,\x - \xo 1.14 \xt Phil 4:6\x* and be steadfast\x - \xo 1.14 \xt 1 Cor 15:58\x* in the mind of Christ.\x - \xo 1.14 \xt 1 Cor 2:16\x* +\v 15 Whatever is sound, and true, and honourable, and just,\f + \fr 1.15 \ft Some manuscripts add \fq and pure\f* and lovely,\x - \xo 1.15 \xt Phil 4:8\x* practice these things.\x - \xo 1.15 \xt Phil 4:9\x* +\v 16 And what you have heard and received, hold in your heart, and peace will be with you. +\p +\v 17 [Greet the brothers.\x - \xo 1.17 \xt 1 Thes 5:26\x*]\f + \fr 1.17 \ft Most manuscripts omit verse 17.\f* +\p +\v 18 The saints greet you.\f + \fr 1.18 \ft One manuscript omits this verse.\f*\x - \xo 1.18 \xt Phil 4:22\x* +\p +\v 19 The grace of the Lord Jesus Christ\f + \fr 1.19 \ft Some manuscripts omit \fq Christ\f* be with your spirit.\x - \xo 1.19 \xt Phil 4:28\x* +\p +\v 20 And have this \add letter\add* read to the Colossians, and that of the Colossians to you.\f + \fr 1.20 \ft One manuscript adds \fq Amen.\fq*, another manuscript omits this verse.\f*\x - \xo 1.20 \xt Col 4:16\x* diff --git a/samples/ApiExample/data/TEA/CommentTags.xml b/samples/ApiExample/data/TEA/CommentTags.xml new file mode 100644 index 00000000..624f1523 --- /dev/null +++ b/samples/ApiExample/data/TEA/CommentTags.xml @@ -0,0 +1,5 @@ + + + + 1 + \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/ProjectProgress.xml b/samples/ApiExample/data/TEA/ProjectProgress.xml new file mode 100644 index 00000000..bd16524a --- /dev/null +++ b/samples/ApiExample/data/TEA/ProjectProgress.xml @@ -0,0 +1,20 @@ + + + + None + + 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001 + + + 000001111111110010000000000000010000000000000000000000000000000000111001111111001010100000000000000000000000000000000000000 + + + 110110000000001100000000000000000000000111010000000001111010001111000000000000110101000000000000000000000000000111111111111 + + + 001000000000000000111100001000000000101000100110000110000001110000000110000000000000000000000000000000000000000000000000000 + + + 000000000000000001000011110111101111010000001001111000000100000000000000000000000000010000000000000000011100000000000000000 + + \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/ProjectUpdates.xml b/samples/ApiExample/data/TEA/ProjectUpdates.xml new file mode 100644 index 00000000..0bbf0e6e --- /dev/null +++ b/samples/ApiExample/data/TEA/ProjectUpdates.xml @@ -0,0 +1,7 @@ + + + 1FE40EDA-1D82-4ED8-95D1-5F44B8EC25CD + 207EF1E9-D931-41A0-920D-96BAEF744746 + 5C974ECE-A444-4E5A-B980-125E3CDEE7E2 + B946EEE7-B890-47FA-BBEF-8D0E6F729F82 + \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/Settings.xml b/samples/ApiExample/data/TEA/Settings.xml new file mode 100644 index 00000000..43bbbf3d --- /dev/null +++ b/samples/ApiExample/data/TEA/Settings.xml @@ -0,0 +1,32 @@ + + usfm.sty + 4 + English + 8.0.100.76 + Test English Apocrypha + 65001 + T + + NFC + TEA + a7e9f1c362e728a143bb5eef7f6c79bcab2478fa + Charis SIL + 12 + + + en::: + 41MAT + + TEA.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001 + + \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/en.ldml b/samples/ApiExample/data/TEA/en.ldml new file mode 100644 index 00000000..87c6fb5a --- /dev/null +++ b/samples/ApiExample/data/TEA/en.ldml @@ -0,0 +1,26 @@ +[A-Za-z][!'-),-.\:;?\[\]\u00B4\u200C\u200D\u2014\u2018\u2019\u201C\u201D]['\-\u00B4\u2014][][][a b c d e f g h i j k l m n o p q r s t u v w x y z {aa} {bb} {cc} {dd} {ee} {ff} {gg} {hh} {ii} {jj} {kk} {ll} {mm} {nn} {oo} {pp} {qq} {rr} {ss} {tt} {uu} {vv} {ww} {xx} {yy} {zz}][][]left-to-rightstandard \ No newline at end of file diff --git a/samples/ApiExample/data/TEA/unique.id b/samples/ApiExample/data/TEA/unique.id new file mode 100644 index 00000000..66104d45 --- /dev/null +++ b/samples/ApiExample/data/TEA/unique.id @@ -0,0 +1 @@ +ed450f1c-1d1f-4ef1-87ac-a6b1d3b4735b \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/84MANTMA.SFM b/samples/ApiExample/data/TMA/84MANTMA.SFM new file mode 100644 index 00000000..ce7aa080 --- /dev/null +++ b/samples/ApiExample/data/TMA/84MANTMA.SFM @@ -0,0 +1,48 @@ +\id MAN - Test Maori Apocrypha +\h +\mt1 +\imt +\ip +\c 1 +\q1 \v 1 +\q2 +\q1 +\q2 +\q1 \v 2 +\q1 \v 3 +\q2 +\q1 \v 4 +\q1 \v 5 +\q1 +\q1 \v 6 +\q1 \v 7 +\q2 +\q3 +\q1 \v 8 +\q2 +\q1 +\q2 +\q1 \v 9 +\q2 +\q1 +\q2 +\q1 \v 10 +\q2 +\q3 +\q1 +\q2 +\q3 +\q1 \v 11 +\q1 \v 12 +\q2 +\q1 \v 13 +\q2 +\q1 +\q2 +\q3 +\q1 +\q2 \v 14 +\q1 +\q2 \v 15 +\q1 +\q2 diff --git a/samples/ApiExample/data/TMA/85PS2TMA.SFM b/samples/ApiExample/data/TMA/85PS2TMA.SFM new file mode 100644 index 00000000..1a1922d6 --- /dev/null +++ b/samples/ApiExample/data/TMA/85PS2TMA.SFM @@ -0,0 +1,32 @@ +\id PS2 - Test Māori Apocrypha +\h NGA WAIATA 151 +\toc1 Ko Nga Waiata 151 +\toc2 Nga Waiata 151 +\toc3 Waiata 151 +\mt1 NGA WAIATA 151 +\imt Te Tīmatanga Kōrero +\ip +\c 1 +\cp 151 +\d Na Rawiri i tuhituhi tenei waiata ki tona ringa ake (ahakoa kei waho i te tatau), i muri i tana whawhai kotahi ki a Golia. +\q1 +\v 1 He i iti ahau waenga i oku tuākana, me te pōtiki i te whare o āku papa; +\q2 I tiaki ahau i nga hipi a toku papa. +\q1 +\v 2 I hanga e oku ringa te hapa; +\q2 i hanga e oku maihao he kutā. +\q1 +\v 3 A ma wai e korero ki toku Ariki? +\q2 Ko te Ariki tonu, e rongo ana ia. +\q1 +\v 4 I tono mai ia i tana karere ki ahau, ka tango mai i ahau i roto i nga hipi a toku papa, +\q2 a pania ana ahau e ia ki te hinu. +\q1 +\v 5 He ataahua, he nunui rawa oku teina; +\q2 otiia kihai te Ariki i ahuareka ki a ratou. +\q1 +\v 6 I haere mai ahau kia whakatau i te tangata iwi ke, +\q2 a kanga iho ahau e ia ki ana whakapakoko. +\q1 +\v 7 Na unuhia ana e ahau tana hoari, tapahia ana tona matenga e ahau, +\q2 a ka tangohia e ahau te tawai o nga tama a Iharaira. diff --git a/samples/ApiExample/data/TMA/BookNames.xml b/samples/ApiExample/data/TMA/BookNames.xml new file mode 100644 index 00000000..833a316b --- /dev/null +++ b/samples/ApiExample/data/TMA/BookNames.xml @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/C3LAOTMA.SFM b/samples/ApiExample/data/TMA/C3LAOTMA.SFM new file mode 100644 index 00000000..9459c187 --- /dev/null +++ b/samples/ApiExample/data/TMA/C3LAOTMA.SFM @@ -0,0 +1,14 @@ +\id LAO - Test Maori Apocrypha +\h +\mt1 +\imt +\ip +\c 1 +\po \v 1 \v 2 +\p \v 3 +\p \v 4 \v 5 \v 6 \v 7 \v 8 \v 9 \v 10 \v 11 \v 12 +\p \v 13 \v 14 \v 15 \v 16 +\p \v 17 +\p \v 18 +\p \v 19 +\p \v 20 diff --git a/samples/ApiExample/data/TMA/CommentTags.xml b/samples/ApiExample/data/TMA/CommentTags.xml new file mode 100644 index 00000000..624f1523 --- /dev/null +++ b/samples/ApiExample/data/TMA/CommentTags.xml @@ -0,0 +1,5 @@ + + + + 1 + \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/ProjectProgress.xml b/samples/ApiExample/data/TMA/ProjectProgress.xml new file mode 100644 index 00000000..bd16524a --- /dev/null +++ b/samples/ApiExample/data/TMA/ProjectProgress.xml @@ -0,0 +1,20 @@ + + + + None + + 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001 + + + 000001111111110010000000000000010000000000000000000000000000000000111001111111001010100000000000000000000000000000000000000 + + + 110110000000001100000000000000000000000111010000000001111010001111000000000000110101000000000000000000000000000111111111111 + + + 001000000000000000111100001000000000101000100110000110000001110000000110000000000000000000000000000000000000000000000000000 + + + 000000000000000001000011110111101111010000001001111000000100000000000000000000000000010000000000000000011100000000000000000 + + \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/Settings.xml b/samples/ApiExample/data/TMA/Settings.xml new file mode 100644 index 00000000..a970e88e --- /dev/null +++ b/samples/ApiExample/data/TMA/Settings.xml @@ -0,0 +1,31 @@ + + usfm.sty + Maori + 8.0.100.76 + Test Maori Apocrypha + 65001 + T + + NFC + TMA + e1b3f0c799c4378a1757dd1b382c1dd515af37db + Charis SIL + 12 + + + mi::: + 41MAT + + TMA.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Daughter:TEA:a7e9f1c362e728a143bb5eef7f6c79bcab2478fa + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000011000000000000000000000000000000000000001 + + \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/mi.ldml b/samples/ApiExample/data/TMA/mi.ldml new file mode 100644 index 00000000..aa095e0e --- /dev/null +++ b/samples/ApiExample/data/TMA/mi.ldml @@ -0,0 +1,15 @@ +[AEHIKM-PRTUWaehikm-prtuw\u0100\u0101\u0112\u0113\u012A\u012B\u014C\u014D\u016A\u016B{ng}{wh}][!(-*,-.\:;?\u00B6\u200C\u200D\u2010\u2014][*\-][][a e h i k m n {ng} o p r t u w {wh}][a e h i k m n {ng} o p r t u w {wh}][][]left-to-rightstandard \ No newline at end of file diff --git a/samples/ApiExample/data/TMA/unique.id b/samples/ApiExample/data/TMA/unique.id new file mode 100644 index 00000000..d3b98c55 --- /dev/null +++ b/samples/ApiExample/data/TMA/unique.id @@ -0,0 +1 @@ +f2ca92e1-0778-4424-9096-a1e64feb6123 \ No newline at end of file diff --git a/samples/ServalApp/poetry.lock b/samples/ServalApp/poetry.lock index a0d60480..13a1ea86 100644 --- a/samples/ServalApp/poetry.lock +++ b/samples/ServalApp/poetry.lock @@ -706,8 +706,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -1367,22 +1367,22 @@ files = [ [[package]] name = "tornado" -version = "6.4" +version = "6.4.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false -python-versions = ">= 3.8" +python-versions = ">=3.8" files = [ - {file = "tornado-6.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:02ccefc7d8211e5a7f9e8bc3f9e5b0ad6262ba2fbb683a6443ecc804e5224ce0"}, - {file = "tornado-6.4-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:27787de946a9cffd63ce5814c33f734c627a87072ec7eed71f7fc4417bb16263"}, - {file = "tornado-6.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7894c581ecdcf91666a0912f18ce5e757213999e183ebfc2c3fdbf4d5bd764e"}, - {file = "tornado-6.4-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e43bc2e5370a6a8e413e1e1cd0c91bedc5bd62a74a532371042a18ef19e10579"}, - {file = "tornado-6.4-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0251554cdd50b4b44362f73ad5ba7126fc5b2c2895cc62b14a1c2d7ea32f212"}, - {file = "tornado-6.4-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fd03192e287fbd0899dd8f81c6fb9cbbc69194d2074b38f384cb6fa72b80e9c2"}, - {file = "tornado-6.4-cp38-abi3-musllinux_1_1_i686.whl", hash = "sha256:88b84956273fbd73420e6d4b8d5ccbe913c65d31351b4c004ae362eba06e1f78"}, - {file = "tornado-6.4-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:71ddfc23a0e03ef2df1c1397d859868d158c8276a0603b96cf86892bff58149f"}, - {file = "tornado-6.4-cp38-abi3-win32.whl", hash = "sha256:6f8a6c77900f5ae93d8b4ae1196472d0ccc2775cc1dfdc9e7727889145c45052"}, - {file = "tornado-6.4-cp38-abi3-win_amd64.whl", hash = "sha256:10aeaa8006333433da48dec9fe417877f8bcc21f48dda8d661ae79da357b2a63"}, - {file = "tornado-6.4.tar.gz", hash = "sha256:72291fa6e6bc84e626589f1c29d90a5a6d593ef5ae68052ee2ef000dfd273dee"}, + {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"}, + {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"}, + {file = "tornado-6.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a017d239bd1bb0919f72af256a970624241f070496635784d9bf0db640d3fec"}, + {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36e62ce8f63409301537222faffcef7dfc5284f27eec227389f2ad11b09d946"}, + {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca9eb02196e789c9cb5c3c7c0f04fb447dc2adffd95265b2c7223a8a615ccbf"}, + {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:304463bd0772442ff4d0f5149c6f1c2135a1fae045adf070821c6cdc76980634"}, + {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:c82c46813ba483a385ab2a99caeaedf92585a1f90defb5693351fa7e4ea0bf73"}, + {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:932d195ca9015956fa502c6b56af9eb06106140d844a335590c1ec7f5277d10c"}, + {file = "tornado-6.4.2-cp38-abi3-win32.whl", hash = "sha256:2876cef82e6c5978fde1e0d5b1f919d756968d5b4282418f3146b79b58556482"}, + {file = "tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38"}, + {file = "tornado-6.4.2.tar.gz", hash = "sha256:92bad5b4746e9879fd7bf1eb21dce4e3fc5128d71601f80005afa39237ad620b"}, ] [[package]] @@ -1523,4 +1523,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.9.7 || >3.9.7,<4.0" -content-hash = "1a59c67f2dcec9f413c7918e000e267400866f2e15a5f09767f0c506f0bd9352" \ No newline at end of file +content-hash = "8c024ad81f66beff9f4cccfdf65629b8d9d87bf49ce3d5774a4d8ad35663be5d" diff --git a/samples/ServalApp/pyproject.toml b/samples/ServalApp/pyproject.toml index ba86a555..85ea229e 100644 --- a/samples/ServalApp/pyproject.toml +++ b/samples/ServalApp/pyproject.toml @@ -11,6 +11,7 @@ streamlit = "^1.31.1" requests = "^2.31.0" SQLAlchemy = "^2.0.22" pyarrow = "^14.0.1" +tornado = "^6.4.2" [tool.poetry.group.dev.dependencies] black = "^23.10.1" diff --git a/scripts/clearml_stats.py b/scripts/clearml_stats.py index c20c33dc..cb60196f 100644 --- a/scripts/clearml_stats.py +++ b/scripts/clearml_stats.py @@ -2,7 +2,7 @@ import json import os import pickle -from datetime import datetime, timezone +from datetime import datetime import numpy as np import pandas as pd @@ -47,6 +47,13 @@ class clearml_stats: def __init__(self): self._client: APIClient = APIClient() self._tasks: dict[str, dict] = self._read_tasks() + self._project_id_to_task_id: dict[str, list[str]] = {} + for task_id in self._tasks.keys(): + project_id = self._tasks[task_id]["project"] + if project_id in self._project_id_to_task_id: + self._project_id_to_task_id[project_id].append(task_id) + else: + self._project_id_to_task_id[project_id] = [task_id] self._projects: dict[str, dict] = self._read_projects() self._languages: pd.DataFrame = pd.read_excel( language_database_filename, index_col=0 @@ -306,24 +313,14 @@ def add_lang(lang): else: langs_by_occurrence[lang] = 1 - num_of_tasks_found = 0 - num_of_tasks_not_found = 0 for project_id in self._projects: self._projects[project_id]["src_lang"] = "unknown" self._projects[project_id]["trg_lang"] = "unknown" self._projects[project_id]["lang_candidates"] = [] project = self._projects[project_id] - if len(project["tasks"]) > 0: - task_not_found = True - for task_id in project["tasks"]: - if task_id in self._tasks.keys(): - task_not_found = False - break - if task_not_found: - num_of_tasks_not_found += 1 - continue - num_of_tasks_found += 1 + if project_id in self._project_id_to_task_id: + project["tasks"] = self._project_id_to_task_id[project_id] task = self._tasks[project["tasks"][0]] args = task["script_args"] if "src_lang" in args and "trg_lang" in args: @@ -491,3 +488,6 @@ def violin_task_delay_time_per_week( axes.set_ylim(0, 8) axes.set_ylabel("hours") axes.grid(True) + + +# %% diff --git a/src/Echo/src/EchoTranslationEngine/Program.cs b/src/Echo/src/EchoTranslationEngine/Program.cs index 6c6f3768..352c536a 100644 --- a/src/Echo/src/EchoTranslationEngine/Program.cs +++ b/src/Echo/src/EchoTranslationEngine/Program.cs @@ -10,6 +10,8 @@ builder.Services.AddHostedService(); builder.Services.AddSingleton(); +builder.Services.AddParallelCorpusPreprocessor(); + builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy()); builder.Services.Configure(builder.Configuration.GetSection("Bugsnag")); @@ -17,9 +19,6 @@ WebApplication app = builder.Build(); -// Configure the HTTP request pipeline. -app.UseHttpsRedirection(); - app.MapGrpcService(); app.MapGrpcService(); diff --git a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs index 67779bc0..fb7abc66 100644 --- a/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs @@ -1,10 +1,16 @@ namespace EchoTranslationEngine; -public class TranslationEngineServiceV1(BackgroundTaskQueue taskQueue) : TranslationEngineApi.TranslationEngineApiBase +public class TranslationEngineServiceV1( + BackgroundTaskQueue taskQueue, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) : TranslationEngineApi.TranslationEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; + public override Task Create(CreateRequest request, ServerCallContext context) { if (request.SourceLanguage != request.TargetLanguage) @@ -75,151 +81,34 @@ await client.BuildStartedAsync( try { + List pretranslationsRequests = []; + _parallelCorpusPreprocessingService.Preprocess( + request.Corpora.Select(Map).ToList(), + row => { }, + (row, corpus) => + { + pretranslationsRequests.Add( + new InsertPretranslationsRequest + { + EngineId = request.EngineId, + CorpusId = corpus.Id, + TextId = row.TextId, + Refs = { row.Refs.Select(r => r.ToString()) }, + Translation = row.SourceSegment + } + ); + }, + false + ); using ( AsyncClientStreamingCall call = client.InsertPretranslations(cancellationToken: cancellationToken) ) { - foreach (ParallelCorpus corpus in request.Corpora) + foreach (InsertPretranslationsRequest request in pretranslationsRequests) { - var sourceFiles = corpus - .SourceCorpora.SelectMany(sc => - sc.Files.Where(f => - (sc.PretranslateTextIds is null || sc.PretranslateTextIds.Contains(f.TextId)) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - var targetFiles = corpus - .TargetCorpora.SelectMany(tc => - tc.Files.Where(f => - (tc.PretranslateTextIds is null || tc.PretranslateTextIds.Contains(f.TextId)) - && f.Format == FileFormat.Text - ) - ) - .ToDictionary(f => f.TextId, f => f.Location); - - foreach (KeyValuePair sourceFile in sourceFiles) - { - string[] sourceLines = await File.ReadAllLinesAsync( - sourceFile.Value, - cancellationToken - ); - - if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath)) - { - string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken); - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach ( - (string sourceLine, string targetLine) in sourceLines - .Select(l => l.Trim()) - .Zip(targetLines.Select(l => l.Trim())) - ) - { - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - var sourceLinesDict = sourceLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Split('\t')[1].Trim() - ); - var targetLinesDict = targetLines.ToDictionary( - l => l.Split('\t')[0].Trim(), - l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty - ); - foreach (KeyValuePair targetLineKVPair in targetLinesDict) - { - string? sourceLine = null; - sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine); - sourceLine ??= string.Empty; - string? targetLine = targetLineKVPair.Value; - if (sourceLine.Length > 0 && targetLine.Length == 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - } - } - } - else - { - bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/'); - if (!isTabSeparated) - { - int lineNum = 1; - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{lineNum}" }, - Translation = sourceLine - }, - cancellationToken - ); - } - lineNum++; - } - } - else - { - foreach (string sourceLine in sourceLines.Select(l => l.Trim())) - { - if (sourceLine.Length > 0) - { - await call.RequestStream.WriteAsync( - new InsertPretranslationsRequest - { - EngineId = request.EngineId, - CorpusId = corpus.Id, - TextId = sourceFile.Key, - Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" }, - Translation = sourceLine.Contains('\t') - ? sourceLine.Split('\t')[1].Trim() - : string.Empty - }, - cancellationToken - ); - } - } - } - } - } + await call.RequestStream.WriteAsync(request, cancellationToken); } - await call.RequestStream.CompleteAsync(); await call; } @@ -317,4 +206,78 @@ ServerCallContext context new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, } ); } + + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToList(), + TargetCorpora = source.TargetCorpora.Select(Map).ToList() + }; + } + + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + var trainOnChapters = source.TrainOnChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var trainOnTextIds = source.TrainOnTextIds.ToHashSet(); + FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll); + + var pretranslateChapters = source.PretranslateChapters.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.Chapters.ToHashSet() + ); + var pretranslateTextIds = source.PretranslateTextIds.ToHashSet(); + FilterChoice pretranslateFilter = GetFilterChoice( + pretranslateChapters, + pretranslateTextIds, + source.PretranslateAll + ); + + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null, + TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null, + PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null, + PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null + }; + } + + private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = source.Location, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId + }; + } + + private enum FilterChoice + { + Chapters, + TextIds, + None + } + + private static FilterChoice GetFilterChoice( + IReadOnlyDictionary> chapters, + HashSet textIds, + bool noFilter + ) + { + // Only either textIds or Scripture Range will be used at a time + // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text) + if (noFilter || (chapters is null && textIds is null)) + return FilterChoice.None; + if (chapters is null || chapters.Count == 0) + return FilterChoice.TextIds; + return FilterChoice.Chapters; + } } diff --git a/src/Echo/src/EchoTranslationEngine/Usings.cs b/src/Echo/src/EchoTranslationEngine/Usings.cs index b7f3ba2d..0404305b 100644 --- a/src/Echo/src/EchoTranslationEngine/Usings.cs +++ b/src/Echo/src/EchoTranslationEngine/Usings.cs @@ -5,3 +5,4 @@ global using Grpc.Core; global using Microsoft.Extensions.Diagnostics.HealthChecks; global using Serval.Translation.V1; +global using SIL.ServiceToolkit.Utils; diff --git a/src/Machine/src/Serval.Machine.EngineServer/Program.cs b/src/Machine/src/Serval.Machine.EngineServer/Program.cs index e36db6c2..b03f6575 100644 --- a/src/Machine/src/Serval.Machine.EngineServer/Program.cs +++ b/src/Machine/src/Serval.Machine.EngineServer/Program.cs @@ -35,8 +35,6 @@ var app = builder.Build(); -app.UseHttpsRedirection(); - app.MapServalTranslationEngineService(); app.MapHangfireDashboard(); diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs index f8dfbcd5..ce0180b5 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilder.cs @@ -3,5 +3,5 @@ public interface IMachineBuilder { IServiceCollection Services { get; } - IConfiguration? Configuration { get; } + IConfiguration Configuration { get; } } diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs index 5a577cb5..67b8ef3d 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IMachineBuilderExtensions.cs @@ -1,63 +1,28 @@ -using Serval.Translation.V1; +using Polly.Extensions.Http; +using Serval.Translation.V1; namespace Microsoft.Extensions.DependencyInjection; public static class IMachineBuilderExtensions { - public static IMachineBuilder AddServiceOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddServiceOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddSmtTransferEngineOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddSmtTransferEngineOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddClearMLOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddClearMLOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddDistributedReaderWriterLockOptions( - this IMachineBuilder build, - Action configureOptions - ) - { - build.Services.Configure(configureOptions); - return build; - } - public static IMachineBuilder AddDistributedReaderWriterLockOptions( this IMachineBuilder build, IConfiguration config @@ -67,67 +32,33 @@ IConfiguration config return build; } - public static IMachineBuilder AddMessageOutboxOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddMessageOutboxOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddSharedFileOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddSharedFileOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddBuildJobOptions( - this IMachineBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder) + public static IMachineBuilder AddServiceToolkitServices(this IMachineBuilder builder) { - if (builder.Configuration is null) - return builder.AddThotSmtModel(o => { }); - else - return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key)); + builder.Services.AddParallelCorpusPreprocessor(); + return builder; } - public static IMachineBuilder AddThotSmtModel( - this IMachineBuilder builder, - Action configureOptions - ) + public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder) { - builder.Services.Configure(configureOptions); - builder.Services.AddSingleton(); - return builder; + return builder.AddThotSmtModel(builder.Configuration.GetSection(ThotSmtModelOptions.Key)); } public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder, IConfiguration config) @@ -151,17 +82,38 @@ public static IMachineBuilder AddUnigramTruecaser(this IMachineBuilder builder) public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, string? connectionString = null) { - connectionString ??= builder.Configuration?.GetConnectionString("ClearML"); + connectionString ??= builder.Configuration.GetConnectionString("ClearML"); if (connectionString is null) throw new InvalidOperationException("ClearML connection string is required"); + var policy = Policy + .Handle() + .OrTransientHttpStatusCode() + .OrResult(msg => msg.StatusCode == HttpStatusCode.TooManyRequests) + .WaitAndRetryAsync( + 7, + retryAttempt => TimeSpan.FromSeconds(2 * retryAttempt), // total 56, less than the 1 minute limit + onRetryAsync: (outcome, timespan, retryAttempt, context) => + { + if (retryAttempt < 3) + return Task.CompletedTask; + // Log the retry attempt + var serviceProvider = builder.Services.BuildServiceProvider(); + var logger = serviceProvider.GetService>(); + logger?.LogInformation( + "Retry {RetryAttempt} encountered an error. Waiting {Timespan} before next retry. Error: {ErrorMessage}", + retryAttempt, + timespan, + outcome.Exception?.Message + ); + return Task.CompletedTask; + } + ); + builder .Services.AddHttpClient("ClearML") .ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!)) - // Add retry policy; fail after approx. 2 + 4 + 8 = 14 seconds - .AddTransientHttpErrorPolicy(b => - b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))) - ); + .AddPolicyHandler(policy); builder.Services.AddSingleton(); @@ -199,7 +151,7 @@ public static IMachineBuilder AddMongoHangfireJobClient( string? connectionString = null ) { - connectionString ??= builder.Configuration?.GetConnectionString("Hangfire"); + connectionString ??= builder.Configuration.GetConnectionString("Hangfire"); if (connectionString is null) throw new InvalidOperationException("Hangfire connection string is required"); @@ -220,7 +172,7 @@ public static IMachineBuilder AddHangfireJobServer( ) { engineTypes ??= - builder.Configuration?.GetSection("TranslationEngines").Get() + builder.Configuration.GetSection("TranslationEngines").Get() ?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt]; var queues = new List(); foreach (TranslationEngineType engineType in engineTypes.Distinct()) @@ -261,7 +213,7 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder) public static IMachineBuilder AddMongoDataAccess(this IMachineBuilder builder, string? connectionString = null) { - connectionString ??= builder.Configuration?.GetConnectionString("Mongo"); + connectionString ??= builder.Configuration.GetConnectionString("Mongo"); if (connectionString is null) throw new InvalidOperationException("Mongo connection string is required"); builder.Services.AddMongoDataAccess( @@ -316,7 +268,7 @@ public static IMachineBuilder AddServalPlatformService( string? connectionString = null ) { - connectionString ??= builder.Configuration?.GetConnectionString("Serval"); + connectionString ??= builder.Configuration.GetConnectionString("Serval"); if (connectionString is null) throw new InvalidOperationException("Serval connection string is required"); @@ -383,7 +335,7 @@ public static IMachineBuilder AddServalTranslationEngineService( builder.AddServalPlatformService(connectionString); engineTypes ??= - builder.Configuration?.GetSection("TranslationEngines").Get() + builder.Configuration.GetSection("TranslationEngines").Get() ?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt]; foreach (TranslationEngineType engineType in engineTypes.Distinct()) { @@ -422,7 +374,7 @@ public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, s if (smtTransferEngineDir is null) { var smtTransferEngineOptions = new SmtTransferEngineOptions(); - builder.Configuration?.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions); + builder.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions); smtTransferEngineDir = smtTransferEngineOptions.EnginesDir; } string? driveLetter = Path.GetPathRoot(smtTransferEngineDir)?[..1]; diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs index 9ae176d8..8fcaced4 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs @@ -2,7 +2,7 @@ public static class IServiceCollectionExtensions { - public static IMachineBuilder AddMachine(this IServiceCollection services, IConfiguration? configuration = null) + public static IMachineBuilder AddMachine(this IServiceCollection services, IConfiguration configuration) { if (!Sldr.IsInitialized) Sldr.Initialize(); @@ -15,35 +15,20 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf services.AddTransient(); services.AddScoped(); - services.AddSingleton(); services.AddStartupTask( (sp, cancellationToken) => sp.GetRequiredService().InitAsync(cancellationToken) ); + services.AddParallelCorpusPreprocessor(); var builder = new MachineBuilder(services, configuration); - if (configuration is null) - { - builder.AddServiceOptions(o => { }); - builder.AddSharedFileOptions(o => { }); - builder.AddSmtTransferEngineOptions(o => { }); - builder.AddClearMLOptions(o => { }); - builder.AddDistributedReaderWriterLockOptions(o => { }); - builder.AddBuildJobOptions(o => { }); - builder.AddMessageOutboxOptions(o => { }); - } - else - { - builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key)); - builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key)); - builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key)); - builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key)); - builder.AddDistributedReaderWriterLockOptions( - configuration.GetSection(DistributedReaderWriterLockOptions.Key) - ); - builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key)); - builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key)); - } + builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key)); + builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key)); + builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key)); + builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key)); + builder.AddDistributedReaderWriterLockOptions(configuration.GetSection(DistributedReaderWriterLockOptions.Key)); + builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key)); + builder.AddMessageOutboxOptions(configuration.GetSection(MessageOutboxOptions.Key)); return builder; } diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs index 58ddf5c1..5fece454 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/MachineBuilder.cs @@ -1,7 +1,7 @@ namespace Microsoft.Extensions.DependencyInjection; -internal class MachineBuilder(IServiceCollection services, IConfiguration? configuration) : IMachineBuilder +internal class MachineBuilder(IServiceCollection services, IConfiguration configuration) : IMachineBuilder { public IServiceCollection Services { get; } = services; - public IConfiguration? Configuration { get; } = configuration; + public IConfiguration Configuration { get; } = configuration; } diff --git a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj index 97d7fb64..f9eea0c5 100644 --- a/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj +++ b/src/Machine/src/Serval.Machine.Shared/Serval.Machine.Shared.csproj @@ -36,9 +36,9 @@ - - - + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs index 2b2b6718..66e1b350 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ClearMLService.cs @@ -161,7 +161,7 @@ public async Task> GetTasksForQueueAsync( var body = new JsonObject { ["queue"] = queueId }; JsonObject? result = await CallAsync("queues", "get_by_id", body, cancellationToken); var tasks = (JsonArray?)result?["data"]?["queue"]?["entries"]; - IEnumerable taskIds = tasks?.Select(t => (string)t?["id"]!) ?? new List(); + IEnumerable taskIds = tasks?.Select(t => (string)t?["task"]!) ?? new List(); return await GetTasksByIdAsync(taskIds, cancellationToken); } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index 3c46a34e..2e79d09a 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -7,8 +7,8 @@ public class NmtPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, - ILanguageTagService languageTagService + ILanguageTagService languageTagService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -17,7 +17,7 @@ ILanguageTagService languageTagService logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly ILanguageTagService _languageTagService = languageTagService; diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index d9e433ce..46baa68d 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -1,49 +1,35 @@ namespace Serval.Machine.Shared.Services; -public class PreprocessBuildJob : HangfireBuildJob> +public class PreprocessBuildJob( + IPlatformService platformService, + IRepository engines, + IDataAccessContext dataAccessContext, + ILogger logger, + IBuildJobService buildJobService, + ISharedFileService sharedFileService, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService +) + : HangfireBuildJob>( + platformService, + engines, + dataAccessContext, + buildJobService, + logger + ) { private static readonly JsonWriterOptions PretranslateWriterOptions = new() { Indented = true }; internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML; - private readonly ISharedFileService _sharedFileService; - private readonly ICorpusService _corpusService; - private int _seed = 1234; - private Random _random; - - public PreprocessBuildJob( - IPlatformService platformService, - IRepository engines, - IDataAccessContext dataAccessContext, - ILogger logger, - IBuildJobService buildJobService, - ISharedFileService sharedFileService, - ICorpusService corpusService - ) - : base(platformService, engines, dataAccessContext, buildJobService, logger) - { - _sharedFileService = sharedFileService; - _corpusService = corpusService; - _random = new Random(_seed); - } + private readonly ISharedFileService _sharedFileService = sharedFileService; - internal int Seed - { - get => _seed; - set - { - if (_seed != value) - { - _seed = value; - _random = new Random(_seed); - } - } - } + private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = + parallelCorpusPreprocessingService; protected override async Task DoWorkAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList data, string? buildOptions, CancellationToken cancellationToken ) @@ -121,127 +107,21 @@ CancellationToken cancellationToken int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - foreach (ParallelCorpus corpus in corpora) - { - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus - .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.TrainOnChapters is null - || IsInChapters(sr, sc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - ITextCorpus[] sourcePretranslateCorpora = sourceCorpora - .Select(sc => - { - ITextCorpus textCorpus = sc.TextCorpus; - if (sc.Corpus.PretranslateTextIds is not null) - textCorpus = textCorpus.FilterTexts(sc.Corpus.PretranslateTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || sc.Corpus.PretranslateChapters is null - || ( - IsInChapters(sr, sc.Corpus.PretranslateChapters) - && !IsInChapters(sr, sc.Corpus.TrainOnChapters ?? new()) - ) - ); - }) - .ToArray(); - - (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus - .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) - .ToArray(); - ITextCorpus[] targetTrainingCorpora = targetCorpora - .Select(tc => - { - ITextCorpus textCorpus = tc.TextCorpus; - if (tc.Corpus.TrainOnTextIds is not null) - textCorpus = textCorpus.FilterTexts(tc.Corpus.TrainOnTextIds); - return textCorpus.Where(row => - row.Ref is not ScriptureRef sr - || tc.Corpus.TrainOnChapters is null - || IsInChapters(sr, tc.Corpus.TrainOnChapters) - ); - }) - .ToArray(); - - if (sourceCorpora.Length == 0) - continue; - - int skipCount = 0; - foreach (Row?[] rows in AlignTrainCorpus(sourceTrainingCorpora, targetTrainingCorpora)) + _parallelCorpusPreprocessingService.Preprocess( + corpora, + row => { - if (skipCount > 0) + if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) { - skipCount--; - continue; - } - - Row[] trainRows = rows.Where(r => r is not null).Cast().ToArray(); - if (trainRows.Length > 0) - { - Row row = trainRows[0]; - if (rows.Length > 1) - { - Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray(); - Row[] targetNonEmptyRows = nonEmptyRows.Where(r => r.TargetSegment.Length > 0).ToArray(); - if (targetNonEmptyRows.Length > 0) - nonEmptyRows = targetNonEmptyRows; - if (nonEmptyRows.Length > 0) - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - { - nonEmptyRows = nonEmptyRows - .GroupBy(r => r.SourceSegment) - .Select(group => group.First()) - .ToArray(); - row = nonEmptyRows[_random.Next(nonEmptyRows.Length)]; - } - } - } - - await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n"); - await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n"); - skipCount = row.RowCount - 1; - if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) - trainCount++; + sourceTrainWriter.Write($"{row.SourceSegment}\n"); + targetTrainWriter.Write($"{row.TargetSegment}\n"); } - } - - if ((bool?)buildOptionsObject?["use_key_terms"] ?? true) + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + }, + (row, corpus) => { - ITextCorpus? sourceTermCorpus = _corpusService - .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) - .FirstOrDefault(); - ITextCorpus? targetTermCorpus = _corpusService - .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) - .FirstOrDefault(); - if (sourceTermCorpus is not null && targetTermCorpus is not null) - { - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); - foreach (ParallelTextRow row in parallelKeyTermsCorpus) - { - await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); - await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); - trainCount++; - } - } - } - - foreach (Row row in AlignPretranslateCorpus(sourcePretranslateCorpora, targetCorpora[0].TextCorpus)) - { - if (row.SourceSegment.Length > 0) + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) { pretranslateWriter.WriteStartObject(); pretranslateWriter.WriteString("corpusId", corpus.Id); @@ -254,21 +134,15 @@ row.Ref is not ScriptureRef sr pretranslateWriter.WriteEndObject(); pretranslateCount++; } - } - } + }, + (bool?)buildOptionsObject?["use_key_terms"] ?? true + ); pretranslateWriter.WriteEndArray(); return (trainCount, pretranslateCount); } - private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) - { - return selection.TryGetValue(sr.Book, out HashSet? chapters) - && chapters != null - && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); - } - protected override async Task CleanupAsync( string engineId, string buildId, @@ -289,189 +163,9 @@ JobCompletionStatus completionStatus } } - private static IEnumerable AlignTrainCorpus( - IReadOnlyList srcCorpora, - IReadOnlyList trgCorpora - ) - { - srcCorpora = srcCorpora.Select(sc => sc.Transform(CleanSegment)).ToArray(); - trgCorpora = trgCorpora.Select(tc => tc.Transform(CleanSegment)).ToArray(); - - if (trgCorpora.All(tc => tc.IsScripture())) - { - return srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => AlignScripture(sc, tc))) - .ZipMany(rows => rows.ToArray()) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - IEnumerable sourceOnlyRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allSourceRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count == 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - IEnumerable targetRows = srcCorpora - .SelectMany(sc => trgCorpora.Select(tc => sc.AlignRows(tc, allTargetRows: true))) - .ZipMany(rows => - rows.Where(r => r.TargetSegment.Count > 0) - .Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1)) - .ToArray() - ); - - return sourceOnlyRows - .Concat(targetRows) - // filter out every list that only contains completely empty rows - .Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0)); - } - - private static IEnumerable AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - HashSet vrefs = []; - foreach ( - (VerseRef vref, string srcSegment, string trgSegment) in srcCorpus - .ExtractScripture() - .Select(r => (r.CorpusVerseRef, r.Text)) - .Zip( - trgCorpus.ExtractScripture().Select(r => r.Text), - (s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t) - ) - ) - { - if (srcSegment == "" && trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - rowCount++; - } - else if (srcSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (trgSegment.Length > 0) - { - if (trgSegBuffer.Length > 0) - trgSegBuffer.Append(' '); - trgSegBuffer.Append(trgSegment); - } - rowCount++; - } - else if (trgSegment == "") - { - vrefs.UnionWith(vref.AllVerses()); - if (srcSegment.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(srcSegment); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - vrefs.Clear(); - rowCount = 0; - } - vrefs.UnionWith(vref.AllVerses()); - srcSegBuffer.Append(srcSegment); - trgSegBuffer.Append(trgSegment); - rowCount++; - } - } - - if (rowCount > 0) - { - yield return new( - vrefs.First().Book, - vrefs.Order().Select(v => new ScriptureRef(v)).Cast().ToArray(), - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - rowCount - ); - for (int i = 0; i < rowCount - 1; i++) - yield return null; - } - } - - private static IEnumerable AlignPretranslateCorpus(ITextCorpus[] srcCorpora, ITextCorpus trgCorpus) - { - int rowCount = 0; - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - List refs = []; - string textId = ""; - foreach (ParallelTextRow row in srcCorpora.SelectMany(sc => sc.AlignRows(trgCorpus, allSourceRows: true))) - { - if (!row.IsTargetRangeStart && row.IsTargetInRange) - { - refs.AddRange(row.TargetRefs); - if (row.SourceText.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(row.SourceText); - } - rowCount++; - } - else - { - if (rowCount > 0) - { - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - textId = ""; - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - refs.Clear(); - rowCount = 0; - } - - textId = row.TextId; - refs.AddRange(row.TargetRefs); - srcSegBuffer.Append(row.SourceText); - trgSegBuffer.Append(row.TargetText); - rowCount++; - } - } - - if (rowCount > 0) - yield return new(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - } - - private record Row( - string TextId, - IReadOnlyList Refs, - string SourceSegment, - string TargetSegment, - int RowCount - ); - protected virtual bool ResolveLanguageCodeForBaseModel(string languageCode, out string resolvedCode) { resolvedCode = languageCode; return true; } - - private static TextRow CleanSegment(TextRow row) - { - if (row.Text == "...") - row.Segment = []; - return row; - } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs index bced613b..336d98ae 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs @@ -91,7 +91,7 @@ await engineService.TrainSegmentPairAsync( public override async Task StartBuild(StartBuildRequest request, ServerCallContext context) { ITranslationEngineService engineService = GetEngineService(request.EngineType); - Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); + SIL.ServiceToolkit.Models.ParallelCorpus[] corpora = request.Corpora.Select(Map).ToArray(); try { await engineService.StartBuildAsync( @@ -269,9 +269,9 @@ private static Translation.V1.Phrase Map(SIL.Machine.Translation.Phrase source) }; } - private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) + private static SIL.ServiceToolkit.Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) { - return new Models.ParallelCorpus + return new SIL.ServiceToolkit.Models.ParallelCorpus { Id = source.Id, SourceCorpora = source.SourceCorpora.Select(Map).ToList(), @@ -279,23 +279,27 @@ private static Models.ParallelCorpus Map(Translation.V1.ParallelCorpus source) }; } - private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) + private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus source) { var trainOnChapters = source.TrainOnChapters.ToDictionary( kvp => kvp.Key, kvp => kvp.Value.Chapters.ToHashSet() ); var trainOnTextIds = source.TrainOnTextIds.ToHashSet(); - FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds); + FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds, source.TrainOnAll); var pretranslateChapters = source.PretranslateChapters.ToDictionary( kvp => kvp.Key, kvp => kvp.Value.Chapters.ToHashSet() ); var pretranslateTextIds = source.PretranslateTextIds.ToHashSet(); - FilterChoice pretranslateFilter = GetFilterChoice(pretranslateChapters, pretranslateTextIds); + FilterChoice pretranslateFilter = GetFilterChoice( + pretranslateChapters, + pretranslateTextIds, + source.PretranslateAll + ); - return new Models.MonolingualCorpus + return new SIL.ServiceToolkit.Models.MonolingualCorpus { Id = source.Id, Language = source.Language, @@ -307,12 +311,12 @@ private static Models.MonolingualCorpus Map(Translation.V1.MonolingualCorpus sou }; } - private static Models.CorpusFile Map(Translation.V1.CorpusFile source) + private static SIL.ServiceToolkit.Models.CorpusFile Map(Translation.V1.CorpusFile source) { - return new Models.CorpusFile + return new SIL.ServiceToolkit.Models.CorpusFile { Location = source.Location, - Format = (Models.FileFormat)source.Format, + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, TextId = source.TextId }; } @@ -326,12 +330,13 @@ private enum FilterChoice private static FilterChoice GetFilterChoice( IReadOnlyDictionary> chapters, - HashSet textIds + HashSet textIds, + bool noFilter ) { // Only either textIds or Scripture Range will be used at a time // TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text) - if (chapters is null && textIds is null) + if (noFilter || (chapters is null && textIds is null)) return FilterChoice.None; if (chapters is null || chapters.Count == 0) return FilterChoice.TextIds; diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index b9393e9b..7e1627a6 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -7,9 +7,9 @@ public class SmtTransferPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - ICorpusService corpusService, IDistributedReaderWriterLockFactory lockFactory, - IRepository trainSegmentPairs + IRepository trainSegmentPairs, + IParallelCorpusPreprocessingService parallelCorpusPreprocessingService ) : PreprocessBuildJob( platformService, @@ -18,7 +18,7 @@ IRepository trainSegmentPairs logger, buildJobService, sharedFileService, - corpusService + parallelCorpusPreprocessingService ) { private readonly IDistributedReaderWriterLockFactory _lockFactory = lockFactory; diff --git a/src/Machine/src/Serval.Machine.Shared/Usings.cs b/src/Machine/src/Serval.Machine.Shared/Usings.cs index ea49e89d..bb148b80 100644 --- a/src/Machine/src/Serval.Machine.Shared/Usings.cs +++ b/src/Machine/src/Serval.Machine.Shared/Usings.cs @@ -54,7 +54,7 @@ global using SIL.Machine.Translation; global using SIL.Machine.Translation.Thot; global using SIL.Machine.Utils; -global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; global using SIL.ServiceToolkit.Services; global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs index 67145c01..f05a8cb3 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs @@ -301,8 +301,8 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), - new LanguageTagService() + new LanguageTagService(), + new ParallelCorpusPreprocessingService(new CorpusService()) ); } if (jobType == typeof(PostprocessBuildJob)) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 539b9c4c..02669cb4 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -65,7 +65,7 @@ public async Task RunAsync_TrainAndPretranslateAll() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); } [Test] @@ -87,7 +87,24 @@ public async Task RunAsync_PretranslateTextIds() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); + } + + [Test] + public async Task RunAsync_PretranslateTextIdsOverlapWithTrainOnTextIds() + { + using TestEnvironment env = new(); + ParallelCorpus corpus1 = TestEnvironment.TextFileCorpus( + pretranslateTextIds: ["textId1"], + trainOnTextIds: ["textId1"] + ); + + await env.RunBuildJobAsync(corpus1); + Assert.Multiple(async () => + { + Assert.That((await env.GetTrainCountAsync()).Source1Count, Is.EqualTo(4)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(2)); + }); } [Test] @@ -143,7 +160,11 @@ public async Task RunAsync_PretranslateChapters() await env.RunBuildJobAsync(corpus1); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(4)); + Assert.That( + await env.GetPretranslateCountAsync(), + Is.EqualTo(4), + JsonSerializer.Serialize(await env.GetPretranslationsAsync()) + ); } [Test] @@ -184,12 +205,12 @@ public async Task RunAsync_MixedSource_Paratext() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(5)); - Assert.That(src2Count, Is.EqualTo(12)); + Assert.That(src1Count, Is.EqualTo(7)); + Assert.That(src2Count, Is.EqualTo(13)); Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(56)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(15)); } [Test] @@ -203,12 +224,12 @@ public async Task RunAsync_MixedSource_Text() (int src1Count, int src2Count, int trgCount, int termCount) = await env.GetTrainCountAsync(); Assert.Multiple(() => { - Assert.That(src1Count, Is.EqualTo(3)); - Assert.That(src2Count, Is.EqualTo(2)); + Assert.That(src1Count, Is.EqualTo(1)); + Assert.That(src2Count, Is.EqualTo(4)); Assert.That(trgCount, Is.EqualTo(1)); Assert.That(termCount, Is.EqualTo(0)); }); - Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(9)); + Assert.That(await env.GetPretranslateCountAsync(), Is.EqualTo(3)); } [Test] @@ -267,7 +288,7 @@ public async Task RunAsync_RemoveFreestandingEllipses() ); JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations.Count, Is.EqualTo(0)); + Assert.That(pretranslations!.Count, Is.EqualTo(1)); } [Test] @@ -388,6 +409,13 @@ public async Task ParallelCorpusLogic() new() { } } }, + PretranslateChapters = new() + { + { + "1CH", + new() { } + } + } }, }, TargetCorpora = new List() @@ -434,26 +462,29 @@ public async Task ParallelCorpusLogic() } }; await env.RunBuildJobAsync(corpora, useKeyTerms: false); + JsonArray? pretranslations = await env.GetPretranslationsAsync(); Assert.Multiple(async () => { + string src = await env.GetSourceExtractAsync(); Assert.That( - await env.GetSourceExtractAsync(), + src, Is.EqualTo( @"Source one, chapter fourteen, verse fifty-five. Segment b. Source one, chapter fourteen, verse fifty-six. -Source one, chapter one, verse one. +Source two, chapter one, verse one. Source two, chapter one, verse two. Source two, chapter one, verse three. -Source two, chapter one, verse four. +Source one, chapter one, verse four. Source two, chapter one, verse five. Source two, chapter one, verse six. -Source two, chapter one, verse seven. Source two, chapter one, verse eight. -Source two, chapter one, verse nine. Source two, chapter one, verse ten. +Source one, chapter one, verse seven, eight, and nine. Source one, chapter one, verse ten. Source two, chapter one, verse one. " - ) + ), + src ); + string trg = await env.GetTargetExtractAsync(); Assert.That( - await env.GetTargetExtractAsync(), + trg, Is.EqualTo( @"Target two, chapter fourteen, verse fifty-five. Target two, chapter fourteen, verse fifty-six. @@ -462,20 +493,19 @@ await env.GetTargetExtractAsync(), Target one, chapter one, verse three. Target one, chapter one, verse five and six. -Target one, chapter one, verse seven and eight. -Target one, chapter one, verse nine and ten. +Target one, chapter one, verse seven and eight. Target one, chapter one, verse nine and ten. " - ) + ), + trg + ); + Assert.That(pretranslations, Is.Not.Null); + Assert.That(pretranslations!.Count, Is.EqualTo(7)); + Assert.That( + pretranslations[2]!["translation"]!.ToString(), + Is.EqualTo("Source one, chapter twelve, verse one.") ); }); - JsonArray? pretranslations = await env.GetPretranslationsAsync(); - Assert.That(pretranslations, Is.Not.Null); - Assert.That(pretranslations!.Count, Is.EqualTo(37), pretranslations.ToJsonString()); - Assert.That( - pretranslations[2]!["translation"]!.ToString(), - Is.EqualTo("Source one, chapter twelve, verse one.") - ); } private class TestEnvironment : DisposableBase @@ -781,12 +811,9 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, - new LanguageTagService() - ) - { - Seed = 1234 - }; + new LanguageTagService(), + new ParallelCorpusPreprocessingService(CorpusService) + ); } case TranslationEngineType.SmtTransfer: { @@ -797,13 +824,10 @@ public PreprocessBuildJob GetBuildJob(TranslationEngineType engineType) Substitute.For>(), BuildJobService, SharedFileService, - CorpusService, LockFactory, - TrainSegmentPairs - ) - { - Seed = 1234 - }; + TrainSegmentPairs, + new ParallelCorpusPreprocessingService(CorpusService) + ); } default: throw new InvalidOperationException("Unknown engine type."); @@ -1010,7 +1034,8 @@ public async Task GetTargetExtractAsync() public async Task GetPretranslateCountAsync() { - return (await GetPretranslationsAsync())?.Count ?? 0; + var pretranslations = await GetPretranslationsAsync(); + return pretranslations?.Count ?? 0; } private void ZipParatextProject(string name) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs index 6b888794..17c89ed4 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs @@ -687,9 +687,9 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - Substitute.For(), _env._lockFactory, - _env.TrainSegmentPairs + _env.TrainSegmentPairs, + new ParallelCorpusPreprocessingService(new CorpusService()) ) { TrainJobRunnerType = _env._trainJobRunnerType diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs index f58cb973..3ccb5537 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Usings.cs @@ -28,4 +28,6 @@ global using SIL.Machine.Utils; global using SIL.ObjectModel; global using SIL.Scripture; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; global using SIL.WritingSystems; diff --git a/src/Serval/src/Serval.Assessment/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Assessment/Configuration/IServalBuilderExtensions.cs index d770433d..ee82803b 100644 --- a/src/Serval/src/Serval.Assessment/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Assessment/Configuration/IServalBuilderExtensions.cs @@ -5,27 +5,17 @@ namespace Microsoft.Extensions.DependencyInjection; public static class IServalBuilderExtensions { - public static IServalBuilder AddAssessment(this IServalBuilder builder, Action? configure = null) + public static IServalBuilder AddAssessment(this IServalBuilder builder) { - if (builder.Configuration is null) - { - builder.AddApiOptions(o => { }); - builder.AddDataFileOptions(o => { }); - } - else - { - builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); - builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); - } + builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); + builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); var assessmentOptions = new AssessmentOptions(); - builder.Configuration?.GetSection(AssessmentOptions.Key).Bind(assessmentOptions); - if (configure is not null) - configure(assessmentOptions); + builder.Configuration.GetSection(AssessmentOptions.Key).Bind(assessmentOptions); foreach (EngineInfo engine in assessmentOptions.Engines) { diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index c2d3dd6e..ee4ce398 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -4218,7 +4218,7 @@ public partial interface ITranslationEnginesClient /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Add a corpus to a translation engine + /// Add a corpus to a translation engine (obsolete - use parallel corpora instead) /// /// /// ## Parameters @@ -4242,20 +4242,22 @@ public partial interface ITranslationEnginesClient /// The corpus configuration (see remarks) /// The added corpus /// A server side error occurred. + [System.Obsolete] System.Threading.Tasks.Task AddCorpusAsync(string id, TranslationCorpusConfig corpusConfig, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all corpora for a translation engine + /// Get all corpora for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// The corpora /// A server side error occurred. + [System.Obsolete] System.Threading.Tasks.Task> GetAllCorporaAsync(string id, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Update a corpus with a new set of files + /// Update a corpus with a new set of files (obsolete - use parallel corpora instead) /// /// /// See posting a new corpus for details of use. Will completely replace corpus' file associations. @@ -4266,16 +4268,18 @@ public partial interface ITranslationEnginesClient /// The corpus configuration /// The corpus was updated successfully /// A server side error occurred. + [System.Obsolete] System.Threading.Tasks.Task UpdateCorpusAsync(string id, string corpusId, TranslationCorpusUpdateConfig corpusConfig, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get the configuration of a corpus for a translation engine + /// Get the configuration of a corpus for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// The corpus id /// The corpus configuration /// A server side error occurred. + [System.Obsolete] System.Threading.Tasks.Task GetCorpusAsync(string id, string corpusId, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)); /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. @@ -4355,7 +4359,7 @@ public partial interface ITranslationEnginesClient /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all pretranslations in a corpus of a translation engine + /// Get all pretranslations in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -4369,7 +4373,7 @@ public partial interface ITranslationEnginesClient ///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id (optional) /// The pretranslations /// A server side error occurred. @@ -4377,7 +4381,7 @@ public partial interface ITranslationEnginesClient /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all pretranslations for the specified text in a corpus of a translation engine + /// Get all pretranslations for the specified text in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -4390,7 +4394,7 @@ public partial interface ITranslationEnginesClient ///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// The pretranslations /// A server side error occurred. @@ -4416,7 +4420,7 @@ public partial interface ITranslationEnginesClient ///
Both scripture and non-scripture text in the USFM is parsed and grouped according to [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// The source[s] of the data to populate the USFM file with. /// The book in USFM format @@ -4437,10 +4441,21 @@ public partial interface ITranslationEnginesClient /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. - ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. - ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) - ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). + ///
Specifying a corpus: + ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. + ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. + ///
+ ///
Filtering by textID or chapter: + ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. + ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) + ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + ///
+ ///
Filter - train on all or none + ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively + ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. + ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. + ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: @@ -5531,7 +5546,7 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Add a corpus to a translation engine + /// Add a corpus to a translation engine (obsolete - use parallel corpora instead) /// /// /// ## Parameters @@ -5555,6 +5570,7 @@ public string BaseUrl /// The corpus configuration (see remarks) /// The added corpus /// A server side error occurred. + [System.Obsolete] public virtual async System.Threading.Tasks.Task AddCorpusAsync(string id, TranslationCorpusConfig corpusConfig, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) @@ -5667,11 +5683,12 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all corpora for a translation engine + /// Get all corpora for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// The corpora /// A server side error occurred. + [System.Obsolete] public virtual async System.Threading.Tasks.Task> GetAllCorporaAsync(string id, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) @@ -5771,7 +5788,7 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Update a corpus with a new set of files + /// Update a corpus with a new set of files (obsolete - use parallel corpora instead) /// /// /// See posting a new corpus for details of use. Will completely replace corpus' file associations. @@ -5782,6 +5799,7 @@ public string BaseUrl /// The corpus configuration /// The corpus was updated successfully /// A server side error occurred. + [System.Obsolete] public virtual async System.Threading.Tasks.Task UpdateCorpusAsync(string id, string corpusId, TranslationCorpusUpdateConfig corpusConfig, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) @@ -5898,12 +5916,13 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get the configuration of a corpus for a translation engine + /// Get the configuration of a corpus for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// The corpus id /// The corpus configuration /// A server side error occurred. + [System.Obsolete] public virtual async System.Threading.Tasks.Task GetCorpusAsync(string id, string corpusId, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) { if (id == null) @@ -6688,7 +6707,7 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all pretranslations in a corpus of a translation engine + /// Get all pretranslations in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -6702,7 +6721,7 @@ public string BaseUrl ///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id (optional) /// The pretranslations /// A server side error occurred. @@ -6822,7 +6841,7 @@ public string BaseUrl /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// - /// Get all pretranslations for the specified text in a corpus of a translation engine + /// Get all pretranslations for the specified text in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -6835,7 +6854,7 @@ public string BaseUrl ///
Only pretranslations for the most recent successful build of the engine are returned. ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// The pretranslations /// A server side error occurred. @@ -6971,7 +6990,7 @@ public string BaseUrl ///
Both scripture and non-scripture text in the USFM is parsed and grouped according to [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). ///
/// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// The source[s] of the data to populate the USFM file with. /// The book in USFM format @@ -7217,10 +7236,21 @@ public string BaseUrl /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. - ///
Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. - ///
Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) - ///
All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). + ///
Specifying a corpus: + ///
* A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. + ///
* A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. + ///
+ ///
Filtering by textID or chapter: + ///
* Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. + ///
* Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) + ///
* All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + ///
+ ///
Filter - train on all or none + ///
* If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively + ///
* If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. + ///
* If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. + ///
* If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. ///
///
Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, ///
the following text will be pretranslated: @@ -9816,18 +9846,24 @@ public partial class TranslationBuild [Newtonsoft.Json.JsonProperty("options", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] public object? Options { get; set; } = default!; + [Newtonsoft.Json.JsonProperty("deploymentVersion", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + public string? DeploymentVersion { get; set; } = default!; + } [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.1.0.0 (NJsonSchema v11.0.2.0 (Newtonsoft.Json v13.0.0.0))")] public partial class TrainingCorpus { [Newtonsoft.Json.JsonProperty("corpus", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public ResourceLink? Corpus { get; set; } = default!; [Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public System.Collections.Generic.IList? TextIds { get; set; } = default!; [Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? ScriptureRange { get; set; } = default!; [Newtonsoft.Json.JsonProperty("parallelCorpus", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] @@ -9860,12 +9896,15 @@ public partial class ParallelCorpusFilter public partial class PretranslateCorpus { [Newtonsoft.Json.JsonProperty("corpus", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public ResourceLink? Corpus { get; set; } = default!; [Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public System.Collections.Generic.IList? TextIds { get; set; } = default!; [Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? ScriptureRange { get; set; } = default!; [Newtonsoft.Json.JsonProperty("parallelCorpus", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] @@ -9897,12 +9936,15 @@ public partial class TranslationBuildConfig public partial class TrainingCorpusConfig { [Newtonsoft.Json.JsonProperty("corpusId", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? CorpusId { get; set; } = default!; [Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public System.Collections.Generic.IList? TextIds { get; set; } = default!; [Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? ScriptureRange { get; set; } = default!; [Newtonsoft.Json.JsonProperty("parallelCorpusId", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] @@ -9935,12 +9977,15 @@ public partial class ParallelCorpusFilterConfig public partial class PretranslateCorpusConfig { [Newtonsoft.Json.JsonProperty("corpusId", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? CorpusId { get; set; } = default!; [Newtonsoft.Json.JsonProperty("textIds", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public System.Collections.Generic.IList? TextIds { get; set; } = default!; [Newtonsoft.Json.JsonProperty("scriptureRange", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] + [System.Obsolete] public string? ScriptureRange { get; set; } = default!; [Newtonsoft.Json.JsonProperty("parallelCorpusId", Required = Newtonsoft.Json.Required.Default, NullValueHandling = Newtonsoft.Json.NullValueHandling.Ignore)] diff --git a/src/Serval/src/Serval.Client/Serval.Client.csproj b/src/Serval/src/Serval.Client/Serval.Client.csproj index 0a72b611..13feff18 100644 --- a/src/Serval/src/Serval.Client/Serval.Client.csproj +++ b/src/Serval/src/Serval.Client/Serval.Client.csproj @@ -2,7 +2,7 @@ netstandard2.1 - 1.5.0 + 1.8.0 Client classes for Serval. Serval.Client Serval diff --git a/src/Serval/src/Serval.DataFiles/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.DataFiles/Configuration/IServalBuilderExtensions.cs index 91756a6c..11af65e1 100644 --- a/src/Serval/src/Serval.DataFiles/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.DataFiles/Configuration/IServalBuilderExtensions.cs @@ -4,10 +4,7 @@ public static class IServalBuilderExtensions { public static IServalBuilder AddDataFiles(this IServalBuilder builder) { - if (builder.Configuration is null) - builder.AddDataFileOptions(o => { }); - else - builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); + builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); builder.Services.AddScoped(); builder.Services.AddHostedService(); diff --git a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto index 98918f0c..609a3fc0 100644 --- a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto +++ b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/engine.proto @@ -161,6 +161,8 @@ message ParallelCorpus { message MonolingualCorpus { string id = 1; string language = 2; + bool train_on_all = 3; + bool pretranslate_all = 4; map train_on_chapters = 5; map pretranslate_chapters = 6; repeated string train_on_text_ids = 7; diff --git a/src/Serval/src/Serval.Shared/Configuration/IServalBuilder.cs b/src/Serval/src/Serval.Shared/Configuration/IServalBuilder.cs index 116fc6d4..f37283e3 100644 --- a/src/Serval/src/Serval.Shared/Configuration/IServalBuilder.cs +++ b/src/Serval/src/Serval.Shared/Configuration/IServalBuilder.cs @@ -3,5 +3,5 @@ public interface IServalBuilder { IServiceCollection Services { get; } - IConfiguration? Configuration { get; } + IConfiguration Configuration { get; } } diff --git a/src/Serval/src/Serval.Shared/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Shared/Configuration/IServalBuilderExtensions.cs index 2f226ab4..4a611f25 100644 --- a/src/Serval/src/Serval.Shared/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Shared/Configuration/IServalBuilderExtensions.cs @@ -2,27 +2,12 @@ public static class IServalBuilderExtensions { - public static IServalBuilder AddDataFileOptions( - this IServalBuilder builder, - Action configureOptions - ) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IServalBuilder AddDataFileOptions(this IServalBuilder builder, IConfiguration config) { builder.Services.Configure(config); return builder; } - public static IServalBuilder AddApiOptions(this IServalBuilder builder, Action configureOptions) - { - builder.Services.Configure(configureOptions); - return builder; - } - public static IServalBuilder AddApiOptions(this IServalBuilder builder, IConfiguration config) { builder.Services.Configure(config); @@ -43,7 +28,7 @@ public static IServalBuilder AddMongoDataAccess( Action configure ) { - string? mongoConnectionString = builder.Configuration?.GetConnectionString("Mongo"); + string? mongoConnectionString = builder.Configuration.GetConnectionString("Mongo"); if (mongoConnectionString is null) throw new InvalidOperationException("Mongo connection string not configured"); builder.Services.AddMongoDataAccess(mongoConnectionString, "Serval", configure); diff --git a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs index 2671ac40..3a7ce339 100644 --- a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs @@ -2,7 +2,7 @@ public static class IServiceCollectionExtensions { - public static IServalBuilder AddServal(this IServiceCollection services, IConfiguration? configuration = null) + public static IServalBuilder AddServal(this IServiceCollection services, IConfiguration configuration) { services.AddTransient(); services.AddTransient(); diff --git a/src/Serval/src/Serval.Shared/Configuration/ServalBuilder.cs b/src/Serval/src/Serval.Shared/Configuration/ServalBuilder.cs index b4fe3747..48c5123d 100644 --- a/src/Serval/src/Serval.Shared/Configuration/ServalBuilder.cs +++ b/src/Serval/src/Serval.Shared/Configuration/ServalBuilder.cs @@ -1,7 +1,7 @@ namespace Microsoft.Extensions.DependencyInjection; -internal class ServalBuilder(IServiceCollection services, IConfiguration? configuration) : IServalBuilder +internal class ServalBuilder(IServiceCollection services, IConfiguration configuration) : IServalBuilder { public IServiceCollection Services { get; } = services; - public IConfiguration? Configuration { get; } = configuration; + public IConfiguration Configuration { get; } = configuration; } diff --git a/src/Serval/src/Serval.Shared/Serval.Shared.csproj b/src/Serval/src/Serval.Shared/Serval.Shared.csproj index 6ea07ec4..75ccbd9b 100644 --- a/src/Serval/src/Serval.Shared/Serval.Shared.csproj +++ b/src/Serval/src/Serval.Shared/Serval.Shared.csproj @@ -19,7 +19,7 @@ - + diff --git a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs index 0236d65d..2788ed49 100644 --- a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs @@ -5,21 +5,10 @@ namespace Microsoft.Extensions.DependencyInjection; public static class IServalBuilderExtensions { - public static IServalBuilder AddTranslation( - this IServalBuilder builder, - Action? configure = null - ) + public static IServalBuilder AddTranslation(this IServalBuilder builder) { - if (builder.Configuration is null) - { - builder.AddApiOptions(o => { }); - builder.AddDataFileOptions(o => { }); - } - else - { - builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); - builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); - } + builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); + builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); builder.Services.AddScoped(); builder.Services.AddScoped(); @@ -29,9 +18,7 @@ public static IServalBuilder AddTranslation( builder.Services.AddSingleton(); var translationOptions = new TranslationOptions(); - builder.Configuration?.GetSection(TranslationOptions.Key).Bind(translationOptions); - if (configure is not null) - configure(translationOptions); + builder.Configuration.GetSection(TranslationOptions.Key).Bind(translationOptions); foreach (EngineInfo engine in translationOptions.Engines) { diff --git a/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusConfigDto.cs b/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusConfigDto.cs index a88ebe3b..58756e3a 100644 --- a/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusConfigDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusConfigDto.cs @@ -2,10 +2,13 @@ public record PretranslateCorpusConfigDto { + [Obsolete] public string? CorpusId { get; init; } + [Obsolete] public IReadOnlyList? TextIds { get; init; } + [Obsolete] public string? ScriptureRange { get; init; } public string? ParallelCorpusId { get; init; } diff --git a/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusDto.cs b/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusDto.cs index 9aa6f939..14fde716 100644 --- a/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/PretranslateCorpusDto.cs @@ -2,10 +2,13 @@ public record PretranslateCorpusDto { + [Obsolete] public ResourceLinkDto? Corpus { get; init; } + [Obsolete] public IReadOnlyList? TextIds { get; init; } + [Obsolete] public string? ScriptureRange { get; init; } public ResourceLinkDto? ParallelCorpus { get; init; } diff --git a/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusConfigDto.cs b/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusConfigDto.cs index c8161a5f..a70bf5ab 100644 --- a/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusConfigDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusConfigDto.cs @@ -2,8 +2,13 @@ namespace Serval.Translation.Contracts; public record TrainingCorpusConfigDto { + [Obsolete] public string? CorpusId { get; init; } + + [Obsolete] public IReadOnlyList? TextIds { get; init; } + + [Obsolete] public string? ScriptureRange { get; init; } public string? ParallelCorpusId { get; init; } diff --git a/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusDto.cs b/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusDto.cs index f734f43b..f958a07b 100644 --- a/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/TrainingCorpusDto.cs @@ -2,8 +2,13 @@ namespace Serval.Translation.Contracts; public record TrainingCorpusDto { + [Obsolete] public ResourceLinkDto? Corpus { get; init; } + + [Obsolete] public IReadOnlyList? TextIds { get; init; } + + [Obsolete] public string? ScriptureRange { get; init; } public ResourceLinkDto? ParallelCorpus { get; init; } diff --git a/src/Serval/src/Serval.Translation/Contracts/TranslationBuildDto.cs b/src/Serval/src/Serval.Translation/Contracts/TranslationBuildDto.cs index 741ff4ba..eb009161 100644 --- a/src/Serval/src/Serval.Translation/Contracts/TranslationBuildDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/TranslationBuildDto.cs @@ -27,4 +27,5 @@ public record TranslationBuildDto /// } /// public object? Options { get; init; } + public string? DeploymentVersion { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index 0affde0f..b0401647 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -1,5 +1,7 @@ namespace Serval.Translation.Controllers; +#pragma warning disable CS0612 // Type or member is obsolete + [ApiVersion(1.0)] [Route("api/v{version:apiVersion}/translation/engines")] [OpenApiTag("Translation Engines")] @@ -9,6 +11,7 @@ public class TranslationEnginesController( IBuildService buildService, IPretranslationService pretranslationService, IOptionsMonitor apiOptions, + IConfiguration configuration, IUrlService urlService, ILogger logger ) : ServalControllerBase(authService) @@ -22,6 +25,7 @@ ILogger logger private readonly IOptionsMonitor _apiOptions = apiOptions; private readonly IUrlService _urlService = urlService; private readonly ILogger _logger = logger; + private readonly IConfiguration _configuration = configuration; /// /// Get all translation engines @@ -313,7 +317,7 @@ await _engineService.TrainSegmentPairAsync( } /// - /// Add a corpus to a translation engine + /// Add a corpus to a translation engine (obsolete - use parallel corpora instead) /// /// /// ## Parameters @@ -344,6 +348,7 @@ await _engineService.TrainSegmentPairAsync( /// The authenticated client cannot perform the operation or does not own the translation engine. /// The engine does not exist. /// A necessary service is currently unavailable. Check `/health` for more details. + [Obsolete("This endpoint is obsolete. Use parallel corpora instead.")] [Authorize(Scopes.UpdateTranslationEngines)] [HttpPost("{id}/corpora")] [ProducesResponseType(StatusCodes.Status201Created)] @@ -369,7 +374,7 @@ CancellationToken cancellationToken } /// - /// Update a corpus with a new set of files + /// Update a corpus with a new set of files (obsolete - use parallel corpora instead) /// /// /// See posting a new corpus for details of use. Will completely replace corpus' file associations. @@ -386,6 +391,7 @@ CancellationToken cancellationToken /// The authenticated client cannot perform the operation or does not own the translation engine. /// The engine or corpus does not exist. /// A necessary service is currently unavailable. Check `/health` for more details. + [Obsolete("This endpoint is obsolete. Use parallel corpora instead.")] [Authorize(Scopes.UpdateTranslationEngines)] [HttpPatch("{id}/corpora/{corpusId}")] [ProducesResponseType(StatusCodes.Status200OK)] @@ -418,7 +424,7 @@ corpusConfig.TargetFiles is null } /// - /// Get all corpora for a translation engine + /// Get all corpora for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// @@ -427,6 +433,7 @@ corpusConfig.TargetFiles is null /// The authenticated client cannot perform the operation or does not own the translation engine /// The engine does not exist /// A necessary service is currently unavailable. Check `/health` for more details. + [Obsolete("This endpoint is obsolete. Use parallel corpora instead.")] [Authorize(Scopes.ReadTranslationEngines)] [HttpGet("{id}/corpora")] [ProducesResponseType(StatusCodes.Status200OK)] @@ -445,7 +452,7 @@ CancellationToken cancellationToken } /// - /// Get the configuration of a corpus for a translation engine + /// Get the configuration of a corpus for a translation engine (obsolete - use parallel corpora instead) /// /// The translation engine id /// The corpus id @@ -455,6 +462,7 @@ CancellationToken cancellationToken /// The authenticated client cannot perform the operation or does not own the translation engine. /// The engine or corpus does not exist. /// A necessary service is currently unavailable. Check `/health` for more details. + [Obsolete("This endpoint is obsolete. Use parallel corpora instead.")] [Authorize(Scopes.ReadTranslationEngines)] [HttpGet("{id}/corpora/{corpusId}", Name = Endpoints.GetTranslationCorpus)] [ProducesResponseType(StatusCodes.Status200OK)] @@ -698,7 +706,7 @@ CancellationToken cancellationToken } /// - /// Get all pretranslations in a corpus of a translation engine + /// Get all pretranslations in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -712,7 +720,7 @@ CancellationToken cancellationToken /// Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id (optional) /// /// The pretranslations @@ -738,7 +746,7 @@ CancellationToken cancellationToken { Engine engine = await _engineService.GetAsync(id, cancellationToken); await AuthorizeAsync(engine); - if (!engine.Corpora.Any(c => c.Id == corpusId)) + if (!engine.Corpora.Any(c => c.Id == corpusId) && !engine.ParallelCorpora.Any(c => c.Id == corpusId)) return NotFound(); if (engine.ModelRevision == 0) return Conflict(); @@ -761,7 +769,7 @@ CancellationToken cancellationToken } /// - /// Get all pretranslations for the specified text in a corpus of a translation engine + /// Get all pretranslations for the specified text in a corpus or parallel corpus of a translation engine /// /// /// Pretranslations are arranged in a list of dictionaries with the following fields per pretranslation: @@ -774,7 +782,7 @@ CancellationToken cancellationToken /// Only pretranslations for the most recent successful build of the engine are returned. /// /// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// /// The pretranslations @@ -800,7 +808,7 @@ CancellationToken cancellationToken { Engine engine = await _engineService.GetAsync(id, cancellationToken); await AuthorizeAsync(engine); - if (!engine.Corpora.Any(c => c.Id == corpusId)) + if (!engine.Corpora.Any(c => c.Id == corpusId) && !engine.ParallelCorpora.Any(c => c.Id == corpusId)) return NotFound(); if (engine.ModelRevision == 0) return Conflict(); @@ -841,7 +849,7 @@ CancellationToken cancellationToken /// Both scripture and non-scripture text in the USFM is parsed and grouped according to [this wiki](https://github.com/sillsdev/serval/wiki/USFM-Parsing-and-Translation). /// /// The translation engine id - /// The corpus id + /// The corpus id or parallel corpus id /// The text id /// The source[s] of the data to populate the USFM file with. /// @@ -875,7 +883,7 @@ CancellationToken cancellationToken { Engine engine = await _engineService.GetAsync(id, cancellationToken); await AuthorizeAsync(engine); - if (!engine.Corpora.Any(c => c.Id == corpusId)) + if (!engine.Corpora.Any(c => c.Id == corpusId) && !engine.ParallelCorpora.Any(c => c.Id == corpusId)) return NotFound(); if (engine.ModelRevision == 0) return Conflict(); @@ -990,10 +998,21 @@ CancellationToken cancellationToken /// Starts a build job for a translation engine. /// /// - /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). If no "trainOn" field is provided, all corpora will be used. - /// Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. - /// Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) - /// All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + /// Specify the corpora and textIds/scriptureRanges within those corpora to train on. Only one type of corpus may be used: either (legacy) corpora (see /translation/engines/{id}/corpora) or parallel corpora (see /translation/engines/{id}/parallel-corpora). + /// Specifying a corpus: + /// * A (legacy) corpus is selected by specifying CorpusId and a parallel corpus is selected by specifying ParallelCorpusId. + /// * A parallel corpus can be further filtered by specifying particular CorpusIds in SourceFilters or TargetFilters. + /// + /// Filtering by textID or chapter: + /// * Paratext projects can be filtered by [book](https://github.com/sillsdev/libpalaso/blob/master/SIL.Scripture/Canon.cs) using the textId for training. + /// * Filters can also be supplied via scriptureRange parameter as ranges of biblical text. See [here](https://github.com/sillsdev/serval/wiki/Filtering-Paratext-Project-Data-with-a-Scripture-Range) + /// * All Paratext project filtering follows original versification. See [here](https://github.com/sillsdev/serval/wiki/Versification-in-Serval) for more information. + /// + /// Filter - train on all or none + /// * If trainOn or pretranslate is not provided, all corpora will be used for training or pretranslation respectively + /// * If a corpus is selected for training or pretranslation and neither scriptureRange nor textIds are defined, all of the selected corpus will be used. + /// * If a corpus is selected for training or pretranslation and an empty scriptureRange or textIds is defined, none of the selected corpus will be used. + /// * If a corpus is selected for training or pretranslation but no further filters are provided, all selected corpora will be used for training or pretranslation respectively. /// /// Specify the corpora and textIds/scriptureRanges within those corpora to pretranslate. When a corpus is selected for pretranslation, /// the following text will be pretranslated: @@ -1035,9 +1054,12 @@ public async Task> StartBuildAsync( CancellationToken cancellationToken ) { + string deploymentVersion = _configuration.GetValue("deploymentVersion") ?? "Unknown"; + Engine engine = await _engineService.GetAsync(id, cancellationToken); await AuthorizeAsync(engine); - Build build = Map(engine, buildConfig); + Build build = Map(engine, buildConfig, deploymentVersion); + await _engineService.StartBuildAsync(build, cancellationToken); TranslationBuildDto dto = Map(build); @@ -1301,7 +1323,7 @@ private Engine Map(TranslationEngineConfigDto source) }; } - private static Build Map(Engine engine, TranslationBuildConfigDto source) + private static Build Map(Engine engine, TranslationBuildConfigDto source, string deploymentVersion) { return new Build { @@ -1310,7 +1332,8 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source) Pretranslate = Map(engine, source.Pretranslate), TrainOn = Map(engine, source.TrainOn), Options = Map(source.Options), - IsInitialized = false + IsInitialized = false, + DeploymentVersion = deploymentVersion }; } @@ -1363,6 +1386,24 @@ private static Build Map(Engine engine, TranslationBuildConfigDto source) $"The parallel corpus {pcc.ParallelCorpusId} is not valid: This parallel corpus does not exist for engine {engine.Id}." ); } + if ( + pcc.SourceFilters != null + && pcc.SourceFilters.Count > 0 + && ( + pcc.SourceFilters.Select(sf => sf.CorpusId).Distinct().Count() > 1 + || pcc.SourceFilters[0].CorpusId + != engine + .ParallelCorpora.Where(pc => pc.Id == pcc.ParallelCorpusId) + .First() + .SourceCorpora[0] + .Id + ) + ) + { + throw new InvalidOperationException( + $"Only the first source corpus in a parallel corpus may be filtered for pretranslation." + ); + } pretranslateCorpora.Add( new PretranslateCorpus { @@ -1507,7 +1548,8 @@ private TranslationBuildDto Map(Build source) QueueDepth = source.QueueDepth, State = source.State, DateFinished = source.DateFinished, - Options = source.Options + Options = source.Options, + DeploymentVersion = source.DeploymentVersion }; } @@ -1726,3 +1768,5 @@ private static ModelDownloadUrlDto Map(ModelDownloadUrl source) }; } } + +#pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/src/Serval.Translation/Models/Build.cs b/src/Serval/src/Serval.Translation/Models/Build.cs index ddfaee32..9afdcbb8 100644 --- a/src/Serval/src/Serval.Translation/Models/Build.cs +++ b/src/Serval/src/Serval.Translation/Models/Build.cs @@ -17,4 +17,5 @@ public record Build : IInitializableEntity public IReadOnlyDictionary? Options { get; init; } public bool? IsInitialized { get; set; } public DateTime? DateCreated { get; set; } + public string? DeploymentVersion { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 0601b5e8..b18257e4 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -1,4 +1,4 @@ -using MassTransit.Mediator; +using MassTransit.Mediator; using Serval.Translation.V1; namespace Serval.Translation.Services; @@ -237,8 +237,19 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok StartBuildRequest request; if (engine.ParallelCorpora.Any()) { - var trainOn = build.TrainOn?.ToDictionary(c => c.ParallelCorpusRef!); - var pretranslate = build.Pretranslate?.ToDictionary(c => c.ParallelCorpusRef!); + Dictionary? trainOn = build.TrainOn?.ToDictionary(c => c.ParallelCorpusRef!); + Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => + c.ParallelCorpusRef! + ); + IReadOnlyList parallelCorpora = engine + .ParallelCorpora.Where(pc => + trainOn == null + || trainOn.ContainsKey(pc.Id) + || pretranslate == null + || pretranslate.ContainsKey(pc.Id) + ) + .ToList(); + request = new StartBuildRequest { EngineType = engine.Type, @@ -246,16 +257,32 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok BuildId = build.Id, Corpora = { - engine.ParallelCorpora.Select(c => - Map(c, trainOn?.GetValueOrDefault(c.Id), pretranslate?.GetValueOrDefault(c.Id)) + parallelCorpora.Select(c => + Map( + c, + trainOn?.GetValueOrDefault(c.Id), + pretranslate?.GetValueOrDefault(c.Id), + trainOn is null, + pretranslate is null + ) ) } }; } else { - var pretranslate = build.Pretranslate?.ToDictionary(c => c.CorpusRef!); - var trainOn = build.TrainOn?.ToDictionary(c => c.CorpusRef!); + Dictionary? trainOn = build.TrainOn?.ToDictionary(c => c.CorpusRef!); + Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => + c.CorpusRef! + ); + IReadOnlyList corpora = engine + .Corpora.Where(c => + trainOn == null + || trainOn.ContainsKey(c.Id) + || pretranslate == null + || pretranslate.ContainsKey(c.Id) + ) + .ToList(); request = new StartBuildRequest { @@ -264,8 +291,14 @@ public async Task StartBuildAsync(Build build, CancellationToken cancellationTok BuildId = build.Id, Corpora = { - engine.Corpora.Select(c => - Map(c, trainOn?.GetValueOrDefault(c.Id), pretranslate?.GetValueOrDefault(c.Id)) + corpora.Select(c => + Map( + c, + trainOn?.GetValueOrDefault(c.Id), + pretranslate?.GetValueOrDefault(c.Id), + trainOn is null, + pretranslate is null + ) ) } }; @@ -607,7 +640,13 @@ private Models.WordGraphArc Map(V1.WordGraphArc source) }; } - private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, PretranslateCorpus? pretranslateCorpus) + private V1.ParallelCorpus Map( + Corpus source, + TrainingCorpus? trainingCorpus, + PretranslateCorpus? pretranslateCorpus, + bool trainOnAllCorpora, + bool pretranslateOnAllCorpora + ) { IEnumerable sourceFiles = source.SourceFiles.Select(Map); IEnumerable targetFiles = source.TargetFiles.Select(Map); @@ -616,7 +655,15 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre V1.MonolingualCorpus targetCorpus = new() { Language = source.TargetLanguage, Files = { source.TargetFiles.Select(Map) } }; - if (trainingCorpus != null) + if ( + trainOnAllCorpora + || (trainingCorpus is not null && trainingCorpus.TextIds is null && trainingCorpus.ScriptureRange is null) + ) + { + sourceCorpus.TrainOnAll = true; + targetCorpus.TrainOnAll = true; + } + else if (trainingCorpus is not null) { if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) { @@ -651,7 +698,19 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre targetCorpus.TrainOnChapters.Add(chapters); } } - if (pretranslateCorpus != null) + if ( + pretranslateOnAllCorpora + || ( + pretranslateCorpus is not null + && pretranslateCorpus.TextIds is null + && pretranslateCorpus.ScriptureRange is null + ) + ) + { + sourceCorpus.PretranslateAll = true; + targetCorpus.PretranslateAll = true; + } + else if (pretranslateCorpus is not null) { if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) { @@ -683,18 +742,20 @@ private V1.ParallelCorpus Map(Corpus source, TrainingCorpus? trainingCorpus, Pre ); } } - return new V1.ParallelCorpus - { - Id = source.Id, - SourceCorpora = { sourceCorpus }, - TargetCorpora = { targetCorpus } - }; + V1.ParallelCorpus corpus = new() { Id = source.Id }; + if (sourceCorpus.Files.Count > 0) + corpus.SourceCorpora.Add(sourceCorpus); + if (targetCorpus.Files.Count > 0) + corpus.TargetCorpora.Add(targetCorpus); + return corpus; } private V1.ParallelCorpus Map( Models.ParallelCorpus source, TrainingCorpus? trainingCorpus, - PretranslateCorpus? pretranslateCorpus + PretranslateCorpus? pretranslateCorpus, + bool trainOnAllCorpora, + bool pretranslateOnAllCorpora ) { string? referenceFileLocation = @@ -702,6 +763,15 @@ private V1.ParallelCorpus Map( ? Map(source.TargetCorpora[0].Files[0]).Location : null; + bool trainOnAllSources = + trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.SourceFilters is null); + bool pretranslateAllSources = + pretranslateOnAllCorpora || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null); + + bool trainOnAllTargets = + trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.TargetFilters is null); + bool pretranslateAllTargets = pretranslateOnAllCorpora || pretranslateCorpus is not null; // there is no pretranslate Target filter. + return new V1.ParallelCorpus { Id = source.Id, @@ -712,7 +782,9 @@ private V1.ParallelCorpus Map( sc, trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - referenceFileLocation + referenceFileLocation, + trainOnAllSources, + pretranslateAllSources ) ) }, @@ -723,7 +795,9 @@ private V1.ParallelCorpus Map( tc, trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), null, - referenceFileLocation + referenceFileLocation, + trainOnAllTargets, + pretranslateAllTargets ) ) } @@ -731,10 +805,12 @@ private V1.ParallelCorpus Map( } private V1.MonolingualCorpus Map( - Models.MonolingualCorpus source, + Models.MonolingualCorpus inputCorpus, ParallelCorpusFilter? trainingFilter, ParallelCorpusFilter? pretranslateFilter, - string? referenceFileLocation + string? referenceFileLocation, + bool trainOnAll, + bool pretranslateOnAll ) { Dictionary? trainOnChapters = null; @@ -763,7 +839,7 @@ pretranslateFilter is not null && referenceFileLocation is not null ) { - GetChapters(referenceFileLocation, pretranslateFilter.ScriptureRange) + pretranslateChapters = GetChapters(referenceFileLocation, pretranslateFilter.ScriptureRange) .Select( (kvp) => { @@ -775,23 +851,48 @@ pretranslateFilter is not null .ToDictionary(); } - var corpus = new V1.MonolingualCorpus + var returnCorpus = new V1.MonolingualCorpus { - Id = source.Id, - Language = source.Language, - Files = { source.Files.Select(Map) } + Id = inputCorpus.Id, + Language = inputCorpus.Language, + Files = { inputCorpus.Files.Select(Map) } }; - if (trainOnChapters is not null) - corpus.TrainOnChapters.Add(trainOnChapters); - if (trainingFilter?.TextIds is not null) - corpus.TrainOnTextIds.Add(trainingFilter.TextIds); - if (pretranslateChapters is not null) - corpus.PretranslateChapters.Add(pretranslateChapters); - if (pretranslateFilter?.TextIds is not null) - corpus.PretranslateTextIds.Add(pretranslateFilter.TextIds); + if ( + trainOnAll + || (trainingFilter is not null && trainingFilter.TextIds is null && trainingFilter.ScriptureRange is null) + ) + { + returnCorpus.TrainOnAll = true; + } + else + { + if (trainOnChapters is not null) + returnCorpus.TrainOnChapters.Add(trainOnChapters); + if (trainingFilter?.TextIds is not null) + returnCorpus.TrainOnTextIds.Add(trainingFilter.TextIds); + } + + if ( + pretranslateOnAll + || ( + pretranslateFilter is not null + && pretranslateFilter.TextIds is null + && pretranslateFilter.ScriptureRange is null + ) + ) + { + returnCorpus.PretranslateAll = true; + } + else + { + if (pretranslateChapters is not null) + returnCorpus.PretranslateChapters.Add(pretranslateChapters); + if (pretranslateFilter?.TextIds is not null) + returnCorpus.PretranslateTextIds.Add(pretranslateFilter.TextIds); + } - return corpus; + return returnCorpus; } private V1.CorpusFile Map(Models.CorpusFile source) diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 48e89b91..516e634e 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -41,11 +41,24 @@ public async Task GetUsfmAsync( { Engine? engine = await _engines.GetAsync(engineId, cancellationToken); Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId); - if (corpus is null) - throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'."); + ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); - CorpusFile sourceFile = corpus.SourceFiles[0]; - CorpusFile targetFile = corpus.TargetFiles[0]; + CorpusFile sourceFile; + CorpusFile targetFile; + if (corpus is not null) + { + sourceFile = corpus.SourceFiles[0]; + targetFile = corpus.TargetFiles[0]; + } + else if (parallelCorpus is not null) + { + sourceFile = parallelCorpus.SourceCorpora[0].Files[0]; + targetFile = parallelCorpus.TargetCorpora[0].Files[0]; + } + else + { + throw new EntityNotFoundException($"Could not find the Corpus '{corpusId}' in Engine '{engineId}'."); + } if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext) throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora."); @@ -87,8 +100,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: true + behavior: UpdateUsfmBehavior.PreferExisting ) ?? ""; break; case PretranslationUsfmTextOrigin.PreferPretranslated: @@ -97,8 +109,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: false + behavior: UpdateUsfmBehavior.PreferNew ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyExisting: @@ -107,8 +118,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, [], // don't put any pretranslations, we only want the existing text. fullName: targetSettings.FullName, - stripAllText: false, - preferExistingText: false + behavior: UpdateUsfmBehavior.PreferNew ) ?? ""; break; case PretranslationUsfmTextOrigin.OnlyPretranslated: @@ -117,8 +127,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: false + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; break; } @@ -142,16 +151,14 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) textId, pretranslations.ToList(), fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: true + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; case PretranslationUsfmTextOrigin.OnlyExisting: return updater.UpdateUsfm( textId, [], // don't pass the pretranslations, we only want the existing text. fullName: targetSettings.FullName, - stripAllText: true, - preferExistingText: true + behavior: UpdateUsfmBehavior.StripExisting ) ?? ""; } } diff --git a/src/Serval/src/Serval.Webhooks/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Webhooks/Configuration/IServalBuilderExtensions.cs index 129804e3..383e5baf 100644 --- a/src/Serval/src/Serval.Webhooks/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Webhooks/Configuration/IServalBuilderExtensions.cs @@ -4,11 +4,7 @@ public static class IServalBuilderExtensions { public static IServalBuilder AddWebhooks(this IServalBuilder builder) { - builder - .Services.AddHttpClient() - .AddTransientHttpErrorPolicy(b => - b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))) - ); + builder.Services.AddHttpClient(); builder.Services.AddScoped(); return builder; } diff --git a/src/Serval/src/Serval.Webhooks/Serval.Webhooks.csproj b/src/Serval/src/Serval.Webhooks/Serval.Webhooks.csproj index 44f1ef4d..4f9fa6d8 100644 --- a/src/Serval/src/Serval.Webhooks/Serval.Webhooks.csproj +++ b/src/Serval/src/Serval.Webhooks/Serval.Webhooks.csproj @@ -14,7 +14,6 @@ - diff --git a/src/Serval/src/Serval.Webhooks/Services/WebhookJob.cs b/src/Serval/src/Serval.Webhooks/Services/WebhookJob.cs index faee17d4..384ba6be 100644 --- a/src/Serval/src/Serval.Webhooks/Services/WebhookJob.cs +++ b/src/Serval/src/Serval.Webhooks/Services/WebhookJob.cs @@ -6,6 +6,32 @@ public class WebhookJob(IRepository hooks, HttpClient httpClient, IOpti private readonly HttpClient _httpClient = httpClient; private readonly JsonOptions _jsonOptions = jsonOptions.Value; + [AutomaticRetry( + Attempts = 20, + DelaysInSeconds = new[] + { + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 2048, + 2048, + 2048, + 2048, + 2048, + 2048, + 2048 + }, + LogEvents = true + )] public async Task RunAsync( WebhookEvent webhookEvent, string owner, diff --git a/src/Serval/src/Serval.Webhooks/Usings.cs b/src/Serval/src/Serval.Webhooks/Usings.cs index f68d9a61..39f9b6a5 100644 --- a/src/Serval/src/Serval.Webhooks/Usings.cs +++ b/src/Serval/src/Serval.Webhooks/Usings.cs @@ -11,7 +11,6 @@ global using Microsoft.AspNetCore.Mvc; global using Microsoft.AspNetCore.Routing; global using Microsoft.Extensions.Options; -global using Polly; global using Serval.Shared.Contracts; global using Serval.Shared.Controllers; global using Serval.Shared.Models; diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs index 6d0b2df2..d66b3557 100644 --- a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs @@ -5,6 +5,8 @@ namespace Serval.ApiServer; +#pragma warning disable CS0612 // Type or member is obsolete + [TestFixture] [Category("Integration")] public class TranslationEngineTests @@ -28,7 +30,15 @@ public class TranslationEngineTests new() { Name = "TestCorpus", - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], + TargetCorpusIds = [TARGET_CORPUS_ID], + }; + + private static readonly TranslationParallelCorpusConfig TestMixedParallelCorpusConfig = + new() + { + Name = "TestCorpus", + SourceCorpusIds = [SOURCE_CORPUS_ID_1, SOURCE_CORPUS_ID_2], TargetCorpusIds = [TARGET_CORPUS_ID], }; private static readonly TranslationCorpusConfig TestCorpusConfigNonEcho = @@ -70,8 +80,9 @@ public class TranslationEngineTests private const string FILE3_FILENAME = "file_c"; private const string FILE4_ID = "f00000000000000000000004"; private const string FILE4_FILENAME = "file_d"; - private const string SOURCE_CORPUS_ID = "cc0000000000000000000001"; - private const string TARGET_CORPUS_ID = "cc0000000000000000000002"; + private const string SOURCE_CORPUS_ID_1 = "cc0000000000000000000001"; + private const string SOURCE_CORPUS_ID_2 = "cc0000000000000000000002"; + private const string TARGET_CORPUS_ID = "cc0000000000000000000003"; private const string DOES_NOT_EXIST_ENGINE_ID = "e00000000000000000000004"; private const string DOES_NOT_EXIST_CORPUS_ID = "c00000000000000000000001"; @@ -170,7 +181,14 @@ public async Task SetUp() var srcCorpus = new DataFiles.Models.Corpus { - Id = SOURCE_CORPUS_ID, + Id = SOURCE_CORPUS_ID_1, + Language = "en", + Owner = "client1", + Files = [new() { File = srcFile, TextId = "all" }] + }; + var srcCorpus2 = new DataFiles.Models.Corpus + { + Id = SOURCE_CORPUS_ID_2, Language = "en", Owner = "client1", Files = [new() { File = srcFile, TextId = "all" }] @@ -182,7 +200,7 @@ public async Task SetUp() Owner = "client1", Files = [new() { File = trgFile, TextId = "all" }] }; - await _env.Corpora.InsertAllAsync([srcCorpus, trgCorpus]); + await _env.Corpora.InsertAllAsync([srcCorpus, srcCorpus2, trgCorpus]); } [Test] @@ -813,7 +831,7 @@ public async Task AddParallelCorpusToEngineByIdAsync() ); Assert.Multiple(() => { - Assert.That(result.SourceCorpora.First().Id, Is.EqualTo(SOURCE_CORPUS_ID)); + Assert.That(result.SourceCorpora.First().Id, Is.EqualTo(SOURCE_CORPUS_ID_1)); Assert.That(result.TargetCorpora.First().Id, Is.EqualTo(TARGET_CORPUS_ID)); }); Engine? engine = await _env.Engines.GetAsync(ECHO_ENGINE1_ID); @@ -861,7 +879,7 @@ public async Task UpdateParallelCorpusByIdForEngineByIdAsync() ); var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, result.Id, updateConfig); @@ -883,7 +901,7 @@ public void UpdateParallelCorpusByIdForEngineById_NoSuchCorpus() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, DOES_NOT_EXIST_CORPUS_ID, updateConfig); @@ -900,10 +918,10 @@ public void UpdateParallelCorpusByIdForEngineById_NoSuchEngine() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; - await client.UpdateParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID, updateConfig); + await client.UpdateParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID_1, updateConfig); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); } @@ -917,7 +935,7 @@ public void UpdateParallelCorpusByIdForEngineById_NotAuthorized() { var updateConfig = new TranslationParallelCorpusUpdateConfig { - SourceCorpusIds = [SOURCE_CORPUS_ID], + SourceCorpusIds = [SOURCE_CORPUS_ID_1], TargetCorpusIds = [TARGET_CORPUS_ID] }; await client.UpdateParallelCorpusAsync(ECHO_ENGINE1_ID, DOES_NOT_EXIST_CORPUS_ID, updateConfig); @@ -1010,7 +1028,7 @@ public void GetParallelCorpusByIdForEngineById_NoSuchEngine() { TranslationParallelCorpus result_afterAdd = await client.GetParallelCorpusAsync( DOES_NOT_EXIST_ENGINE_ID, - SOURCE_CORPUS_ID + SOURCE_CORPUS_ID_1 ); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); @@ -1085,7 +1103,7 @@ public void DeleteParallelCorpusByIdForEngineById_NoSuchEngine() ServalApiException? ex = Assert.ThrowsAsync(async () => { - await client.DeleteParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID); + await client.DeleteParallelCorpusAsync(DOES_NOT_EXIST_ENGINE_ID, SOURCE_CORPUS_ID_1); }); Assert.That(ex?.StatusCode, Is.EqualTo(404)); } @@ -1097,7 +1115,7 @@ public void DeleteParallelCorpusByIdForEngineById_NotAuthorized() ServalApiException? ex = Assert.ThrowsAsync(async () => { - await client.DeleteParallelCorpusAsync(ECHO_ENGINE1_ID, SOURCE_CORPUS_ID); + await client.DeleteParallelCorpusAsync(ECHO_ENGINE1_ID, SOURCE_CORPUS_ID_1); }); Assert.That(ex?.StatusCode, Is.EqualTo(403)); } @@ -1391,6 +1409,9 @@ public async Task StartBuildForEngineByIdAsync(IEnumerable scope, int ex build = await client.GetCurrentBuildAsync(engineId); Assert.That(build, Is.Not.Null); + + Assert.That(build.DeploymentVersion, Is.Not.Null); + break; case 400: case 403: @@ -1578,13 +1599,13 @@ public async Task StartBuild_ParallelCorpus() new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }] + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }] }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID, TextIds = ["all"] }] }; ; @@ -1625,13 +1646,13 @@ public async Task StartBuildAsync_ParallelCorpus() new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }] + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }] }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedCorpus.Id, - SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID, TextIds = ["all"] }], + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1, TextIds = ["all"] }], TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID, TextIds = ["all"] }] }; ; @@ -1660,6 +1681,105 @@ public async Task StartBuildAsync_ParallelCorpus() Assert.That(build, Is.Not.Null); } + [Test] + public async Task StartBuildAsync_Corpus_NoFilter() + { + TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); + TranslationCorpus addedCorpus = await client.AddCorpusAsync(NMT_ENGINE1_ID, TestCorpusConfig); + PretranslateCorpusConfig ptcc = + new() { CorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }] }; + TrainingCorpusConfig tcc = + new() + { + CorpusId = addedCorpus.Id, + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }], + TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID }] + }; + ; + TranslationBuildConfig tbc = new TranslationBuildConfig + { + Pretranslate = [ptcc], + TrainOn = [tcc], + Options = """ + {"max_steps":10, + "use_key_terms":false, + "some_double":10.5, + "some_nested": {"more_nested": {"other_double":10.5}}, + "some_string":"string"} + """ + }; + TranslationBuild resultAfterStart; + Assert.ThrowsAsync(async () => + { + resultAfterStart = await client.GetCurrentBuildAsync(NMT_ENGINE1_ID); + }); + + TranslationBuild build = await client.StartBuildAsync(NMT_ENGINE1_ID, tbc); + Assert.That(build, Is.Not.Null); + Assert.That(build.TrainOn, Is.Not.Null); + Assert.That(build.TrainOn.Count, Is.EqualTo(1)); + Assert.That(build.TrainOn[0].TextIds, Is.Null); + Assert.That(build.TrainOn[0].ScriptureRange, Is.Null); + Assert.That(build.Pretranslate, Is.Not.Null); + Assert.That(build.Pretranslate.Count, Is.EqualTo(1)); + Assert.That(build.Pretranslate[0].TextIds, Is.Null); + Assert.That(build.Pretranslate[0].ScriptureRange, Is.Null); + + build = await client.GetCurrentBuildAsync(NMT_ENGINE1_ID); + Assert.That(build, Is.Not.Null); + } + + [Test] + public async Task StartBuildAsync_ParallelCorpus_NoFilter() + { + TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); + TranslationParallelCorpus addedCorpus = await client.AddParallelCorpusAsync( + NMT_ENGINE1_ID, + TestParallelCorpusConfig + ); + PretranslateCorpusConfig ptcc = + new() { ParallelCorpusId = addedCorpus.Id, SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }] }; + TrainingCorpusConfig tcc = + new() + { + ParallelCorpusId = addedCorpus.Id, + SourceFilters = [new() { CorpusId = SOURCE_CORPUS_ID_1 }], + TargetFilters = [new() { CorpusId = TARGET_CORPUS_ID }] + }; + ; + TranslationBuildConfig tbc = new TranslationBuildConfig + { + Pretranslate = [ptcc], + TrainOn = [tcc], + Options = """ + {"max_steps":10, + "use_key_terms":false, + "some_double":10.5, + "some_nested": {"more_nested": {"other_double":10.5}}, + "some_string":"string"} + """ + }; + TranslationBuild resultAfterStart; + Assert.ThrowsAsync(async () => + { + resultAfterStart = await client.GetCurrentBuildAsync(NMT_ENGINE1_ID); + }); + + TranslationBuild build = await client.StartBuildAsync(NMT_ENGINE1_ID, tbc); + Assert.That(build, Is.Not.Null); + Assert.That(build.TrainOn, Is.Not.Null); + Assert.That(build.TrainOn.Count, Is.EqualTo(1)); + Assert.That(build.TrainOn[0].TextIds, Is.Null); + Assert.That(build.TrainOn[0].ScriptureRange, Is.Null); + Assert.That(build.Pretranslate, Is.Not.Null); + Assert.That(build.Pretranslate.Count, Is.EqualTo(1)); + Assert.That(build.Pretranslate[0].TextIds, Is.Null); + Assert.That(build.Pretranslate[0].ScriptureRange, Is.Null); + + build = await client.GetCurrentBuildAsync(NMT_ENGINE1_ID); + Assert.That(build, Is.Not.Null); + } + [Test] public async Task StartBuildAsync_ParallelCorpus_PretranslateParallelAndNormalCorpus() { @@ -1704,7 +1824,7 @@ public async Task StartBuildAsync_ParallelCorpus_PretranslateNoCorpusSpecified() TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); TranslationParallelCorpus addedParallelCorpus = await client.AddParallelCorpusAsync( NMT_ENGINE1_ID, - TestParallelCorpusConfig + TestMixedParallelCorpusConfig ); PretranslateCorpusConfig ptcc = new() { }; TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedParallelCorpus.Id }; @@ -1716,6 +1836,32 @@ public async Task StartBuildAsync_ParallelCorpus_PretranslateNoCorpusSpecified() }); } + [Test] + public async Task StartBuildAsync_ParallelCorpus_PretranslateFilterOnMultipleSources() + { + TranslationEnginesClient client = _env.CreateTranslationEnginesClient(); + TranslationParallelCorpus addedParallelCorpus = await client.AddParallelCorpusAsync( + NMT_ENGINE1_ID, + TestParallelCorpusConfig + ); + PretranslateCorpusConfig ptcc = + new() + { + ParallelCorpusId = addedParallelCorpus.Id, + SourceFilters = + [ + new ParallelCorpusFilterConfig() { CorpusId = SOURCE_CORPUS_ID_1 }, + new ParallelCorpusFilterConfig() { CorpusId = SOURCE_CORPUS_ID_2 } + ] + }; + TrainingCorpusConfig tcc = new() { ParallelCorpusId = addedParallelCorpus.Id }; + TranslationBuildConfig tbc = new TranslationBuildConfig { Pretranslate = [ptcc], TrainOn = [tcc] }; + Assert.ThrowsAsync(async () => + { + await client.StartBuildAsync(NMT_ENGINE1_ID, tbc); + }); + } + [Test] public async Task StartBuildAsync_ParallelCorpus_TrainOnNoCorpusSpecified() { @@ -2235,3 +2381,5 @@ protected override void DisposeManagedResources() } } } + +#pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs index 0589e53b..2fb9f86a 100644 --- a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs +++ b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs @@ -1,5 +1,7 @@ namespace Serval.E2ETests; +#pragma warning disable CS0612 // Type or member is obsolete + [TestFixture] [Category("E2E")] public class ServalApiTests @@ -115,14 +117,26 @@ public async Task NmtBatch() string[] books = ["MAT.txt", "1JN.txt", "2JN.txt"]; string cId1 = await _helperClient.AddTextCorpusToEngineAsync(engineId, books, "es", "en", false); _helperClient.TranslationBuildConfig.TrainOn = [new() { CorpusId = cId1, TextIds = ["1JN.txt"] }]; - string cId2 = await _helperClient.AddTextCorpusToEngineAsync(engineId, ["3JN.txt"], "es", "en", true); + string cId2 = await _helperClient.AddTextCorpusToEngineAsync( + engineId, + ["2JN.txt", "3JN.txt"], + "es", + "en", + true + ); + _helperClient.TranslationBuildConfig.Pretranslate = [new() { CorpusId = cId2, TextIds = ["2JN.txt"] }]; await _helperClient.BuildEngineAsync(engineId); await Task.Delay(1000); - IList lTrans = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( + IList lTrans1 = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( + engineId, + cId1 + ); + Assert.That(lTrans1, Has.Count.EqualTo(0)); // should be nothing + IList lTrans2 = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( engineId, cId2 ); - Assert.That(lTrans, Has.Count.EqualTo(14)); + Assert.That(lTrans2, Has.Count.EqualTo(13)); // just 2 John } [Test] @@ -131,14 +145,26 @@ public async Task NmtQueueMultiple() const int NUM_ENGINES = 10; const int NUM_WORKERS = 8; string[] engineIds = new string[NUM_ENGINES]; + string[] books = ["MAT.txt", "1JN.txt", "2JN.txt"]; + TranslationParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus( + books, + "es", + "en", + false + ); + TranslationParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( + ["3JN.txt"], + "es", + "en", + true + ); for (int i = 0; i < NUM_ENGINES; i++) { _helperClient.InitTranslationBuildConfig(); engineIds[i] = await _helperClient.CreateNewEngineAsync("Nmt", "es", "en", $"NMT1_{i}"); string engineId = engineIds[i]; - string[] books = ["MAT.txt", "1JN.txt", "2JN.txt"]; - await _helperClient.AddTextCorpusToEngineAsync(engineId, books, "es", "en", false); - await _helperClient.AddTextCorpusToEngineAsync(engineId, ["3JN.txt"], "es", "en", true); + await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, train_corpus, false); + await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, pretranslate_corpus, true); await _helperClient.StartBuildAsync(engineId); //Ensure that tasks are enqueued roughly in order await Task.Delay(1_000); @@ -201,15 +227,27 @@ public async Task NmtLargeBatchAndDownload() TranslationEngine engine = await _helperClient.TranslationEnginesClient.GetAsync(engineId); Assert.That(engine.IsModelPersisted, Is.True); string[] books = ["bible_LARGEFILE.txt"]; - await _helperClient.AddTextCorpusToEngineAsync(engineId, books, "es", "en", false); - string cId = await _helperClient.AddTextCorpusToEngineAsync(engineId, ["3JN.txt"], "es", "en", true); + TranslationParallelCorpusConfig train_corpus = await _helperClient.MakeParallelTextCorpus( + books, + "es", + "en", + false + ); + TranslationParallelCorpusConfig pretranslate_corpus = await _helperClient.MakeParallelTextCorpus( + ["3JN.txt"], + "es", + "en", + true + ); + await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, train_corpus, false); + string cId = await _helperClient.AddParallelTextCorpusToEngineAsync(engineId, pretranslate_corpus, true); await _helperClient.BuildEngineAsync(engineId); await Task.Delay(1000); IList lTrans = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( engineId, cId ); - TestContext.WriteLine(lTrans[0].Translation); + Assert.That(lTrans, Has.Count.EqualTo(14)); // Download the model from the s3 bucket ModelDownloadUrl url = await _helperClient.TranslationEnginesClient.GetModelDownloadUrlAsync(engineId); using Task s = new HttpClient().GetStreamAsync(url.Url); @@ -247,28 +285,18 @@ public async Task CircuitousRouteGetWordGraphAsync() Assert.That(ex.StatusCode, Is.EqualTo(409)); //Add corpus - string cId = await _helperClient.AddTextCorpusToEngineAsync( - smtEngineId, - ["2JN.txt", "3JN.txt"], - "es", - "en", - false - ); + var corpus1 = await _helperClient.MakeParallelTextCorpus(["2JN.txt", "3JN.txt"], "es", "en", false); + string cId = await _helperClient.AddParallelTextCorpusToEngineAsync(smtEngineId, corpus1, false); //Build the new engine await _helperClient.BuildEngineAsync(smtEngineId); //Remove added corpus (shouldn't affect translation) - await _helperClient.TranslationEnginesClient.DeleteCorpusAsync(smtEngineId, cId, deleteFiles: false); + await _helperClient.TranslationEnginesClient.DeleteParallelCorpusAsync(smtEngineId, cId); // Add corpus - await _helperClient.AddTextCorpusToEngineAsync( - smtEngineId, - ["1JN.txt", "2JN.txt", "3JN.txt"], - "es", - "en", - false - ); + var corpus2 = await _helperClient.MakeParallelTextCorpus(["1JN.txt", "2JN.txt", "3JN.txt"], "es", "en", false); + await _helperClient.AddParallelTextCorpusToEngineAsync(smtEngineId, corpus2, false); //Build the new engine await _helperClient.BuildEngineAsync(smtEngineId); @@ -424,6 +452,12 @@ public async Task ParatextProjectNmtJobAsync() corpus.Id ); Assert.That(lTrans, Is.Not.Empty); + string usfm = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync( + engineId, + corpus.Id, + "JHN" + ); + Assert.That(usfm, Does.Contain("\\v 1")); } [TearDown] @@ -438,3 +472,5 @@ public async Task OneTimeTearDown() await _helperClient.DisposeAsync(); } } + +#pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs index ae70f6ce..87f54a13 100644 --- a/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs +++ b/src/Serval/test/Serval.E2ETests/ServalClientHelper.cs @@ -1,8 +1,11 @@ namespace Serval.E2ETests; +#pragma warning disable CS0612 // Type or member is obsolete + public class ServalClientHelper : IAsyncDisposable { public DataFilesClient DataFilesClient { get; } + public CorporaClient CorporaClient { get; } public TranslationEnginesClient TranslationEnginesClient { get; } public TranslationEngineTypesClient TranslationEngineTypesClient { get; } @@ -32,6 +35,7 @@ public ServalClientHelper(string audience, string prefix = "SCE_", bool ignoreSS _httpClient.BaseAddress = new Uri(hostUrl); _httpClient.Timeout = TimeSpan.FromSeconds(60); DataFilesClient = new DataFilesClient(_httpClient); + CorporaClient = new CorporaClient(_httpClient); TranslationEnginesClient = new TranslationEnginesClient(_httpClient); TranslationEngineTypesClient = new TranslationEngineTypesClient(_httpClient); _prefix = prefix; @@ -175,12 +179,22 @@ public async Task AddTextCorpusToEngineAsync( bool pretranslate ) { - List sourceFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, sourceLanguage); + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); var targetFileConfig = new List(); if (!pretranslate) { - List targetFiles = await UploadFilesAsync(filesToAdd, FileFormat.Text, targetLanguage); + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); foreach (var item in targetFiles.Select((file, i) => new { i, file })) { targetFileConfig.Add( @@ -191,20 +205,11 @@ bool pretranslate var sourceFileConfig = new List(); - if (sourceLanguage == targetLanguage && !pretranslate) - { - // if it's the same language, and we are not pretranslating, do nothing (echo for suggestions) - // if pretranslating, we need to upload the source separately - // if different languages, we are not echoing. - } - else + for (int i = 0; i < sourceFiles.Count; i++) { - for (int i = 0; i < sourceFiles.Count; i++) - { - sourceFileConfig.Add( - new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } - ); - } + sourceFileConfig.Add( + new TranslationCorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] } + ); } TranslationCorpus response = await TranslationEnginesClient.AddCorpusAsync( @@ -229,10 +234,91 @@ bool pretranslate return response.Id; } + public async Task MakeParallelTextCorpus( + string[] filesToAdd, + string sourceLanguage, + string targetLanguage, + bool pretranslate + ) + { + List sourceFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + sourceLanguage, + isTarget: false + ); + + var targetFileConfig = new List(); + if (!pretranslate) + { + List targetFiles = await UploadFilesAsync( + filesToAdd, + FileFormat.Text, + targetLanguage, + isTarget: true + ); + foreach (var item in targetFiles.Select((file, i) => new { i, file })) + { + targetFileConfig.Add(new CorpusFileConfig { FileId = item.file.Id, TextId = filesToAdd[item.i] }); + } + } + + CorpusConfig targetCorpusConfig = + new() + { + Name = "None", + Language = targetLanguage, + Files = targetFileConfig + }; + + var targetCorpus = await CorporaClient.CreateAsync(targetCorpusConfig); + + var sourceFileConfig = new List(); + + for (int i = 0; i < sourceFiles.Count; i++) + { + sourceFileConfig.Add(new CorpusFileConfig { FileId = sourceFiles[i].Id, TextId = filesToAdd[i] }); + } + + CorpusConfig sourceCorpusConfig = + new() + { + Name = "None", + Language = sourceLanguage, + Files = sourceFileConfig + }; + + var sourceCorpus = await CorporaClient.CreateAsync(sourceCorpusConfig); + + TranslationParallelCorpusConfig parallelCorpusConfig = + new() { SourceCorpusIds = { sourceCorpus.Id }, TargetCorpusIds = { targetCorpus.Id } }; + + return parallelCorpusConfig; + } + + public async Task AddParallelTextCorpusToEngineAsync( + string engineId, + TranslationParallelCorpusConfig parallelCorpusConfig, + bool pretranslate + ) + { + var parallelCorpus = await TranslationEnginesClient.AddParallelCorpusAsync(engineId, parallelCorpusConfig); + + if (pretranslate) + { + TranslationBuildConfig.Pretranslate!.Add( + new PretranslateCorpusConfig { ParallelCorpusId = parallelCorpus.Id } + ); + } + + return parallelCorpus.Id; + } + public async Task> UploadFilesAsync( IEnumerable filesToAdd, FileFormat fileFormat, - string language + string language, + bool isTarget ) { string languageFolder = Path.GetFullPath( @@ -252,7 +338,7 @@ string language foreach (string fileName in filesToAdd) { - string fullName = _prefix + language + "_" + fileName; + string fullName = _prefix + language + "_" + fileName + (isTarget ? "_trg" : "_src"); //delete files that have the name name if (filenameToId.Contains(fullName)) @@ -335,3 +421,5 @@ public ValueTask DisposeAsync() return new ValueTask(Task.CompletedTask); } } + +#pragma warning restore CS0612 // Type or member is obsolete diff --git a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs index b4dc6841..42d70339 100644 --- a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs +++ b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs @@ -17,7 +17,7 @@ public void GetZipParatextProjectTextUpdater() TestEnvironment env = new(); using ZipParatextProjectTextUpdater updater = env.Service.GetZipParatextProjectTextUpdater("file1.zip"); Assert.That( - updater.UpdateUsfm("MAT", [], preferExistingText: true).ReplaceLineEndings("\n"), + updater.UpdateUsfm("MAT", [], behavior: UpdateUsfmBehavior.PreferExisting).ReplaceLineEndings("\n"), Is.EqualTo( $@"\id MAT - PROJ \h {Canon.BookIdToEnglishName("MAT")} diff --git a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs index 59d24d0c..0da83cf1 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs @@ -136,7 +136,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified() Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } }, @@ -155,7 +157,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified() Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } } @@ -206,7 +210,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -226,7 +232,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -277,7 +285,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -297,7 +307,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -347,7 +359,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } }, @@ -366,7 +380,203 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "text1" } - } + }, + PretranslateAll = true, + TrainOnAll = true + } + } + } + } + } + } + ); + } + + [Test] + public async Task StartBuildAsync_OneOfMultipleCorpora() + { + var env = new TestEnvironment(); + string engineId = (await env.CreateMultipleCorporaEngineWithTextFilesAsync()).Id; + await env.Service.StartBuildAsync( + new Build + { + Id = BUILD1_ID, + EngineRef = engineId, + TrainOn = [new TrainingCorpus { CorpusRef = "corpus1" }], + Pretranslate = [new PretranslateCorpus { CorpusRef = "corpus1" }] + } + ); + _ = env.TranslationServiceClient.Received() + .StartBuildAsync( + new StartBuildRequest + { + BuildId = BUILD1_ID, + EngineId = engineId, + EngineType = "Smt", + Corpora = + { + new V1.ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + { + new List + { + new() + { + Language = "es", + Files = + { + new V1.CorpusFile + { + Location = "file1.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = true, + TrainOnAll = true + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Language = "en", + Files = + { + new V1.CorpusFile + { + Location = "file2.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = true, + TrainOnAll = true + } + } + } + } + } + } + ); + } + + [Test] + public async Task StartBuildAsync_TrainOnOnePretranslateTheOther() + { + var env = new TestEnvironment(); + string engineId = (await env.CreateMultipleCorporaEngineWithTextFilesAsync()).Id; + await env.Service.StartBuildAsync( + new Build + { + Id = BUILD1_ID, + EngineRef = engineId, + TrainOn = [new TrainingCorpus { CorpusRef = "corpus1" }], + Pretranslate = [new PretranslateCorpus { CorpusRef = "corpus2" }] + } + ); + _ = env.TranslationServiceClient.Received() + .StartBuildAsync( + new StartBuildRequest + { + BuildId = BUILD1_ID, + EngineId = engineId, + EngineType = "Smt", + Corpora = + { + new V1.ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + { + new List + { + new() + { + Language = "es", + Files = + { + new V1.CorpusFile + { + Location = "file1.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = false, + TrainOnAll = true + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Language = "en", + Files = + { + new V1.CorpusFile + { + Location = "file2.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = false, + TrainOnAll = true + } + } + } + }, + new V1.ParallelCorpus + { + Id = "corpus2", + SourceCorpora = + { + new List + { + new() + { + Language = "es", + Files = + { + new V1.CorpusFile + { + Location = "file3.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = true, + TrainOnAll = false + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Language = "en", + Files = + { + new V1.CorpusFile + { + Location = "file4.txt", + Format = FileFormat.Text, + TextId = "text1" + } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -445,7 +655,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -475,7 +687,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -525,7 +739,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -544,7 +760,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -618,11 +836,278 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "MAT" } - } - }, + }, + PretranslateAll = true, + TrainOnAll = false + }, + new() + { + Id = "parallel-corpus1-source2", + Language = "es", + Files = + { + new V1.CorpusFile + { + Location = "file3.txt", + Format = FileFormat.Text, + TextId = "MRK" + } + }, + PretranslateAll = true, + TrainOnAll = false + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Id = "parallel-corpus1-target1", + Language = "en", + TrainOnTextIds = { "MAT" }, + Files = + { + new V1.CorpusFile + { + Location = "file2.txt", + Format = FileFormat.Text, + TextId = "MAT" + } + }, + PretranslateAll = true, + TrainOnAll = false + }, + new() + { + Id = "parallel-corpus1-target2", + Language = "en", + Files = + { + new V1.CorpusFile + { + Location = "file4.txt", + Format = FileFormat.Text, + TextId = "MRK" + } + }, + PretranslateAll = true, + TrainOnAll = false + } + } + } + } + } + } + ); + } + + [Test] + public async Task StartBuildAsync_ParallelCorpus_OneOfMultipleCorpora() + { + var env = new TestEnvironment(); + string engineId = (await env.CreateMultipleParallelCorpusEngineWithTextFilesAsync()).Id; + await env.Service.StartBuildAsync( + new Build + { + Id = BUILD1_ID, + EngineRef = engineId, + TrainOn = + [ + new TrainingCorpus + { + ParallelCorpusRef = "parallel-corpus1", + SourceFilters = new List() + { + new() + { + CorpusRef = "parallel-corpus1-source1", + TextIds = new List { "MAT" } + } + }, + TargetFilters = new List() + { + new() + { + CorpusRef = "parallel-corpus1-target1", + TextIds = new List { "MAT" } + } + } + } + ], + Pretranslate = [new PretranslateCorpus { ParallelCorpusRef = "parallel-corpus1" }] + } + ); + _ = env.TranslationServiceClient.Received() + .StartBuildAsync( + new StartBuildRequest + { + BuildId = BUILD1_ID, + EngineId = engineId, + EngineType = "Smt", + Corpora = + { + new V1.ParallelCorpus + { + Id = "parallel-corpus1", + SourceCorpora = + { + new List + { + new() + { + Id = "parallel-corpus1-source1", + Language = "es", + TrainOnTextIds = { "MAT" }, + Files = + { + new V1.CorpusFile + { + Location = "file1.txt", + Format = FileFormat.Text, + TextId = "MAT" + } + }, + PretranslateAll = true, + TrainOnAll = false + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Id = "parallel-corpus1-target1", + Language = "en", + TrainOnTextIds = { "MAT" }, + Files = + { + new V1.CorpusFile + { + Location = "file2.txt", + Format = FileFormat.Text, + TextId = "MAT" + } + }, + PretranslateAll = true, + TrainOnAll = false + } + } + } + } + } + } + ); + } + + [Test] + public async Task StartBuildAsync_ParallelCorpus_TrainOnOnePretranslateTheOther() + { + var env = new TestEnvironment(); + string engineId = (await env.CreateMultipleParallelCorpusEngineWithTextFilesAsync()).Id; + await env.Service.StartBuildAsync( + new Build + { + Id = BUILD1_ID, + EngineRef = engineId, + TrainOn = + [ + new TrainingCorpus + { + ParallelCorpusRef = "parallel-corpus1", + SourceFilters = new List() + { + new() + { + CorpusRef = "parallel-corpus1-source1", + TextIds = new List { "MAT" } + } + }, + TargetFilters = new List() + { + new() + { + CorpusRef = "parallel-corpus1-target1", + TextIds = new List { "MAT" } + } + } + } + ], + Pretranslate = [new PretranslateCorpus { ParallelCorpusRef = "parallel-corpus2" }] + } + ); + _ = env.TranslationServiceClient.Received() + .StartBuildAsync( + new StartBuildRequest + { + BuildId = BUILD1_ID, + EngineId = engineId, + EngineType = "Smt", + Corpora = + { + new V1.ParallelCorpus + { + Id = "parallel-corpus1", + SourceCorpora = + { + new List + { + new() + { + Id = "parallel-corpus1-source1", + Language = "es", + TrainOnTextIds = { "MAT" }, + Files = + { + new V1.CorpusFile + { + Location = "file1.txt", + Format = FileFormat.Text, + TextId = "MAT" + } + }, + PretranslateAll = false, + TrainOnAll = false + } + } + }, + TargetCorpora = + { + new List + { + new() + { + Id = "parallel-corpus1-target1", + Language = "en", + TrainOnTextIds = { "MAT" }, + Files = + { + new V1.CorpusFile + { + Location = "file2.txt", + Format = FileFormat.Text, + TextId = "MAT" + } + }, + PretranslateAll = false, + TrainOnAll = false + } + } + } + }, + new V1.ParallelCorpus + { + Id = "parallel-corpus2", + SourceCorpora = + { + new List + { new() { - Id = "parallel-corpus1-source2", + Id = "parallel-corpus2-source1", Language = "es", Files = { @@ -632,7 +1117,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "MRK" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -642,22 +1129,7 @@ await env.Service.StartBuildAsync( { new() { - Id = "parallel-corpus1-target1", - Language = "en", - TrainOnTextIds = { "MAT" }, - Files = - { - new V1.CorpusFile - { - Location = "file2.txt", - Format = FileFormat.Text, - TextId = "MAT" - } - } - }, - new() - { - Id = "parallel-corpus1-target2", + Id = "parallel-corpus2-target1", Language = "en", Files = { @@ -667,7 +1139,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Text, TextId = "MRK" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -741,7 +1215,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new() { @@ -755,7 +1231,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file3.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } }, @@ -776,7 +1254,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new() { @@ -790,7 +1270,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file4.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -824,6 +1306,17 @@ await env.Service.StartBuildAsync( new() { CorpusRef = "parallel-corpus1-target1", ScriptureRange = "MAT 1;MRK" } } } + ], + Pretranslate = + [ + new PretranslateCorpus + { + ParallelCorpusRef = "parallel-corpus1", + SourceFilters = new List() + { + new() { CorpusRef = "parallel-corpus1-source1", ScriptureRange = "MAT 2" } + } + } ] } ); @@ -858,6 +1351,13 @@ await env.Service.StartBuildAsync( new ScriptureChapters { Chapters = { } } } }, + PretranslateChapters = + { + { + "MAT", + new ScriptureChapters { Chapters = { 2 } } + } + }, Files = { new V1.CorpusFile @@ -866,7 +1366,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = false, + TrainOnAll = false }, new() { @@ -880,7 +1382,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file3.zip" } - } + }, + PretranslateAll = false, + TrainOnAll = false } } }, @@ -911,7 +1415,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new() { @@ -925,7 +1431,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file4.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -1001,7 +1509,9 @@ await env.Service.StartBuildAsync( "MRK", new ScriptureChapters { Chapters = { 1, 2 } } } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new V1.MonolingualCorpus() { @@ -1026,7 +1536,9 @@ await env.Service.StartBuildAsync( "MRK", new ScriptureChapters { Chapters = { 1 } } } - } + }, + PretranslateAll = true, + TrainOnAll = false } }, TargetCorpora = @@ -1054,7 +1566,9 @@ await env.Service.StartBuildAsync( "MRK", new ScriptureChapters { Chapters = { 2 } } } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new V1.MonolingualCorpus() { @@ -1079,7 +1593,9 @@ await env.Service.StartBuildAsync( "MRK", new ScriptureChapters { Chapters = { 1, 2 } } } - } + }, + PretranslateAll = true, + TrainOnAll = false } } } @@ -1127,7 +1643,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true }, new V1.MonolingualCorpus() { @@ -1141,7 +1659,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file3.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true } }, TargetCorpora = @@ -1158,7 +1678,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true }, new V1.MonolingualCorpus() { @@ -1172,7 +1694,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file4.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } } @@ -1213,7 +1737,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified_ParallelCorpus() Format = FileFormat.Paratext, TextId = "file1.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true }, new V1.MonolingualCorpus() { @@ -1227,7 +1753,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified_ParallelCorpus() Format = FileFormat.Paratext, TextId = "file3.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true } }, TargetCorpora = @@ -1244,7 +1772,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified_ParallelCorpus() Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true }, new V1.MonolingualCorpus() { @@ -1258,7 +1788,9 @@ public async Task StartBuildAsync_TrainOnNotSpecified_ParallelCorpus() Format = FileFormat.Paratext, TextId = "file4.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } } @@ -1285,7 +1817,7 @@ await env.Service.StartBuildAsync( SourceFilters = new List() { new() { CorpusRef = "parallel-corpus1-source1", ScriptureRange = "MAT 1;MRK" } - } + }, } ] } @@ -1327,7 +1859,9 @@ await env.Service.StartBuildAsync( "MRK", new ScriptureChapters { } } - } + }, + PretranslateAll = true, + TrainOnAll = false }, new V1.MonolingualCorpus() { @@ -1341,7 +1875,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file3.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = false } }, TargetCorpora = @@ -1358,7 +1894,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file2.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true }, new V1.MonolingualCorpus() { @@ -1372,7 +1910,9 @@ await env.Service.StartBuildAsync( Format = FileFormat.Paratext, TextId = "file4.zip" } - } + }, + PretranslateAll = true, + TrainOnAll = true } } } @@ -1626,6 +2166,75 @@ public async Task CreateEngineWithTextFilesAsync() return engine; } + public async Task CreateMultipleCorporaEngineWithTextFilesAsync() + { + var engine = new Engine + { + Id = "engine1", + Owner = "owner1", + SourceLanguage = "es", + TargetLanguage = "en", + Type = "Smt", + Corpora = new Models.Corpus[] + { + new() + { + Id = "corpus1", + SourceLanguage = "es", + TargetLanguage = "en", + SourceFiles = + [ + new() + { + Id = "file1", + Filename = "file1.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "text1" + } + ], + TargetFiles = + [ + new() + { + Id = "file2", + Filename = "file2.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "text1" + } + ], + }, + new() + { + Id = "corpus2", + SourceLanguage = "es", + TargetLanguage = "en", + SourceFiles = + [ + new() + { + Id = "file3", + Filename = "file3.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "text1" + } + ], + TargetFiles = + [ + new() + { + Id = "file4", + Filename = "file4.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "text1" + } + ], + } + } + }; + await Engines.InsertAsync(engine); + return engine; + } + public async Task CreateEngineWithParatextProjectAsync() { var engine = new Engine @@ -1760,6 +2369,107 @@ public async Task CreateParallelCorpusEngineWithTextFilesAsync() return engine; } + public async Task CreateMultipleParallelCorpusEngineWithTextFilesAsync() + { + var engine = new Engine + { + Id = "engine1", + Owner = "owner1", + SourceLanguage = "es", + TargetLanguage = "en", + Type = "Smt", + ParallelCorpora = new Models.ParallelCorpus[] + { + new() + { + Id = "parallel-corpus1", + SourceCorpora = new List() + { + new() + { + Id = "parallel-corpus1-source1", + Name = "", + Language = "es", + Files = + [ + new() + { + Id = "file1", + Filename = "file1.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "MAT" + } + ] + } + }, + TargetCorpora = new List() + { + new() + { + Id = "parallel-corpus1-target1", + Name = "", + Language = "en", + Files = + [ + new() + { + Id = "file2", + Filename = "file2.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "MAT" + } + ] + } + } + }, + new() + { + Id = "parallel-corpus2", + SourceCorpora = new List() + { + new() + { + Id = "parallel-corpus2-source1", + Name = "", + Language = "es", + Files = + [ + new() + { + Id = "file3", + Filename = "file3.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "MRK" + } + ] + } + }, + TargetCorpora = new List() + { + new() + { + Id = "parallel-corpus2-target1", + Name = "", + Language = "en", + Files = + [ + new() + { + Id = "file4", + Filename = "file4.txt", + Format = Shared.Contracts.FileFormat.Text, + TextId = "MRK" + } + ] + } + } + } + } + }; + await Engines.InsertAsync(engine); + return engine; + } + public async Task CreateParallelCorpusEngineWithParatextProjectAsync() { var engine = new Engine diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index cbdcb6ff..5aca4ed6 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -22,7 +22,7 @@ public class PretranslationServiceTests [Test] public async Task GetUsfmAsync_Source_PreferExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, @@ -46,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() [Test] public async Task GetUsfmAsync_Source_PreferPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -70,7 +70,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() [Test] public async Task GetUsfmAsync_Source_OnlyExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyExisting, @@ -94,7 +94,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting() [Test] public async Task GetUsfmAsync_Source_OnlyPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyPretranslated, @@ -118,7 +118,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() [Test] public async Task GetUsfmAsync_Target_PreferExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -143,7 +143,7 @@ public async Task GetUsfmAsync_Target_PreferExisting() [Test] public async Task GetUsfmAsync_Target_PreferPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -168,7 +168,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated() [Test] public async Task GetUsfmAsync_Target_TargetBookDoesNotExist() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -181,7 +181,7 @@ public async Task GetUsfmAsync_Target_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() { - TestEnvironment env = new(); + using TestEnvironment env = new(); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -205,7 +205,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookExists() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -230,7 +230,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() [Test] public async Task GetUsfmAsync_Target_OnlyExisting() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -244,7 +244,7 @@ public async Task GetUsfmAsync_Target_OnlyExisting() [Test] public async Task GetUsfmAsync_Target_OnlyPretranslated() { - TestEnvironment env = new(); + using TestEnvironment env = new(); env.AddMatthewToTarget(); string usfm = await env.GetUsfmAsync( @@ -266,10 +266,26 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated() ); } - private class TestEnvironment + private class TestEnvironment : IDisposable { public TestEnvironment() { + CorpusFile file1 = + new() + { + Id = "file1", + Filename = "file1.zip", + Format = Shared.Contracts.FileFormat.Paratext, + TextId = "project1" + }; + CorpusFile file2 = + new() + { + Id = "file2", + Filename = "file2.zip", + Format = Shared.Contracts.FileFormat.Paratext, + TextId = "project1" + }; Engines = new MemoryRepository( [ new() @@ -287,29 +303,45 @@ public TestEnvironment() Id = "corpus1", SourceLanguage = "en", TargetLanguage = "en", - SourceFiles = - [ + SourceFiles = [file1], + TargetFiles = [file2], + } + ] + }, + new() + { + Id = "parallel_engine1", + Owner = "owner1", + SourceLanguage = "en", + TargetLanguage = "en", + Type = "nmt", + ModelRevision = 1, + ParallelCorpora = + [ + new() + { + Id = "parallel_corpus1", + SourceCorpora = new List() + { new() { - Id = "file1", - Filename = "file1.zip", - Format = Shared.Contracts.FileFormat.Paratext, - TextId = "project1" + Id = "src_1", + Language = "en", + Files = [file1], } - ], - TargetFiles = - [ + }, + TargetCorpora = new List() + { new() { - Id = "file2", - Filename = "file2.zip", - Format = Shared.Contracts.FileFormat.Paratext, - TextId = "project1" + Id = "trg_1", + Language = "es", + Files = [file2], } - ], + } } ] - } + }, ] ); @@ -334,6 +366,26 @@ public TestEnvironment() TextId = "MAT", Refs = ["MAT 1:2"], Translation = "Chapter 1, verse 2." + }, + new() + { + Id = "pt3", + EngineRef = "parallel_engine1", + ModelRevision = 1, + CorpusRef = "parallel_corpus1", + TextId = "MAT", + Refs = ["MAT 1:1"], + Translation = "Chapter 1, verse 1." + }, + new() + { + Id = "pt4", + EngineRef = "parallel_engine1", + ModelRevision = 1, + CorpusRef = "parallel_corpus1", + TextId = "MAT", + Refs = ["MAT 1:2"], + Translation = "Chapter 1, verse 2." } ] ); @@ -342,23 +394,37 @@ public TestEnvironment() ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG")); var zipSubstituteSource = Substitute.For(); var zipSubstituteTarget = Substitute.For(); - zipSubstituteSource.OpenEntry("MATSRC.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); - zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(""))); + zipSubstituteSource + .OpenEntry("MATSRC.SFM") + .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); + zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(""))); zipSubstituteSource.EntryExists(Arg.Any()).Returns(false); zipSubstituteTarget.EntryExists(Arg.Any()).Returns(false); zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true); zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true); TargetZipContainer = zipSubstituteTarget; - using var textUpdaterSource = new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteSource, - CreateProjectSettings("SRC") - ); - using var textUpdaterTarget = new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteTarget, - CreateProjectSettings("TRG") - ); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(textUpdaterSource); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(textUpdaterTarget); + TextUpdaters = new List(); + Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type) + { + var updater = type switch + { + "SRC" + => new Shared.Services.ZipParatextProjectTextUpdater( + zipSubstituteSource, + CreateProjectSettings("SRC") + ), + "TRG" + => new Shared.Services.ZipParatextProjectTextUpdater( + zipSubstituteTarget, + CreateProjectSettings("TRG") + ), + _ => throw new ArgumentException() + }; + TextUpdaters.Add(updater); + return updater; + } + ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(x => GetTextUpdater("SRC")); + ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(x => GetTextUpdater("TRG")); Service = new PretranslationService(Pretranslations, Engines, ScriptureDataFileService); } @@ -367,6 +433,7 @@ public TestEnvironment() public MemoryRepository Engines { get; } public IScriptureDataFileService ScriptureDataFileService { get; } public IZipContainer TargetZipContainer { get; } + public IList TextUpdaters { get; } public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, @@ -381,12 +448,25 @@ PretranslationUsfmTemplate template textOrigin: textOrigin, template: template ); - return usfm.Replace("\r\n", "\n"); + usfm = usfm.Replace("\r\n", "\n"); + string parallel_usfm = await Service.GetUsfmAsync( + engineId: "parallel_engine1", + modelRevision: 1, + corpusId: "parallel_corpus1", + textId: "MAT", + textOrigin: textOrigin, + template: template + ); + parallel_usfm = parallel_usfm.Replace("\r\n", "\n"); + Assert.That(parallel_usfm, Is.EqualTo(usfm)); + return usfm; } public void AddMatthewToTarget() { - TargetZipContainer.OpenEntry("MATTRG.SFM").Returns(new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); + TargetZipContainer + .OpenEntry("MATTRG.SFM") + .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); } private static ParatextProjectSettings CreateProjectSettings(string name) @@ -406,5 +486,13 @@ private static ParatextProjectSettings CreateProjectSettings(string name) languageCode: "en" ); } + + public void Dispose() + { + foreach (var updater in TextUpdaters) + { + updater.Dispose(); + } + } } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs index 83fd6a21..14e4ba2a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IHealthChecksBuilderExtensions.cs @@ -1,6 +1,4 @@ -using SIL.ServiceToolkit.Services; - -namespace Microsoft.Extensions.DependencyInjection; +namespace Microsoft.Extensions.DependencyInjection; public static class IHealthChecksBuilderExtensions { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs new file mode 100644 index 00000000..d5a6424f --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -0,0 +1,11 @@ +namespace Microsoft.Extensions.DependencyInjection; + +public static class IServiceCollectionExtensions +{ + public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) + { + services.AddSingleton(); + services.AddSingleton(); + return services; + } +} diff --git a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs similarity index 84% rename from src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs index a84bf7f6..65e45202 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/CorpusFile.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/CorpusFile.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public enum FileFormat { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs similarity index 92% rename from src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs index 2b4a1612..c0323727 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/MonolingualCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record MonolingualCorpus { diff --git a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs similarity index 87% rename from src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs index a28dfc14..83374162 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/ParallelCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Models; +namespace SIL.ServiceToolkit.Models; public record ParallelCorpus { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs new file mode 100644 index 00000000..5b43e1fe --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/Row.cs @@ -0,0 +1,3 @@ +namespace SIL.ServiceToolkit.Models; + +public record Row(string TextId, IReadOnlyList Refs, string SourceSegment, string TargetSegment, int RowCount); diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj index a84edf58..f9476b69 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/SIL.ServiceToolkit.csproj @@ -16,6 +16,12 @@ + + + + + + diff --git a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs similarity index 97% rename from src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs index 17d562ad..71d49a50 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/CorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusService.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class CorpusService : ICorpusService { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs similarity index 81% rename from src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs index bbcc9de3..babe8c9b 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ICorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ICorpusService.cs @@ -1,4 +1,4 @@ -namespace Serval.Machine.Shared.Services; +namespace SIL.ServiceToolkit.Services; public interface ICorpusService { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..1556de6d --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs @@ -0,0 +1,11 @@ +namespace SIL.ServiceToolkit.Utils; + +public interface IParallelCorpusPreprocessingService +{ + void Preprocess( + IReadOnlyList corpora, + Action train, + Action pretranslate, + bool useKeyTerms = false + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs new file mode 100644 index 00000000..e75a2d59 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs @@ -0,0 +1,222 @@ +namespace SIL.ServiceToolkit.Services; + +public class ParallelCorpusPreprocessingService : IParallelCorpusPreprocessingService +{ + private readonly ICorpusService _corpusService; + private int _seed = 1234; + private Random _random; + + public ParallelCorpusPreprocessingService(ICorpusService corpusService) + { + _corpusService = corpusService; + _random = new Random(_seed); + } + + internal int Seed + { + get => _seed; + set + { + if (_seed != value) + { + _seed = value; + _random = new Random(_seed); + } + } + } + + public void Preprocess( + IReadOnlyList corpora, + Action train, + Action pretranslate, + bool useKeyTerms = false + ) + { + foreach (ParallelCorpus corpus in corpora) + { + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] sourceCorpora = corpus + .SourceCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + + if (sourceCorpora.Length == 0) + continue; + + ITextCorpus[] sourceTrainingCorpora = sourceCorpora + .Select(sc => FilterTrainingCorpora(sc.Corpus, sc.TextCorpus)) + .ToArray(); + + ITextCorpus[] sourcePretranslateCorpora = sourceCorpora + .Select(sc => FilterPretranslateCorpora(sc.Corpus, sc.TextCorpus)) + .ToArray(); + + (MonolingualCorpus Corpus, ITextCorpus TextCorpus)[] targetCorpora = corpus + .TargetCorpora.SelectMany(c => _corpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc))) + .ToArray(); + + ITextCorpus[] targetTrainingCorpora = targetCorpora + .Select(tc => FilterTrainingCorpora(tc.Corpus, tc.TextCorpus)) + .ToArray(); + + ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); + if (sourceTrainingCorpus.IsScripture()) + { + sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); + } + + ITextCorpus targetCorpus = targetTrainingCorpora.ChooseFirst(); + + ITextCorpus targetTrainingCorpus = targetCorpus; + if (targetTrainingCorpus.IsScripture()) + { + targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); + } + + ParallelTextRow[] trainingRows = sourceTrainingCorpus + .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true) + .ToArray(); + + foreach (Row row in CollapseRanges(trainingRows)) + { + train(row); + } + + if (useKeyTerms) + { + ITextCorpus? sourceTermCorpus = _corpusService + .CreateTermCorpora(corpus.SourceCorpora.SelectMany(sc => sc.Files).ToList()) + .FirstOrDefault(); + ITextCorpus? targetTermCorpus = _corpusService + .CreateTermCorpora(corpus.TargetCorpora.SelectMany(tc => tc.Files).ToList()) + .FirstOrDefault(); + if (sourceTermCorpus is not null && targetTermCorpus is not null) + { + IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpus.AlignRows(targetTermCorpus); + foreach (ParallelTextRow row in parallelKeyTermsCorpus) + { + train(new Row(row.TextId, row.Refs, row.SourceText, row.TargetText, 1)); + } + } + } + ITextCorpus sourcePretranslateCorpus = sourcePretranslateCorpora.ChooseFirst(); + + IParallelTextCorpus pretranslateCorpus = sourcePretranslateCorpus.AlignRows( + targetCorpus, + allSourceRows: true + ); + + foreach (Row row in CollapseRanges(pretranslateCorpus.ToArray())) + { + pretranslate(row, corpus); + } + } + } + + private static ITextCorpus FilterPretranslateCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.PretranslateTextIds is not null) + { + return textCorpus.FilterTexts(corpus.PretranslateTextIds); + } + if (corpus.PretranslateChapters is not null) + { + return textCorpus + .FilterTexts(corpus.PretranslateChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.PretranslateChapters)); + } + return textCorpus; + } + + private static ITextCorpus FilterTrainingCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.TrainOnTextIds is not null) + { + return textCorpus.FilterTexts(corpus.TrainOnTextIds); + } + if (corpus.TrainOnChapters is not null) + { + return textCorpus + .FilterTexts(corpus.TrainOnChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.TrainOnChapters)); + } + return textCorpus; + } + + private static IEnumerable CollapseRanges(ParallelTextRow[] rows) + { + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List refs = []; + string textId = ""; + bool hasUnfinishedRange = false; + + foreach (ParallelTextRow row in rows) + { + if ( + hasUnfinishedRange + && (!row.IsTargetInRange || row.IsTargetRangeStart) + && (!row.IsSourceInRange || row.IsSourceRangeStart) + ) + { + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + + hasUnfinishedRange = false; + } + + textId = row.TextId; + refs.AddRange(row.TargetRefs); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + if (row.TargetText.Length > 0) + { + if (trgSegBuffer.Length > 0) + trgSegBuffer.Append(' '); + trgSegBuffer.Append(row.TargetText); + } + + if (row.IsTargetInRange || row.IsSourceInRange) + { + hasUnfinishedRange = true; + continue; + } + + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + refs.Clear(); + } + if (hasUnfinishedRange) + { + yield return new Row(textId, refs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + } + } + + private static bool IsScriptureRow(TextRow parallelTextRow) + { + return parallelTextRow.Ref is ScriptureRef sr && sr.IsVerse; + } + + private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) + { + return selection.TryGetValue(sr.Book, out HashSet? chapters) + && chapters != null + && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); + } + + private static TextRow CleanSegment(TextRow row) + { + if (row.Text == "...") + row.Segment = []; + return row; + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs index 0d9630d6..a5800d9f 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs @@ -1,4 +1,5 @@ global using System.Diagnostics.CodeAnalysis; +global using System.Text; global using System.Text.Json.Nodes; global using System.Text.RegularExpressions; global using Grpc.Core; @@ -9,4 +10,8 @@ global using Microsoft.Extensions.Hosting; global using Microsoft.Extensions.Logging; global using Microsoft.Extensions.Options; +global using SIL.Machine.Corpora; +global using SIL.ServiceToolkit.Models; +global using SIL.ServiceToolkit.Services; +global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj new file mode 100644 index 00000000..0b5ceff0 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/SIL.ServiceToolkit.Tests.csproj @@ -0,0 +1,33 @@ + + + + net8.0 + enable + enable + SIL.ServiceToolkit + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs new file mode 100644 index 00000000..543332e2 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs @@ -0,0 +1,96 @@ +namespace SIL.ServiceToolkit.Services; + +[TestFixture] +public class ParallelCorpusPreprocessingServiceTests +{ + private static readonly string TestDataPath = Path.Combine( + AppContext.BaseDirectory, + "..", + "..", + "..", + "Services", + "data" + ); + + [Test] + public void TestParallelCorpusPreprocessor() + { + ParallelCorpusPreprocessingService processor = new(new CorpusService()); + List corpora = + [ + new() + { + Id = "corpus1", + SourceCorpora = + [ + new() + { + Id = "source-corpus1", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source1.txt") + } + ] + }, + new() + { + Id = "source-corpus2", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source2.txt") + } + ] + } + ], + TargetCorpora = + [ + new() + { + Id = "target-corpus1", + Language = "en", + Files = + [ + new() + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "target1.txt") + } + ] + } + ] + } + ]; + int trainCount = 0; + int pretranslateCount = 0; + processor.Preprocess( + corpora, + row => + { + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + trainCount++; + }, + (row, _) => + { + if (row.SourceSegment.Length > 0 && row.TargetSegment.Length == 0) + pretranslateCount++; + }, + false + ); + Assert.Multiple(() => + { + Assert.That(trainCount, Is.EqualTo(2)); + Assert.That(pretranslateCount, Is.EqualTo(3)); + }); + } +} diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt new file mode 100644 index 00000000..2aeb971c --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source1.txt @@ -0,0 +1,7 @@ +Source one, Line 1 +Source one, Line 2 + +Source one, Line 4 + +Source one, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt new file mode 100644 index 00000000..7f4a0669 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/source2.txt @@ -0,0 +1,7 @@ +Source two, Line 1 +Source two, Line 2 + +Source two, Line 4 +Source two, Line 5 +Source two, Line 6 + diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt new file mode 100644 index 00000000..816e9435 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/data/target1.txt @@ -0,0 +1,7 @@ +Target one, Line 1 + + +Target one, Line 4 + + +Target one, Line 7 diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs new file mode 100644 index 00000000..e1c24c5f --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs @@ -0,0 +1,2 @@ +global using NUnit.Framework; +global using SIL.ServiceToolkit.Models;