Skip to content

Commit

Permalink
SMT on ClearML:
Browse files Browse the repository at this point in the history
* Replace CPU, GPU types with just Hangfire vs ClearML as well as engine type
* Allow each engine type to have it's own queue and docker image
* SMT now broken into preprocessing, train and postprocessing, just like Nmt
* SMT build defaults on ClearML
* NMT local train removed
* Use .zip for SMT model moving
* Update cleanup script for SMT models
  • Loading branch information
johnml1135 committed May 20, 2024
1 parent f4b27e5 commit 5aac7fa
Show file tree
Hide file tree
Showing 47 changed files with 1,337 additions and 1,137 deletions.
3 changes: 1 addition & 2 deletions src/SIL.Machine.AspNetCore/Configuration/BuildJobOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@ public class BuildJobOptions
{
public const string Key = "BuildJob";

public Dictionary<BuildJobType, BuildJobRunner> Runners { get; set; } =
new() { { BuildJobType.Cpu, BuildJobRunner.Hangfire }, { BuildJobType.Gpu, BuildJobRunner.ClearML } };
public IList<ClearMLBuildJobOptions> ClearML { get; set; } = new List<ClearMLBuildJobOptions>();
}
11 changes: 11 additions & 0 deletions src/SIL.Machine.AspNetCore/Configuration/ClearMLBuildJobOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
namespace SIL.Machine.AspNetCore.Configuration;

public class ClearMLBuildJobOptions
{
public const string Key = "ClearMLBuildJob";

public TranslationEngineType TranslationEngineType { get; set; }
public string ModelType { get; set; } = "";
public string Queue { get; set; } = "default";
public string DockerImage { get; set; } = "";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
namespace SIL.Machine.AspNetCore.Configuration;

public class ClearMLEngineTypeOptions
{
public const string Key = "ClearMLEngineType";

public string EngineType { get; set; } = "";
public string Queue { get; set; } = "default";
public string ModelType { get; set; } = "";
public string DockerImage { get; set; } = "";
}
3 changes: 0 additions & 3 deletions src/SIL.Machine.AspNetCore/Configuration/ClearMLOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ public class ClearMLOptions
{
public const string Key = "ClearML";

public string Queue { get; set; } = "default";
public string AccessKey { get; set; } = "";
public string SecretKey { get; set; } = "";
public bool BuildPollingEnabled { get; set; } = false;
public TimeSpan BuildPollingTimeout { get; set; } = TimeSpan.FromSeconds(10);
public string ModelType { get; set; } = "huggingface";
public string RootProject { get; set; } = "Machine";
public string Project { get; set; } = "dev";
public string DockerImage { get; set; } = "";
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ public static IMachineBuilder AddSharedFileOptions(this IMachineBuilder builder,
return builder;
}

public static IMachineBuilder AddBuildJobOptions(
this IMachineBuilder builder,
Action<BuildJobOptions> configureOptions
)
{
builder.Services.Configure(configureOptions);
return builder;
}

public static IMachineBuilder AddBuildJobOptions(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
return builder;
}

public static IMachineBuilder AddThotSmtModel(this IMachineBuilder builder)
{
if (builder.Configuration is null)
Expand Down Expand Up @@ -131,26 +146,6 @@ public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, st
return builder;
}

private static IMachineBuilder AddClearMLBuildJobRunner(this IMachineBuilder builder)
{
builder.Services.AddScoped<IBuildJobRunner, ClearMLBuildJobRunner>();
builder.Services.AddScoped<IClearMLBuildJobFactory, NmtClearMLBuildJobFactory>();
builder.Services.AddSingleton<ClearMLMonitorService>();
builder.Services.AddHostedService(p => p.GetRequiredService<ClearMLMonitorService>());

return builder;
}

private static IMachineBuilder AddHangfireBuildJobRunner(this IMachineBuilder builder)
{
builder.Services.AddScoped<IBuildJobRunner, HangfireBuildJobRunner>();

builder.Services.AddScoped<IHangfireBuildJobFactory, SmtTransferHangfireBuildJobFactory>();
builder.Services.AddScoped<IHangfireBuildJobFactory, NmtHangfireBuildJobFactory>();

return builder;
}

private static MongoStorageOptions GetMongoStorageOptions()
{
var mongoStorageOptions = new MongoStorageOptions
Expand Down Expand Up @@ -200,6 +195,7 @@ public static IMachineBuilder AddHangfireJobServer(
switch (engineType)
{
case TranslationEngineType.SmtTransfer:
builder.Services.AddSingleton<SmtTransferEngineStateService>();
builder.AddThotSmtModel().AddTransferEngine().AddUnigramTruecaser();
queues.Add("smt_transfer");
break;
Expand Down Expand Up @@ -360,34 +356,21 @@ public static IMachineBuilder AddServalTranslationEngineService(
return builder;
}

public static IMachineBuilder AddBuildJobService(
this IMachineBuilder builder,
Action<BuildJobOptions> configureOptions
)
{
builder.Services.Configure(configureOptions);
var options = new BuildJobOptions();
configureOptions(options);
return builder.AddBuildJobService(options);
}

public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
var buildJobOptions = new BuildJobOptions();
config.GetSection(BuildJobOptions.Key).Bind(buildJobOptions);
return builder.AddBuildJobService(buildJobOptions);
}

public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder)
{
if (builder.Configuration is null)
{
builder.AddBuildJobService(o => { });
}
else
if (builder.Configuration is not null)
{
builder.AddBuildJobService(builder.Configuration.GetSection(BuildJobOptions.Key));
builder.Services.AddScoped<IBuildJobService, BuildJobService>();

builder.Services.AddScoped<IBuildJobRunner, ClearMLBuildJobRunner>();
builder.Services.AddScoped<IClearMLBuildJobFactory, NmtClearMLBuildJobFactory>();
builder.Services.AddScoped<IClearMLBuildJobFactory, SmtTransferClearMLBuildJobFactory>();
builder.Services.AddSingleton<ClearMLMonitorService>();
builder.Services.AddHostedService(p => p.GetRequiredService<ClearMLMonitorService>());

builder.Services.AddScoped<IBuildJobRunner, HangfireBuildJobRunner>();
builder.Services.AddScoped<IHangfireBuildJobFactory, NmtHangfireBuildJobFactory>();
builder.Services.AddScoped<IHangfireBuildJobFactory, SmtTransferHangfireBuildJobFactory>();

var smtTransferEngineOptions = new SmtTransferEngineOptions();
builder.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
Expand All @@ -412,23 +395,4 @@ public static IMachineBuilder AddModelCleanupService(this IMachineBuilder builde
builder.Services.AddHostedService<ModelCleanupService>();
return builder;
}

private static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, BuildJobOptions options)
{
builder.Services.AddScoped<IBuildJobService, BuildJobService>();

foreach (BuildJobRunner runnerType in options.Runners.Values.Distinct())
{
switch (runnerType)
{
case BuildJobRunner.ClearML:
builder.AddClearMLBuildJobRunner();
break;
case BuildJobRunner.Hangfire:
builder.AddHangfireBuildJobRunner();
break;
}
}
return builder;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
builder.AddSharedFileOptions(o => { });
builder.AddSmtTransferEngineOptions(o => { });
builder.AddClearMLOptions(o => { });
builder.AddBuildJobOptions(o => { });
}
else
{
builder.AddServiceOptions(configuration.GetSection(ServiceOptions.Key));
builder.AddSharedFileOptions(configuration.GetSection(SharedFileOptions.Key));
builder.AddSmtTransferEngineOptions(configuration.GetSection(SmtTransferEngineOptions.Key));
builder.AddClearMLOptions(configuration.GetSection(ClearMLOptions.Key));
builder.AddBuildJobOptions(configuration.GetSection(BuildJobOptions.Key));
}
return builder;
}
Expand Down
13 changes: 10 additions & 3 deletions src/SIL.Machine.AspNetCore/Models/Build.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@ public enum BuildJobState
Canceling
}

public enum BuildJobRunner
public enum JobRunnerType
{
Hangfire,
ClearML
}

public enum BuildStage
{
Preprocess,
Train,
Postprocess
}

public record Build
{
public required string BuildId { get; init; }
public required BuildJobState JobState { get; init; }
public required string JobId { get; init; }
public required BuildJobRunner JobRunner { get; init; }
public required string Stage { get; init; }
public required JobRunnerType JobRunner { get; init; }
public required BuildStage Stage { get; init; }
public string? Options { get; set; }
}
1 change: 1 addition & 0 deletions src/SIL.Machine.AspNetCore/Models/TranslationEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ public record TranslationEngine : IEntity
public string Id { get; set; } = "";
public int Revision { get; set; } = 1;
public required string EngineId { get; init; }
public required TranslationEngineType Type { get; init; } = TranslationEngineType.Nmt;
public required string SourceLanguage { get; init; }
public required string TargetLanguage { get; init; }
public required bool IsModelPersisted { get; init; }
Expand Down
51 changes: 16 additions & 35 deletions src/SIL.Machine.AspNetCore/Services/BuildJobService.cs
Original file line number Diff line number Diff line change
@@ -1,31 +1,18 @@
namespace SIL.Machine.AspNetCore.Services;

public class BuildJobService : IBuildJobService
public class BuildJobService(IEnumerable<IBuildJobRunner> runners, IRepository<TranslationEngine> engines)
: IBuildJobService
{
private readonly Dictionary<BuildJobType, IBuildJobRunner> _runnersByJobType;
private readonly Dictionary<BuildJobRunner, IBuildJobRunner> _runners;
private readonly IRepository<TranslationEngine> _engines;

public BuildJobService(
IEnumerable<IBuildJobRunner> runners,
IRepository<TranslationEngine> engines,
IOptions<BuildJobOptions> options
)
{
_runners = runners.ToDictionary(r => r.Type);
_runnersByJobType = new Dictionary<BuildJobType, IBuildJobRunner>();
foreach (KeyValuePair<BuildJobType, BuildJobRunner> kvp in options.Value.Runners)
_runnersByJobType.Add(kvp.Key, _runners[kvp.Value]);
_engines = engines;
}
private readonly Dictionary<JobRunnerType, IBuildJobRunner> _runners = runners.ToDictionary(r => r.Type);
private readonly IRepository<TranslationEngine> _engines = engines;

public Task<bool> IsEngineBuilding(string engineId, CancellationToken cancellationToken = default)
{
return _engines.ExistsAsync(e => e.EngineId == engineId && e.CurrentBuild != null, cancellationToken);
}

public Task<IReadOnlyList<TranslationEngine>> GetBuildingEnginesAsync(
BuildJobRunner runner,
JobRunnerType runner,
CancellationToken cancellationToken = default
)
{
Expand All @@ -49,38 +36,32 @@ public Task<IReadOnlyList<TranslationEngine>> GetBuildingEnginesAsync(
}

public async Task CreateEngineAsync(
IEnumerable<BuildJobType> jobTypes,
string engineId,
string? name = null,
CancellationToken cancellationToken = default
)
{
foreach (BuildJobType jobType in jobTypes)
foreach (JobRunnerType runnerType in _runners.Keys)
{
IBuildJobRunner runner = _runnersByJobType[jobType];
IBuildJobRunner runner = _runners[runnerType];
await runner.CreateEngineAsync(engineId, name, cancellationToken);
}
}

public async Task DeleteEngineAsync(
IEnumerable<BuildJobType> jobTypes,
string engineId,
CancellationToken cancellationToken = default
)
public async Task DeleteEngineAsync(string engineId, CancellationToken cancellationToken = default)
{
foreach (BuildJobType jobType in jobTypes)
foreach (JobRunnerType runnerType in _runners.Keys)
{
IBuildJobRunner runner = _runnersByJobType[jobType];
IBuildJobRunner runner = _runners[runnerType];
await runner.DeleteEngineAsync(engineId, cancellationToken);
}
}

public async Task<bool> StartBuildJobAsync(
BuildJobType jobType,
TranslationEngineType engineType,
JobRunnerType runnerType,
string engineId,
string buildId,
string stage,
BuildStage stage,
object? data = null,
string? buildOptions = null,
CancellationToken cancellationToken = default
Expand All @@ -97,10 +78,10 @@ public async Task<bool> StartBuildJobAsync(
{
return false;
}

IBuildJobRunner runner = _runnersByJobType[jobType];
TranslationEngine engine = (await _engines.GetAsync(e => e.EngineId == engineId, cancellationToken))!;
IBuildJobRunner runner = _runners[runnerType];
string jobId = await runner.CreateJobAsync(
engineType,
engine.Type,
engineId,
buildId,
stage,
Expand All @@ -127,7 +108,7 @@ await _engines.UpdateAsync(
),
cancellationToken: cancellationToken
);
await runner.EnqueueJobAsync(jobId, cancellationToken);
await runner.EnqueueJobAsync(jobId, engine.Type, cancellationToken);
return true;
}
catch
Expand Down
27 changes: 21 additions & 6 deletions src/SIL.Machine.AspNetCore/Services/ClearMLBuildJobRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@

public class ClearMLBuildJobRunner(
IClearMLService clearMLService,
IEnumerable<IClearMLBuildJobFactory> buildJobFactories
IEnumerable<IClearMLBuildJobFactory> buildJobFactories,
IOptionsMonitor<BuildJobOptions> options
) : IBuildJobRunner
{
private readonly IClearMLService _clearMLService = clearMLService;
private readonly Dictionary<TranslationEngineType, IClearMLBuildJobFactory> _buildJobFactories =
buildJobFactories.ToDictionary(f => f.EngineType);

public BuildJobRunner Type => BuildJobRunner.ClearML;
private readonly Dictionary<TranslationEngineType, ClearMLBuildJobOptions> _options =
options.CurrentValue.ClearML.ToDictionary(o => o.TranslationEngineType);

public JobRunnerType Type => JobRunnerType.ClearML;

public async Task CreateEngineAsync(
string engineId,
Expand All @@ -31,7 +35,7 @@ public async Task<string> CreateJobAsync(
TranslationEngineType engineType,
string engineId,
string buildId,
string stage,
BuildStage stage,
object? data = null,
string? buildOptions = null,
CancellationToken cancellationToken = default
Expand All @@ -48,22 +52,33 @@ public async Task<string> CreateJobAsync(
string script = await buildJobFactory.CreateJobScriptAsync(
engineId,
buildId,
_options[engineType].ModelType,
stage,
data,
buildOptions,
cancellationToken
);
return await _clearMLService.CreateTaskAsync(buildId, projectId, script, cancellationToken);
return await _clearMLService.CreateTaskAsync(
buildId,
projectId,
script,
_options[engineType].DockerImage,
cancellationToken
);
}

public Task<bool> DeleteJobAsync(string jobId, CancellationToken cancellationToken = default)
{
return _clearMLService.DeleteTaskAsync(jobId, cancellationToken);
}

public Task<bool> EnqueueJobAsync(string jobId, CancellationToken cancellationToken = default)
public Task<bool> EnqueueJobAsync(
string jobId,
TranslationEngineType engineType,
CancellationToken cancellationToken = default
)
{
return _clearMLService.EnqueueTaskAsync(jobId, cancellationToken);
return _clearMLService.EnqueueTaskAsync(jobId, _options[engineType].Queue, cancellationToken);
}

public Task<bool> StopJobAsync(string jobId, CancellationToken cancellationToken = default)
Expand Down
Loading

0 comments on commit 5aac7fa

Please sign in to comment.