Skip to content

Commit

Permalink
Fix CHIfinder
Browse files Browse the repository at this point in the history
Flush and dispose results file when done writing; add unit test that result was reported.
  • Loading branch information
jas88 committed Nov 9, 2023
1 parent a77326b commit 4f63e34
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 38 deletions.
82 changes: 44 additions & 38 deletions HICPlugin/DataFlowComponents/CHIColumnFinder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener
string fileLocation = null;
if (OutputFileDirectory?.Exists == true)
{
var CHIDir = System.IO.Path.Combine(OutputFileDirectory.FullName, "FoundCHIs");
var CHIDir = Path.Combine(OutputFileDirectory.FullName, "FoundCHIs");
if (!Directory.Exists(CHIDir)) Directory.CreateDirectory(CHIDir);
fileLocation = Path.Combine(CHIDir, $"{toProcess.TableName}{_potentialChiLocationFileDescriptor}");
if (File.Exists(fileLocation) && BailOutAfter>0)
Expand Down Expand Up @@ -104,30 +104,42 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
$"You have chosen the following columns to be ignored: {string.Join(",", columnGreenList)}"));

foreach (var col in toProcess.Columns.Cast<DataColumn>().Where(c => !columnGreenList.Contains(c.ColumnName.Trim())))
try
{
foreach (var val in toProcess.Rows.Cast<DataRow>().Select(DeRef).AsParallel().Where(ContainsValidChi))
foreach (var col in toProcess.Columns.Cast<DataColumn>().Where(c => !columnGreenList.Contains(c.ColumnName.Trim())))
{
Interlocked.Increment(ref count);
if (BailOutAfter > 0 && count >= BailOutAfter) break;

listFile.Value?.WriteLine($"{col.ColumnName},{GetPotentialCHI(val)},{val}");
if (VerboseLogging || string.IsNullOrWhiteSpace(fileLocation))
foreach (var val in toProcess.Rows.Cast<DataRow>().Select(DeRef).AsParallel().Where(ContainsValidChi))
{
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
$"Column {col.ColumnName} in Dataset {toProcess.TableName} appears to contain a CHI ({val})"));
if (!_isTableAlreadyNamed)
Interlocked.Increment(ref count);
if (BailOutAfter > 0 && count >= BailOutAfter) break;

listFile.Value?.WriteLine($"{col.ColumnName},{GetPotentialCHI(val)},{val}");
if (VerboseLogging || string.IsNullOrWhiteSpace(fileLocation))
{
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
"DataTable has not been named. If you want to know the dataset that the error refers to please add an ExtractCatalogueMetadata to the extraction pipeline."));
$"Column {col.ColumnName} in Dataset {toProcess.TableName} appears to contain a CHI ({val})"));
if (!_isTableAlreadyNamed)
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
"DataTable has not been named. If you want to know the dataset that the error refers to please add an ExtractCatalogueMetadata to the extraction pipeline."));
}
}
}
if (count != 0 && VerboseLogging) listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, $"Have Written {count} Potential CHIs to {fileLocation}"));
if (count != 0 && VerboseLogging) listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, $"Have Written {count} Potential CHIs to {fileLocation}"));

continue;
continue;

[NotNull]
string DeRef([NotNull] DataRow row) => row[col].ToString() ?? "";
[NotNull]
string DeRef([NotNull] DataRow row) => row[col].ToString() ?? "";
}
}
finally
{
if (listFile.IsValueCreated && listFile.Value is not null)
{
listFile.Value?.Flush();
listFile.Value?.Dispose();
}
}

if (count>0 && OutputFileDirectory?.Exists == true)
{
listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, $"{count} CHIs have been found in your extraction. Find them in {OutputFileDirectory.FullName}"));
Expand Down Expand Up @@ -395,13 +407,14 @@ private static bool ContainsValidChi([CanBeNull] object toCheck)
public void PreInitialize(IExtractCommand value, IDataLoadEventListener listener)
{
if (value is not ExtractDatasetCommand edcs) return;

OutputFileDirectory = value.GetExtractionDirectory();
try
{
var hashOnReleaseColumns = edcs.Catalogue.CatalogueItems.Select(static ci => ci.ExtractionInformation)
.Where(static ei => ei?.HashOnDataRelease == true).Select(static ei => ei.GetRuntimeName()).ToArray();

if (!hashOnReleaseColumns.Any() && String.IsNullOrWhiteSpace(AllowListFile)) return;
if (!hashOnReleaseColumns.Any() && string.IsNullOrWhiteSpace(AllowListFile)) return;

if (hashOnReleaseColumns.Length > 0)
{
Expand All @@ -411,32 +424,25 @@ public void PreInitialize(IExtractCommand value, IDataLoadEventListener listener

if (File.Exists(AllowListFile) && _allowLists.Count == 0)
{
string allowListFileContent = File.ReadAllText(AllowListFile);
var allowListFileContent = File.ReadAllText(AllowListFile);
var deserializer = new DeserializerBuilder().Build();
var yamlObject = deserializer.Deserialize<Dictionary<Object, Object>>(allowListFileContent);
foreach (var kvp in yamlObject)
var yamlObject = deserializer.Deserialize<Dictionary<string, List<string>>>(allowListFileContent);
foreach (var (catalogue, columns) in yamlObject)
{
string catalogue = kvp.Key.ToString();
List<string> columns = new();
foreach (var column in kvp.Value as List<Object>)
{
columns.Add(column.ToString());
}
_allowLists.Add(catalogue, columns);
}
}
if (hashOnReleaseColumns.Any())

if (!hashOnReleaseColumns.Any()) return;

if (_allowLists.TryGetValue("RDMP_ALL", out var allowAllList))
{
allowAllList.AddRange(hashOnReleaseColumns);
_allowLists["RDMP_ALL"] = allowAllList;
}
else
{
bool exists = _allowLists.TryGetValue("RDMP_ALL", out var allowAllList);
if (exists)
{
allowAllList.AddRange(hashOnReleaseColumns);
_allowLists["RDMP_ALL"] = allowAllList;
}
else
{
_allowLists.Add("RDMP_ALL", hashOnReleaseColumns.ToList());
}
_allowLists.Add("RDMP_ALL", hashOnReleaseColumns.ToList());
}
}
catch (Exception e)
Expand Down
37 changes: 37 additions & 0 deletions HICPluginTests/Unit/CHIColumnFinderTests.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
using System;
using System.Data;
using System.IO;
using System.Linq;
using HICPluginInteractive.DataFlowComponents;
using NUnit.Framework;
using Rdmp.Core.Curation.Data;
using Rdmp.Core.DataExport.Data;
using Rdmp.Core.DataExport.DataExtraction.Commands;
using Rdmp.Core.DataExport.DataExtraction.UserPicks;
using Rdmp.Core.MapsDirectlyToDatabaseTable;
using Rdmp.Core.Repositories;
using Rdmp.Core.ReusableLibraryCode.Progress;
using Tests.Common.Scenarios;

Expand Down Expand Up @@ -57,4 +64,34 @@ public void TestDataWithCHIs(string toCheck, bool expectedToBeChi)
Assert.DoesNotThrow(() => _chiFinder.ProcessPipelineData(toProcess, _listener, null));
}

[Test]
public void TestFile()
{
var memRepo = new MemoryCatalogueRepository();
var dataRepo = new MemoryDataExportRepository();
var ds = new ExtractableDataSet(dataRepo, new Catalogue(dataRepo, "cat"));
var project = new Project(dataRepo, "test")
{
ExtractionDirectory = TestContext.CurrentContext.WorkDirectory
};
var ec = new ExtractionConfiguration(dataRepo, project,"testConfig");
ec.AddDatasetToConfiguration(ds);
foreach (var ecSelectedDataSet in ec.SelectedDataSets)
{
ecSelectedDataSet.SaveToDatabase();
}
var cf = new CHIColumnFinder();
var bundle = new ExtractableDatasetBundle(ds)
{
};
var cmd = new ExtractDatasetCommand(ec,bundle);
cf.PreInitialize( cmd,ThrowImmediatelyDataLoadEventListener.NoisyPicky);
using var toProcess = new DataTable();
toProcess.Columns.Add("CHI");
toProcess.Rows.Add(new object[] { 1111111111 });
Assert.DoesNotThrow(() => cf.ProcessPipelineData(toProcess, _listener, null));
var result = Directory.GetFiles(TestContext.CurrentContext.WorkDirectory, "*.csv", SearchOption.AllDirectories)
.First(static name => name.EndsWith("_Potential_CHI_Locations.csv", StringComparison.Ordinal));
Assert.Contains("CHI,1111111111,1111111111",File.ReadLines(result).ToList());
}
}

0 comments on commit 4f63e34

Please sign in to comment.