From eec69be88216dbe9946c27a08136c744d0b88328 Mon Sep 17 00:00:00 2001 From: James Friel Date: Wed, 23 Aug 2023 13:08:12 +0100 Subject: [PATCH] make parallel --- .../DataFlowComponents/CHIColumnFinder.cs | 65 ++++++++++++++----- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/HICPlugin/DataFlowComponents/CHIColumnFinder.cs b/HICPlugin/DataFlowComponents/CHIColumnFinder.cs index 7283156..4dce092 100644 --- a/HICPlugin/DataFlowComponents/CHIColumnFinder.cs +++ b/HICPlugin/DataFlowComponents/CHIColumnFinder.cs @@ -13,6 +13,8 @@ using Rdmp.Core.ReusableLibraryCode; using Rdmp.Core.ReusableLibraryCode.Checks; using Rdmp.Core.ReusableLibraryCode.Progress; +using System.Threading.Tasks; +using System.Diagnostics; namespace HICPluginInteractive.DataFlowComponents; @@ -32,8 +34,8 @@ public class CHIColumnFinder : IPluginDataFlowComponent, IPipelineReq [DemandsInitialization("By default all columns from the source will be checked for valid CHIs. Set this to a list of headers (separated with a comma) to ignore the specified columns.", DemandType = DemandType.Unspecified)] public string IgnoreColumns { - get => string.Join(',',_columnWhitelist); - set => _columnWhitelist=(value ?? "").Split(',').Select(s=>s.Trim()).ToList(); + get => string.Join(',', _columnWhitelist); + set => _columnWhitelist = (value ?? "").Split(',').Select(s => s.Trim()).ToList(); } private bool _firstTime = true; @@ -45,6 +47,8 @@ public string IgnoreColumns public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener listener, GracefulCancellationToken cancellationToken) { + // var sw = new Stopwatch(); + // sw.Start(); if (OverrideUntil.HasValue && OverrideUntil.Value > DateTime.Now) { if (_firstTime) @@ -69,20 +73,48 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener _columnWhitelist.AddRange(ignoreColumnsArray); } - var batchRowCount = 0; - toProcess.BeginLoadData(); - string[] knownBadColumns = Array.Empty(); - var dtRows = toProcess.Rows.Cast().ToArray(); - foreach (var row in dtRows) + // var batchRowCount = 0; + // toProcess.BeginLoadData(); + // string[] knownBadColumns = Array.Empty(); + // var dtRows = toProcess.Rows.Cast().ToArray(); + // foreach (var row in dtRows) + // { + // foreach (var col in toProcess.Columns.Cast().Where(col => !_columnWhitelist.Contains(col.ColumnName.Trim()) && !knownBadColumns.Contains(col.ColumnName))) + // { + // bool containsValidCHI = ContainsValidChi(row[col]); + // if (!containsValidCHI) continue; + // knownBadColumns.Append(col.ColumnName); + + // if (_activator?.IsInteractive == true && ShowUIComponents) + // DoTheMessageBoxDance(toProcess, listener, col, row, batchRowCount); + // else + // { + // var message = + // $"Column {col.ColumnName} in Dataset {toProcess.TableName} appears to contain a CHI ({row[col]})"; + // _foundChiList.Add(message); + // listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, message)); + // if (!_isTableAlreadyNamed) + // listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, + // "DataTable has not been named. If you want to know the dataset that the error refers to please add an ExtractCatalogueMetadata to the extraction pipeline.")); + // } + // } + + // batchRowCount++; + // } + // toProcess.EndLoadData(); + // sw.Stop(); + // var sw2 = new Stopwatch(); + // sw2.Start(); + Parallel.For(0, toProcess.Rows.Count, rowIndex => { - foreach (var col in toProcess.Columns.Cast().Where(col => !_columnWhitelist.Contains(col.ColumnName.Trim()) && !knownBadColumns.Contains(col.ColumnName))) + DataRow row = toProcess.Rows[rowIndex]; + foreach (var col in toProcess.Columns.Cast().Where(col => !_columnWhitelist.Contains(col.ColumnName.Trim()))) { bool containsValidCHI = ContainsValidChi(row[col]); - if(!containsValidCHI) continue; - knownBadColumns.Append(col.ColumnName); + if (!containsValidCHI) continue; if (_activator?.IsInteractive == true && ShowUIComponents) - DoTheMessageBoxDance(toProcess, listener, col, row, batchRowCount); + DoTheMessageBoxDance(toProcess, listener, col, row, rowIndex); else { var message = @@ -94,15 +126,14 @@ public DataTable ProcessPipelineData(DataTable toProcess, IDataLoadEventListener "DataTable has not been named. If you want to know the dataset that the error refers to please add an ExtractCatalogueMetadata to the extraction pipeline.")); } } - - batchRowCount++; - } - toProcess.EndLoadData(); + }); + // sw2.Stop(); + // Console.WriteLine(string.Format("{0} {1} {2}", toProcess.TableName, sw.Elapsed, sw2.Elapsed)); return toProcess; } - - private void DoTheMessageBoxDance(DataTable toProcess, IDataLoadEventListener listener, DataColumn col, DataRow row, int batchRowCount) + + private void DoTheMessageBoxDance(DataTable toProcess, IDataLoadEventListener listener, DataColumn col, DataRow row, int batchRowCount) { if (_activator.IsInteractive && _activator.YesNo( $"Column {col.ColumnName} in Dataset {(_isTableAlreadyNamed ? toProcess.TableName : "UNKNOWN (you need an ExtractCatalogueMetadata in the pipeline to get a proper name)")} appears to contain a CHI ({row[col]})\n\nWould you like to view the current batch of data?", "Suspected CHI Column"))