Improved CSVsubsetSplit method

Additionally, a bug with the delimiter sniffer that prevented dialects from being properly determined has been fixed.
ws-garcia · Jan 19, 2024 · 51e3af6 · 51e3af6
1 parent e9f74af
commit 51e3af6
Show file tree

Hide file tree

Showing 12 changed files with 420 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -421,7 +421,7 @@ Visit [this site](https://ws-garcia.github.io/VBA-CSV-interface/limitations/csv_
 
 ## Licence
 
-Copyright (C) 2020-2023  [W. García](https://github.com/ws-garcia/).
+Copyright (C) 2020-2024  [W. García](https://github.com/ws-garcia/).
 
 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 

diff --git a/docs/api/methods/csvsubsetsplit.md b/docs/api/methods/csvsubsetsplit.md
@@ -15,7 +15,7 @@ Splits the CSV data into a set of files in which each piece has a related portio
 
 ## Syntax
 
-*expression*.`CSVsubsetSplit`*(filePath, \[subsetColumns:= 1\], \[headers:= True\], \[repeatHeaders:= True\], \[streamSize:= 20])*
+*expression*.`CSVsubsetSplit`*(filePath, \[subsetColumns:= 1\], \[headers:= True\], \[repeatHeaders:= True\], \[streamSize:= 20\], \[oConfig:=Nothing\])*
 
 ### Parameters
 
@@ -47,6 +47,10 @@ Splits the CSV data into a set of files in which each piece has a related portio
 <td style="text-align: left;"><em>streamSize</em></td>
 <td style="text-align: left;">Optional. Identifier specifying a <code>Long</code> Type variable representing the buffer size factor used to read the target CSV file.</td>
 </tr>
+<tr>
+<td style="text-align: left;"><em>oConfig</em></td>
+<td style="text-align: left;">Optional. Identifier specifying a <code>CSVparserConfig</code> Object variable holding all the configurations to parse the CSV file.</td>
+</tr>
 </tbody>
 </table>
 
@@ -60,6 +64,11 @@ Splits the CSV data into a set of files in which each piece has a related portio
 
 The `CSVsubsetSplit` method will create a file for each different value (data grouping) in the fields at the *subsetColumns* position, then all related data is appended to the respective file. Use the *headers* parameter to include a header record in each new CSV file. The *subsetColumns* parameter can be a single value or an array of `Long` values.  When the CSV file has a header record and the user sets the *header* parameter to `False`, the header row is saved in a separate file and the rest of CSV files will have no header record. The user can control when to include the headers by using the *repeatHeaders* parameter.
 
+>⚠️**Caution**
+>{: .text-grey-lt-000 .bg-green-000 }
+>The user shall verify that when subdividing a CSV file using a text field/column there are no records with special characters not supported by the file system of the operating system.
+{: .text-grey-dk-300 .bg-yellow-000 }
+
 >📝**Note**
 >{: .text-grey-lt-000 .bg-green-000 }
 >The result subsets will be saved in a folder named [\*-WorkDir], where (\*) denotes the name of the source CSV file.

diff --git a/docs/home/getting_started.md b/docs/home/getting_started.md
@@ -209,7 +209,7 @@ The images below shows the overall performance for the imports operations from t
 - The CSV syntax slow-down the performance. When the number of escaped fields are increased, the performance decrease.
 
 ## Licence
-Copyright (C) 2020-2023  [W. García](https://github.com/ws-garcia/).
+Copyright (C) 2020-2024  [W. García](https://github.com/ws-garcia/).
 
 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 

diff --git a/src/Access_version.zip b/src/Access_version.zip
diff --git a/src/All_Host_version.zip b/src/All_Host_version.zip
diff --git a/src/CSVSniffer.cls b/src/CSVSniffer.cls
@@ -486,19 +486,20 @@ End Function
 Private Function RecordScore(ByRef strArray As Variant) As Double
     Dim L0 As Long
     Dim tmpSUM As Double
-    Dim FielCount As Long
+    Dim SumLEN As Double
+    Dim FieldsCount As Long
 
-    FielCount = 1 + UBound(strArray) - LBound(strArray)
+    FieldsCount = 1 + UBound(strArray) - LBound(strArray)
     tmpSUM = 0
     For L0 = LBound(strArray) To UBound(strArray)
         Select Case DetectDataType(strArray(L0))
             Case FieldDataType.Known
                 tmpSUM = tmpSUM + 100
             Case Else
-                tmpSUM = tmpSUM + 20
+                tmpSUM = tmpSUM + 0.1 '20
         End Select
     Next L0
-    RecordScore = tmpSUM / FielCount
+    RecordScore = (tmpSUM / FieldsCount)
 End Function
 ''' <summary>
 ''' Calculates a factor for table scoring based in the standard
@@ -540,7 +541,7 @@ Public Function TableScore(ByRef ArrayList As CSVArrayList) As Double
             For L0 = 0 To ArrayList.count - 1
                 SumRecScores = SumRecScores + RecordScore(ArrayList(L0))
             Next L0
-            TableScore = RecordsConsistencyFactor(ArrayList) * SumRecScores
+            TableScore = RecordsConsistencyFactor(ArrayList) * SumRecScores / ArrayList.count
         End If
     End If
 End Function
diff --git a/src/CSVinterface.cls b/src/CSVinterface.cls
@@ -29,6 +29,8 @@ Private Const CHR_BACKSLASH As String = "\"
 Private Const CHR_DOUBLE_QUOTES As String = """"
 Private Const CHR_TILDE As String = "~"
 Private Const CHR_CARET As String = "^"
+Private Const CHR_LSQRB As String = "{"
+Private Const CHR_RSQRB As String = "}"
 '////////////////////////////////////////////////////////////////////////////////////////////
 '#
 '////////////////////////////////////////////////////////////////////////////////////////////
@@ -300,7 +302,8 @@ Public Function CSVsubsetSplit(filePath As String, _
                                 Optional subsetColumns As Variant = 1, _
                                 Optional Headers As Boolean = True, _
                                 Optional repeatHeaders As Boolean = True, _
-                                Optional streamSize As Long = 20) As Collection
+                                Optional streamSize As Long = 20, _
+                                Optional oConfig As CSVparserConfig = Nothing) As Collection
     Dim CreatedFiles As Collection
     Dim CSVhead As Variant
     Dim CSVreader As CSVinterface
@@ -322,6 +325,11 @@ Public Function CSVsubsetSplit(filePath As String, _
 
     Set CreatedFiles = New Collection
     Set CSVreader = New CSVinterface
+    If Not oConfig Is Nothing Then
+        Set CSVreader.parseConfig = oConfig.CopyConfig
+    Else
+        Set CSVreader.parseConfig = Me.parseConfig.CopyConfig
+    End If
     Set CSVwriter = New CSVinterface
     Set readerConf = CSVreader.parseConfig
     Set ExportSubSet = New CSVArrayList
@@ -348,6 +356,7 @@ Public Function CSVsubsetSplit(filePath As String, _
         'Sequential reading from file
         With CSVstream
             .endStreamOnLineBreak = True
+            .unifiedLFOutput = readerConf.multiEndOfLineCSV
             .OpenStream filePath
             .utf8EncodedFile = readerConf.utf8EncodedFile
             .bufferSize = streamSize
@@ -3162,9 +3171,11 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
                             '@--------------------------------------------------------------------------------
                             'Save results with keys
                             If InStrB(1, TmpCSVstr, GetQuoteChar(QuoteChar(k))) Then
-                                ScoreArray.AddIndexedItem DialectToString(.dialect), GuesserHelper.TableScore(ImportedTable)
+                                ScoreArray.AddIndexedItem AppendIndexesToKey(DialectToString(.dialect), i, j), _
+                                                            GuesserHelper.TableScore(ImportedTable)
                             Else
-                                ScoreArray.AddIndexedItem DialectToString(.dialect) & CHR_CARET, GuesserHelper.TableScore(ImportedTable) / 2
+                                ScoreArray.AddIndexedItem AppendIndexesToKey(DialectToString(.dialect) & CHR_CARET, i, j), _
+                                                            GuesserHelper.TableScore(ImportedTable) '/ 2
                             End If
                         Next k
                 Next j
@@ -3173,7 +3184,7 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
     With ScoreArray
         '@--------------------------------------------------------------------------------
         'Choose the maximum score
-        tmpResult = .keys()(MaxIndexVal(.indexedItems))
+        tmpResult = RemoveIndexesFromKey(.keys()(MaxIndexVal(.indexedItems)))
         '@--------------------------------------------------------------------------------
         'Returns
         Set SniffInString = StringToDialect(tmpResult)
@@ -3218,6 +3229,12 @@ Private Function DialectToString(ByRef dialectObj As CSVdialect) As String
     End With
     DialectToString = Join$(tmpResult, "ii")
 End Function
+Private Function AppendIndexesToKey(ByRef aKey As String, idx1 As Long, idx2 As Long) As String
+    AppendIndexesToKey = CHR_LSQRB & idx1 & idx2 & CHR_RSQRB & aKey
+End Function
+Private Function RemoveIndexesFromKey(ByRef aKey As Variant) As String
+    RemoveIndexesFromKey = MidB(aKey, InStrB(1, aKey, CHR_RSQRB) + 2)
+End Function
 Private Function StringToDialect(ByRef dialectString As String) As CSVdialect
     Dim tmpArr() As String
     Dim idx As Long

diff --git a/testing/tests/Test runner [CSVinterface-TDD].xlsm b/testing/tests/Test runner [CSVinterface-TDD].xlsm
diff --git a/testing/tests/results/CSV delimiter guessing test - 18-ene-2024 23-28-31.txt b/testing/tests/results/CSV delimiter guessing test - 18-ene-2024 23-28-31.txt
@@ -0,0 +1,22 @@
+=== Delimiters guessing test ===
++ Mixed comma and semicolon
++ File with multi-line field
++ Optional quoted fields
++ Mixed comma and semicolon - file B
++ Geometric CSV
++ Table embedded in the last record
++ Table embedded in the second record
++ Multiple commas in fields
++ Uncommon char as field delimiter
++ Wrong delimiters have been added to guessing operation
++ FEC data - [clevercsv issue #15]
++ Mixed comma and colon - [clevercsv issue #35]
++ Json data type - [clevercsv issue #37]
++ Undefined field delimiter
+X Rainbow CSV [issue #92]
+  Expected: ([,] & [2])Actual: ([|] & [2])
++ Pipe character is more frequent than the comma
++ Pipe character is more frequent than the semicolon
++ Short pipe separated table embedded
+= FAIL (1 of 18 failed) = 18/1/2024 11:28:32 p.�m. =
+
diff --git a/testing/tests/results/CSV import test - 18-ene-2024 20-39-49.txt b/testing/tests/results/CSV import test - 18-ene-2024 20-39-49.txt
@@ -0,0 +1,163 @@
+=== StreamCSVimport ===
++ Bad comments value specified
++ Comment with non-default character
++ Commented line at beginning
++ Commented line at end
++ Commented line in middle
++ Entire file is comment lines
++ Input is just a string (a single field)
++ Input is just empty fields
++ Input is just the delimiter (2 empty fields)
++ Input with only a commented line and blank line after
++ Input with only a commented line, without comments enabled
++ Input without comments with line starting with whitespace
++ Line ends with quoted field
++ Line starts with quoted field
++ Misplaced quotes in data, not as opening quotes
++ Multiple consecutive empty fields
++ Multiple rows, one column (no delimiter found)
++ One column input with empty fields
++ One Row
++ Pipe delimiter
++ Quoted field at end of row (but not at EOF) has quotes
++ Quoted field has no closing quote
++ Quoted field with 5 quotes in a row and a delimiter
++ Quoted field with delimiter
++ Quoted field with escaped quotes at boundaries
++ Quoted field with escaped quotes
++ Quoted field with extra whitespace on edges
++ Quoted field with line break
++ Quoted field with quotes around delimiter
++ Quoted field with quotes on left side of delimiter
++ Quoted field with quotes on right side of delimiter
++ Quoted field with Unix escaped quotes at boundaries
++ Quoted field with whitespace around quotes
++ Quoted field
++ Quoted fields at end of row with delimiter and line break
+X Quoted fields with line breaks
+  Expected 3 fields and 1 record
++ Row with enough fields but blank field at end
++ Row with too few fields
++ Row with too many fields
++ Skip empty lines, with empty input
++ Skip empty lines, with first line only whitespace
++ Skip empty lines, with newline at end of input
++ Tab delimiter
++ Three comment lines consecutively at beginning of file
++ Two comment lines consecutively at end of file
++ Two comment lines consecutively
++ Two rows
++ Unquoted field with quotes at end of field
++ Whitespace at edges of unquoted field
++ Complex CSV syntax
+= FAIL (1 of 50 failed) = 18/1/2024 8:40:43 p.�m. =
+
+=== StringCSVimport ===
++ Bad comments value specified
++ Comment with non-default character
++ Commented line at beginning
++ Commented line at end
++ Commented line in middle
++ Entire file is comment lines
++ Input is just a string (a single field)
++ Input is just empty fields
++ Input is just the delimiter (2 empty fields)
++ Input with only a commented line and blank line after
++ Input with only a commented line, without comments enabled
++ Input without comments with line starting with whitespace
++ Line ends with quoted field
++ Line starts with quoted field
++ Misplaced quotes in data, not as opening quotes
++ Multiple consecutive empty fields
++ Multiple rows, one column (no delimiter found)
++ One column input with empty fields
++ One Row
++ Pipe delimiter
++ Quoted field at end of row (but not at EOF) has quotes
++ Quoted field has no closing quote
++ Quoted field with 5 quotes in a row and a delimiter
++ Quoted field with delimiter
++ Quoted field with escaped quotes at boundaries
++ Quoted field with escaped quotes
++ Quoted field with extra whitespace on edges
++ Quoted field with line break
++ Quoted field with quotes around delimiter
++ Quoted field with quotes on left side of delimiter
++ Quoted field with quotes on right side of delimiter
++ Quoted field with Unix escaped quotes at boundaries
++ Quoted field with whitespace around quotes
++ Quoted field
++ Quoted fields at end of row with delimiter and line break
+X Quoted fields with line breaks
+  Expected 3 fields and 1 record
++ Row with enough fields but blank field at end
++ Row with too few fields
++ Row with too many fields
++ Skip empty lines, with empty input
++ Skip empty lines, with first line only whitespace
++ Skip empty lines, with newline at end of input
++ Tab delimiter
++ Three comment lines consecutively at beginning of file
++ Two comment lines consecutively at end of file
++ Two comment lines consecutively
++ Two rows
++ Unquoted field with quotes at end of field
++ Whitespace at edges of unquoted field
++ Complex CSV syntax
+= FAIL (1 of 50 failed) = 18/1/2024 8:40:47 p.�m. =
+
+=== SequentialCSVimport ===
++ Bad comments value specified
++ Comment with non-default character
++ Commented line at beginning
++ Commented line at end
++ Commented line in middle
++ Entire file is comment lines
++ Input is just a string (a single field)
++ Input is just empty fields
++ Input is just the delimiter (2 empty fields)
++ Input with only a commented line and blank line after
+X Input with only a commented line, without comments enabled
+  Expected 1 records with 1 fields
++ Input without comments with line starting with whitespace
++ Line ends with quoted field
++ Line starts with quoted field
++ Misplaced quotes in data, not as opening quotes
++ Multiple consecutive empty fields
++ Multiple rows, one column (no delimiter found)
++ One column input with empty fields
++ One Row
++ Pipe delimiter
++ Quoted field at end of row (but not at EOF) has quotes
++ Quoted field has no closing quote
++ Quoted field with 5 quotes in a row and a delimiter
++ Quoted field with delimiter
++ Quoted field with escaped quotes at boundaries
++ Quoted field with escaped quotes
++ Quoted field with extra whitespace on edges
++ Quoted field with line break
++ Quoted field with quotes around delimiter
++ Quoted field with quotes on left side of delimiter
++ Quoted field with quotes on right side of delimiter
++ Quoted field with Unix escaped quotes at boundaries
++ Quoted field with whitespace around quotes
++ Quoted field
++ Quoted fields at end of row with delimiter and line break
+X Quoted fields with line breaks
+  Expected 3 fields and 1 record
++ Row with enough fields but blank field at end
++ Row with too few fields
++ Row with too many fields
++ Skip empty lines, with empty input
++ Skip empty lines, with first line only whitespace
++ Skip empty lines, with newline at end of input
++ Tab delimiter
++ Three comment lines consecutively at beginning of file
++ Two comment lines consecutively at end of file
++ Two comment lines consecutively
++ Two rows
++ Unquoted field with quotes at end of field
++ Whitespace at edges of unquoted field
++ Complex CSV syntax
+= FAIL (2 of 50 failed) = 18/1/2024 8:40:50 p.�m. =
+
diff --git a/testing/tests/results/CSV import test - 18-ene-2024 20-45-54.txt b/testing/tests/results/CSV import test - 18-ene-2024 20-45-54.txt
@@ -0,0 +1,36 @@
+=== StreamCSVimport ===
++ Bad comments value specified
++ Comment with non-default character
++ Commented line at beginning
++ Commented line at end
++ Commented line in middle
++ Entire file is comment lines
++ Input is just a string (a single field)
++ Input is just empty fields
++ Input is just the delimiter (2 empty fields)
++ Input with only a commented line and blank line after
++ Input with only a commented line, without comments enabled
++ Input without comments with line starting with whitespace
++ Line ends with quoted field
++ Line starts with quoted field
++ Misplaced quotes in data, not as opening quotes
++ Multiple consecutive empty fields
++ Multiple rows, one column (no delimiter found)
++ One column input with empty fields
++ One Row
++ Pipe delimiter
++ Quoted field at end of row (but not at EOF) has quotes
++ Quoted field has no closing quote
++ Quoted field with 5 quotes in a row and a delimiter
++ Quoted field with delimiter
++ Quoted field with escaped quotes at boundaries
++ Quoted field with escaped quotes
++ Quoted field with extra whitespace on edges
++ Quoted field with line break
++ Quoted field with quotes around delimiter
++ Quoted field with quotes on left side of delimiter
++ Quoted field with quotes on right side of delimiter
++ Quoted field with Unix escaped quotes at boundaries
++ Quoted field with whitespace around quotes
++ Quoted field
++ Quoted fields at end of row with delimiter and line break