From 4042183973086c5a3cd131446b4d72e34a0b3f01 Mon Sep 17 00:00:00 2001
From: theo-barfoot <44871137+theo-barfoot@users.noreply.github.com>
Date: Wed, 15 May 2024 17:06:28 +0100
Subject: [PATCH] seperate hashing with no concat and fix "hash" option (#11)
feat: Enhance CSV processing with individual column hashing and additional options
- Modified the existing functionality to hash each selected column separately instead of concatenating them.
- Added new columns with the original column name suffixed by "_hash" for each hashed column.
- Implemented options to manage columns as follows:
- "Hash": Adds a new column with the hashed value and retains the original column.
- "Hash and exclude": Adds a new column with the hashed value and removes the original column from the unidentifiable spreadsheet.
- "Exclude": Removes the original column only from the unidentifiable spreadsheet but retains it in the original_with_hash spreadsheet.
- "Keep": Retains the original column in both spreadsheets without any changes.
- Added a new section with a text input box and a button to hash the input using SHA-1. Displays the first 10 characters of the hash below the input box when the button is clicked.
- Updated the `myForm` submission event listener to handle separate hashing of each column.
- Updated the `updateSelect` function to handle UI changes for each option.
- Ensured that new columns with hashed values are added correctly based on the selected options.
This commit enhances the usability and flexibility of the CSV processing tool, allowing users to better manage and anonymize their data.
---
README.md | 4 +--
index.html | 72 ++++++++++++++++++++--------------------------------
testcsv2.csv | 3 +++
3 files changed, 32 insertions(+), 47 deletions(-)
create mode 100644 testcsv2.csv
diff --git a/README.md b/README.md
index 53fc7f4..cf72663 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Navigate to the [Pseudo-Anonymised ID Generator](https://cai4cai.ml/sha1inbrowse
The application currently supports `.csv` files only. Make sure your data is saved in this format before proceeding. Select your file by clicking on `Choose File`.
### Step 3: Configure Data Columns
-Assign roles to each column in your dataset, specifying which should be hashed, excluded, or kept. Specifically, assign the NHS number column to `Hash and Exclude` and all identifiable information to `Exclude`. Relevant clinical information, but not patient identifiable, can be assigned to `Keep`. Any junk that you do not want, in the anonymised spreadsheet, can be set to `Exclude`.
+Assign roles to each column in your dataset, specifying which should be hashed, excluded, or kept. Specifically, assign the NHS number column to `Hash and Exclude` and all identifiable information to `Exclude`. Relevant clinical information, but not patient identifiable, can be assigned to `Keep`. Any junk that you do not want, in the anonymised spreadsheet, can be set to `Exclude`. Columns that you want a hashed version of, but you also want to appear in the anonymised spreadsheet can be set to `Hash`
### Step 4: Process the Data
Click `Process` to start the local hashing. This generates two `.csv` files:
@@ -33,7 +33,7 @@ Click `Process` to start the local hashing. This generates two `.csv` files:
### Data Handling
- **Internal Use**: Keep `original_with_hash.csv` within the NHS trust.
-- **External Sharing**: `unidentifiable.csv` can be shared with researchers at KCL, following approved ethics and data sharing agreements.
+- **External Sharing**: `unidentifiable.csv` is the anonymised spreadsheet and can be shared with researchers at KCL, following approved ethics and data sharing agreements.
## Support and Contact
For any inquiries or support requests, please contact:
diff --git a/index.html b/index.html
index 6ca839e..8a779a7 100644
--- a/index.html
+++ b/index.html
@@ -25,7 +25,6 @@
margin-bottom: 10px;
text-align: center;
}
-
@@ -125,7 +124,7 @@ Hash Input
const csvFile = document.getElementById("csvFile");
const table = document.getElementById("previewTable");
- const hashColour = "#a3ffff"
+ const hashColour = "#a3ffff";
const excludeColour = "#ffa6a6";
const stripeWidth = 6;
@@ -134,59 +133,59 @@ Hash Input
const input = csvFile.files[0];
const reader = new FileReader();
- // 👇 executed when a file is loaded
reader.onload = async function (e) {
- // 👇 get the text from CSV file
const text = e.target.result;
- // 👇 parse it using D3.js
const data = d3.csvParse(text);
const dataWithHash = d3.csvParse(text);
- // Find what to do with each column
const toHash = [];
+ const toHashAndExclude = [];
const toExclude = [];
data.columns.forEach(function(column) {
const value = document.getElementById("select_" + column).value;
switch (value) {
- case "Hash and keep":
+ case "Hash":
toHash.push(column);
break;
case "Hash and exclude":
- toHash.push(column);
- toExclude.push(column);
+ toHashAndExclude.push(column);
break;
case "Exclude":
toExclude.push(column);
break;
+ case "Keep":
+ // Keep option, do nothing
+ break;
}
});
- console.log(toHash);
- console.log(toExclude);
for (let i = 0; i < data.length; i++) {
- // Concatenate strings for hashing
- let hashKey = "";
- toHash.forEach(function(column) {
- hashKey += data[i][column];
- })
- // Hash
- console.log("Hashing: " + hashKey);
- let hash = await sha1(hashKey);
- console.log("SHA-1: " + hash);
-
- data[i]["Hash"] = hash;
- dataWithHash[i]["Hash"] = hash;
+ // Hash and add columns for "Hash" and "Hash and exclude" options
+ for (let j = 0; j < toHash.length; j++) {
+ const column = toHash[j];
+ const hashKey = data[i][column];
+ const hash = await sha1(hashKey);
+ data[i][`${column}_hash`] = hash;
+ dataWithHash[i][`${column}_hash`] = hash;
+ }
+ for (let j = 0; j < toHashAndExclude.length; j++) {
+ const column = toHashAndExclude[j];
+ const hashKey = data[i][column];
+ const hash = await sha1(hashKey);
+ data[i][`${column}_hash`] = hash;
+ dataWithHash[i][`${column}_hash`] = hash;
+ }
- // Delete fields in toExclude
+ // Remove columns for "Exclude" and "Hash and exclude" options
toExclude.forEach(function(column) {
delete data[i][column];
- })
+ });
+ toHashAndExclude.forEach(function(column) {
+ delete data[i][column];
+ });
}
- // console.log(data);
- // console.log(dataWithHash);
-
let csvContent = "data:text/csv;charset=utf-8," + encodeURI(d3.csvFormat(data));
let originalCsvWithHash = "data:text/csv;charset=utf-8," + encodeURI(d3.csvFormat(dataWithHash));
download(csvContent, "unidentifiable.csv");
@@ -201,7 +200,6 @@ Hash Input
function updateSelect(e) {
const select = e.target;
let name = select.id.slice(7);
- // Update colour of table column
const numCells = table.rows.length;
let value = select.value.toLowerCase();
let toHash = value.includes("hash");
@@ -231,20 +229,13 @@ Hash Input
const reader = new FileReader();
- // 👇 executed when a file is loaded
reader.onload = async function (e) {
- // 👇 get the text from CSV file
const text = e.target.result;
- // 👇 parse it using D3.js
- console.log("Parsing started");
const data = d3.csvParse(text);
- console.log("Parsing done");
- // Get object for preview table
const previewTableBody = document.getElementById("previewTableBody");
- // Header rows for preview
const headerRow = document.getElementById("previewHeadRow");
for (let i = 0; i < data.columns.length; i++) {
let cell = headerRow.insertCell(i);
@@ -253,7 +244,6 @@ Hash Input
cell.innerHTML = column;
}
- // Add rows to preview table
const toShow = Math.min(data.length, 10);
for (let i = 0; i < toShow; i++) {
const row = previewTableBody.insertRow(i);
@@ -268,16 +258,13 @@ Hash Input
}
}
- // Add drop-down for each field
const options = ["Keep", "Exclude", "Hash", "Hash and exclude"];
data.columns.forEach(function(name) {
- // Label
const label = document.createElement("label");
const id = "select_" + name;
label.for = id;
label.textContent = name + ": ";
selectsParent.appendChild(label);
- // Select
const select = document.createElement("select");
select.id = id;
select.addEventListener("change", updateSelect)
@@ -288,22 +275,17 @@ Hash Input
option.text = optionName;
select.appendChild(option);
});
- // Line break
selectsParent.appendChild(document.createElement("br"));
});
- // Show size of dataset
document.getElementById("previewLbl").innerHTML = "Showing " + toShow + " of " + data.length;
- // Show options
setOptionsVisible(true);
setPreviewVisible(true);
setLoadingVisible(false);
};
- // 👇 load the input file to the reader
setLoadingVisible(true);
- console.log("Starting file reading");
reader.readAsText(input);
});
diff --git a/testcsv2.csv b/testcsv2.csv
new file mode 100644
index 0000000..423452a
--- /dev/null
+++ b/testcsv2.csv
@@ -0,0 +1,3 @@
+nhs number,accession number,some other stuff,junk
+12345,RJZ12345,good,beans
+54321,RJZ54321,bad,toast