diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index f3911d957..d3c4fa4a8 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -35,6 +35,7 @@ env:
POM_VERSION: 2023-SNAPSHOT
JAVA_VERSION: 21
ERRORS_THRESHOLD: 0.01
+ PYTHON_VERSION: "3.10"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
@@ -265,7 +266,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- benchmark: [ 'auctionmark', 'chbenchmark', 'epinions', 'hyadapt', 'noop', 'otmetrics', 'resourcestresser', 'seats', 'sibench', 'smallbank', 'tatp', 'templated', 'tpcc', 'tpcc-with-reconnects', 'tpch', 'twitter', 'voter', 'wikipedia', 'ycsb' ]
+ benchmark: [ 'anonymization', 'auctionmark', 'chbenchmark', 'epinions', 'hyadapt', 'noop', 'otmetrics', 'resourcestresser', 'seats', 'sibench', 'smallbank', 'tatp', 'templated', 'tpcc', 'tpcc-with-reconnects', 'tpch', 'twitter', 'voter', 'wikipedia', 'ycsb' ]
services:
mysql: # https://hub.docker.com/_/mysql
image: mysql:latest
@@ -301,6 +302,21 @@ jobs:
java-version: ${{env.JAVA_VERSION}}
distribution: 'temurin'
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{env.PYTHON_VERSION}}
+
+ - name: Install Python dependencies
+ working-directory: ./scripts/anonymization
+ run: |
+ if [[ ${{matrix.benchmark}} == anonymization ]]; then
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ else
+ echo "Dependency installation not necessary for benchmark"
+ fi
+
- name: Run benchmark
env:
MYSQL_PORT: ${{ job.services.mysql.ports[3306] }}
@@ -312,6 +328,13 @@ jobs:
if [[ ${{matrix.benchmark}} == templated ]]; then
java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
java -jar benchbase.jar -b ${{matrix.benchmark}} -c config/mysql/sample_${{matrix.benchmark}}_config.xml --create=false --load=false --execute=true --json-histograms results/histograms.json
+ # For anonymization, we load tpcc and anonymize a single table. The workload itself is not executed
+ # FIXME: 'exit 0' is called because there is no benchmark executed and analyzed. Must be removed once the Anonymization script is
+ # fully implemented. See Pull Request 455.
+ elif [[ ${{matrix.benchmark}} == anonymization ]]; then
+ java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
+ java -jar benchbase.jar -b tpcc -c config/mysql/sample_${{matrix.benchmark}}_config.xml --anonymize=true
+ exit 0
elif [[ ${{matrix.benchmark}} == tpcc-with-reconnects ]]; then
# See Also: WITH_SERVICE_INTERRUPTIONS=true docker/build-run-benchmark-with-docker.sh
java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
new file mode 100644
index 000000000..a9e0d2f1e
--- /dev/null
+++ b/.github/workflows/python.yml
@@ -0,0 +1,34 @@
+name: BenchBase (Python)
+
+on:
+ push:
+ branches: [ main ]
+ tags:
+ - 'v*'
+ pull_request:
+ branches: [ main ]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.10"]
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ working-directory: ./scripts/anonymization
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ - name: Check anonymization files with pylint
+ run: |
+ pylint --rcfile=.pylintrc ./scripts/anonymization/src
+ - name: Test anonymization with pytest
+ working-directory: ./scripts/anonymization/src
+ run: pytest test.py
diff --git a/.gitignore b/.gitignore
index 756c1fb4f..d425de6bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,5 +56,8 @@ build/
.*.swp
.env
-
docker-compose-*.tar.gz
+
+# Python
+__pycache__/
+*.py[cod]
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 000000000..c3daa833d
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,45 @@
+[MAIN]
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=0.9
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=9
+
+# Maximum number of locals for function / method body.
+max-locals=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=0
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
diff --git a/config/mysql/sample_anonymization_config.xml b/config/mysql/sample_anonymization_config.xml
new file mode 100644
index 000000000..08843f66a
--- /dev/null
+++ b/config/mysql/sample_anonymization_config.xml
@@ -0,0 +1,42 @@
+
+
+
+
+ MYSQL
+ com.mysql.cj.jdbc.Driver
+ jdbc:mysql://localhost:3306/benchbase?rewriteBatchedStatements=true&allowPublicKeyRetrieval=True&sslMode=DISABLED
+ admin
+ password
+ true
+ TRANSACTION_SERIALIZABLE
+ 128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/scripts/anonymization/README.md b/scripts/anonymization/README.md
new file mode 100644
index 000000000..57f478889
--- /dev/null
+++ b/scripts/anonymization/README.md
@@ -0,0 +1,352 @@
+# Anonymization
+
+**NOTE:ANONYMIZATION IS A WORK IN PROGRESS AND DOES CURRENTLY NOT ACTUALLY ANONYMIZE THE DATA. THIS FEATURE WILL BE ADDED LATER**
+
+The anonymization module allows applying privacy mechanisms such as differential privacy or column faking to the data.
+The system will pull data from the JDBC connection, anonymize the data and push it back to the DBMS by creating a new table.
+
+## Setup
+
+### Requirements
+
+**Python**: Python 3.10
+
+Requirements can be found inside of the `requirements.txt` file.
+
+Install with pip:
+
+```bash
+pip install -r requirements.txt
+```
+
+### Pylint - Linting
+
+Pylint is automatically added by the requirements and can be run as follows:
+```bash
+pylint --rcfile=.pylintrc src/
+```
+
+### Pytest - Testing
+Pytest is automatically added by the requirements and can be run as follows:
+
+```bash
+pytest ./src/test.py
+```
+
+
+
+## Configuration files (XML)
+
+### Table selection
+
+Multiple tables can be anonymized sequentially by specifying a `
`-tag for each of them and defining the table name.
+
+```xml
+
+
+
+
+```
+
+### Differential Privacy anonymization
+
+In order to apply differential privacy, a corresponding ``-tag must be added to each table.
+
+Differential privacy can be configured with the following parameters:
+
+| Name | Default | Possible Value | Description |
+| ----------- | ------- | -------------------------------------------------- | -------------------------------------------------------------------------------------------- |
+| epsilon | 1.0 | Any value > 0 | The privacy budget. Higher values will decrease the privacy guarantee. |
+| pre_epsilon | 0.5 | Any value between 0 and epsilon (epsilon excluded) | The privacy budget spent on preprocessing of the data. Only necessary for continuous columns. |
+| algorithm | mst | One of: `mst,aim,dpctgan,patectgan` | The differential privacy mechanism applied to the data. |
+
+```xml
+
+
+
+```
+
+### Column Information
+
+The columns of a table can be split into four categories:
+
+1. `ignore` - Columns that are ignored by the DP-mechanism. Left untouched.
+2. `categorical` - Columns that contain categorical data.
+3. `continuous` - Columns of data on a continuous domain. Only numerical columns.
+4. `ordinal` - Columns of ordinal meaning. Typically numerical columns with low numers of distinct values.
+
+Each type of column requires its own XML-tag with each member of a type given as a separate tag. (See exampels below for clarification)
+
+If no column type information is given:
+
+- The anonymization process will automatically try to infer the category best suitable for each column.
+- Automatic anonymization can lead to undesired results or failures in the process.
+- To counter this, each column can be defined by hand with the suitable tags.
+
+---
+
+**Ignore the ID-column and anonymize automatically:**
+
+```xml
+
+
+
+```
+
+The ID-column will be removed from the anonymization process and be added back to the data in its original form before pushing back to the DB! This is useful for primary key columns or columns that contain only NULL values.
+
+---
+
+**Define the type of each column individually:**
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+ **Disclaimer**: As soon as the column types are defined by hand, all columns must be defined. It is not possible to only specify some of the categorical columns and let the algorithms do the rest!
+
+### Continuous Columns
+
+The continuous columns will automatically be preprocessed by the algorithm. The lower and upper bounds of the values are inferred in a differentially private way and therefore use some of the preprocessing epsilon budget. Further, values will be binned in order to be applicable to the DP-mechanism. Automatically, the system uses only 10 bins. In order to achieve better data utility, we suggest to manually fine-tune the settings in the config
+
+In the following example, the number of bins is increased to 1000 and the lower and upper bounds are given by hand such that no preprocessing epsilon must be spent!
+
+```xml
+
+
+
+```
+
+The parameters must be added inside of the ``-tag. (See below)
+
+| Name | Default | Possible Value | Description |
+| ----- | ------- | ----------------- | ------------------------------------------- |
+| name | - | Any string | The name of the column. |
+| bins | 10 | Any value | The amount of distinct values in the output.|
+| lower | - | Any value | Lower bound of the values. |
+| upper | - | Any value > lower | Upper bound of the values. |
+
+---
+
+**A full working example could look like this:**
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+## Value faking anonymization
+
+The anonymization module supports value faking to handle sensitive values. The process is decoupled from differential privacy anonymization, allowing for separate or combined anonymization efforts.
+
+**How to handle sensitive values?**
+
+- A ``-tag must be created to signify that column faking should be applied.
+- Column faking can be defined individually on each column with the ``-tag.
+
+The following parameters are available for sensitive values:
+
+| Name | Default | Possible Value | Description |
+| ------- | ------- | ------------------------------------------------ | ---------------------------------------------------------------------------------------------- |
+| name | - | Any string | The name of the column. |
+| method | pystr | Any method provided by the python faker library | The faking method of the library. If the method is not found, a random string will be inserted. |
+| locales | - | A string of supported locales, separated by comma | The locale to produce localized values like chinese street names or english first names. |
+| seed | 0 | Any integer value | The privacy budget spent on preprocessing of the data. Only necessary for continuous columns. |
+
+### Faker categories
+
+Possible methods for faking can be found on the official [Faker documentation](https://faker.readthedocs.io/en/master/)
+
+### Faking example
+
+```xml
+
+
+
+```
+
+## Examples
+
+### Basic
+
+The most basic config will need only the name of the table and the privacy mechanism. All necessary information is collected automatically.
+
+```xml
+
+
+
+```
+
+It is possible to specify multiple tables for the anonymization. Each table will be anonymized on its own
+
+```xml
+
+
+
+
+```
+
+The parameters of differential privacy anonymization can be tuned manually
+
+```xml
+
+
+
+```
+
+### Full config
+
+It is possible to add information about the columns to enable fine-tuning.
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+### Sensitive value handling
+
+Sensitive columns can be anonymized further by replcing the values with fake values. A combination of differential privacy and value faking could look as follows:
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+## Architecture Image
+
+![Architecture](architecture.png)
+
+## More Information
+
+[Differential Privacy Fundamentals](https://www.cis.upenn.edu/~aaroth/Papers/privacybook.pdf)
+
+[Smartnoise Library (Algorithm implementations)](https://github.com/opendp/smartnoise-sdk)
+
+[Python Faker documentation](https://faker.readthedocs.io/en/master/)
diff --git a/scripts/anonymization/architecture.png b/scripts/anonymization/architecture.png
new file mode 100644
index 000000000..fced6c8c4
Binary files /dev/null and b/scripts/anonymization/architecture.png differ
diff --git a/scripts/anonymization/requirements.txt b/scripts/anonymization/requirements.txt
new file mode 100644
index 000000000..1702d2858
--- /dev/null
+++ b/scripts/anonymization/requirements.txt
@@ -0,0 +1,11 @@
+Faker~=15.3.4
+JayDeBeApi~=1.2.3
+JPype1~=1.5.0
+lxml~=4.9.3
+numpy~=1.26.4
+pandas~=2.2.1
+pytest~=8.1.1
+pylint~=3.0.2
+smartnoise-sql~=1.0.3
+smartnoise-synth~=1.0.3
+private-pgm @ git+https://github.com/ryan112358/private-pgm.git@5b9126295c110b741e5426ddbff419ea1e60e788
diff --git a/scripts/anonymization/src/anonymizer.py b/scripts/anonymization/src/anonymizer.py
new file mode 100644
index 000000000..924d61a32
--- /dev/null
+++ b/scripts/anonymization/src/anonymizer.py
@@ -0,0 +1,107 @@
+"""Module that handles the full Anonymization pipeline
+"""
+
+import sys
+import xml.etree.ElementTree as ET
+import pandas as pd
+from modules.jdbc_handler import JDBCHandler
+from configuration.config_parser import XMLParser
+from configuration.configurations import DPConfig, SensitiveConfig, ContinuousConfig
+#from modules.dp_anonymizer import DifferentialPrivacyAnonymizer
+#from modules.sensitive_anonymizer import SensitiveAnonymizer
+
+
+def anonymize(
+ dataset: pd.DataFrame,
+ anon_config: DPConfig,
+ cont_config: ContinuousConfig,
+ sens_config: SensitiveConfig,
+ templates_path: str,
+):
+ '''
+ dp_data = dataset
+ if anon_config:
+ dp_anonymizer = DifferentialPrivacyAnonymizer(dataset, anon_config, cont_config)
+ dp_data = dp_anonymizer.run_anonymization()
+
+ if sens_config:
+ sens_anonymizer = SensitiveAnonymizer(dp_data,sens_config,templates_path)
+ dp_data = sens_anonymizer.run_anonymization()
+
+ return dp_data
+ '''
+ return
+
+
+def anonymize_db(
+ jdbc_handler: JDBCHandler,
+ anon_config: DPConfig,
+ sens_config: SensitiveConfig,
+ cont_config: ContinuousConfig,
+ templates_path: str,
+):
+ '''
+ jdbc_handler.start_JVM()
+
+ conn = jdbc_handler.get_connection()
+
+ table = anon_config.table_name
+ dataset, timestamps = jdbc_handler.data_from_table(conn, table)
+
+ datasetAnon = anonymize(
+ dataset, anon_config, contConfig, sensConfig, templates_path
+ )
+
+ ## TODO: Throw in Sensitive Anonmization
+
+ # Create empty table
+ anon_table_name = jdbc_handler.create_anonymized_table(conn, table)
+
+ # Populate new table
+ jdbc_handler.populate_anonymized_table(
+ conn, datasetAnon, anon_table_name, timestamps
+ )
+
+ conn.close()
+ '''
+ return
+
+
+
+def main():
+ """Entry method"""
+
+ # No templates provided
+ if len(sys.argv) == 2:
+ xml_config_path = sys.argv[1]
+ templates_path = ""
+
+ elif len(sys.argv) == 3:
+ xml_config_path = sys.argv[1]
+ templates_path = sys.argv[2]
+
+ else:
+ print("Not enough arguments provided: ")
+ return
+
+ tree = ET.parse(xml_config_path)
+
+ parameters = tree.getroot()
+
+ jdbc_handler = JDBCHandler(
+ parameters.find("driver").text,
+ parameters.find("url").text,
+ parameters.find("username").text,
+ parameters.find("password").text,
+ )
+
+ # Loop over all specified tables and anonymize them one-by-one
+ for table in parameters.find("anonymization").findall("table"):
+ config_parser = XMLParser(table)
+ anon_config, sens_config, cont_config = config_parser.get_config()
+
+ anonymize_db(jdbc_handler, anon_config, sens_config, cont_config, templates_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/anonymization/src/configuration/config_parser.py b/scripts/anonymization/src/configuration/config_parser.py
new file mode 100644
index 000000000..bbe6e492a
--- /dev/null
+++ b/scripts/anonymization/src/configuration/config_parser.py
@@ -0,0 +1,118 @@
+"""Module that handles all things related to configuration parsing
+"""
+
+import sys
+import xml.etree.ElementTree as ET
+
+from configuration.configurations import (
+ DPConfig,
+ DPColumnConfig,
+ SensitiveConfig,
+ SensitiveEntry,
+ ContinuousConfig,
+ ContinuousEntry,
+)
+
+
+class XMLParser:
+ """A class to represent a specific XML parser for BenchBase configuration files
+
+ Attributes
+ ----------
+ table : ET.Element
+ XML element with the - tag
+ """
+
+ def __init__(self, table: ET.Element):
+ self.table = table
+
+ def get_config(self):
+ """Function that extracts the different types of configuration classes from the XML tree
+
+ Returns:
+ (DPConfig,ContinuousConfig,SensitiveConfig): The three configuration classes
+ """
+
+ anon_config = None
+ cont_config = None
+ sens_config = None
+ continuous_entries = []
+ sensitive_entries = []
+
+ table_name = self.table.get("name")
+
+ # Exit the program if not enough basic information (name of a table) is available
+ if table_name is None:
+ sys.exit(
+ "There was no name provided for the table that should be anonymized. Program is exiting now!"
+ )
+
+ print(f"Parsing config for table: {table_name}")
+
+ dp_info = self.table.find("differential_privacy")
+
+ if dp_info is not None:
+
+ cat = self.__get_column_type("ignore", dp_info)
+ ordi = self.__get_column_type("ordinal", dp_info)
+ ignore = self.__get_column_type("continuous", dp_info)
+
+ eps = dp_info.get("epsilon", "1.0")
+ pre_eps = dp_info.get("pre_epsilon", "0.5")
+ alg = dp_info.get("algorithm", "mst")
+
+ # Specific handling for continuous columns to incorporate a separate config
+ cont = []
+ if dp_info.find("continuous") is not None:
+ for column in dp_info.find("continuous").findall("column"):
+ cont.append(column.get("name"))
+
+ if column.get("bins") or column.get("lower") or column.get("upper"):
+ continuous_entries.append(
+ ContinuousEntry(
+ column.get("name"),
+ column.get("bins", "10"),
+ column.get("lower"),
+ column.get("upper"),
+ )
+ )
+
+ column_classes = DPColumnConfig(ignore,cat,cont,ordi)
+
+ anon_config = DPConfig(
+ table_name, eps, pre_eps, alg, column_classes
+ )
+
+ if len(cont) > 0:
+ cont_config = ContinuousConfig(continuous_entries)
+
+ sens = self.table.find("value_faking")
+ if sens is not None:
+ for column in sens.findall("column"):
+ sensitive_entries.append(
+ SensitiveEntry(
+ column.get("name"),
+ column.get("method"),
+ column.get("locales"),
+ column.get("seed", "0"),
+ )
+ )
+ sens_config = SensitiveConfig(sensitive_entries)
+
+ return anon_config, sens_config, cont_config
+
+ def __get_column_type(self, keyword: str, subtree: ET.Element):
+ """Helper method to extract column types
+
+ Args:
+ keyword (str): Column type keyword
+ subtree (ET.Element): The subtree in which to search
+
+ Returns:
+ list[str]: A list of column names with the specific type
+ """
+ tmp = []
+ if subtree.find(keyword) is not None:
+ for column in subtree.find(keyword).findall("column"):
+ tmp.append(column.get("name"))
+ return tmp
diff --git a/scripts/anonymization/src/configuration/configurations.py b/scripts/anonymization/src/configuration/configurations.py
new file mode 100644
index 000000000..5938116df
--- /dev/null
+++ b/scripts/anonymization/src/configuration/configurations.py
@@ -0,0 +1,136 @@
+"""Module that contains all configuration classes
+"""
+
+
+class DPColumnConfig:
+ """A class to represent the classification of columns for DP anonymization
+
+ Attributes
+ ----------
+ hidden : list[str]
+ List of column names that will not be anonymized
+ categorical : list[str]
+ List of categorical column names
+ continuous : list[str]
+ List of continuous column names
+ ordinal : list[str]
+ List of ordinal column names
+
+ """
+
+ def __init__(
+ self,
+ hidden: list[str],
+ categorical: list[str],
+ continuous: list[str],
+ ordinal: list[str],
+ ):
+ self.hidden = hidden
+ self.categorical = categorical
+ self.continuous = continuous
+ self.ordinal = ordinal
+
+
+class DPConfig:
+ """A class to represent a config that handles DP-Anonymization
+
+ Attributes
+ ----------
+ table_name : str
+ Table name
+ epsilon : str
+ Privacy budget
+ preproc_eps : str
+ Privacy budget for preprocessing
+ algorithm : str
+ Name of the DP-algorithm
+ column_classification : DPColumnConfig
+ lassification of table columns
+
+ """
+
+ def __init__(
+ self,
+ table_name: str,
+ epsilon: str,
+ preproc_eps: str,
+ algorithm: str,
+ column_classification: DPColumnConfig,
+ ):
+ self.epsilon = epsilon
+ self.table_name = table_name
+ self.preproc_eps = preproc_eps
+ self.algorithm = algorithm
+ self.column_classification = column_classification
+
+
+class ContinuousEntry:
+ """A class to represent a continuous column entry
+
+ Attributes
+ ----------
+ name : str
+ Name of the column
+ bins : str
+ Number of bins
+ lower : str
+ Lower bound of values
+ upper : str
+ Upper bound of values
+
+ """
+
+ def __init__(self, name: str, bins: str, lower: str, upper: str):
+ self.name = name
+ self.bins = bins
+ self.lower = lower
+ self.upper = upper
+
+
+class ContinuousConfig:
+ """A class to represent a continuous column config
+
+ Attributes
+ ----------
+ columns : list[ContinuousEntry]
+ A list of continuous entries
+ """
+
+ def __init__(self, columns: list[ContinuousEntry]):
+ self.columns = columns
+
+
+class SensitiveEntry:
+ """A class to represent a sensitive column entry
+
+ Attributes
+ ----------
+ name : str
+ Name of the column
+ method : str
+ Faking method
+ locales : list[str]
+ List of locales
+ seed : str
+ Randomization seed
+
+ """
+
+ def __init__(self, name: str, method: str, locales: list[str], seed: str):
+ self.name = name
+ self.method = method
+ self.locales = locales
+ self.seed = seed
+
+
+class SensitiveConfig:
+ """A class to represent a continuous column config
+
+ Attributes
+ ----------
+ columns : list[SensitiveEntry]
+ Name of the column
+ """
+
+ def __init__(self, columns: list[SensitiveEntry]):
+ self.columns = columns
diff --git a/scripts/anonymization/src/modules/jdbc_handler.py b/scripts/anonymization/src/modules/jdbc_handler.py
new file mode 100644
index 000000000..79132aa91
--- /dev/null
+++ b/scripts/anonymization/src/modules/jdbc_handler.py
@@ -0,0 +1,152 @@
+"""JDBCHandler. A class that handles all things related to database connections
+"""
+
+import jpype
+import jpype.imports
+import jaydebeapi
+import pandas as pd
+import numpy as np
+
+
+class JDBCHandler:
+ """
+ A class to represent a JDBC connection handler
+
+ Attributes
+ ----------
+ driver : str
+ jdbc driver
+ url : str
+ database url
+ username : str
+ database username
+ password : str
+ database password
+ """
+
+ JAR_PATH = "benchbase.jar"
+
+ def __init__(self, driver: str, url: str, username: str, password: str):
+ self.driver = driver
+ self.url = url
+ self.username = username
+ self.password = password
+
+ def start_jvm(self):
+ """Function that starts the Java Virtual Machine based on the JAR file created by BenchBase"""
+ jpype.startJVM(classpath=[self.JAR_PATH])
+
+ def get_connection(self):
+ """Function that returns a database connection based on the class attributes
+
+ Returns:
+ jaydebeapi.Connection: A connection to the database service
+ """
+ return jaydebeapi.connect(self.driver, self.url, [self.username, self.password])
+
+ def data_from_table(self, conn: jaydebeapi.Connection, table: str):
+ """Function that pulls data from a specific table of the database
+
+ Args:
+ conn (jaydebeapi.Connection): Connection to the database
+ table (str): Name of the table
+
+ Returns:
+ (pd.DataFrame,list[int]): The table as a DataFrame and a list of indexes for all time-related columns
+ """
+ curs = conn.cursor()
+ curs.execute(f"SELECT * FROM {table}")
+
+ res = curs.fetchall()
+ meta = curs.description
+ curs.close()
+
+ cols = []
+ col_types = []
+ for entry in meta:
+ cols.append(str(entry[0]))
+ col_types.append(entry[1])
+ timestamp_indexes = self.__get_timestamp_indexes(col_types)
+
+ frame = pd.DataFrame(res, columns=cols)
+
+ return frame, timestamp_indexes
+
+ def __get_timestamp_indexes(self, col_types: list):
+ """Function that analyzes table metadata and returns a list of indexes of time-related columns
+
+ Args:
+ col_types (list): A list of column types
+
+ Returns:
+ list[int]: A list of indexes
+ """
+ indexes = []
+ for i, entry in enumerate(col_types):
+ for d_type in entry.values:
+ if d_type in ("TIMESTAMP", "DATE", "TIME"):
+ indexes.append(i)
+ return indexes
+
+ def create_anonymized_table(self, conn: jaydebeapi.Connection, table: str):
+ """Function that creates an empty copy of an existing table on the database
+
+ Args:
+ conn (jaydebeapi.Connection): Connection to the database
+ table (str): Name of the original table
+
+ Returns:
+ str: The name of the copied version
+ """
+ curs = conn.cursor()
+ anon_table_name = table + "_anonymized"
+ curs.execute(f"DROP TABLE IF EXISTS {anon_table_name}")
+ curs.execute(f"CREATE TABLE {anon_table_name} AS TABLE {table} WITH NO DATA")
+ curs.close()
+ return anon_table_name
+
+ def populate_anonymized_table(
+ self,
+ conn: jaydebeapi.Connection,
+ df: pd.DataFrame,
+ table: str,
+ timestamp_indexes,
+ ):
+ """Function that pushed data to a table on the database
+
+ Args:
+ conn (jaydebeapi.Connection): Connection to the database
+ df (pd.DataFrame): Data to push
+ table (str): Name of the table that receives the data
+ timestamp_indexes (list[int]): A list of indexes of time-related
+ """
+ # NaN replacement with NONE
+ df = df.replace(np.nan, None)
+
+ # Parsing timestamps to datetime format
+ for ind in timestamp_indexes:
+ name = df.columns[ind]
+ df[name] = pd.to_datetime(df[name], format="mixed")
+
+ tuples = [tuple(x) for x in df.values]
+
+ if len(timestamp_indexes):
+
+ # This is a dynamic import that only works once the JVM is running
+ import java # pylint: disable=import-outside-toplevel,import-error
+
+ for i, tup in enumerate(tuples):
+ li = list(tup)
+ for j in timestamp_indexes:
+ if pd.isnull(li[j]):
+ li[j] = None
+ else:
+ li[j] = java.sql.Timestamp @ li[j]
+ tuples[i] = tuple(li)
+
+ column_slots = f"({','.join('?' for _ in df.columns)})"
+ insert_query = f"insert into {table} values {column_slots}"
+
+ curs = conn.cursor()
+ curs.executemany(insert_query, tuples)
+ curs.close()
diff --git a/scripts/anonymization/src/test.py b/scripts/anonymization/src/test.py
new file mode 100644
index 000000000..f1f973f05
--- /dev/null
+++ b/scripts/anonymization/src/test.py
@@ -0,0 +1,74 @@
+"""Testing suite
+"""
+
+import xml.etree.ElementTree as ET
+from configuration.config_parser import XMLParser
+
+
+MINIMAL_CONFIG = """
+
+
+
+"""
+
+FULL_CONFIG = """
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+def test_full_config():
+ """Test method for a full config with dp-anonymization, continuous and sensitive values"""
+
+ parameters = ET.fromstring(FULL_CONFIG)
+
+ full_anon = parameters.find("table")
+ config_parser = XMLParser(full_anon)
+ anon_config, sens_config, cont_config = config_parser.get_config()
+
+ assert anon_config is not None
+ assert sens_config is not None
+ assert cont_config is not None
+
+ assert anon_config.table_name == "item"
+ assert anon_config.epsilon == "1.0"
+ assert anon_config.preproc_eps == "0.0"
+ assert anon_config.algorithm == "aim"
+
+
+def test_minimal_config():
+ """Test method for a minimal config where only dp-anonymization is applied
+ """
+
+ parameters = ET.fromstring(MINIMAL_CONFIG)
+
+ only_dp_auto = parameters.find("table")
+ config_parser = XMLParser(only_dp_auto)
+ anon_config, sens_config, cont_config = config_parser.get_config()
+
+ assert anon_config is not None
+ assert sens_config is None
+ assert cont_config is None
diff --git a/src/main/java/com/oltpbenchmark/DBWorkload.java b/src/main/java/com/oltpbenchmark/DBWorkload.java
index d5af2f5d6..0c2ddc5cb 100644
--- a/src/main/java/com/oltpbenchmark/DBWorkload.java
+++ b/src/main/java/com/oltpbenchmark/DBWorkload.java
@@ -538,6 +538,20 @@ public static void main(String[] args) throws Exception {
LOG.debug("Skipping loading benchmark database records");
}
+ // Anonymize Datasets
+ // Currently, the system only parses the config but does not run any anonymization!
+ // Will be added in the future
+ if (isBooleanOptionSet(argsLine, "anonymize")) {
+ try {
+ if (xmlConfig.configurationsAt("/anonymization/table").size() > 0) {
+ applyAnonymization(xmlConfig, configFile);
+ }
+ } catch (Throwable ex) {
+ LOG.error("Unexpected error when anonymizing datasets", ex);
+ System.exit(1);
+ }
+ }
+
// Execute Workload
if (isBooleanOptionSet(argsLine, "execute")) {
// Bombs away!
@@ -579,6 +593,8 @@ private static Options buildOptions(XMLConfiguration pluginConfig) {
options.addOption(null, "create", true, "Initialize the database for this benchmark");
options.addOption(null, "clear", true, "Clear all records in the database for this benchmark");
options.addOption(null, "load", true, "Load data using the benchmark's data loader");
+ options.addOption(
+ null, "anonymize", true, "Anonymize specified datasets using differential privacy");
options.addOption(null, "execute", true, "Execute the benchmark workload");
options.addOption("h", "help", false, "Print this help");
options.addOption("s", "sample", true, "Sampling window");
@@ -799,4 +815,41 @@ private static boolean isBooleanOptionSet(CommandLine argsLine, String key) {
}
return (false);
}
+
+ /**
+ * Handles the anonymization of specified tables with differential privacy and automatically
+ * creates an anonymized copy of the table. Adapts templated query file if sensitive values are
+ * present
+ *
+ * @param xmlConfig
+ * @param configFile
+ */
+ private static void applyAnonymization(XMLConfiguration xmlConfig, String configFile) {
+
+ String templatesPath = "";
+ if (xmlConfig.containsKey("query_templates_file")) {
+ templatesPath = xmlConfig.getString("query_templates_file");
+ }
+
+ LOG.info("Starting the Anonymization process");
+ LOG.info(SINGLE_LINE);
+ String osCommand = System.getProperty("os.name").startsWith("Windows") ? "python" : "python3";
+ ProcessBuilder processBuilder =
+ new ProcessBuilder(
+ osCommand, "scripts/anonymization/src/anonymizer.py", configFile, templatesPath);
+ try {
+ // Redirect Output stream of the script to get live feedback
+ processBuilder.inheritIO();
+ Process process = processBuilder.start();
+ int exitCode = process.waitFor();
+ if (exitCode != 0) {
+ throw new Exception("Anonymization program exited with a non-zero status code");
+ }
+ LOG.info("Finished the Anonymization process for all tables");
+ LOG.info(SINGLE_LINE);
+ } catch (Exception e) {
+ LOG.error(e.getMessage());
+ return;
+ }
+ }
}