diff --git a/CITATION.cff b/CITATION.cff index 887fb45..b0f2557 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -9,5 +9,10 @@ authors: orcid: "https://orcid.org/0000-0002-0013-4602" title: "ANJANA" version: 0.2.1 -date-released: 2024-04-18 +date-released: 2024-05-13 url: "https://github.com/IFCA-Advanced-Computing/anjana" +identifiers: + - type: doi + value: 10.5281/zenodo.11186382 + + diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 6ada495..7ce3da6 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -124,6 +124,29 @@ Then, in order to create the hierarchies we can define the following dictionary: 1: np.array(["*"] * len(data["city"].values))} # Suppression } +In addition, we can also use the function _generate_intervals()_ from _utils_ for creating the interval-based hierarchy as follows: + +.. code-block:: python + + import numpy as np + from anjana.anonymity import utils + + age = data['age'].values + + hierarchies = { + "age": { + 0: data["age"].values, + 1: utils.generate_intervals(data["age"].values, 0, 100, 5), + 2: utils.generate_intervals(data["age"].values, 0, 100, 10), + }, + "gender": { + 0: data["gender"].values, + 1: np.array(["*"] * len(data["gender"].values)) # Suppression + }, + "city": {0: data["city"].values, + 1: np.array(["*"] * len(data["city"].values))} # Suppression + } + .. _adult dataset: https://archive.ics.uci.edu/ml/datasets/adult .. _examples folder of the repository: https://gitlab.ifca.es/privacy-security/siesta-anonymity/-/tree/main/examples diff --git a/docs/source/index.rst b/docs/source/index.rst index 1ff9b1c..6df587c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,9 +1,9 @@ ANJANA ============================================================================= -|License| |codecov| |DOI| |PyPI| |Downloads| |Documentation Status| +|License| |codecov| |DOI| |Downloads| |Documentation Status| |release-please| |Publish Package in PyPI| |CI/CD Pipeline| |Code Coverage| -|Python version| +|Python version| |PyPI| ANJANA is a `Python`_ library which allows the application of different anonymity techniques based on a set of identifiers, quasi-identifiers (QI) and a sensitive diff --git a/examples/hospital.py b/examples/hospital.py index 8cdf7e9..62a9126 100644 --- a/examples/hospital.py +++ b/examples/hospital.py @@ -16,7 +16,7 @@ import numpy as np import pandas as pd -from anjana.anonymity import k_anonymity, l_diversity, utils +from anjana.anonymity import k_anonymity, l_diversity, utils, basic_beta_likeness data = pd.read_csv("data/hospital_extended.csv") diff --git a/tests/test_anonymity.py b/tests/test_anonymity.py index 49a30ae..67d5a8d 100644 --- a/tests/test_anonymity.py +++ b/tests/test_anonymity.py @@ -188,6 +188,21 @@ def test_entropy_l(self): ) assert len(data_anon) == 0 + def test_entropy_l1(self): + data_anon = anonymity.entropy_l_diversity( + self.data, + self.ident, + self.quasi_ident, + self.sens_att, + self.k, + 1, + self.supp_level, + self.hierarchies, + ) + assert 1 == pycanon.anonymity.entropy_l_diversity( + data_anon, self.quasi_ident, [self.sens_att] + ) + def test_rec_c_l(self): data_anon = anonymity.recursive_c_l_diversity( self.data, @@ -365,7 +380,11 @@ class TestHospital: l_div = 2 supp_level = 0 hierarchies = { - "age": dict(pd.read_csv("./examples/hierarchies/age.csv", header=None)), + "age": { + 0: data["age"].values, + 1: utils.generate_intervals(data["age"].values, 0, 100, 5), + 2: utils.generate_intervals(data["age"].values, 0, 100, 10), + }, "gender": { 0: data["gender"].values, 1: np.array(["*"] * len(data["gender"].values)), @@ -392,6 +411,18 @@ def test_k_anon(self): data_anon_real["age"] = hierarchy_age[2].values[pos] assert data_anon_real.equals(data_anon) + def test_k_anon_big(self): + data_anon = anonymity.k_anonymity( + self.data, + self.ident, + self.quasi_ident, + 30, + self.supp_level, + self.hierarchies, + ) + + assert data_anon.equals(pd.DataFrame()) + def test_l_div(self): data_anon = anonymity.l_diversity( self.data, @@ -414,6 +445,32 @@ def test_l_div(self): data_anon_real["city"] = "*" assert data_anon_real.equals(data_anon) + def test_basic_beta0_supp0(self): + data_anon = anonymity.basic_beta_likeness( + self.data, + self.ident, + self.quasi_ident, + self.sens_att, + self.k, + 0, + 0, + self.hierarchies, + ) + assert data_anon.equals(pd.DataFrame()) + + def test_enhanced_beta0_supp0(self): + data_anon = anonymity.enhanced_beta_likeness( + self.data, + self.ident, + self.quasi_ident, + self.sens_att, + self.k, + 0, + 0, + self.hierarchies, + ) + assert data_anon.equals(pd.DataFrame()) + def test_get_transformation(self): data_anon = anonymity.k_anonymity( self.data, @@ -431,7 +488,15 @@ def test_get_transformation(self): def test_get_transformation_2qi(self): hierarchies = { - "age": dict(pd.read_csv("./examples/hierarchies/age.csv", header=None)), + "age": { + 0: self.data["age"].values, + 1: utils.generate_intervals( + self.data["age"].values, 0, 100, 5 + ), + 2: utils.generate_intervals( + self.data["age"].values, 0, 100, 10 + ), + }, "city": { 0: self.data["city"].values, 1: np.array(["*"] * len(self.data["city"].values)), @@ -447,7 +512,7 @@ def test_get_transformation_2qi(self): ) transformation = utils.get_transformation( - data_anon, self.quasi_ident, self.hierarchies + data_anon, self.quasi_ident, hierarchies ) assert [2, 0, 0] == transformation