From 6a1f0ee3259fd12d152826b474c2e5b2c6ede6c1 Mon Sep 17 00:00:00 2001 From: Brian Thorne Date: Wed, 10 Mar 2021 22:50:35 +1300 Subject: [PATCH] Update docs and version bump Includes updating the blocking tutorial. --- docs/blocking-schema.rst | 2 +- docs/development.rst | 15 +- docs/index.rst | 5 +- docs/tutorial/tutorial_blocking.ipynb | 506 +++++++++++--------------- poetry.lock | 44 ++- pyproject.toml | 2 +- 6 files changed, 263 insertions(+), 311 deletions(-) diff --git a/docs/blocking-schema.rst b/docs/blocking-schema.rst index 9c93b82..413c2d5 100644 --- a/docs/blocking-schema.rst +++ b/docs/blocking-schema.rst @@ -40,7 +40,7 @@ Example Schema "type": "lambda-fold", "version": 1, "config": { - "blocking-features": [1, 2], + "blocking-features": ["name", "suburb"], "Lambda": 30, "bf-len": 2048, "num-hash-funcs": 5, diff --git a/docs/development.rst b/docs/development.rst index aab2d5a..aae29c4 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -4,25 +4,22 @@ Development Testing ------- -Make sure you have all the required modules before running the tests -(modules that are only needed for tests are not included during -installation):: +Make sure you have all the required dependencies before running the tests:: - $ pip install -r requirements.txt + $ poetry install -Now run the unit tests and print out code coverage with `py.test`:: +Now run the unit tests and print out code coverage with `pytest`:: - $ python -m pytest --cov=blocklib + $ poetry run pytest --cov=blocklib Type Checking ------------- -``blocklib`` uses static typechecking with ``mypy``. To run the type checker (in Python 3.5 or later):: +``blocklib`` uses static typechecking with ``mypy``. To run the type checker as configured to run in the CI:: - $ pip install mypy - $ mypy blocklib --ignore-missing-imports --strict-optional --no-implicit-optional --disallow-untyped-calls + $ poetry run mypy blocklib --ignore-missing-imports --strict-optional --no-implicit-optional --disallow-untyped-calls diff --git a/docs/index.rst b/docs/index.rst index 19e42b1..1237e8a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,13 +11,14 @@ record linkage scalable. It is achieved by partitioning datasets into groups, ca records in corresponding blocks. This can reduce the number of comparisons that need to be conducted to find which pairs of records should be linked. -Note that it is part of anonlink system which includes libraries for encoding, command line tools and Rest API: +Note that it is part of the anonlink system which includes libraries for encoding, command line tools and Rest API: * `clkhash `_ * `anonlink-client `_ +* `anonlink `_ * `anonlink-entity-service `_ -Blocklib is Apache 2.0 licensed, supports Python version 3.5+ and run on Windows, OSX and Linux. +Blocklib is Apache 2.0 licensed, supports Python version 3.6+ and run on Windows, OSX and Linux. Install with pip:: diff --git a/docs/tutorial/tutorial_blocking.ipynb b/docs/tutorial/tutorial_blocking.ipynb index 8603621..e194d45 100644 --- a/docs/tutorial/tutorial_blocking.ipynb +++ b/docs/tutorial/tutorial_blocking.ipynb @@ -26,97 +26,34 @@ "\n", "In this tutorial, we demonstrate how to use blocking in privacy preserving record linkage. \n", "\n", - "Load example Nothern Carolina voter registration dataset:" + "Load example Northern Carolina voter registration dataset:" ] }, { "cell_type": "code", "execution_count": 1, + "outputs": [], + "source": [ + "import blocklib" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
recidgivennamesurnamesuburbpc
0761859katechapmanbrighton4017
11384455lianhursecarisbrook3464
21933333matthewrussobardon4065
31564695lorrainezammitminchinbury2770
45971993ingorichardsonwoolsthorpe3276
\n", - "
" - ], - "text/plain": [ - " recid givenname surname suburb pc\n", - "0 761859 kate chapman brighton 4017\n", - "1 1384455 lian hurse carisbrook 3464\n", - "2 1933333 matthew russo bardon 4065\n", - "3 1564695 lorraine zammit minchinbury 2770\n", - "4 5971993 ingo richardson woolsthorpe 3276" - ] + "text/plain": " recid givenname surname suburb pc\n0 761859 kate chapman brighton 4017\n1 1384455 lian hurse carisbrook 3464\n2 1933333 matthew russo bardon 4065\n3 1564695 lorraine zammit minchinbury 2770\n4 5971993 ingo richardson woolsthorpe 3276", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
recidgivennamesurnamesuburbpc
0761859katechapmanbrighton4017
11384455lianhursecarisbrook3464
21933333matthewrussobardon4065
31564695lorrainezammitminchinbury2770
45971993ingorichardsonwoolsthorpe3276
\n
" }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -133,27 +70,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this dataset, `recid` is the voter registration number. So we are able to verify the quality of a linkage between snapshots of this dataset taken at different times. `pc` refers to postcode.\n", + "In this dataset, `recid` is the voter registration number, with that we are able to verify the quality of a linkage\n", + "between snapshots of this dataset taken at different times. `pc` refers to postcode.\n", "\n", - "Next step is to config a blocking job. Before we do that, let's look at the blocking methods we are currently supporting:\n", + "The next step is to configure how to block the data. There are two privacy preserving blocking methods currently\n", + "supported by `blocklib`:\n", "\n", "1. Probabilistic signature (p-sig)\n", "2. LSH based $\\Lambda$-fold redundant (lambda-fold)\n", "\n", - "Let's firstly look at P-sig\n", + "This tutorial will demonstrate using both of these, starting with probabilistic signatures.\n", "\n", - "### Blocking Methods - Probabilistic signature (p-sig)\n", + "## Blocking Methods - Probabilistic signature (p-sig)\n", "\n", - "The high level idea behind this blocking method is that it uses signatures as the blocking key and place only records having same signatures into the same block. You can find the original paper here: [Scalable Entity Resolution Using Probabilistic Signatures on Parallel Databases](https://arxiv.org/abs/1712.09691).\n", + "The high level idea behind this blocking method is that it uses signatures as the blocking key and places records\n", + "having the same signatures into the same block. You can find the original paper here:\n", + "[Scalable Entity Resolution Using Probabilistic Signatures on Parallel Databases](https://arxiv.org/abs/1712.09691).\n", "\n", - "Detailed steps and explanations are in the following.\n", - "\n", - "Let's see an example of configuration for `p-sig`" + "An example blocking configuration using probabilistic signatures:" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -161,8 +100,7 @@ " \"type\": \"p-sig\",\n", " \"version\": 1,\n", " \"config\": {\n", - " \"blocking-features\": [1, 2],\n", - "# \"record-id-col\": 0,\n", + " \"blocking-features\": ['givenname', 'surname'],\n", " \"filter\": {\n", " \"type\": \"ratio\",\n", " \"max\": 0.02,\n", @@ -170,100 +108,149 @@ " },\n", " \"blocking-filter\": {\n", " \"type\": \"bloom filter\",\n", - " \"number-hash-functions\": 4,\n", + " \"number-hash-functions\": 20,\n", " \"bf-len\": 2048,\n", " },\n", " \"signatureSpecs\": [\n", " [\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 1},\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 2},\n", + " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": \"givenname\"},\n", + " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": \"surname\"},\n", " ],\n", " [\n", - " {\"type\": \"metaphone\", \"feature\": 1},\n", - " {\"type\": \"metaphone\", \"feature\": 2},\n", + " {\"type\": \"metaphone\", \"feature\": \"givenname\"},\n", + " {\"type\": \"metaphone\", \"feature\": \"surname\"},\n", " ]\n", " ]\n", " }\n", - "}" + "}\n" ] }, + { + "cell_type": "markdown", + "source": [ + "The blocking config can be fully validated to ensure all required types are present." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "BlockingSchemaModel(version=1, type=, config=PSigConfig(record_id_column=None, blocking_features=['givenname', 'surname'], filter=PSigFilterRatioConfig(type='ratio', max=0.02, min=0.0), blocking_filter=PSigBlockingBFFilterConfig(type='bloom filter', number_of_hash_functions=20, bloom_filter_length=2048), signatures=[[PSigCharsAtSignatureSpec(type=, feature='givenname', config=PSigCharsAtSignatureConfig(pos=[0])), PSigCharsAtSignatureSpec(type=, feature='surname', config=PSigCharsAtSignatureConfig(pos=[0]))], [PSigMetaphoneSignatureSpec(type=, feature='givenname'), PSigMetaphoneSignatureSpec(type=, feature='surname')]]))" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "blocklib.validation.validate_signature_config(blocking_config)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step1 - Generate Signature**\n", + "### Step 1 - Generate Signatures\n", "\n", - "For a record `r`, a signature is a sub-record derived from record `r` with a signature strategy. An example signature strategy is to concatenate the initials of first and last name, e.g., the signature for record `\"John White\"` is `\"JW\"`.\n", + "For a record `r`, a signature is a sub-record derived from record `r` with a signature strategy. An example signature\n", + "strategy is to concatenate the initials of first and last name, e.g., the signature for record `\"John White\"` is `\"JW\"`.\n", "\n", - "We provide the following signature strategies:\n", + "`blocklib` provides the following signature strategies:\n", "\n", - "* feature-value: the signature is generated by returning the selected feature\n", - "* characters-at: the signature is generated by selecting a single character or a sequence of characters from selected feature\n", - "* metaphone: the signature is generated by phonetic encoding the selected feature using metaphone\n", + "* `feature-value`: the signature is generated by returning the selected feature\n", + "* `characters-at`: the signature is generated by selecting a single character or a sequence of characters from selected feature\n", + "* `metaphone`: the signature is generated by phonetic encoding the selected feature using metaphone\n", "\n", - "The output of this step is a reversed index where keys are generated signatures / blocking key and the values are list of corresponding record IDs. A record ID could be row index or the actual record identifier if it is available in the dataset.\n", + "The output of this step is a reversed index where keys are generated signatures / blocking key, and the values are\n", + "lists of corresponding record IDs. A record ID could be row index or the actual record identifier if it is available\n", + "in the dataset.\n", "\n", - "Signature strategies are defined in the `signatureSpecs` section. For example, in the above configuration, we are going to generate two signatures for each record. The first signature is a combination of 3 different signature strategies\n", + "Signature strategies are defined in the `signatureSpecs` section. For example, in the above configuration, we are\n", + "going to generate two signatures for each record. The first signature produces initials:\n", "\n", + "```json\n", + "[\n", + " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": \"givenname\"},\n", + " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": \"surname\"}\n", + "]\n", "```\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 1},\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 2},\n", - " {\"type\": \"feature-value\", \"feature_idx\": 4}\n", "\n", - "```\n", - "It combines the initials of first and last name and postcode.\n", + "The second signature is generated by a combination of how the two components of a person's name sounds:\n", "\n", - "The second signature is generated by a combination of 2 signature strategies:\n", - "```\n", - " {\"type\": \"metaphone\", \"feature\": 1},\n", - " {\"type\": \"metaphone\", \"feature\": 2},\n", + "```json\n", + "[\n", + " {\"type\": \"metaphone\", \"feature\": \"givenname\"},\n", + " {\"type\": \"metaphone\", \"feature\": \"surname\"}\n", + "]\n", "```\n", "That is phonetic encoding of first name and last name.\n", "\n", "*One signature corresponds to one block. I will use signature and block interchangeably but they mean the same thing.*\n", "\n", - "**Step2 - Filter Too Frequent Signatures**\n", + "### Step 2 - Filter Signatures\n", "\n", - "A signature is assumed to identify a record as uniquely as possible. Therefore, we need to filter out some too frequent signatures since they can uniquely identify the record. On the otherside, we want to be resilient to frequency attack, so we need to filter out too rare signature that only contains very few records. The configuration of filtering is in the `filter` part. For example, in the above configuration, the filter section is configured as:\n", - "```\n", - " \"filter\": {\n", - " \"type\": \"ratio\",\n", - " \"max\": 0.02,\n", - " \"min\": 0.001,\n", - " }\n", - "```\n", - "Then we will filter out all signatures / blocks whose number of records is greater than 2% of number of total records or is less than 0.1% of number of total records. \n", + "Signature strategies can create blocks with many records, and blocks with just one record. To impose limits\n", + "on the minimum and maximum block size `blocklib` provides configurable filtering.\n", "\n", - "Note that we also support absoulte filtering configuration i.e. filter by number of counts. For example:\n", + "For example, in the above configuration, the filter is configured as:\n", "\n", - "```\n", - " \"filter\": {\n", - " \"type\": \"count\",\n", - " \"max\": 100,\n", - " \"min\": 5,\n", - " }\n", + "```json\n", + "{\n", + " \"type\": \"ratio\",\n", + " \"max\": 0.02,\n", + " \"min\": 0.001\n", + "}\n", "```\n", "\n", - "**Step3 - Anonymization**\n", + "`blocklib` will filter out all signatures / blocks whose number of records is greater than 2% of number of total\n", + "records or is less than 0.1% of number of total records. Note these percentages are based on the data provided\n", + "to `blocklib` so only use on roughly symmetric sized record linkage.\n", "\n", - "Given we want to do privacy preserving record linkage, the signatures need to be hashed to avoid leaking of PII information. The most frequent used data structure of such encoding is Bloom Filter. Here we use one Bloom Filter and map all filtered signatures into that Bloom Filter. The configuration of Bloom Filter is in `block-filter` section:\n", + "Absolute filtering is also supported to filter by number of records. An example `filter` configuration:\n", "\n", + "```json\n", + "{\n", + " \"type\": \"count\",\n", + " \"max\": 100,\n", + " \"min\": 5\n", + "}\n", "```\n", - " \"blocking-filter\": {\n", - " \"type\": \"bloom filter\",\n", - " \"number-hash-functions\": 20,\n", - " \"bf-len\": 2048,\n", - " }\n", + "\n", + "### Step 3 - Anonymization\n", + "\n", + "Given the aim of privacy preserving record linkage, the signatures themselves (e.g. `\"JW\"`) are not going to be\n", + "shared, instead following the `p-sig` paper, the signatures all get encoded into a Bloom Filter. Here we use one\n", + "Bloom Filter and map all filtered signatures into that Bloom Filter.\n", "\n", "```\n", + "\"blocking-filter\": {\n", + " \"type\": \"bloom filter\",\n", + " \"number-hash-functions\": 20,\n", + " \"bf-len\": 2048,\n", + "}\n", + "```\n", "\n", - "After anonymization, the signature becomes the set of indices of bits 1 in the bloom filter and hence can preseve the privacy of data for each data provider.\n", + "After anonymization, the signature becomes the set of indices of bits 1 in the bloom filter and hence can preserve\n", + "the privacy of data for each data provider.\n", "\n", - "### Carry out Blocking Job\n", + "### Blocking Data\n", "\n", - "Okay, once you have a good understanding of the P-Sig blocking, we can carry out our blocking job with `blocklib`. First, we need to process the data since `blocklib` only accept list of tuples or lists as input data. An example data input for blocklib is\n", + "Now that we have configured how the P-Sig blocking will work, we can carry out our blocking job with `blocklib`.\n", + "Note `blocklib` only accept list of tuples or lists as input data, so some pre-processing may be necessary. Example\n", + "data input for `blocklib`:\n", "\n", - "```\n", + "```python\n", "[\n", " [761859, 'kate', 'chapman', 'brighton', 4017],\n", " [1384455, 'lian', 'hurse', 'carisbrook', 3464],\n", @@ -273,12 +260,12 @@ "]\n", "```\n", "\n", - "**Step1 - Generate Candidate Blocks for Party A - Alice**" + "**Step 1 - Generate Candidate Blocks for Party A - Alice**" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -291,13 +278,13 @@ ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", - "data_alice = df_alice.to_dict(orient='split')['data']\n", - "print(\"Example PII\", data_alice[0])" + "alice = df_alice.to_dict(orient='split')\n", + "print(\"Example PII\", alice['data'][0])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -306,12 +293,12 @@ "text": [ "P-Sig: 100.0% records are covered in blocks\n", "Statistics for the generated blocks:\n", - "\tNumber of Blocks: 5029\n", + "\tNumber of Blocks: 5028\n", "\tMinimum Block Size: 1\n", "\tMaximum Block Size: 61\n", - "\tAverage Block Size: 1.8337641678266057\n", + "\tAverage Block Size: 1.834128878281623\n", "\tMedian Block Size: 1\n", - "\tStandard Deviation of Block Size: 3.8368431973204213\n" + "\tStandard Deviation of Block Size: 3.8371894627245102\n" ] } ], @@ -319,44 +306,38 @@ "# NBVAL_IGNORE_OUTPUT\n", "from blocklib import generate_candidate_blocks\n", "\n", - "block_obj_alice = generate_candidate_blocks(data_alice, blocking_config)" + "block_obj_alice = generate_candidate_blocks(alice['data'], blocking_config, header=alice['columns'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The statistics of blocks are printed for you to inspect the block distribution and decide if this is a good blocking result. Here both average and median block sizes are 1 which is resilient to frequency attack. \n", + "The statistics of blocks are printed for you to inspect the block distribution and decide if this is a good blocking\n", + "result. Here both average and median block sizes are 1 which is resilient to frequency attack.\n", + "\n", + "`generate_candidate_blocks` returns a `CandidateBlockingResult`, the attribute we are most interested in is `blocks`,\n", + "a `dict` that maps signatures to lists of records.\n", "\n", - "You can get the blocking instance and blocks/reversed indice in the `block_obj_alice`. Let's look at the first block in the reversed indcies:" + "Let's look at the first block:" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { - "text/plain": [ - "'(1560, 401, 491, 1470)'" - ] + "text/plain": "'(1920, 1031, 142, 401, 1560, 671, 1830, 941, 52, 1211, 1470, 581, 1740, 851, 2010, 1121, 232, 491, 1650, 761)'" }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", - "print(block_obj_alice.state)\n", "list(block_obj_alice.blocks.keys())[0]" ] }, @@ -364,14 +345,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To protect the privacy of data, the signature / blocking key is not the original signature such as `JW`. Instead, it is a list of mapped indices of bits 1 in Bloom Filter of `JW`. Next we want to do the same thing for another party - Bob.\n", + "To protect the privacy, the signature / blocking key is not the original signature such as `JW`. Instead, it is a list of\n", + "mapped indices of bits set to 1 in the Bloom Filter for the original signature. Next we want to do the same thing for\n", + "another party - _enter Bob_.\n", "\n", "**Step2 - Generate Candidate Blocks for Party B - Bob**" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -385,9 +368,8 @@ "\tMaximum Block Size: 59\n", "\tAverage Block Size: 1.8377839776803508\n", "\tMedian Block Size: 1\n", - "\tStandard Deviation of Block Size: 3.838423809405143\n", - "\n", - "(1098, 707, 316, 1973)\n", + "\tStandard Deviation of Block Size: 3.8382680217493017\n", + "(1284, 1675, 18, 409, 800, 1191, 1582, 1973, 316, 707, 1861, 1098, 204, 595, 986, 1377, 1768, 111, 502, 893)\n", "[1, 25, 765, 1078, 1166, 1203, 1273, 1531, 1621, 1625, 1755, 1965, 2027, 2824, 3106, 3125, 3414, 3501, 3610, 4033, 4139, 4472, 4579]\n" ] } @@ -395,9 +377,9 @@ "source": [ "# NBVAL_IGNORE_OUTPUT\n", "df_bob = pd.read_csv('data/bob.csv')\n", - "data_bob = df_bob.to_dict(orient='split')['data']\n", - "block_obj_bob = generate_candidate_blocks(data_bob, blocking_config)\n", - "print(block_obj_bob.state)\n", + "bob = df_bob.to_dict(orient='split')\n", + "block_obj_bob = generate_candidate_blocks(bob['data'], blocking_config, header=bob['columns'])\n", + "\n", "print(list(block_obj_bob.blocks.keys())[0])\n", "print(list(block_obj_bob.blocks.values())[1])" ] @@ -408,20 +390,23 @@ "source": [ "### Generate Final Blocks\n", "\n", - "Now we have candidate blocks from both parties, we can generate final blocks by only including signatures that appear in both parties. Instead of directly comparing signature, the algorithm will firstly map the list of signatures into a Bloom Filter for for each party called the candidate blocking filter, and then creates the combined blocking filter by only retaining the bits that are present in all candidate filters." + "Now we have _candidate_ blocks from both parties, we can generate final blocks by only including signatures that appear\n", + "in both parties. Instead of directly comparing signatures, the algorithm maps the list of signatures into a\n", + "Bloom Filter for each party called the candidate blocking filter, and then creates the combined blocking filter by only\n", + "retaining the bits that are present in both candidate filters." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Alice: 2793 out of 5029 blocks are in common\n", - "Bob: 2793 out of 5018 blocks are in common\n" + "Alice: 2794 out of 5028 blocks are in common\n", + "Bob: 2794 out of 5018 blocks are in common\n" ] } ], @@ -440,7 +425,8 @@ "source": [ "### Assess Blocking\n", "\n", - "We can assess the blocking result when we have ground truth. There are two main metrics to assess blocking result as we mentioned in the beginning of this tutorial. Here is a recap:\n", + "We can assess the blocking result when we have ground truth. There are two main metrics to assess blocking result as\n", + "mentioned in the beginning of this tutorial. Here is a recap:\n", "\n", "* reduction ratio: relative reduction in the number of record pairs to be compared.\n", "* pair completeness: the percentage of true matches after blocking\n" @@ -448,14 +434,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "assessing blocks: 100%|██████████| 2793/2793 [00:00<00:00, 97204.45key/s]\n" + "assessing blocks: 100%|██████████| 2794/2794 [00:00<00:00, 152397.17key/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reduction Ratio: 0.996, Pair Completeness: 1.000\n" ] } ], @@ -463,108 +456,46 @@ "# NBVAL_IGNORE_OUTPUT\n", "from blocklib.evaluation import assess_blocks_2party\n", "\n", + "subdata1 = [x[0] for x in alice['data']]\n", + "subdata2 = [x[0] for x in bob['data']]\n", "\n", - "subdata1 = [x[0] for x in data_alice]\n", - "subdata2 = [x[0] for x in data_bob]\n", + "rr, pc = assess_blocks_2party(\n", + " [filtered_blocks_alice, filtered_blocks_bob],\n", + " [subdata1, subdata2]\n", + ")\n", "\n", - "rr, pc = assess_blocks_2party([filtered_blocks_alice, filtered_blocks_bob],\n", - " [subdata1, subdata2])" + "print(f\"Reduction Ratio: {rr:.3f}, Pair Completeness: {pc:.3f}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Feature Name are also Supported!\n", + "### Blocking Methods - LSH Based $\\Lambda$-fold Redundant\n", "\n", - "When there are many columns in the data, it is a bit inconvenient to use feature index. Luckily, blocklib also supports feature name in the blocking schema:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "blocking_config = {\n", - " \"type\": \"p-sig\",\n", - " \"version\": 1,\n", - " \"config\": {\n", - " \"blocking-features\": ['givenname', 'surname'],\n", - " \"filter\": {\n", - " \"type\": \"ratio\",\n", - " \"max\": 0.02,\n", - " \"min\": 0.00,\n", - " },\n", - " \"blocking-filter\": {\n", - " \"type\": \"bloom filter\",\n", - " \"number-hash-functions\": 4,\n", - " \"bf-len\": 2048,\n", - " },\n", - " \"signatureSpecs\": [\n", - " [\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 'givenname'},\n", - " {\"type\": \"characters-at\", \"config\": {\"pos\": [0]}, \"feature\": 'surname'},\n", - " ],\n", - " [\n", - " {\"type\": \"metaphone\", \"feature\": 'givenname'},\n", - " {\"type\": \"metaphone\", \"feature\": 'surname'},\n", - " ]\n", - " ]\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When generating candidate blocks, a header is required to pass through:" + "Now we look the other blocking method that we support - LSH Based $\\Lambda$-fold Redundant blocking. This blocking\n", + "method uses a list of selected bits selected randomly from Bloom Filter for each record as block keys.\n", + "$\\Lambda$ refers the degree of redundancy i.e. we will conduct LSH-based blocking $\\Lambda$ times, each forms a\n", + "blocking group. Then those blocking groups are combined into one blocking results. This will make a record\n", + "redundant $\\Lambda$ times but will increase the recall.\n", + "\n", + "Let's see an example config, this time selecting the blocking features using column indices instead of column names:" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "P-Sig: 100.0% records are covered in blocks\n", - "Statistics for the generated blocks:\n", - "\tNumber of Blocks: 5029\n", - "\tMinimum Block Size: 1\n", - "\tMaximum Block Size: 61\n", - "\tAverage Block Size: 1.8337641678266057\n", - "\tMedian Block Size: 1\n", - "\tStandard Deviation of Block Size: 3.8368431973204213\n" - ] + "data": { + "text/plain": "BlockingSchemaModel(version=1, type=, config=LambdaConfig(record_id_column=None, blocking_features=[1, 2], Lambda=5, bloom_filter_length=2048, number_of_hash_functions=10, K=40, block_encodings=False, random_state=0))" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } ], - "source": [ - "data_alice = df_alice.to_dict(orient='split')['data']\n", - "header = list(df_alice.columns)\n", - "\n", - "block_obj_alice = generate_candidate_blocks(data_alice, blocking_config, header=header)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Blocking Methods - LSH Based $\\Lambda$-fold Redundant\n", - "\n", - "Now we look the other blocking method that we support - LSH Based $\\Lambda$-fold Redundant blocking.This blocking method uses the a list of selected bits selected randomly from Bloom Filter for each record as block keys. $\\Lambda$ refers the degree of redundancy i.e. we will conduct LSH-based blocking $\\Lambda$ times, each forms a blocking group. Then those blocking groups are combined into one blocking results. This will make a record redundant $\\Lambda$ times but will increase the recall.\n", - "\n", - "Let's see an example config of it:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], "source": [ "blocking_config = {\n", " \"type\": \"lambda-fold\",\n", @@ -578,15 +509,15 @@ " \"random_state\": 0,\n", " \"input-clks\": False\n", " }\n", - "}" + "}\n", + "\n", + "blocklib.validation.validate_signature_config(blocking_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "Now let's explain the meaning of each argument:\n", "\n", "* blocking-features: a list of feature indice that we are going to use to generate blocks\n", @@ -601,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -631,15 +562,15 @@ "source": [ "# NBVAL_IGNORE_OUTPUT\n", "print('Generating candidate blocks for Alice:')\n", - "block_obj_alice = generate_candidate_blocks(data_alice, blocking_config)\n", + "block_obj_alice = generate_candidate_blocks(alice['data'], blocking_config)\n", "print()\n", "print('Generating candidate blocks for Bob: ')\n", - "block_obj_bob = generate_candidate_blocks(data_bob, blocking_config)" + "block_obj_bob = generate_candidate_blocks(bob['data'], blocking_config)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -660,29 +591,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "assessing blocks: 100%|██████████| 4167/4167 [00:00<00:00, 7690.70key/s] " + "assessing blocks: 100%|██████████| 4167/4167 [00:00<00:00, 10746.77key/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "RR=0.8823915973988634\n", - "PC=1.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "Reduction Ratio: 0.882, Pair Completeness: 1.000\n" ] } ], @@ -690,8 +613,8 @@ "# NBVAL_IGNORE_OUTPUT\n", "rr, pc = assess_blocks_2party([filtered_blocks_alice, filtered_blocks_bob],\n", " [subdata1, subdata2])\n", - "print('RR={}'.format(rr))\n", - "print('PC={}'.format(pc))" + "\n", + "print(f\"Reduction Ratio: {rr:.3f}, Pair Completeness: {pc:.3f}\")" ] } ], @@ -712,17 +635,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 1f31a13..9b23f55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -145,7 +145,7 @@ toml = ["toml"] name = "dataclasses" version = "0.8" description = "A backport of the dataclasses module for Python 3.6" -category = "dev" +category = "main" optional = false python-versions = ">=3.6, <3.7" @@ -521,6 +521,22 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "pydantic" +version = "1.8.1" +description = "Data validation and settings management using python 3.6 type hinting" +category = "main" +optional = false +python-versions = ">=3.6.1" + +[package.dependencies] +dataclasses = {version = ">=0.6", markers = "python_version < \"3.7\""} +typing-extensions = ">=3.7.4.3" + +[package.extras] +dotenv = ["python-dotenv (>=0.10.4)"] +email = ["email-validator (>=1.0.3)"] + [[package]] name = "pyflakes" version = "2.2.0" @@ -763,7 +779,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pyt [metadata] lock-version = "1.1" python-versions = ">=3.6.1, <4" -content-hash = "af042e912e9bc8c495bad0998467a8804b807b980480bacf428b896ff7856070" +content-hash = "08d8d2a2fffd59429ecc946ccc9db9ee54a3f51e5db825b512fc86cc3a7e6ecd" [metadata.files] appdirs = [ @@ -1144,6 +1160,30 @@ pycparser = [ {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"}, {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"}, ] +pydantic = [ + {file = "pydantic-1.8.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0c40162796fc8d0aa744875b60e4dc36834db9f2a25dbf9ba9664b1915a23850"}, + {file = "pydantic-1.8.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fff29fe54ec419338c522b908154a2efabeee4f483e48990f87e189661f31ce3"}, + {file = "pydantic-1.8.1-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:fbfb608febde1afd4743c6822c19060a8dbdd3eb30f98e36061ba4973308059e"}, + {file = "pydantic-1.8.1-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:eb8ccf12295113ce0de38f80b25f736d62f0a8d87c6b88aca645f168f9c78771"}, + {file = "pydantic-1.8.1-cp36-cp36m-win_amd64.whl", hash = "sha256:20d42f1be7c7acc352b3d09b0cf505a9fab9deb93125061b376fbe1f06a5459f"}, + {file = "pydantic-1.8.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dde4ca368e82791de97c2ec019681ffb437728090c0ff0c3852708cf923e0c7d"}, + {file = "pydantic-1.8.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:3bbd023c981cbe26e6e21c8d2ce78485f85c2e77f7bab5ec15b7d2a1f491918f"}, + {file = "pydantic-1.8.1-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:830ef1a148012b640186bf4d9789a206c56071ff38f2460a32ae67ca21880eb8"}, + {file = "pydantic-1.8.1-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:fb77f7a7e111db1832ae3f8f44203691e15b1fa7e5a1cb9691d4e2659aee41c4"}, + {file = "pydantic-1.8.1-cp37-cp37m-win_amd64.whl", hash = "sha256:3bcb9d7e1f9849a6bdbd027aabb3a06414abd6068cb3b21c49427956cce5038a"}, + {file = "pydantic-1.8.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2287ebff0018eec3cc69b1d09d4b7cebf277726fa1bd96b45806283c1d808683"}, + {file = "pydantic-1.8.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:4bbc47cf7925c86a345d03b07086696ed916c7663cb76aa409edaa54546e53e2"}, + {file = "pydantic-1.8.1-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:6388ef4ef1435364c8cc9a8192238aed030595e873d8462447ccef2e17387125"}, + {file = "pydantic-1.8.1-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:dd4888b300769ecec194ca8f2699415f5f7760365ddbe243d4fd6581485fa5f0"}, + {file = "pydantic-1.8.1-cp38-cp38-win_amd64.whl", hash = "sha256:8fbb677e4e89c8ab3d450df7b1d9caed23f254072e8597c33279460eeae59b99"}, + {file = "pydantic-1.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2f2736d9a996b976cfdfe52455ad27462308c9d3d0ae21a2aa8b4cd1a78f47b9"}, + {file = "pydantic-1.8.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:3114d74329873af0a0e8004627f5389f3bb27f956b965ddd3e355fe984a1789c"}, + {file = "pydantic-1.8.1-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:258576f2d997ee4573469633592e8b99aa13bda182fcc28e875f866016c8e07e"}, + {file = "pydantic-1.8.1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:c17a0b35c854049e67c68b48d55e026c84f35593c66d69b278b8b49e2484346f"}, + {file = "pydantic-1.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:e8bc082afef97c5fd3903d05c6f7bb3a6af9fc18631b4cc9fedeb4720efb0c58"}, + {file = "pydantic-1.8.1-py3-none-any.whl", hash = "sha256:e3f8790c47ac42549dc8b045a67b0ca371c7f66e73040d0197ce6172b385e520"}, + {file = "pydantic-1.8.1.tar.gz", hash = "sha256:26cf3cb2e68ec6c0cfcb6293e69fb3450c5fd1ace87f46b64f678b0d29eac4c3"}, +] pyflakes = [ {file = "pyflakes-2.2.0-py2.py3-none-any.whl", hash = "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92"}, {file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"}, diff --git a/pyproject.toml b/pyproject.toml index ef223a2..69c1a01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "blocklib" -version = "0.1.7" +version = "0.1.8" description = "A library for blocking in record linkage" license = "Apache-2.0" authors = [