-
Notifications
You must be signed in to change notification settings - Fork 5
/
Makefile
100 lines (74 loc) · 3.54 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
## --- echo commands (for debugging)
## SHELL = sh -xv
##-------------------------------------
## Set up main path variables
##-------------------------------------
## set basic variables
PYTHON = python
QUIPP_ROOT = $(shell pwd)
## construct lists of run input .json files and their base prefixes
RUN_INPUTS = $(wildcard run-inputs/*.json)
RUN_INPUTS_BASE_PREFIX = $(patsubst %.json,%,$(notdir $(RUN_INPUTS)))
## construct list of synthetic output directories using the input file names
SYNTH_OUTPUTS_PREFIX = $(addprefix synth-output/,$(RUN_INPUTS_BASE_PREFIX))
## construct list of the synthetic output .csv file names
## (only the 1st synthetic .csv is used)
SYNTH_OUTPUTS_CSV = $(addsuffix /synthetic_data_1.csv,$(SYNTH_OUTPUTS_PREFIX))
## Construct a list of .json file names for each utility and privacy metric
SYNTH_OUTPUTS_PRIV_DISCL_RISK = $(addsuffix /privacy_disclosure_risk.json,$(SYNTH_OUTPUTS_PREFIX))
SYNTH_OUTPUTS_UTIL_CLASS = $(addsuffix /utility_classifiers.json,$(SYNTH_OUTPUTS_PREFIX))
SYNTH_OUTPUTS_UTIL_CORR = $(addsuffix /utility_correlations.json,$(SYNTH_OUTPUTS_PREFIX))
.PHONY: all all-synthetic generated-data clean
all: $(SYNTH_OUTPUTS_PRIV_DISCL_RISK) $(SYNTH_OUTPUTS_UTIL_CLASS) $(SYNTH_OUTPUTS_UTIL_CORR)
all-synthetic: $(SYNTH_OUTPUTS_CSV)
##-------------------------------------
## Generate input data
##-------------------------------------
## set data file paths
AE_DEIDENTIFIED_DATA = generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.csv generator-outputs/odi-nhs-ae/hospital_ae_data_deidentify.json
LONDON_POSTCODES = generators/odi-nhs-ae/data/London\ postcodes.csv
generated-data: $(AE_DEIDENTIFIED_DATA)
# download the London Postcodes dataset used by the A&E generated
# dataset (this is about 133 MB)
$(LONDON_POSTCODES):
cd generators/odi-nhs-ae/ && \
curl -o "./data/London postcodes.csv" \
https://www.doogal.co.uk/UKPostcodesCSV.ashx?region=E12000007
# make the "A&E deidentified" generated dataset
# this is currently the only generated dataset, so it is handled with
# its own rule
$(AE_DEIDENTIFIED_DATA) &: $(LONDON_POSTCODES)
mkdir -p generator-outputs/odi-nhs-ae/ && \
cd generator-outputs/odi-nhs-ae/ && \
$(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/generate.py && \
$(PYTHON) $(QUIPP_ROOT)/generators/odi-nhs-ae/deidentify.py
##-------------------------------------
## Generate synthetic data
##-------------------------------------
## synthesize data - this rule also builds "synth-output/%/data_description.json"
$(SYNTH_OUTPUTS_CSV) : \
synth-output/%/synthetic_data_1.csv : run-inputs/%.json $(AE_DEIDENTIFIED_DATA)
mkdir -p $$(dirname $@) && \
cp $< $$(dirname $@) && \
python synthesize.py -i $< -o $$(dirname $@)
##-------------------------------------
## Calculate privacy and utility
##-------------------------------------
## compute privacy and utility metrics
$(SYNTH_OUTPUTS_PRIV_DISCL_RISK) : \
synth-output/%/privacy_disclosure_risk.json : \
run-inputs/%.json synth-output/%/synthetic_data_1.csv
python metrics/privacy-metrics/disclosure_risk.py -i $< -o $$(dirname $@)
$(SYNTH_OUTPUTS_UTIL_CLASS) : \
synth-output/%/utility_classifiers.json : \
run-inputs/%.json synth-output/%/synthetic_data_1.csv
python metrics/utility-metrics/classifiers.py -i $< -o $$(dirname $@)
$(SYNTH_OUTPUTS_UTIL_CORR) : \
synth-output/%/utility_correlations.json : \
run-inputs/%.json synth-output/%/synthetic_data_1.csv
python metrics/utility-metrics/correlations.py -i $< -o $$(dirname $@)
##-------------------------------------
## Clean
##-------------------------------------
clean:
rm -rf generator-outputs synth-output