-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathreview_public_geo.R
245 lines (188 loc) · 11.9 KB
/
review_public_geo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# perform a privacy impact review on the covid case surveillance public data file with geography, checking the 10 suppression rules
# k-anonymity 1000 for state/county (#1,#2)
# confirming suppression for low population counties (#3)
# confirming suppression for low population demographics within counties (#4)
# confirming suppression for demographics where case count too high in county (#5)
# k-anonymity 11 for all quasi-identifiers (#6)
# confirming suppression of linked variables (#7,8,9)
# confirming suppression where only one county in a state is suppressed (#10)
# if we had confidential variables we would check for l-diversity, but we don't, so we don't
# actual suppression logic is in HHSProtect code repository, this script confirms that files are meeting privacy rules to reduce risk of reidentification
# note that this assumes that missing values aren't factored in for k-anon, to make them count as wildcards, adjust the alpha parameter to what percent you want them to count as wildcards
# census file is processed in Protect based on https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-detail.html using the 2019 estimates for consistency with other parts of the response, version stored in repo is the summed and formatted version to make life easier. If we start using year other than 2019, then regenerate and update in repo.
#sdcApp(maxRequestSize = 2000)
#View(data)
cat(toString(Sys.time()))
source("functions.R")
library(arrow)
library(sdcMicro)
getwd()
report_dir = "../reports"
out_dir = "../output"
data_dir = "../data/raw"
KANON_LEVEL_LOCATION <- 1000
KANON_LEVEL_LOCATION
KANON_LEVEL <- 11
KANON_LEVEL
COUNTY_POPULATION_LEVEL <- 20000
COUNTY_POPULATION_LEVEL
COUNTY_DEMO_POPULATION_LEVEL <- 20*KANON_LEVEL
COUNTY_DEMO_POPULATION_LEVEL
COUNTY_POP_FILE_NAME = paste0(data_dir,"/county_pop_demo_for_verify.csv")
location_quasi_identifiers = c("res_state","res_county")
quasi_identifiers = c("case_month", location_quasi_identifiers, "age_group","sex","race","ethnicity","death_yn")
confidential_attributes = c()
#in some cases where attributes are related, we want to suppress the linked attribute whenever the source is suppressed. Using this format as that's what sdcmicro expects for ghostVars
linked_attributes = list(
list("res_state",c("county_fips_code","state_fips_code", "res_county")),
list("res_county",c("county_fips_code"))
)
#if I use a CSV then there's logic to change down below
file_name <- "public_county_geography_2021-11-09.parquet"
suppressed_file_name = paste(out_dir,"/",file_name,".suppressed.csv",sep="")
detailed_file_name = paste(data_dir,"/",file_name,sep="")
print(detailed_file_name)
#data = read.csv(detailed_file_name, fileEncoding="UTF-8-BOM", na.strings=c('NA',''))
df = read_parquet(detailed_file_name, as_data_frame = TRUE)
#df = read_parquet_parts(detailed_file_name) #
#for some reason the dataframe from arrow makes sdc take forever and error, but if I make a new dataframe it works, todo figure it out
data <- data.frame(df)
save(df, file = "Geo_df.Rdata")
remove(df)
#summarize dataset
result <- quick_summary(data, label="all_fields", qis=quasi_identifiers)
#summarize existing utility
summary <- summarize_utility(data, quasi_identifiers)
#any linked variables not suppressed when they are supposed to be? (rules #7,8,9)
summarize_linked_attribute_violations(data, linked_attributes)
# review for locations first (rules #1,2)
cat('\n\nProcessing check for states and counties having at least 1,000 cases (rules #1,2), should be 0.\n')
#recoding all the "NA" (already suppressed), Missings and Unknowns to NA for purposes of k-anonymity
data_na <- recode_to_na(data,quasi_identifiers,BLANK_CATEGORIES)
sdcObj <- createSdcObj(dat=data_na,
keyVars=location_quasi_identifiers,
numVars=NULL,
weightVar=NULL,
hhId=NULL,
strataVar=NULL,
pramVars=NULL,
excludeVars=NULL,
seed=0,
randomizeRecords=FALSE,
alpha=c(0),
ghostVars = NULL)
# print to confirm observations, num variables, quasis, quasi describes, and risk info
sdc_print(sdcObj, KANON_LEVEL_LOCATION)
#should be zero
fk = summarize_violations(data_na, sdcObj, KANON_LEVEL_LOCATION, location_quasi_identifiers)
#now lets check for k-anonymity using full quasi-identifier set (rule #6)
cat('\n\nProcessing check for k-anonymity 11 across all quasi-identifiers (rule #6), should be 0.\n')
sdcObj <- createSdcObj(dat=data_na,
keyVars=quasi_identifiers,
numVars=NULL,
weightVar=NULL,
hhId=NULL,
strataVar=NULL,
pramVars=NULL,
excludeVars=NULL,
seed=0,
randomizeRecords=FALSE,
alpha=c(1), #changing to 1 so nulls are treated as wildcards instead of categorical, this is new for geo because of the number of quasi-identifiers and wanting to improve accuracy of suppression to meet privacy threshold
ghostVars = NULL)
# print to confirm observations, num variables, quasis, quasi describes, and risk info
sdc_print(sdcObj, KANON_LEVEL)
#this should be zero
fk <- summarize_violations(data_na, sdcObj, KANON_LEVEL, quasi_identifiers)
# not actually performing suppression, but if needed to help debug uncomment below to generate an sdcmicro suppressed file
#sdcObj <- kAnon(sdcObj, importance=c(1,1,1,1,1,1,2), combs=NULL, k=c(KANON_LEVEL))
#writeSafeFile(sdcObj,"csv","simple",suppressed_file_name, quote=FALSE, sep=",",row.names=FALSE)
cat("\n\nWriting out a privacy eval report to:", paste(report_dir,"/",file_name,".html",sep = ""),"\n\n")
report(sdcObj, outdir = report_dir, filename = file_name,
title = "SRRG Privacy Evaluation Report for Case Surveillance Public Data Set with Geography", internal = TRUE, verbose = FALSE)
save(sdcObj, file = "Geo_sdcObj.Rdata")
remove(sdcObj)
save(data, file = "Geo_data.Rdata")
remove(data)
cat('Processing check for low population counties (rule #3), should be 0.\n')
#weird way to read in a file to support utf characters in dataset #TODO fix
fileIn=file(COUNTY_POP_FILE_NAME,open="rb",encoding="UTF-8-BOM")
lines = readLines(fileIn)
county_data = read.csv(text=lines, na.strings=c('NA',''), colClasses=c("ï..state_county_combined_fips"="character"))
#county_data = read.csv(COUNTY_POP_FILE_NAME, fileEncoding="UTF-8-BOM", na.strings=c('NA',''),colClasses=c("state_county_combined_fips"="character"))
names(county_data)[names(county_data) == 'ï..state_county_combined_fips'] <- 'county_fips_code'
names(county_data)[names(county_data) == 'state_county_combined_fips'] <- 'county_fips_code'
names(county_data) <- tolower(names(county_data))
county_data['state_abbr'] = state.abb[match(county_data$stname,state.name)]
data_with_census = merge(x=data_na, y=county_data, by = 'county_fips_code', all.x = TRUE)
#if I screw up the merge, that's bad
stopifnot(nrow(data_na) == nrow(data_with_census))
#missing census data, assume zero population, so we check if suppressed
data_with_census$sum_of_tot_pop[is.na(data_with_census$sum_of_tot_pop)] <- 0
violations <- subset(data_with_census, sum_of_tot_pop < COUNTY_POPULATION_LEVEL & res_county != "NA")
num_v = nrow(violations)
cat("Low population county violations (",num_v,"). If greater than zero violations, then the list of counties with violations and 5 sample violations.\n")
if (num_v > 0){
print(unique(violations$res_county))
print(violations[sample(num_v,min(num_v,5)),c(quasi_identifiers, 'sum_of_tot_pop')])
}
save(data_na, file = "Geo_data_na.Rdata")
remove(data_na)
#basically my logic here for rule #4 is that any county's population by sex, race, and/or ethnicity should be greater than or equal to 220. So I'll check each record that's not NA
cat('\n\nProcessing check for low subpopulation demographics in counties (rule #4), should be 0, twice.\n')
#fyi, this takes like 10-15 minutes
data_with_census['census_subpopulation'] <- apply(data_with_census,1, function(x) what_subpopulation_value(x))
violations <- subset(data_with_census, res_county != 'NA' & census_subpopulation < COUNTY_DEMO_POPULATION_LEVEL)
num_v = nrow(violations)
cat("Subpopulation county violations, part 1 checking subpopulation for counties (",num_v,"). If greater than zero violations, then here's 5 sample violations.\n")
if (num_v > 0){
print(violations[sample(num_v,min(num_v,5)),])
}
#then also want to check for situations where I couldn't get the subpopulation, for records that don't have a subpop, either the county should be NA, or all the demo fields should be NA
potential_violations = subset(data_with_census, is.na(census_subpopulation))
#if any of these have an actual res_county value AND demographics that's bad. This should have been handled by rule 3 since if there's no total population, there shouldn't be a subpop either
violations = subset(potential_violations,
(res_county != 'NA')
& ((sex != 'NA' & sex != 'Missing' & sex != 'Unknown')
| (race != 'NA' & race != 'Missing' & race != 'Unknown')
| (ethnicity != 'NA' & ethnicity != 'Missing' & ethnicity != 'Unknown')
)
)
num_v = nrow(violations)
cat("Subpopulation county violations, part 2, checking to make sure there aren't any res_county that aren't NA but have subpops (",num_v,"). If greater than zero violations, then here's 5 sample violations.\n")
if (num_v > 0){
print(violations[sample(num_v,min(num_v,5)),])
}
cat('\n\nProcessing check for case counts by subpopulation (rule #5), should be 0.\n')
#aggregate by county fips, sex, race, ethnicity
agg <- aggregate(data_with_census, by=list(data_with_census$county_fips_code, data_with_census$sex, data_with_census$race, data_with_census$ethnicity ), FUN=length)[,1:5]
colnames(agg) <- c('county_fips_code', 'sex','race','ethnicity','cases_subpopulation')
#join
data_with_census_case = merge(x=data_with_census, y=agg, by = c('county_fips_code','sex','race','ethnicity'), all.x = TRUE)
#if I screw up the merge, that's bad
stopifnot(nrow(data_with_census_case) == nrow(data_with_census))
#check for case_subpop >= census_subpopulation/2
violations = subset(data_with_census_case, county_fips_code != 'NA' & (cases_subpopulation >= census_subpopulation/2))
num_v = nrow(violations)
cat("Subpopulation population too small for cases (",num_v,"). If greater than zero violations, then here's 5 sample violations.\n")
if (num_v > 0){
print(violations[sample(num_v,min(num_v,5)),])
}
save(data_with_census_case, file = "Geo_data_with_census_case.Rdata")
remove(data_with_census_case)
load(file = "Geo_data.Rdata")
cat('\n\nProcessing check for county/state complementary offsets (rule #10), should be 0.\n')
census_counties_per_state = aggregate(data=county_data, ctyname ~ state_abbr, function(x) length(unique(x)))
names(census_counties_per_state) <- c('census_state','census_num_counties')
case_counties_per_state = aggregate(data=subset(data_with_census,county_fips_code != 'NA'), county_fips_code ~ res_state, function(x) length(unique(x)))
names(case_counties_per_state)[names(case_counties_per_state) == 'county_fips_code'] <- 'case_num_counties'
names(case_counties_per_state) <- c('case_state','case_num_counties')
combined_counties_per_state = merge(x=case_counties_per_state, y=census_counties_per_state, by.x = 'case_state', by.y = 'census_state', all.x = TRUE)
combined_counties_per_state['county_diff'] = combined_counties_per_state['census_num_counties'] - combined_counties_per_state['case_num_counties']
violations <- subset(combined_counties_per_state, county_diff == 1)
num_v = nrow(violations)
cat("County/state complementary violations (",num_v,"). If greater than zero violations, then here's states with violations.\n")
if (num_v > 0){
print(violations)
}
cat(toString(Sys.time()))
#debug stuff