-
Notifications
You must be signed in to change notification settings - Fork 0
/
sts_compare.ado
267 lines (220 loc) · 8.44 KB
/
sts_compare.ado
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
*! version 1.0 Ishmail Azindoo Baako (IPA) Apr, 2018
cap version 13.1
program define sts_compare
syntax using/, DATAbase(string) OUTfile(string)
qui {
cls
noi disp "Comparing Datasets ... "
noi disp
tempfile sts_request sts_database
* IMPORT AND SAVE REQUEST
*------------------------
import excel using "`using'", sheet("Request") cellrange(E5) allstring clear
keep E I K P W
* rename variables and assign value labels to them
rename (E I K P W ) ///
(fullname phonenumber email nhis unique_id)
* label variables
label variable fullname "Full name (automatically filled)"
label variable phonenumber "Phone number"
label variable email "Email address"
label variable nhis "NHIS number"
label variable unique_id "IPA Unique Staff ID"
* Mark observation row numbers
gen request_row = _n + 4
* trim string variables
foreach var of varlist fullname email {
replace `var' = trim(itrim(`var'))
}
* delete empty rows
drop if missing(fullname)
* Check for missing values in phonenumber, email, nhis
foreach var of varlist phonenumber email nhis {
cap assert !missing(`var')
if _rc {
sort request_row
noi disp in red "The following are missing for `var' in Contract Request"
noi list request_row fullname if missing(`var'), abbrev(32) noobs sep(1)
}
}
* Check that phone numbers are at least 9 digits.
gen len_check = length(phonenumber) >= 9 | missing(phonenumber)
if _rc {
noi disp in red "The following phonenumbers in Contract Request are less than 9 digits"
noi list request_row fullname phonenumber if !len_check, abbrev(32) noobs sep(1)
}
drop len_check
* destring phone, nhis numbers and unique_id
destring phonenumber, ignore("+") replace
destring unique_id, force replace
format %10.0f phonenumber
* Check for duplicates on uniqueid, phonenumber, email, nhis
foreach var of varlist unique_id phonenumber email nhis {
duplicates tag `var' if !missing(`var'), gen (_dup)
count if _dup & !missing(`var')
if `r(N)' > 0 {
sort `var' request_row
noi disp in red "The following are duplicate on `var' in Contract Request"
noi list request_row fullname `var' if _dup & !missing(`var'), ///
abbrev(32) noobs sepby(`var')
}
drop _dup
}
* save data
sort request_row
save `sts_request'
* save values of request in locals
loc request_count `=_N'
forval i = 1/`request_count' {
loc request_row_`i' = request_row[`i']
loc request_unique_id_`i' = unique_id[`i']
loc request_fullname_`i' = fullname[`i']
loc request_phonenumber_`i' = phonenumber[`i']
loc request_email_`i' = email[`i']
loc request_nhis_`i' = nhis[`i']
}
* IMPORT AND SAVE DATABASE
*-------------------------
import excel using "`database'", sheet("Staff details") cellrange(B3) clear
* keep only relevant variables
keep L M N Q G AW
* rename variables and assign value labels to them
rename (L M N Q G AW ) ///
(fullname phonenumber email nhis unique_id correct_id )
* prefix all database variables with database
rename (*) (database_*)
* replace unique_id if the unique_id is duplicates and was corrected
destring database_correct_id, replace
replace database_correct_id = . if length(string(database_correct_id)) != 6
replace database_unique_id = database_correct_id if !missing(database_correct_id)
drop database_correct_id
* label variables
label variable database_fullname "Full name (automatically filled)"
label variable database_phonenumber "Phone number"
label variable database_email "Email address"
label variable database_nhis "NHIS number"
label variable database_unique_id "IPA Unique Staff ID"
* trim string variables
tostring database_nhis, replace
foreach var of varlist database_fullname database_email database_nhis {
replace `var' = trim(itrim(`var'))
}
* Drop empty rows
drop if missing(database_fullname)
* Mark observation row numbers
gen database_row = _n + 2
save `sts_database'
* COMPARE DATASETS
*-----------------
*generate missing variables for request dataset
generate request_row = .
generate request_unique_id = .
generate unique_id_matched = .
generate request_fullname = ""
generate request_phonenumber = ""
generate phonenumber_matched = .
generate request_email = ""
generate email_matched = .
generate request_nhis = ""
generate nhis_matched = .
* re-order variables
#d;
order request_row
database_row
request_unique_id
database_unique_id
unique_id_matched
request_fullname
database_fullname
request_phonenumber
database_phonenumber
phonenumber_matched
request_email
database_email
email_matched
request_nhis
database_nhis
nhis_matched
;
#d cr
* loop through all request rows and flag matches
forval i = 1/`request_count' {
if !missing("`request_unique_id_`i''") {
* look for matches in unique_id
replace request_row = `request_row_`i'' ///
if database_unique_id == `request_unique_id_`i''
}
* look for matches in phone
if !missing("`request_phonenumber_`i''") {
replace request_row = `request_row_`i'' ///
if regexm(database_phonenumber, "`request_phonenumber_`i''")
}
* look for matches in email
if !missing("`request_email_`i''") {
replace request_row = `request_row_`i'' ///
if database_email == "`request_email_`i''"
}
* look for matches in nhis
* for string nhis, check for a continues 4 digit or more sequence of numbers
gen _nhis_fmt = regexs(0) if regexm("`request_nhis_`i''", "[0-9][0-9][0-9][0-9][0-9]+")
loc request_nhis_`i'_fmt = _nhis_fmt[1]
if !missing("`request_nhis_`i'_fmt'") {
replace request_row = `request_row_`i'' ///
if regexm(database_nhis, "`request_nhis_`i'_fmt'")
}
* Replace Request Variables
cap replace request_unique_id = `request_unique_id_`i'' if request_row == `request_row_`i''
cap replace request_phonenumber = "`request_phonenumber_`i''" if request_row == `request_row_`i''
cap replace request_email = "`request_email_`i''" if request_row == `request_row_`i''
cap replace request_nhis = "`request_nhis_`i''" if request_row == `request_row_`i''
replace request_fullname = "`request_fullname_`i''" if request_row == `request_row_`i''
* Replace match markers
replace unique_id_matched = 1 if database_unique_id == request_unique_id & request_row == `request_row_`i''
* For the variables that can be missing, check that it is not before replacing
if !missing("`request_email_`i''") replace email_matched = 1 if regexm(database_email, "`request_email_`i''") & ///
request_row == `request_row_`i''
if !missing("`request_nhis_`i'_fmt'") replace nhis_matched = 1 if regexm(database_nhis, "`request_nhis_`i'_fmt'") & ///
request_row == `request_row_`i''
if !missing("`request_phonenumber_`i''") replace phonenumber_matched = 1 if regexm(database_phonenumber, "`request_phonenumber_`i''") & ///
request_row == `request_row_`i''
* drop _nhis_fmt
drop _nhis_fmt
}
foreach var of varlist *_matched {
replace `var' = 0 if missing(`var')
}
* generate serial number for each request row
sort request_row database_row
bysort request_row: gen index = _n
bysort request_row: egen last_submission = max(index)
replace last_submission = last_submission == index
* gen matches
egen matches = rowtotal(*_matched)
* convert matches to string yes
label define yesno 0 "no" 1 "yes"
label values *_matched last_submission yesno
keep if matches >= 1
* generate variable to mark the number of uniqueids matched per request
* Export Matched
if `=_N' > 0 {
sort request_row request_unique_id database_row
export excel using "`outfile'", sheet("matches") firstrow(variable) replace
}
* Get the request_row numbers of matched observations
levelsof request_row, loc (matched_rows) clean
* delete matched rows from request data
use `sts_request', clear
gen matched = 0
foreach row in `matched_rows' {
replace matched = 1 if request_row == `row'
}
keep if !matched
* Export unmatched
if `=_N' > 0 {
sort request_row
export excel request_row fullname email nhis using "`outfile'", sheet("no matches", replace) firstrow(variable)
}
noi disp
noi disp "Comparison Completed ... "
}
end