-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsfg_data_analysis_v2.php
155 lines (131 loc) · 4.71 KB
/
sfg_data_analysis_v2.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
<?php
$st_file = $argv[1];
$sfg_file = $argv[2];
$sfg_email_col = $argv[3];
$sfg_amount = $argv[4];
$testing = $argv[5];
//rename files to show they've been hashed
$st_file_hash = str_replace(".csv", "_hashed.csv", $st_file);
$sfg_file_hash = str_replace(".csv", "_hashed.csv", $sfg_file);
//concatenate command tot run hash.pl
$hash_st = "perl hash.pl -i " . $st_file . " -o " . "hashed/" . $st_file_hash;
$hash_sfg = "perl hash.pl -i " . $sfg_file . " -o " . "hashed/" . $sfg_file_hash;
$test = false;
if($testing != null && $testing = "t") {
$test = true;
}
//execute command to use hash.pl
echo "\nHashing emails\n";
exec("mkdir hashed");
exec($hash_st);
exec($hash_sfg);
//get rid of first line of each file
$st_first_line = "sed -i '' 1d hashed/$st_file_hash";
$sfg_first_line = "sed -i '' 1d hashed/$sfg_file_hash";
exec($st_first_line);
exec($sfg_first_line);
//cut st email & codes column & sfg email column
echo "\nCutting email columns\n";
exec("mkdir cut_files");
$st_file_hash_cut = str_replace(".csv", "_cut.csv", $st_file_hash);
$sfg_file_hash_cut = str_replace(".csv", "_cut.csv", $sfg_file_hash);
$st_cut = "cut -f 2-3 hashed/" . $st_file_hash . " > cut_files/" . $st_file_hash_cut;
$sfg_cut = "cut -f " . $sfg_email_col . "," . $sfg_amount . "-" . $sfg_amount . " hashed/" . $sfg_file_hash . " > cut_files/" . $sfg_file_hash_cut;
exec($st_cut);
exec($sfg_cut);
//remove rows with no source codes in ST file
echo "\nRemoving rows with no source codes in ST file\n";
$st_cut_cleansed = str_replace(".csv", "_cleansed.csv", $st_file_hash_cut);
$st_cut_clean = "awk -F'\t' '$2!=\"\"'" . " cut_files/" . $st_file_hash_cut . " > cut_files/" . $st_cut_cleansed;
exec($st_cut_clean);
//remove duplicate emailds
echo "\nRemoving duplicate emails in both ST and SFG files\n";
exec("mkdir uniq_files");
$st_uniq = str_replace(".csv", "_uniq.csv", $st_cut_cleansed);
$sfg_uniq = str_replace(".csv", "_uniq.csv", $sfg_file_hash_cut);
$st_uniq_sort = "sort -u -t '\t' -k1 cut_files/" . $st_cut_cleansed . " > " . "uniq_files/" . $st_uniq;
$sfg_uniq_sort = "sort -u -t '\t' -k1 cut_files/" . $sfg_file_hash_cut . " > " . "uniq_files/" . $sfg_uniq;
exec($st_uniq_sort);
exec($sfg_uniq_sort);
//get source code matches from ST to SFG file
$st_file = "uniq_files/" . $st_uniq;
$sfg_file = "uniq_files/" . $sfg_uniq;
//files with all source codes in ST file
echo "\nFinding matches between ST and SFG files\n";
/*
*to get matches and return fields from both files
awk -F"[, ]" 'NR==FNR{a[$1]=$1","$2; next} ($2 in a){print a[$2]","$1}' file1 file2
*/
exec("mkdir final_files");
$complete_file = str_replace(".csv", "_complete.csv", $sfg_uniq);
$get_matches = "join -t $'\t' -1 1 -2 1 $st_file $sfg_file > final_files/$complete_file";
exec($get_matches);
echo "\nFinalizing files and counting source codes\n";
//open file with all sources
$st_file_all_sources = fopen($st_file, "r");
$st_count_arr = array();
$counter_arr = array();
$complete_file_name = "final_files/" . $complete_file;
$read_file = fopen($complete_file_name, "r");
$rename_final = str_replace(".csv", "_final.csv", $complete_file);
$final_file_name = "final_files/" . $rename_final;
$final_file = fopen($final_file_name, "w+");
while($row = fgetcsv($st_file_all_sources, 0, "\t")) {
$counter_arr[$row[1]] = 0;
}
$st_all_sources = fopen($st_file, "r");
//$sfg_amount = fopen($sfg_file, "r");
while($read = fgetcsv($st_all_sources, 0, "\t")) {
if(array_key_exists($read[1], $st_count_arr)) {
$st_count_arr[$read[1]][0] += 1;
}
else {
$st_count_arr[$read[1]][0] = 1;
}
}
while($line = fgetcsv($read_file, 0, "\t")) {
echo $line[1] . "\n";
if(array_key_exists($line[1], $st_count_arr)) {
$st_count_arr[$line[1]][1] += 1;
$st_count_arr[$line[1]][2] += $line[2];
}
}
$headers[0] = "Source";
$headers[1] = "Totals by Source in SailThru";
$headers[2] = "Total Matches in SFG";
$headers[3] = "Total Dollar Amount";
$headers[4] = "Average Dollar Amount";
fputcsv($final_file, $headers);
foreach($st_count_arr as $line => $value) {
$temp_arr[0] = $line;
$temp_arr[1] = $value[0];
$temp_arr[2] = $value[1];
$temp_arr[3] = $value[2];
//compute average dollar amount
if($value[1] > 0) {
$temp_amt = $value[2] / $value[1];
}
else {
$temp_amt = 0;
}
$temp_arr[4] = $temp_amt;
fputcsv($final_file, $temp_arr);
}
fclose($read_file);
fclose($final_file);
//if not a test, remove all directories except the final one and final file
if(!$test) {
//rm -rf directoryname
$rm_hashed = "rm -rf hashed";
$rm_cut_files = "rm -rf cut_files";
$rm_uniq_files = "rm -rf uniq_files";
$rm_complete_file = "rm -f " . $complete_file_name;
echo "Removing test directories\n";
exec($rm_hashed);
exec($rm_cut_files);
exec($rm_uniq_files);
exec($rm_complete_file);
echo "Test directories removed\n";
}
echo "All done!\n";
?>