-
Notifications
You must be signed in to change notification settings - Fork 0
/
ph-xls2csv.sh
175 lines (140 loc) · 4.69 KB
/
ph-xls2csv.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/bin/bash --posix
# Script to process raw Penny Harvest data provided in XLSX format
# The output is a series of normalized, cleaned up and geocoded CSV files
# INSTRUCTIONS
# $ bash ph-xls2csv.sh -i [file-name]
# Example: bash ph-xls2csv.sh -i Map_data.xls
# TODO
# review //TEMP
# Add check for XLS or XLSX
# Check format of date last updated
# Check not to overwrite existing files
set -u
typeset -r start_time=$SECONDS
error()
{
echo >&2 $*
exit 1
}
usage()
{
cat >&2 <<-EOF
Usage : $0 -i map_data.xls
-i input file
-h help
EOF
}
typeset var_input=""
while getopts "i:h" option
do
case $option in
i)
var_input="$OPTARG"
;;
h)
usage
exit 0
;;
*)
usage
exit 1
;;
esac
done
# check that args not empty
# //TEMP we could make better checks
[[ $var_input != "" ]] || { usage; exit 1;}
# The folder should not contain the final file already
#[[ ! -f $var_output ]] || error "It seems you already have a $var_output in this folder. Remove it and run this script again."
# The sheets in the Excel that need to be processed
typeset -r sheets=(SCHOOLS GRANTS ORGS)
# The years the data is about
typeset -r years=(5yrs 2013 2012 2011 2010 2009)
# The MapQuest API key
typeset -r mq_api=Fmjtd%7Cluur290y2h%2Cr5%3Do5-90zl54
# Change Internal Field Separator to new line. Otherwise, it will think spaces
# in filenames are field separators
typeset -r IFS=$'\n'
# Checking if the dependencies are met.
# check that geocode.py is available
typeset -r cmd_geocode="geocode.py"
[[ -f $cmd_geocode ]] || error "This script needs $cmd_geocode"
# Some of the tools used are part of csvkit, like in2csv, csvjoin, csvstack, etc
# Based on: http://www.snabelb.net/content/bash_support_function_check_dependencies
# for portability and just in case which is not available
typeset -r cmd_which="/usr/bin/which"
[[ -x $cmd_which ]] || error "$cmd_which command not found"
# check that every command is available and executable
for command in in2csv csvcut csvjoin python wget unzip sed
do
if [[ $command == "in2csv" ]]
then
typeset -r cmd_in2csv=$($cmd_which in2csv)
if [[ ! -x $cmd_in2csv ]]
then
# but keep initial echo
echo -e "\nThis script requires a couple of tools that are provided by csvkit."
echo -e "You might be able to install csvkit by using:"
echo -e "\t$ sudo pip install csvkit"
echo -e "More info: http://csvkit.readthedocs.org/en/latest/index.html#installation"
exit 1
fi
else
typeset -r cmd_$command=$($cmd_which $command)
[[ -x $(eval echo \$cmd_$command) ]] || error "$cmd_$command command not found"
fi
done
# Check if the input file is actually present
[[ -f $var_input ]] || error "It seems the input file is not present."
##############################################################################
# Convert each relevant sheet in the Excel file to its own CSV file + some
# other cleaning up.
for sheet in ${sheets[*]}
do
echo "Start processing data on $sheet"
#Xls -> csv on the sheet that matters
$cmd_in2csv --sheet $sheet $var_input > $sheet.csv
elapsed_time=$(($SECONDS - $start_time))
echo "$elapsed_time seconds. done"
done
# Clean up the school sheet
# First line doesn't contain the actual headers
$cmd_sed -i -e "1D" SCHOOLS.csv
# Clean up the column headers of the schools and add an indication of the year.
for year in ${years[*]}
do
$cmd_sed -i "0,/# of Grants/{s/# of Grants/number_grants_$year/}" SCHOOLS.csv
$cmd_sed -i "0,/# of Service Projects/{s/# of Service Projects/number_service_projects_$year/}" SCHOOLS.csv
$cmd_sed -i "0,/$ Grants/{s/$ Grants/amount_grants_$year/}" SCHOOLS.csv
$cmd_sed -i "0,/PH (Held Collection)/{s/PH (Held Collection)/ph_held_collection_$year/}" SCHOOLS.csv
$cmd_sed -i "0,/RT (Held Roundtable)/{s/RT (Held Roundtable)/rt_held_roundtable_$year/}" SCHOOLS.csv
done
# Clean up of the Grant sheet
# Remove all the organization related information that is available on the other
# sheets as well.
$cmd_csvcut -C 12,13,14,15,16,17,18,19 GRANTS.csv > GRANTS-tmp.csv
rm GRANTS.csv
mv GRANTS-tmp.csv GRANTS.csv
elapsed_time=$(($SECONDS - $start_time))
echo "$elapsed_time seconds. Initial conversion and some housekeeping taken care of."
##############################################################################
# Geocode
# @param string $1
# Name of file to be geocoded (without extension)
# @param string $2
# Name of the column where the unique ID is stored
function geocode {
touch latlng_$1.csv
$cmd_python $cmd_geocode latlng_$1.csv $1.csv $mq_api
#Merge the files
$cmd_csvjoin $1.csv latlng_$1.csv -c $2,id > $1-tmp.csv
rm $1.csv
rm latlng_$1.csv
mv $1-tmp.csv $1.csv
}
geocode SCHOOLS 'School ID'
geocode ORGS 'ORG ID'
elapsed_time=$(($SECONDS - $start_time))
echo "$elapsed_time seconds. Geocoding done."
exit 0
# EOF