-
Notifications
You must be signed in to change notification settings - Fork 0
/
ph-org-csv2md.py
169 lines (143 loc) · 6.86 KB
/
ph-org-csv2md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# Script to generate Penny Harvest organisation profiles in Markdown format
# The data should be provided in CSV format and the script outputs one md file
# per organization.
#
# @param string folder_out
# The name of the output folder
#
# @param string file_in
# The csv file containing the data
# TODO
# Checks on the output folder // If not empty, ask user if we should continue anyway
# Replace spaces for hyphens in the title (just in case)
import csv
import sys
import arrow
folder_out = sys.argv[1]
file_in = sys.argv[2]
def indent(n = 1):
base_indentation = ' '
return base_indentation * n
def sanitize_br(text):
"Replace the _x000D_ for proper line-breaks and returns. First we remove the line-breaks, any remaining _x000D_ is assumed to be an actual return."
return text.replace(' _x000D_\n', ' ').replace('_x000D_\n','\n\n').replace('_x000D_','')
def yaml_array(text, separator = ','):
"Splits a string by a separator and stores the result as a YAML array"
if text.strip(' \n'):
array = ''
for substring in text.split(separator):
array += '\n' + indent() + '- ' + substring
return array
else:
# If empty, we return nothing
return ''
def yaml_multiline_string(text, is_array = False):
"Turns a string into a properly formatted multiline string to be used as a YAML variable. If is_array is set to true, the multiline string will be formatted as a YAML array"
if text.strip(' \n'):
# Add pipe and indent first paragraph
# 1. replace line-breaks, 2. replace return + indent paragraph,
# 3. clean up remaining -_x000D_."
prepend = ''
indentation = 1
# The syntax for multiline string array, is slightly different and requires
# a hyphen before the pipe and extra indentation
if is_array:
prepend = '\n' + indent(indentation) + '- '
indentation = 2
text = prepend + '|\n' + indent(indentation) + text.replace(' _x000D_\n', ' ').replace('_x000D_\n','\n\n' + indent(indentation)).replace('_x000D_','')
return text
else:
# If empty, we return nothing
return ''
md_linebreak = ' '
with open(file_in, 'rb') as ifile:
# Fetch the index of each relevant column. This allows for some flexibility
# in column position and does a check if all mandatory columns are present
reader = csv.reader(ifile)
header = reader.next()
# Dict with the variable names (key) and the relevant column names (value)
cols = { 'oid' : 'ORG ID', 'name' : 'Organization_name', 'add' : 'Address', 'cit' : 'City', 'st' : 'State', 'zip' : 'Zip', 'nat' : 'National', 'locs' : 'Location_of_services', 'loco' : 'Location_of_offices', 'web': 'Website', 'keyw' : 'Keywords', 'imp' : 'Impact Area', 'miss' : 'Org Mission', 'descr' : 'Organization_overview', 'cash' : 'Cash_grants', 'cash1' : 'Description_of_cash_grants_1', 'cash2' : 'Description_of_cash_grants_2', 'serv' : 'Service_opportunities', 'serv1' : 'Description_of_service_opportunity_1', 'serv2' : 'Description_of_service_opportunity_2', 'learn' : 'How_can_you_help_students_learn_more_about_you', 'cont' : 'How_can_you_continue_the_relationship_afterwards', 'sal' : 'Salutation', 'first' : 'First_Name', 'last' : 'Last_Name', 'title' : 'Title', 'intro' : 'Introduction', 'ph' : 'Phone', 'ext' : 'Extension', 'fax' : 'Fax', 'email' : 'Email', 'meth' : 'Preferred_Method_of_Contact', 'time' : 'Best_Time_to_Contact', 'lat' : 'lat', 'lng' : 'lng' }
fields = { }
error = False
# Loop over key & value of the 'cols' dict and build the 'fields' dict with
# convenient variable name (key) and the position of the column in the CSV
# (value)
for key,value in cols.iteritems():
try:
fields[key] = header.index(value)
except ValueError:
print "A column '" + value + "' was not found."
error = True
if error == True:
print "Please check your CSV and try again."
sys.exit(0)
utc = arrow.utcnow()
timestamp = utc.format('YYYY-MM-DD')
# Generate a markdown file for each row in the CSV
for row in reader:
# Replace any colon in the title for a hyphen
row[fields['name']] = row[fields['name']].replace(':', '-')
# Define the note category
if row[fields['nat']]:
category = "national"
else:
category = "local"
# Sanitize the prefered contact method. We have to account for multiple
# contact methods.
dirty_method = row[fields['meth']]
sane_method = [ ]
if dirty_method.find('mail') != -1 or dirty_method.find('@') != -1:
sane_method.append('email')
if dirty_method.find('phone') != -1:
sane_method.append('phone')
sane_method = ",".join(sane_method)
# Create file with proper file-name. Jekyll requires every file to start
# with a date
file_out = folder_out + '/' + timestamp + '-' + row[fields['oid']] + '.md'
f = open(file_out, 'w')
f.write('---\n')
f.write('layout: organization\n')
f.write('category: ' + category + '\n')
f.write('\n')
f.write('title: ' + row[fields['name']] + '\n')
f.write('impact_area: ' + row[fields['imp']] + '\n')
f.write('keywords: ' + yaml_array(row[fields['keyw']]) + '\n')
f.write('location_services: ' + row[fields['locs']] + '\n')
f.write('location_offices: ' + row[fields['loco']] + '\n')
f.write('website: ' + row[fields['web']] + '\n')
f.write('\n')
f.write('description: ' + yaml_multiline_string(row[fields['descr']]) + '\n')
f.write('mission: ' + yaml_multiline_string(row[fields['miss']]) + '\n')
f.write('\n')
f.write('cash_grants: ' + row[fields['cash']] + '\n')
f.write('grants: ' + yaml_multiline_string(row[fields['cash1']], True) + yaml_multiline_string(row[fields['cash2']], True) + '\n')
f.write('service_opp: ' + row[fields['serv']] + '\n')
f.write('services: ' + yaml_multiline_string(row[fields['serv1']], True) + yaml_multiline_string(row[fields['serv2']], True) + '\n')
f.write('\n')
f.write('learn: ' + yaml_array(row[fields['learn']]) + '\n')
f.write('cont_relationship: ' + yaml_array(row[fields['cont']]) + '\n')
f.write('\n')
f.write('salutation: ' + row[fields['sal']] + '\n')
f.write('first_name: ' + row[fields['first']] + '\n')
f.write('last_name: ' + row[fields['last']] + '\n')
f.write('title_contact_person: ' + row[fields['title']] + '\n')
f.write('\n')
f.write('city: ' + row[fields['cit']] + '\n')
f.write('state: ' + row[fields['st']] + '\n')
f.write('address: |\n')
f.write(indent() + row[fields['add']] + md_linebreak + '\n')
f.write(indent() + row[fields['cit']] + ' ' + row[fields['st']] + ' ' + row[fields['zip']] + '\n')
f.write('lat: ' + row[fields['lat']] + '\n')
f.write('lng: ' + row[fields['lng']] + '\n')
f.write('phone: ' + row[fields['ph']] + '\n')
f.write('ext: ' + row[fields['ext']] + '\n')
f.write('fax: ' + row[fields['fax']] + '\n')
f.write('email: ' + row[fields['email']] + '\n')
f.write('preferred_contact: ' + str(sane_method) + '\n')
f.write('contact_person_intro: ' + yaml_multiline_string(row[fields['intro']]) + '\n')
f.write('---\n')
f.write(sanitize_br(row[fields['descr']]))
# Done. Close the file.
f.close()