-
Notifications
You must be signed in to change notification settings - Fork 52
/
CSVKMD.py
executable file
·165 lines (147 loc) · 5.58 KB
/
CSVKMD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""
Purpose: Process a CSV file to combine data fields for unique key fields
Customize: Change QUOTE_CHAR, DATA_DELIMITER, LINE_TERMINATOR as required/desired
Define: KEYFIELD, DATAFIELD, SAVEFIELDS, MATCHFIELDS, SKIPFIELDS
Python: Use python or python3 below as appropriate to your system; verify that you have version 3
$ python -V or python3 -V
Python 3.x.y
Usage:
$ python3 CSVKMD.py ./Input.csv ./Output.csv
Example:
You have a CSV file CourseStudent.csv with three columns: id,name,student
The columns are: course ID, course Name, student email address
id,name,student
57121690282,science,[email protected]
57121690282,science,[email protected]
57121690282,science,[email protected]
56941282690,english,[email protected]
56941282690,english,[email protected]
56941282690,english,[email protected]
47491913641,math,[email protected]
47491913641,math,[email protected]
47491913641,math,[email protected]
Process all courses
KEYFIELD = 'id'
DATAFIELD = 'student'
SAVEFIELDS = ['name']
MATCHFIELDS = {}
SKIPFIELDS = {}
$ python3 CSVKMD.py CourseStudent.csv CourseStudentCombined.csv
$ more CourseStudentCombined.csv
id,name,student
$ gam csv CourseStudentCombined.csv gam courses "~id" add students users "~student"
Process only english course
KEYFIELD = 'id'
DATAFIELD = 'student'
SAVEFIELDS = ['name']
MATCHFIELDS = {'name': re.compile(r'english')}
SKIPFIELDS = {}
$ python3 CSVKMD.py CourseStudent.csv CourseStudentCombined.csv
$ more CourseStudentCombined.csv
id,name,student
$ gam csv CourseStudentCombined.csv gam courses "~id" add students users "~student"
Process all courses except english
KEYFIELD = 'id'
DATAFIELD = 'student'
SAVEFIELDS = ['name']
MATCHFIELDS = {}
SKIPFIELDS = {'name': re.compile(r'english')}
$ python3 CSVKMD.py CourseStudent.csv CourseStudentCombined.csv
$ more CourseStudentCombined.csv
id,name,student
$ gam csv CourseStudentCombined.csv gam courses "~id" add students users "~student"
"""
import csv
import re
import sys
QUOTE_CHAR = '"' # Adjust as needed to properly read CSV files
DATA_DELIMITER = ' '# Delimiter between data field items
LINE_TERMINATOR = '\n' # On Windows, you probably want '\r\n'
ignore = re.compile(r'') # Keep pylint happy
# Key field name
# e.g., KEYFIELD = 'id'
KEYFIELD = ''
# Data field name
# e.g., DATAFIELD = 'student'
DATAFIELD = ''
# List of additional field names to include in output
# e.g., SAVEFIELDS = ['name']
SAVEFIELDS = []
# A dictionary of fields and associated Python Regular Expressions. Rows where all fields match will be processed
# e.g., MATCHFIELDS = {'name': re.compile(r'english')}
MATCHFIELDS = {}
# A dictionary of fields and associated Python Regular Expressions. Rows where with all fields don't match will be processed
# SKIPFIELDS = {'name': re.compile(r'english')}
SKIPFIELDS = {}
def fieldError(category, fieldName):
sys.stderr.write(f'Error: {category}field "{fieldName}" not in file {sys.argv[1]} field names: {",".join(inputFieldNames)}\n')
def checkMatchSkipFields(row, matchFields, skipFields):
for matchField, matchPattern in iter(matchFields.items()):
if (matchField not in row) or not matchPattern.search(row[matchField]):
return False
for skipField, matchPattern in iter(skipFields.items()):
if (skipField in row) and matchPattern.search(row[skipField]):
return False
return True
data = {}
if (len(sys.argv) > 1) and (sys.argv[1] != '-'):
inputFile = open(sys.argv[1], 'r', encoding='utf-8')
else:
inputFile = sys.stdin
inputCSV = csv.DictReader(inputFile, quotechar=QUOTE_CHAR)
inputFieldNames = inputCSV.fieldnames
fieldErrors = 0
if KEYFIELD not in inputFieldNames:
fieldError('key', KEYFIELD)
fieldErrors += 1
if DATAFIELD not in inputFieldNames:
fieldError('data', DATAFIELD)
fieldErrors += 1
for field in SAVEFIELDS:
if field not in inputFieldNames:
fieldError('save', field)
fieldErrors += 1
for field in MATCHFIELDS:
if field not in inputFieldNames:
fieldError('match', field)
fieldErrors += 1
for field in SKIPFIELDS:
if field not in inputFieldNames:
fieldError('skip', field)
fieldErrors += 1
if fieldErrors:
sys.exit(1)
outputFieldNames = []
for field in inputFieldNames:
if field == KEYFIELD or field == DATAFIELD or field in SAVEFIELDS:
outputFieldNames.append(field)
for irow in inputCSV:
keyfield = irow[KEYFIELD]
datafield = irow[DATAFIELD]
if keyfield and datafield and checkMatchSkipFields(irow, MATCHFIELDS, SKIPFIELDS):
data.setdefault(keyfield, {DATAFIELD: set()})
data[keyfield][DATAFIELD].add(datafield)
for field in SAVEFIELDS:
data[keyfield][field] = irow[field]
if inputFile != sys.stdin:
inputFile.close()
if (len(sys.argv) > 2) and (sys.argv[2] != '-'):
outputFile = open(sys.argv[2], 'w', newline='')
else:
outputFile = sys.stdout
outputCSV = csv.DictWriter(outputFile, outputFieldNames, lineterminator=LINE_TERMINATOR, quotechar=QUOTE_CHAR)
outputCSV.writeheader()
for kp, kv in sorted(data.items()):
orow = {KEYFIELD: kp, DATAFIELD: DATA_DELIMITER.join(kv[DATAFIELD])}
for field in SAVEFIELDS:
orow[field] = kv[field]
outputCSV.writerow(orow)
if outputFile != sys.stdout:
outputFile.close()