forked from james-atkinson/speedcomplainer
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsv_common.py
828 lines (683 loc) · 28.9 KB
/
csv_common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
"""
This contains the base_csv_file class, and other auxillary functions that
the standards package uses.
The base_csv_file is the base for any of the Character Seperated
Value files that this standards library contains.
Examples:
.. code-block:
# example reading of basic csv file
from standards.common import base_csv_file
headers=['id', 'name', 'data']
test = base_csv_file("test.csv", headers, headers)
test.setup_read()
for value in test.readrow():
print(value)
# example reading of a clarity file
from standards.clarity_extract2 import extract_file
clarity = extract_file("hannon_provider_patients_in_epic.csv")
clarity.setup_read()
clarity_data = clarity.read_by_name_ssn()
print ("# of records: %s" % len(clarity_data))
for entry in clarity_data:
print ("Contact Date: ", entry["CONTACT_DATE"])
print ("Visit Prov Id: ", entry["VISIT_PROV_ID"])
Versions:
v1.56 - Default CSV Writer to use QUOTE_MINIMAL, controlled by quote argument
v1.55 - Added in_sep & out_sep to mdy_to_ymd_str
- Added out_sep to float_to_ymd_str
v1.5 - Added ConflictCount, and readCount in BaseCsvFile.
v1.0 - General Release
"""
__author__ = "Benjamin Schollnick"
__status__ = "Production"
__version__ = "1.55"
import datetime
import csv
import pathlib
import os
import sys
from dateutil.parser import parse
from dateutil.parser._parser import ParserError
def ensure_dirs(dpath):
"""
If the directory path does not exist,
then create it!
"""
try:
os.makedirs(dpath)
except WindowsError:
pass
def force_add_seps(datestring, sep="-"):
"""Must be yyyymmdd
"""
return datestring[0:4] + sep + datestring[4:6] + sep + datestring[6:]
def mdy_to_ymd_str(mdy, in_sep="-", out_sep="-"):
"""
Convert from mm/dd/yyyy datetime date string to a yyyy-mm-dd string.
Args:
mdy (string): The DateTime date string (%m-%d-%Y) to be converted
def_sep (string): The default separator character between the values
by default that is "-" for the inbound string
out_sep (string): The default separator character between the values
by default that is "-" for the outbound string
reject_blank (Boolean): If True, if the date can not be parsed, return None
If False, the existing mdy value will be returned
Thus if False, invalid data can be returned (it's the
original data that was sent into the function)
Returns:
datetime: string
in yyyy-mm-dd format (assuming it's not overriden by out_sep)
.. code-block:
>>> import datetime
>>> test = datetime.datetime(2018,1,1).strftime("%m-%d-%Y")
>>> mdy_to_ymd_str(test)
'2018-01-01'
>>> mdy_to_ymd_str("08-26-2019")
'2019-08-26'
>>> common.mdy_to_ymd_str("08-24-2019")
'2019-08-24'
>>> common.mdy_to_ymd_str("08-24-2019", out_sep="/")
'2019/08/24'
>>> common.mdy_to_ymd_str("08*24*2019", in_sep="*", out_sep="/")
'2019/08/24'
"""
output = ""
if mdy not in [None, ""]:
try:
output = datetime.datetime.strptime(mdy, "%m{}%d{}%Y".format(in_sep, in_sep))
output = output.strftime("%Y{}%m{}%d".format(out_sep, out_sep))
except ValueError:
return None
return output
def mdy_to_ymd_flex_str(mdy, in_sep=r"-", out_sep="-", yearFirst=False, dayFirst=False, reject_blank=False, default_yr=2000):
"""
Convert from mm/dd/yyyy datetime date string to a yyyy-mm-dd string.
see https://github.com/dateutil/dateutil/issues/703# regarding dateutils defaulting of century.
Args:
mdy (string): The DateTime date string (%m-%d-%Y) to be converted
def_sep (string): A string of separator characters that should be removed.
out_sep (string): The default separator character between the values
by default that is "-" for the outbound string
reject_blank (Boolean): If True, if the date can not be parsed, return None
If False, the existing mdy value will be returned
Thus if False, invalid data can be returned (it's the
original data that was sent into the function)
Returns:
datetime: string
in yyyy-mm-dd format (assuming it's not overriden by out_sep)
.. code-block:
>>> import datetime
>>> test = datetime.datetime(2018,1,1).strftime("%m-%d-%Y")
>>> mdy_to_ymd_flex_str(test)
'2018-01-01'
>>> mdy_to_ymd_flex_str("08-26-2019")
'2019-08-26'
>>> common.mdy_to_ymd_flex_str("08-24-2019")
'2019-08-24'
>>> common.mdy_to_ymd_flex_str("08-24-2019", out_sep="/")
'2019/08/24'
>>> common.mdy_to_ymd_flex_str("08*24*2019", in_sep="*", out_sep="/")
'2019/08/24'
"""
from dateutil.parser import parserinfo, parser
class parserinfo_20c(parserinfo):
def convertyear(self, year, century_specified=False):
if not century_specified and year < 100:
year += default_yr
return year
parser_20c = parser(parserinfo_20c())
def parse_20c(timestr, **kwargs):
return parser_20c.parse(timestr, **kwargs)
# mm-dd-yyyy # 10 -> mmddyyyy # 8
# mm-dd-yy # 8 -> mmddyy # 6
# mmddyy # 6 -> mmddyy # 6
mdy = mdy.replace(in_sep, "-")
if mdy not in [None, ""]:
try:
mdy = parse_20c(mdy, yearfirst=yearFirst, dayfirst=dayFirst).strftime("%Y{}%m{}%d".format(out_sep, out_sep)) # if out_seps requested, they are added.
except ParserError:
mdy = False
print("Parser Error")
if reject_blank:
return None
return mdy
def float_to_ymd_str(float_value, out_sep="-"):
"""
Convert from floating point string (epoch) to a yyyy-mm-dd string.
Args:
float_value (float): The DateTime floating point (epoch timestamp) value
Returns:
datetime: string in yyyy-mm-dd format
.. code-block:
>>> float_to_ymd_str(1514786400.00)
'2018-01-01'
"""
output = datetime.datetime.fromtimestamp(float_value)
output = output.strftime("%Y{}%m{}%d".format(out_sep, out_sep))
return output
class BaseCsvFile():
"""
Base class for xSV functionality.
"""
def __init__(self, fqpn, input_headers=None,
output_headers=None, padlength=8, padchar='0',
lineterminator='\n'):
"""
Args:
fqpn (string): Fully Qualified PathName
input_headers (list): Override autodetection of csv headers.
contains the headers in the order of the headers in the file.
output_headers (list): contains the headers in the order of the
headers in the output file.
padlength (integer): Number of characters in the EMRN, defaults to 8
padchar (string): The character(s) to pad with
lineterminator (string): The character to use to terminate the csv
line with.
For the fqpn, as long as it's a valid pathname from the app
directory, it will be useable.
eg:
* c:/test.csv
* c:/users/text.csv
* test.csv
* p:t.csv
"""
self.path = pathlib.Path(fqpn)
self.__fh = None
self.reading = False
self.writing = False
self.csv_handler = None
self.input_headers = input_headers
self.output_headers = output_headers
self.padlength = padlength
self.padchar = padchar
self.quoting = csv.QUOTE_ALL
#self.quoting = csv.QUOTE_NONNUMERIC
self.lineterm = lineterminator
self.conflictCount = 0
self.readCount = None
self.source = None
self.allow_append = False
def quote_all(self):
self.quoting = csv.QUOTE_ALL
def quote_minimal(self):
self.quoting = csv.QUOTE_MINIMAL
def quote_nonnumeric(self):
self.quoting = csv.QUOTE_NONNUMERIC
def quote_none(self):
self.quoting = csv.QUOTE_NONE
def close(self):
"""
Close the file handle
"""
if self.__fh != None:
self.__fh.close()
def setup_read(self, delimiter=',', force_headers=False,
remap_source=False, encoding='utf-8-sig'):
"""
Configure for Read only.
Args:
delimiter (String): The delimiting character for the CSV file.
By default this is set to a Comma (,)
force_headers (boolean): If True, do not auto-detect the headers,
use the headers from self.input_headers. If False, auto-detect.
remap_source (function): If set to False (Boolean), then do not
remap, use the file handle normally based off self.path. If
remap is intended, pass the function you wish to act as the file
handle. (Generally used for Memory IO, instead of File IO)
Returns:
Boolean: True if successfully set, False if bad headers, or
File doesn't exist.
Raises:
RuntimeError: if already configured for reading.
(This prevents, accidental reconfiguration to a different
delimiter.)
*Note*
The init function will only set the variables. You'll need
to call setup_read() or setup_write() to configure for those
purposes.
https://stackoverflow.com/questions/34399172/
why-does-my-python-code-print-the-extra-characters-%C3%AF-when-reading-from-a-tex/
"""
if self.reading:
raise RuntimeError("Duplicate Request - Already setup for Reading.")
elif self.writing:
raise RuntimeError("Configured for Writing - unable to Read.")
if self.path.exists() is False:
raise RuntimeError("File Does not Exist")
#return False
if encoding == None:
self.__fh = self.path.open(mode='r', newline='')
else:
self.__fh = self.path.open(mode='r', newline='', encoding=encoding)
if remap_source:
self.source = remap_source(self.__fh)
else:
self.source = self.__fh
if force_headers:
self.csv_handler = csv.DictReader(self.source,
delimiter=delimiter,
fieldnames=self.input_headers,
quoting=self.quoting,
lineterminator=self.lineterm)
else:
self.csv_handler = csv.DictReader(self.source,
delimiter=delimiter,
quoting=self.quoting,
lineterminator=self.lineterm)
self.allow_append = False
self.reading = True
return True
def setup_write(self, delimiter=',',
overwrite=True,
writeheader=True,
quoting=True):
"""
Configure for Write only.
Args:
delimiter (string): The delimiting character for the CSV file. By default
this is set to a comma (,)
overwrite (boolean): Allow overwriting of files. If set to False, and the
file already exists, a RuntimeError will occur. *By Default this
is set to True, and will overwrite files.*
writeheader (Boolean): Write a header to the CSV file, if set to False,
the header will be surpressed.
quoting (Boolean): If True (Default), all non-float values will be quoted
automatically in the CSV
Returns:
Boolean: True if successfully set, False if the file already
exists, and overwrite is set to False.
Raises:
RuntimeError: if already configured for writing.
(This prevents, accidental reconfiguration to a different
delimiter.)
*Note*: This will overwrite any existing file without warning, by
default. Set overwrite to False, if you do not wish to overwrite.
"""
if quoting:
quote_level = self.quoting
else:
quote_level = csv.QUOTE_NONE
if self.writing:
raise RuntimeError("Duplicate Request - Already setup for Writing.")
elif self.reading:
raise RuntimeError("Configured for Reading - unable to Write.")
if self.path.exists() and overwrite is False:
return False
self.__fh = self.path.open(mode='w', newline='')
self.csv_handler = csv.DictWriter(self.__fh,
delimiter=delimiter,
fieldnames=self.output_headers,
quoting=quote_level)
if writeheader:
self.csv_handler.writeheader()
self.allow_append = False
self.writing = True
return True
def setup_append(self, delimiter=',',
writeheader=True,
quoting=True):
"""
Configure for Write only.
Args:
delimiter (string): The delimiting character for the CSV file. By default
this is set to a comma (,)
overwrite (boolean): Allow overwriting of files. If set to False, and the
file already exists, a RuntimeError will occur. *By Default this
is set to True, and will overwrite files.*
writeheader (Boolean): Write a header to the CSV file, if set to False,
the header will be surpressed.
quoting (Boolean): If True (Default), all non-float values will be quoted
automatically in the CSV
Returns:
Boolean: True if successfully set, False if the file already
exists, and overwrite is set to False.
Raises:
RuntimeError: if already configured for writing.
(This prevents, accidental reconfiguration to a different
delimiter.)
*Note*: This will overwrite any existing file without warning, by
default. Set overwrite to False, if you do not wish to overwrite.
"""
self.allow_append = True
if quoting:
quote_level = self.quoting
else:
quote_level = csv.QUOTE_NONE
if self.reading:
raise RuntimeError("Configured for Reading - unable to Write.")
if self.path.exists() and self.allow_append is False:
return False
already_exists = self.path.exists()
self.__fh = self.path.open(mode='a', newline='')
self.csv_handler = csv.DictWriter(self.__fh,
delimiter=delimiter,
fieldnames=self.output_headers,
quoting=quote_level)
if writeheader and already_exists == False:
self.csv_handler.writeheader()
self.writing = True
return True
def clear_record(self):
"""
Return a dictionary that contains the headers (as key),
and empty values.
This way, you can populate the fields that need to be populated
without the worry of missing a field.
"""
output = {}
for field in self.output_headers:
output[field] = ""
return output
def flush(self):
self.__fh.flush()
def return_beginning(self):
"""
Forcibly reset the file offset pointer to the beginning of the file.
"""
self.__fh.seek(0)
def readrow(self):
"""
Read a CSV row from the reading file.
Returns:
Dictionary: Dictionary with the data from the CSV row.
Raises:
RuntimeError: if not configured for reading.
.. code-block:
from standards.universal import IndexFile
stats = {}
index_reader = IndexFile("test.csv")
index_reader.setup_read()
for row in index_reader.readrow():
if row["doc_category"] not in stats:
stats[row["doc_category"]] = 0
stats[row["doc_category"]] += 1
for key in sorted(stats):
print ("%s - %s" % (key, humanize.intcomma(stats[key])))
"""
if not self.reading:
raise RuntimeError('Attempted to Read without Setup')
return self.csv_handler
def write_cleaning(self, datadict):
"""
*Subclass* if you need to force the datadictionary to follow specific
rules.
This will allow you to clean or replace/substitute the raw data before
it is written. (eg. pad data if not handled at read time, etc)
Args:
datadict (dictionary): The Row data that is being examined for
cleaning.
Returns:
dictionary: The cleaned data
Make sure to set clean TRUE in writerow to enable this functionality.
See Universal.py for example.
.. code-block:
def write_cleaning(self, datadict):
datadict["dob"] = datadict["dob"].replace("-", "")
return datadict
.. code-block:
def writerow(self, datadict, clean=True):
if self.reading:
raise RuntimeError("Configured for reading -- Unable to write.")
return super().writerow(datadict, clean)
"""
return datadict
def readrawline(self, clean_func=None):
"""
Allow manual loading of the file, line by line as a generator.
This way you can still use the framework, but allows you to read
in a specialized manner. (eg. not via read_by_key(s) )
Args:
clean_func (function): Defaults to None (unused).
If passed an function, that function will be called to process
and manipulate the data, and then the data will be returned.
Yield:
dictionary: The dictionary for the row that is being read.
"""
for row in self.csv_handler:
if clean_func is not None:
row = clean_func(row)
yield row
def writerow(self, datadict, clean=False):
"""
Write a CSV row to the output file.
Args:
datadict (dictionary): The Row of Data to be written to the file
clean (Boolean): If True, use write_cleaning routines
Returns:
Dictionary: The returned results from the csvdictwriter writerow
Raises:
RuntimeError: if not configured for reading.
"""
if not self.writing:
raise RuntimeError('Attempted to Write without Setup')
if clean:
datadict = self.write_cleaning(datadict)
return self.csv_handler.writerow(datadict)
def getReadCount(self):
"""
Please note, this will be reset if you perform any read operations.
Please store the value, if you need to retain the readCount.
Returns:
None: If no read has occurred.
Integer: returns INTEGER value of rows that have been read.
"""
return self.readCount
def conflictsOccurred(self):
"""
Returns:
Boolean: True if a primary key collision occurred, otherwise,
return False.
"""
return self.conflictCount > 0
def RemapValue(self, remapPool, SourceValue, MappingColumn, debug=False):
"""
>>> import common
>>> temp = common.BaseCsvFile("csvFile2.csv")
>>> temp.setup_read()
True
>>> trans = {"Snake Wrangler":"Python!"}
>>> for x in temp.readrow():
... z = x
...
>>> temp.remap_column(remapPool=trans, row=z, SourceColumn=" Title")
'Python!'
"""
#print(remapPool[SourceValue])
if SourceValue in remapPool:
try:
NewValue = remapPool[SourceValue][MappingColumn].strip()
# get the value
if not(NewValue.strip() in ["", None]):
# if the value is not "" or None, return value
return NewValue
except KeyError:
print()
print("%s is an invalid Column in the Remapping File" % MappingColumn)
print()
sys.exit(1)
# org mrn was not found in pool,
# *OR* the eMRN was in "", or None (Invalid value)
return None
def _read_by_key(self, key=None, restrictFields=None, clean_func=None, revealConflicts=False):
"""
Args:
key (string): Key is the column (of the header) to use as the key
for the dictionary. For data integrity the *KEY* must be
unique. This is one-to-one.
clean_func (func): If None, do not perform cleaning. To Enable
pass the function into the argument.
restrictFields (list): This list contains the names (case-insensitive)
of the columns to be gathered and stored in the data returned.
(eg. If the database has Column1, Column2, Column3, Columnxx,
and you only want Column 3, restrictFields=["Column3"],
if you want column 1, 3, xx,
restrictFields=["Column1", "Column3", Columnxx"]
Mainly supplied to reduce memory consumption for larger
csv's.
Returns:
Dictionary: Dictionary of data from the extract.
Raises:
RuntimeError: Raises a RuntimeError if no key is specified.
+---------+------------+-----------+
| mrn | first | last |
+=========+============+===========+
| 0123 | john | doe |
+---------+------------+-----------+
| 2124 | john | gleason |
+---------+------------+-----------+
| 3125 | jack | hardy |
+---------+------------+-----------+
| 4126 | frank | franklin |
+---------+------------+-----------+
| 5127 | peter | griffin |
+---------+------------+-----------+
| 6128 | thomas | jane |
+---------+------------+-----------+
.. code-block:
test = _read_by_key(key="mrn")
test.keys()
dict_keys(['0123', '2134', '3125', '4126', '5127', 6128'])
len(test.keys())
6
*NOTE* the key must be unique if the primary key is not unique, it will
overwrite non-unique keys. In that situation, only the last data
written will be available.
.. code-block:
test = _read_by_key(key="first")
test.keys()
dict_keys(['JOHN', 'JACK', 'FRANK', 'PETER', 'THOMAS'])
len(test.keys())
5
>>> test = BaseCsvFile(r"test_samples/monty.csv")
>>> test.setup_read()
True
>>> data = test._read_by_key(key="value")
>>> data.keys()
dict_keys(['1', '2', '3'])
>>> test = BaseCsvFile(r"test_samples/monty.csv")
>>> test.setup_read()
True
>>> data = test._read_by_key(key="letter")
>>> data.keys()
dict_keys(['A', 'B', 'C'])
>>> test = BaseCsvFile(r"test_samples/small.csv")
>>> test.setup_read()
True
>>> data = test._read_by_key(key="2nd")
>>> data.keys()
dict_keys(['A', 'D', 'G'])
>>> test = BaseCsvFile(r"test_samples/small.csv")
>>> test.setup_read()
True
>>> data = test._read_by_key(key="4th")
>>> data.keys()
dict_keys(['C', 'F', 'I'])
>>> data["F"]
OrderedDict([('1st', '2'), ('2nd', 'd'), ('3rd', 'e'), ('4th', 'f')])
"""
data = {}
self.readCount = 0
self.conflictCount = 0
if key is None:
raise RuntimeError("No key specified.")
for row in self.csv_handler:
if clean_func is not None:
row = clean_func(row)
keyvalue = str(row[key]).upper()
if keyvalue in data:
self.conflictCount += 1
if revealConflicts:
print("Conflict: ", keyvalue)
else:
if restrictFields != None:
kvalues = list(row.keys())
for x in kvalues:
if x.title() not in restrictFields and x in row:
del(row[x])
data[keyvalue] = row
self.readCount += 1
return data
def _read_by_keys(self, keys=None, clean_func=None, revealConflicts=False):
"""
Args:
keys (string): Key is the column (of the header) to use as the key
for the dictionary. For data integrity the *KEY* must be
unique. This is one-to-one.
clean_func (func): If None, do not perform cleaning. To Enable
pass the function into the argument.
Returns:
Dictionary: Dictionary of data from the extract.
Raises:
RuntimeError: Raises a RuntimeError if no key is specified.
+---------+------------+-----------+
| mrn | first | last |
+=========+============+===========+
| 0123 | john | doe |
+---------+------------+-----------+
| 2124 | john | gleason |
+---------+------------+-----------+
| 3125 | jack | hardy |
+---------+------------+-----------+
| 4126 | frank | franklin |
+---------+------------+-----------+
| 5127 | peter | griffin |
+---------+------------+-----------+
| 6128 | thomas | jane |
+---------+------------+-----------+
.. code-block:
test = _read_by_keys(keys=["mrn"])
test.keys()
dict_keys(['0123', '2134', '3125', '4126', '5127', 6128'])
len(test.keys())
6
*NOTE* the key must be unique if the primary key is not unique, it will
overwrite non-unique keys. Only the last data written will be
available.
.. code-block:
test = _read_by_keys(keys=["first", "last"])
test.keys()
dict_keys(['JOHN DOE', 'JOHN GLEASON', 'JACK HARDY', 'FRANK FRANKLIN',
'PETER GRIFFIN', 'THOMAS JANE'])
len(test.keys())
6
def clean(row):
row["dob"] = row["dob"].replace("-", "")
return row
import common
headers = ["first_name","last_name","mrn","dob","doc_date",\
"creation_date","urpath","filename","description",\
"doc_category","sha-512"]
test = common.BaseCsvFile("test.csv", headers, headers)
test.setup_read()
True
z = test._read_by_keys(keys=["first_name",\
"last_name", "dob"], clean_func=clean)
len(z)
6
z.keys()
dict_keys(['JOHN_DOE_18951116', 'THOMAS_JANE_19370916',\
'CARRIE_FISHER_19740122', 'GARY_TOAD_19560114',\
'ABBOTT_TERRY_1978831', 'JUDY_YAAS_19691019'])
"""
data = {}
self.readCount = 0
self.conflictCount = 0
if keys is None:
raise RuntimeError("No key specified.")
for row in self.csv_handler:
keyvalue = []
if clean_func is not None:
row = clean_func(row)
for key in keys:
keyvalue.append(row[key].strip())
keyvalue = '_'.join(keyvalue)
if keyvalue in data:
self.conflictCount += 1
if revealConflicts:
print("Conflict: ", keyvalue)
else:
self.readCount += 1
data[keyvalue.upper().strip()] = row
return data
read_by_key = _read_by_key
read_by_keys = _read_by_keys