forked from surajpaib/dicomsort
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dicomsort.py
executable file
·556 lines (482 loc) · 19.4 KB
/
dicomsort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
#!/usr/bin/env python
"""
https://github.com/pieper/dicomsort
Sorts directories containing dicom files into directories
with human-readable names for easy organization and manipulation.
See --help for options
Steve Pieper [email protected]
Stefan Baumann [email protected]
This software is released under the terms of the
3D Slicer License version 1.0 (December 20, 2005).
See the License.txt file or http://slicer.org for full text.
"""
# {{{ packages and logging utilities
# standard python includes
import sys
import os
import traceback
import shutil
import tempfile
import urllib.request
import zipfile
from multiprocessing import Pool
# special public packages
try:
import dicom
from dicom.filereader import InvalidDicomError
except ImportError:
import pydicom
from pydicom.filereader import InvalidDicomError
dicom = pydicom
# }}}
# {{{ DICOMSorter
class DICOMSorter(object):
"""Implements the logic for sorting dicom files from
a source directory into a target directory tree
according to a given set of options.
This is meant to be somewhat independent of the command line
wrapper so that it could be used as a library in other
code if needed.
"""
def __init__(self):
self.flagToOptions = {
'-v': 'verbose',
'--verbose': 'verbose',
'-z': 'compressTargets',
'--compressTargets': 'compressTargets',
'-d': 'deleteSource',
'--deleteSource': 'deleteSource',
'-f': 'forceDelete',
'--forceDelete': 'forceDelete',
'-k': 'keepGoing',
'--keepGoing': 'keepGoing',
'-s': 'symlink',
'--symlink': 'symlink',
'-t': 'test',
'--test': 'test',
'-u': 'unsafe',
'--unsafe': 'unsafe',
'-r': 'truncateTime',
'--truncateTime': 'truncateTime',
'-j': 'jobs',
'--jobs': 'jobs'
}
self.defaultOptions = {
'sourceDir': None,
'targetPattern': None,
'compressTargets': False,
'deleteSource': False,
'forceDelete': False,
'keepGoing': False,
'verbose': False,
'symlink': False,
'test': False,
'unsafe': False,
'truncateTime': False,
'jobs': 1
}
self.requiredOptions = [ 'sourceDir', 'targetPattern', ]
# each dict key is a directory path used while sorting
# values are lists of new filenames within directory
self.renamedFiles = {}
def setOptions(self,options):
"""Set the member variable options based on passed dictionary,
complaining if require options are missing, and filling in
optional options with default values if not specified"""
for option in self.requiredOptions:
if option not in options:
return False
for option in self.defaultOptions:
if option not in options:
options[option] = self.defaultOptions[option]
if '%' not in options['targetPattern']:
# implement the default sort
pattern = "%PatientID/%AccessionNumber/%SeriesNumber_%SeriesDescription/%StudyInstanceUID/%InstanceNumber_%SOPInstanceUID.dcm"
options['targetPattern'] = os.path.join(options['targetPattern'], pattern)
self.options = options
return True
def safeFileName(self,fileName):
"""Remove any potentially dangerous or confusing characters from
the file name by mapping them to reasonable substitutes"""
underscores = r"""+`~!@#$%^&*(){}[]/=\|<>,.":' """
safeName = ""
for c in fileName:
if c in underscores:
c = "_"
safeName += c
return safeName
def pathFromDatasetPattern(self,ds,safe=True):
"""Given a dicom dataset, use the targetPattern option
to define a file path"""
replacements = {}
fmt, keys = self.formatFromPattern()
for key in keys:
if hasattr(ds,key):
value = ds.__getattr__(key)
else:
value = ""
if value == "":
value = "Unknown%s" % key
if self.options['truncateTime']:
if key.endswith("Time") and str(value)[str(value).find('.')+1:] == '000000':
value = str(value)[:str(value).find('.')]
if safe:
try:
replacements[key] = self.safeFileName(str(value))
except UnicodeEncodeError as why:
print('Encoding target path segment value failed. Exception: %s' % why)
value = "Unknown_%s_" % key
replacements[key] = self.safeFileName(str(value))
else:
replacements[key] = str(value)
return fmt % replacements
def formatFromPattern(self):
"""Given a dicom dataset, use the targetPattern option
to define a file path"""
keys = []
fmt = ""
p = self.options['targetPattern']
end = len(p)
i = 0
while i < end:
c = p[i]
if c == "%":
fmt += "%("
i += 1
key = ""
while True:
c = p[i]
if c.isalpha():
fmt += c
key += c
else:
fmt += ")s"
break
i += 1
if i >= end:
fmt += ")s"
break
keys.append(key)
else:
fmt += c
i += 1
return(fmt, keys)
def renameFiles(self):
"""Perform the sorting operation by sequentially renaming all
the files in the source directory and all it's children
"""
self.filesRenamed = 0
self.filesSkipped = 0
if self.options['verbose']:
print("Preparing the list of files ...")
allFiles = []
if self.options['sourceDir'] == "":
for line in sys.stdin:
line = line.strip()
if os.path.isfile(line):
allFiles.append(line.strip())
else:
for root, subFolders, files in os.walk(self.options['sourceDir']):
for file in files:
file = os.path.join(root,file)
allFiles.append(file)
if self.options['verbose']:
print("Sorting files ...")
def renameFilesInParallel(files):
"""Rename a list of files in parallel"""
pool = Pool(processes=self.options['jobs'])
with tqdm(total=len(files)) as pbar:
for flag in pool.imap_unordered(self.renameFile, files):
if flag:
self.filesRenamed += 1
else:
self.filesSkipped += 1
pbar.update()
pool.close()
pool.join()
try:
from tqdm import tqdm
pbar = tqdm(total=len(allFiles))
except ImportError:
pbar = None
if self.options['jobs'] > 1:
# Right now parallel processing is only supported when keepGoing is true
if not self.options['keepGoing']:
print("Parallel processing is only supported when keepGoing is true")
return sys.exit(1)
renameFilesInParallel(allFiles)
else:
for file in allFiles:
if self.renameFile(file):
self.filesRenamed += 1
else:
self.filesSkipped += 1
if pbar is not None:
pbar.update(1)
if pbar is not None:
pbar.close()
if self.options['verbose']:
print("Renamed %d, skipped %d" % (self.filesRenamed, self.filesSkipped))
return True
def renameFile(self,file):
"""Rename a single file according to the current options.
Return true on success"""
# check for dicom file
if self.options['verbose']:
print("Considering file %s" % file)
try:
ds = dicom.read_file(file,stop_before_pixels=True)
except (IOError, os.error) as why:
print( "dicom.read_file() IO error on file %s, exception %s" % (file,str(why)) )
return False
except InvalidDicomError:
return False
except KeyError:
# needed for issue with pydicom 0.9.9 and some dicomdir files
return False
# check for valid path - abort program to avoid overwrite
path = self.pathFromDatasetPattern(ds, safe=(not self.options['unsafe']))
if os.path.exists(path):
print('\nSource file: %s' % file)
print('Target file: %s' % path)
print('\nTarget file already exists - pattern is probably not unique')
if not self.options['keepGoing']:
print('Aborting to avoid data loss.')
sys.exit(-3)
# TODO: handle duplicate files for parallel processing
# make new directories to hold file if needed
targetDir = os.path.dirname(path)
targetFileName = os.path.basename(path)
os.makedirs(targetDir, exist_ok=True)
try:
if self.options['symlink']:
os.symlink(file, path)
if self.options['verbose']:
print("Symlinked %s, to %s" % (file,path))
else:
shutil.copyfile(file,path)
if self.options['verbose']:
print("Copied %s, to %s" % (file,path))
except (IOError, os.error) as why:
print( "Dicom file copy/symlink IO error on output pathname >%s< Exception >%s<" % (path,str(why)) )
if self.options['deleteSource'] or self.options['forceDelete']:
print ("Halting execution on IO error because deleteSource or forceDelete options could cause data loss.")
sys.exit(1)
# keep track of files and new directories
if targetDir in self.renamedFiles:
self.renamedFiles[targetDir].append(targetFileName)
else:
self.renamedFiles[targetDir] = [targetFileName,]
return True
def zipRenamedFiles(self):
"""For each directory that had files added while sorting,
create a zipfile containing the newly sorted files
that were added to that directory.
"""
for targetDir in self.renamedFiles:
dirBase = os.path.basename(targetDir)
dirDirname = os.path.dirname(targetDir)
zipFilePath = dirDirname + '/' + dirBase + ".zip"
if self.options['verbose']:
print ('Creating %s' % zipFilePath)
zfp = zipfile.ZipFile(zipFilePath, "w")
for name in self.renamedFiles[targetDir]:
zipPath = dirBase + '/' + name
filePath = targetDir + '/' + name
if self.options['verbose']:
print ('Adding %s' % zipPath)
zfp.write(filePath, zipPath, zipfile.ZIP_DEFLATED)
os.remove(filePath)
if self.options['verbose']:
print ('Finished %s' % zipFilePath)
zfp.close()
# remove the stub directory - it will only succeed if it is
# empty, meaning that all the files were moved to the zipfile
try:
os.rmdir(targetDir)
except OSError:
pass
# DICOMSorter }}}
# {{{ Download helper
class DownloadHelper(object):
"""Class to help download data for testing"""
def __init__(self):
self.downloadPercent = 0
def humanFormatSize(self,size):
""" from http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size"""
for x in ['bytes','KB','MB','GB']:
if size < 1024.0 and size > -1024.0:
return "%3.1f%s" % (size, x)
size /= 1024.0
return "%3.1f%s" % (size, 'TB')
def downloadReportHook(self,blocksSoFar,blockSize,totalSize):
percent = int((100. * blocksSoFar * blockSize) / totalSize)
if percent == 100 or (percent - self.downloadPercent >= 10):
humanSizeSoFar = self.humanFormatSize(blocksSoFar * blockSize)
humanSizeTotal = self.humanFormatSize(totalSize)
print('Downloaded %s (%d%% of %s)...' %
(humanSizeSoFar, percent, humanSizeTotal))
self.downloadPercent = percent
def downloadFileIfNeeded(self,url,destination,expectedSize):
if os.path.exists(destination) and os.stat(destination).st_size == expectedSize:
print('File exists %s and is correct size, not downloading' % destination)
return True
self.downloadPercent = 0
print('Requesting download of %s from %s...\n' % (destination, url))
try:
urllib.request.urlretrieve(url, destination, self.downloadReportHook)
print('Download finished')
except IOError as e:
print('Download failed: %s' % e)
return False
return True
# Download helper}}}
# {{{ main, test, and arg parse
def usage():
s = """
% dicomsort.py --help
dicomsort [options...] sourceDir targetDir/<patterns>
where [options...] can be:
[-z,--compressTargets] - create a .zip file in the target directory
[-d,--deleteSource] - remove source files/directories after sorting
[-f,--forceDelete] - remove source without confirmation
[-k,--keepGoing] - report but ignore duplicate target files
[-v,--verbose] - print diagnostics while processing
[-s,--symlink] - create a symlink to dicom files in sourceDir instead of copying them
[-t,--test] - run the built in self test (requires internet)
[-u,--unsafe] - do not replace unsafe characters with '_' in the path
[-j,--jobs] - number of jobs to run in parallel (default 1)
[--help] - print this message
where sourceDir is directory to be scanned or "" (null string) to read file list from stdin
where targetDir/<patterns...> is a string defining the output file and directory
names based on the dicom tags in the file.
If patterns are not specified, the following default is used:
%PatientName-%Modality%StudyID-%StudyDescription-%StudyDate/%SeriesNumber_%SeriesDescription-%InstanceNumber.dcm
Example 1:
dicomsort data sorted/%PatientName/%StudyDate/%SeriesDescription-%InstanceNumber.dcm
could create a folder structure like:
sorted/JohnDoe/2013-40-18/FLAIR-2.dcm
Example 2:
find DicomSourceDir/ | grep "IMA$" | dicomsort -s "" DicomTargetDir
would scan DicomSourceDir for file path names ending in IMA and create an
output directory DicomTargetDir. The folder structure will be created using
the default pattern with symbolic links to the source dicom data files.
"""
print(s)
def selfTest(sorter):
"""Run a self test of the DICOMSorter
- download a zipfile of test dicom data
- extract it in a temp location
- sort it using various options
- confirm correct results
"""
# perform the download
print('Downloading...')
testDataURL = "https://s3.amazonaws.com/ec2.isomics.com/dicomsort-testdata.zip"
tempfile.TemporaryFile().close() # to set the tempdir variable
destination = os.path.join(tempfile.tempdir, 'dicomsort-testdata.zip')
expectedSize = 65916934
downloader = DownloadHelper()
downloader.downloadFileIfNeeded(testDataURL, destination, expectedSize)
# unzip the data
print('Extracting...')
dataDir = os.path.join(tempfile.tempdir, 'dicomsort-testdata')
fileCount = 0
if os.path.exists(dataDir):
for root, subFolders, files in os.walk(dataDir):
fileCount += len(files)
print('Found %d files in %s' % (fileCount, dataDir))
if fileCount != 1062 or not os.path.exists(dataDir):
if os.path.exists(dataDir):
shutil.rmtree(dataDir)
archive = zipfile.ZipFile(destination)
archive.extractall(dataDir)
# now run the tests on the downloaded data
targetDir = os.path.join(tempfile.tempdir, 'dicomsort-output')
if os.path.exists(targetDir):
shutil.rmtree(targetDir)
targetPattern = targetDir + '/%PatientName/%StudyDescription-%StudyDate/%SeriesDescription-%SeriesNumber-%InstanceNumber.dcm'
options = sorter.options
options['sourceDir'] = dataDir
options['targetPattern'] = targetPattern
sorter.setOptions(options)
sorter.renameFiles()
print('\nSelf-Test Passed!')
def parseArgs(sorter,args):
"""Parse the command line args into the sorter.
"""
options = {}
remainingArgs = []
while args != []:
arg = args.pop(0)
if arg == '--help':
usage()
sys.exit()
if arg == "-j" or arg == "--jobs":
options['jobs'] = int(args.pop(0))
elif arg in sorter.flagToOptions.keys():
options[sorter.flagToOptions[arg]] = True
elif arg.startswith('-'):
usage()
sys.exit(1)
else:
remainingArgs.append(arg)
if 'test' in options:
remainingArgs = ["",""]
if len(remainingArgs) != 2:
usage()
sys.exit(1)
options['sourceDir'], options['targetPattern'] = remainingArgs
if not sorter.setOptions(options):
usage()
sys.exit()
if options['sourceDir'] == "":
print ("Reading file list from stdin.")
elif not os.path.exists(options['sourceDir']):
print ("Source directory does not exist: %s" % options['sourceDir'])
sys.exit(1)
if options['symlink'] and (options['compressTargets'] or options['deleteSource'] or options['forceDelete']):
print ("symlink option is not compatible with compressTargets, deleteSource, or forceDelete options")
sys.exit(1)
def confirmDelete(sorter):
if sorter.options['forceDelete']:
return True
print("Source directory is: %s" % sorter.options['sourceDir'])
response = input ('Delete source directory? [y/N] ')
if response == 'y' or response == 'yes':
return True
return False
def main():
sorter = DICOMSorter()
try:
parseArgs(sorter,sys.argv[1:])
if sorter.options['test']:
selfTest(sorter)
exit(0)
if not sorter.renameFiles():
sys.exit(2)
print("Files sorted")
if sorter.options['compressTargets']:
sorter.zipRenamedFiles()
print('Target files compressed')
if sorter.options['deleteSource']:
if confirmDelete(sorter):
shutil.rmtree(sorter.options['sourceDir'])
print ('Source directory deleted')
else:
print ('Source directory not deleted')
sys.exit()
except KeyboardInterrupt as e: # Ctrl-C
raise e
except SystemExit as e: # sys.exit()
raise e
except Exception as e:
print ('ERROR, UNEXPECTED EXCEPTION')
print (str(e))
traceback.print_exc()
os._exit(1)
if __name__ == '__main__':
main()
# }}}
# vim:set sr et ts=4 sw=4 ft=python fenc=utf-8: // See Vim, :help 'modeline
# vim: foldmethod=marker