-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbioformat_extractor.py
executable file
·132 lines (116 loc) · 4.85 KB
/
bioformat_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pyclowder
import re
import logging
import tempfile
import os, json
import subprocess
import csv
from pyclowder.extractors import Extractor
import pyclowder.files
import javabridge
import bioformats
import bioformats.formatreader
from xml.dom.minidom import parseString
javabridge.start_vm(class_path=bioformats.JARS, run_headless=True)
def get_good_bioformats(filepath="/home/bioformats.tsv"):
l = dict()
with open(filepath) as fd:
rd = csv.reader(fd, delimiter="\t", quotechar='"')
next(rd, None) # skip header
for row in rd:
_format = row[0]
_extensionstr = row[1]
_pixels_quality = int(row[2].split('-')[0].strip())
_metadata_quality = int(row[3].split('-')[0].strip())
if _pixels_quality >= 3 and _metadata_quality >=3:
_exts = _extensionstr.split(',')
_extensions = []
for _ext in _exts:
_ext_strip = _ext.strip()
if _ext_strip != '' and _ext_strip.startswith("."):
_extensions.append(_ext.strip())
l[_format] = _extensions
extensions = []
for _value in l.values():
extensions.extend(_value)
return extensions
def get_info(obj):
attr = dict()
type_name = type(obj).__name__
prop_names = dir(obj)
for prop_name in prop_names:
prop_val = getattr(obj, prop_name)
prop_val_type_name = type(prop_val).__name__
if prop_val_type_name in [ 'method', 'method-wrapper', 'type', 'NoneType', 'builtin_function_or_method', 'dict', '__weakref__', '__module__', '__doc__']:
pass
try:
val_as_str = json.dumps([ prop_val ], indent=2)[1:-1]
attr[prop_name] = val_as_str.strip()
except:
pass
return attr
def parse_element(element):
dict_data = dict()
if element.nodeType == element.TEXT_NODE:
dict_data['data'] = element.data
if element.nodeType not in [element.TEXT_NODE, element.DOCUMENT_NODE,
element.DOCUMENT_TYPE_NODE]:
for item in element.attributes.items():
dict_data[item[0]] = item[1]
if element.nodeType not in [element.TEXT_NODE, element.DOCUMENT_TYPE_NODE]:
for child in element.childNodes:
child_name, child_dict = parse_element(child)
if child_name in dict_data:
try:
dict_data[child_name].append(child_dict)
except AttributeError:
dict_data[child_name] = [dict_data[child_name], child_dict]
else:
dict_data[child_name] = child_dict
return element.nodeName, dict_data
class BioformatExtractor(Extractor):
def __init__(self):
Extractor.__init__(self)
# add any additional arguments to parser
# self.parser.add_argument('--max', '-m', type=int, nargs='?', default=-1,
# help='maximum number (default=-1)')
# parse command line and load default logging configuration
self.setup()
self.bioformat_extensions = get_good_bioformats()
# setup logging for the exctractor
logging.getLogger('pyclowder').setLevel(logging.DEBUG)
logging.getLogger('__main__').setLevel(logging.DEBUG)
def process_message(self, connector, host, secret_key, resource, parameters):
# Process the file and upload the results
logger = logging.getLogger(__name__)
inputfile = resource["local_paths"][0]
file_id = resource['id']
filename, file_extension = os.path.splitext(inputfile)
if file_extension in self.bioformat_extensions:
#this part handles the metadata
try:
javabridge.attach()
omexmlstr = bioformats.get_omexml_metadata(inputfile)
if omexmlstr:
dom = parseString(omexmlstr)
if dom:
(__, result) = parse_element(dom)
logger.debug(result)
metadata = self.get_metadata(result, 'file', file_id, host)
logger.debug(metadata)
# upload metadata
pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
else:
logger.debug("Cannot parse xml")
else:
logger.debug("Bioformat cannot read the file")
except Exception as e:
logger.error("Error getting metadata from file", e)
finally:
javabridge.detach()
else:
logger.error(f"File format not supprted, Supported list:{self.bioformat_extensions}")
if __name__ == "__main__":
extractor = BioformatExtractor()
extractor.start()
javabridge.kill_vm()