-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #284 from datalad/issue-283-adapt-to-thin-datalad
Move the extractors that were removed from datalad-core to metalad
- Loading branch information
Showing
34 changed files
with
1,721 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 et: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
"""Legacy metadata extractors""" | ||
|
||
from os.path import join | ||
|
||
from datalad.consts import DATALAD_DOTDIR | ||
|
||
METADATA_DIR = join(DATALAD_DOTDIR, 'metadata') | ||
DATASET_METADATA_FILE = join(METADATA_DIR, 'dataset.json') | ||
DATASET_CONFIG_FILE = join(DATALAD_DOTDIR, 'config') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 et: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
"""Metadata extractor for Git-annex metadata""" | ||
|
||
from ..base import BaseMetadataExtractor | ||
|
||
import logging | ||
lgr = logging.getLogger('datalad.metadata.extractors.annexmeta') | ||
from datalad.log import log_progress | ||
|
||
from datalad.support.annexrepo import AnnexRepo | ||
# use main version as core version | ||
|
||
# this must stay, despite being a seemingly unused import, each extractor defines a version | ||
from .definitions import version as vocabulary_version | ||
|
||
|
||
class AnnexMetadataExtractor(BaseMetadataExtractor): | ||
|
||
NEEDS_CONTENT = False | ||
|
||
def _get_dataset_metadata(self): | ||
return {} | ||
|
||
def _get_content_metadata(self): | ||
log_progress( | ||
lgr.info, | ||
'extractorannex', | ||
'Start annex metadata extraction from %s', self.ds, | ||
total=len(self.paths), | ||
label='Annex metadata extraction', | ||
unit=' Files', | ||
) | ||
repo = self.ds.repo # OPT: .repo could be relatively expensive | ||
if not isinstance(repo, AnnexRepo): | ||
log_progress( | ||
lgr.info, | ||
'extractorannex', | ||
'Finished annex metadata extraction from %s', self.ds | ||
) | ||
return | ||
|
||
valid_paths = None | ||
if self.paths and sum(len(i) for i in self.paths) > 500000: | ||
valid_paths = set(self.paths) | ||
for file, meta in repo.get_metadata( | ||
self.paths if self.paths and valid_paths is None else '.'): | ||
if file.startswith('.datalad') or valid_paths and file not in valid_paths: | ||
# do not report on our own internal annexed files (e.g. metadata blobs) | ||
continue | ||
log_progress( | ||
lgr.info, | ||
'extractorannex', | ||
'Extracted annex metadata from %s', file, | ||
update=1, | ||
increment=True) | ||
meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v | ||
for k, v in meta.items()} | ||
key = repo.get_file_annexinfo(file).get('key') | ||
if key: | ||
meta['key'] = key | ||
yield (file, meta) | ||
# we need to make sure that batch processes are terminated | ||
# otherwise they might cause trouble on windows | ||
repo.precommit() | ||
log_progress( | ||
lgr.info, | ||
'extractorannex', | ||
'Finished annex metadata extraction from %s', self.ds | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 et: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
"""Audio metadata extractor""" | ||
from __future__ import absolute_import | ||
|
||
from os.path import join as opj | ||
import logging | ||
lgr = logging.getLogger('datalad.metadata.extractors.audio') | ||
from datalad.log import log_progress | ||
|
||
from mutagen import File as audiofile | ||
|
||
from .definitions import vocabulary_id | ||
from ..base import BaseMetadataExtractor | ||
|
||
|
||
# how properties reported by mutagen map onto our vocabulary | ||
vocab_map = { | ||
'album': 'music:album', | ||
'artist': 'music:artist', | ||
'channels': 'music:channels', | ||
'composer': 'music:Composer', | ||
'copyright': 'dcterms:rights', | ||
'genre': 'music:Genre', | ||
'length': 'duration(s)', | ||
'sample_rate': 'music:sample_rate', | ||
'title': 'name', | ||
} | ||
|
||
|
||
class AudioMetadataExtractor(BaseMetadataExtractor): | ||
|
||
_unique_exclude = {'bitrate'} | ||
|
||
def get_metadata(self, dataset, content): | ||
if not content: | ||
return {}, [] | ||
log_progress( | ||
lgr.info, | ||
'extractoraudio', | ||
'Start audio metadata extraction from %s', self.ds, | ||
total=len(self.paths), | ||
label='audio metadata extraction', | ||
unit=' Files', | ||
) | ||
contentmeta = [] | ||
for f in self.paths: | ||
absfp = opj(self.ds.path, f) | ||
log_progress( | ||
lgr.info, | ||
'extractoraudio', | ||
'Extract audio metadata from %s', absfp, | ||
update=1, | ||
increment=True) | ||
info = audiofile(absfp, easy=True) | ||
if info is None: | ||
continue | ||
meta = {vocab_map.get(k, k): info[k][0] | ||
if isinstance(info[k], list) and len(info[k]) == 1 else info[k] | ||
for k in info} | ||
if hasattr(info, 'mime') and len(info.mime): | ||
meta['format'] = 'mime:{}'.format(info.mime[0]) | ||
for k in ('length', 'channels', 'bitrate', 'sample_rate'): | ||
if hasattr(info.info, k): | ||
val = getattr(info.info, k) | ||
if k == 'length': | ||
# duration comes in seconds, cap at millisecond level | ||
val = round(val, 3) | ||
meta[vocab_map.get(k, k)] = val | ||
contentmeta.append((f, meta)) | ||
|
||
log_progress( | ||
lgr.info, | ||
'extractoraudio', | ||
'Finished audio metadata extraction from %s', self.ds | ||
) | ||
return { | ||
'@context': { | ||
'music': { | ||
'@id': 'http://purl.org/ontology/mo/', | ||
'description': 'Music Ontology with main concepts and properties for describing music', | ||
'type': vocabulary_id, | ||
}, | ||
'duration(s)': { | ||
"@id": 'time:Duration', | ||
"unit": "uo:0000010", | ||
'unit_label': 'second', | ||
}, | ||
}, | ||
}, \ | ||
contentmeta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 et: | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
# | ||
# See COPYING file distributed along with the datalad package for the | ||
# copyright and license terms. | ||
# | ||
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## | ||
"""Extractor for datacite xml records, currently for CRCNS datasets | ||
""" | ||
|
||
import re | ||
import os.path as op | ||
from collections import OrderedDict | ||
import logging | ||
lgr = logging.getLogger('datalad.metadata.extractors.datacite') | ||
|
||
try: | ||
import xml.etree.cElementTree as ET | ||
except ImportError: | ||
import xml.etree.ElementTree as ET | ||
|
||
from ..base import BaseMetadataExtractor | ||
|
||
|
||
def _merge(iterable): | ||
"""Merge multiple items into a single one separating with a newline""" | ||
return "\n".join(iterable) | ||
|
||
|
||
def _unwrap(text): | ||
"""Basic unwrapping of text separated by newlines""" | ||
return re.sub(r'\n\s*', ' ', text) | ||
|
||
|
||
def _process_tree(tree, nstag): | ||
"""Process XML tree for a record and return a dictionary for our standard | ||
""" | ||
rec = OrderedDict() | ||
for key, tag_, getall, trans1_, transall_ in [ | ||
('author', 'creatorName', True, None, None), | ||
('name', "title[@titleType='AlternativeTitle']", False, None, None), | ||
# actually it seems we have no title but "ShortDescription"!!! TODO | ||
#('title', "title", False, _unwrap, None), | ||
('shortdescription', "title", False, _unwrap, None), | ||
('description', 'description', True, _unwrap, _merge), | ||
('version', 'version', False, None, None), | ||
('sameas', "identifier[@identifierType='DOI']", False, None, None), | ||
# conflicts with our notion for having a "type" to be internal and to demarkate a Dataset | ||
# here might include the field e.g. Dataset/Neurophysiology, so skipping for now | ||
# ('type', "resourceType[@resourceTypeGeneral='Dataset']", False, None, None), | ||
('citation', "relatedIdentifier", True, None, None), | ||
('tag', "subject", True, None, None), | ||
('formats', "format", True, None, None), | ||
]: | ||
trans1 = trans1_ or (lambda x: x) | ||
text = lambda x: trans1(x.text.strip()) | ||
tag = nstag(tag_) | ||
try: | ||
if getall: | ||
value = list(map(text, tree.findall(tag))) | ||
else: | ||
value = text(tree.find(tag)) | ||
except AttributeError: | ||
continue | ||
if not value or value == ['']: | ||
continue | ||
if transall_: | ||
value = transall_(value) | ||
rec[key] = value | ||
return rec | ||
|
||
|
||
class DataciteMetadataExtractor(BaseMetadataExtractor): | ||
def _get_dataset_metadata(self): | ||
canonical = op.join(self.ds.path, '.datalad', 'meta.datacite.xml') | ||
|
||
# look for the first matching filename and go with it | ||
fname = [canonical] if op.lexists(canonical) else \ | ||
[op.join(self.ds.path, f) for f in self.paths | ||
if op.basename(f) == 'meta.datacite.xml'] | ||
if not fname or not op.lexists(fname[0]): | ||
return {} | ||
fname = fname[0] | ||
# those namespaces are a b.ch | ||
# TODO: avoid reading file twice | ||
namespaces = dict([ | ||
node for _, node in ET.iterparse( | ||
open(fname), events=('start-ns',) | ||
) | ||
]) | ||
ns = namespaces[''] | ||
|
||
def nstag(tag): | ||
return './/{%s}%s' % (ns, tag) | ||
|
||
tree = ET.ElementTree(file=fname) | ||
return _process_tree(tree, nstag) | ||
|
||
def _get_content_metadata(self): | ||
return [] # no content metadata provided |
Oops, something went wrong.