Skip to content

Commit

Permalink
Merge pull request #284 from datalad/issue-283-adapt-to-thin-datalad
Browse files Browse the repository at this point in the history
Move the extractors that were removed from datalad-core to metalad
  • Loading branch information
christian-monch authored Sep 16, 2022
2 parents b47b593 + 332a427 commit a4c8519
Show file tree
Hide file tree
Showing 34 changed files with 1,721 additions and 13 deletions.
2 changes: 1 addition & 1 deletion datalad_metalad/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
from datalad.interface.base import Interface
from datalad.interface.base import build_doc
from datalad.interface.utils import eval_results
from datalad.metadata.extractors.base import BaseMetadataExtractor
from datalad.support.annexrepo import AnnexRepo
from datalad.ui import ui

from .extractors.base import (
BaseMetadataExtractor,
DataOutputCategory,
DatasetMetadataExtractor,
FileInfo,
Expand Down
4 changes: 1 addition & 3 deletions datalad_metalad/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def get_state(self, dataset):

# XXX this is the legacy-legacy interface, keep around for a bit more and then
# remove
class BaseMetadataExtractor(metaclass=abc.ABCMeta):
class BaseMetadataExtractor:

NEEDS_CONTENT = True # majority of the extractors need data content

Expand Down Expand Up @@ -366,7 +366,6 @@ def get_metadata(self, dataset=True, content=True):
self._get_dataset_metadata() if dataset else None, \
((k, v) for k, v in self._get_content_metadata()) if content else None

@abc.abstractmethod
def _get_dataset_metadata(self):
"""
Returns
Expand All @@ -376,7 +375,6 @@ def _get_dataset_metadata(self):
"""
raise NotImplementedError

@abc.abstractmethod
def _get_content_metadata(self):
"""Get ALL metadata for all dataset content.
Expand Down
17 changes: 17 additions & 0 deletions datalad_metalad/extractors/legacy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Legacy metadata extractors"""

from os.path import join

from datalad.consts import DATALAD_DOTDIR

METADATA_DIR = join(DATALAD_DOTDIR, 'metadata')
DATASET_METADATA_FILE = join(METADATA_DIR, 'dataset.json')
DATASET_CONFIG_FILE = join(DATALAD_DOTDIR, 'config')
76 changes: 76 additions & 0 deletions datalad_metalad/extractors/legacy/annex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Metadata extractor for Git-annex metadata"""

from ..base import BaseMetadataExtractor

import logging
lgr = logging.getLogger('datalad.metadata.extractors.annexmeta')
from datalad.log import log_progress

from datalad.support.annexrepo import AnnexRepo
# use main version as core version

# this must stay, despite being a seemingly unused import, each extractor defines a version
from .definitions import version as vocabulary_version


class AnnexMetadataExtractor(BaseMetadataExtractor):

NEEDS_CONTENT = False

def _get_dataset_metadata(self):
return {}

def _get_content_metadata(self):
log_progress(
lgr.info,
'extractorannex',
'Start annex metadata extraction from %s', self.ds,
total=len(self.paths),
label='Annex metadata extraction',
unit=' Files',
)
repo = self.ds.repo # OPT: .repo could be relatively expensive
if not isinstance(repo, AnnexRepo):
log_progress(
lgr.info,
'extractorannex',
'Finished annex metadata extraction from %s', self.ds
)
return

valid_paths = None
if self.paths and sum(len(i) for i in self.paths) > 500000:
valid_paths = set(self.paths)
for file, meta in repo.get_metadata(
self.paths if self.paths and valid_paths is None else '.'):
if file.startswith('.datalad') or valid_paths and file not in valid_paths:
# do not report on our own internal annexed files (e.g. metadata blobs)
continue
log_progress(
lgr.info,
'extractorannex',
'Extracted annex metadata from %s', file,
update=1,
increment=True)
meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v
for k, v in meta.items()}
key = repo.get_file_annexinfo(file).get('key')
if key:
meta['key'] = key
yield (file, meta)
# we need to make sure that batch processes are terminated
# otherwise they might cause trouble on windows
repo.precommit()
log_progress(
lgr.info,
'extractorannex',
'Finished annex metadata extraction from %s', self.ds
)
97 changes: 97 additions & 0 deletions datalad_metalad/extractors/legacy/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Audio metadata extractor"""
from __future__ import absolute_import

from os.path import join as opj
import logging
lgr = logging.getLogger('datalad.metadata.extractors.audio')
from datalad.log import log_progress

from mutagen import File as audiofile

from .definitions import vocabulary_id
from ..base import BaseMetadataExtractor


# how properties reported by mutagen map onto our vocabulary
vocab_map = {
'album': 'music:album',
'artist': 'music:artist',
'channels': 'music:channels',
'composer': 'music:Composer',
'copyright': 'dcterms:rights',
'genre': 'music:Genre',
'length': 'duration(s)',
'sample_rate': 'music:sample_rate',
'title': 'name',
}


class AudioMetadataExtractor(BaseMetadataExtractor):

_unique_exclude = {'bitrate'}

def get_metadata(self, dataset, content):
if not content:
return {}, []
log_progress(
lgr.info,
'extractoraudio',
'Start audio metadata extraction from %s', self.ds,
total=len(self.paths),
label='audio metadata extraction',
unit=' Files',
)
contentmeta = []
for f in self.paths:
absfp = opj(self.ds.path, f)
log_progress(
lgr.info,
'extractoraudio',
'Extract audio metadata from %s', absfp,
update=1,
increment=True)
info = audiofile(absfp, easy=True)
if info is None:
continue
meta = {vocab_map.get(k, k): info[k][0]
if isinstance(info[k], list) and len(info[k]) == 1 else info[k]
for k in info}
if hasattr(info, 'mime') and len(info.mime):
meta['format'] = 'mime:{}'.format(info.mime[0])
for k in ('length', 'channels', 'bitrate', 'sample_rate'):
if hasattr(info.info, k):
val = getattr(info.info, k)
if k == 'length':
# duration comes in seconds, cap at millisecond level
val = round(val, 3)
meta[vocab_map.get(k, k)] = val
contentmeta.append((f, meta))

log_progress(
lgr.info,
'extractoraudio',
'Finished audio metadata extraction from %s', self.ds
)
return {
'@context': {
'music': {
'@id': 'http://purl.org/ontology/mo/',
'description': 'Music Ontology with main concepts and properties for describing music',
'type': vocabulary_id,
},
'duration(s)': {
"@id": 'time:Duration',
"unit": "uo:0000010",
'unit_label': 'second',
},
},
}, \
contentmeta
101 changes: 101 additions & 0 deletions datalad_metalad/extractors/legacy/datacite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Extractor for datacite xml records, currently for CRCNS datasets
"""

import re
import os.path as op
from collections import OrderedDict
import logging
lgr = logging.getLogger('datalad.metadata.extractors.datacite')

try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET

from ..base import BaseMetadataExtractor


def _merge(iterable):
"""Merge multiple items into a single one separating with a newline"""
return "\n".join(iterable)


def _unwrap(text):
"""Basic unwrapping of text separated by newlines"""
return re.sub(r'\n\s*', ' ', text)


def _process_tree(tree, nstag):
"""Process XML tree for a record and return a dictionary for our standard
"""
rec = OrderedDict()
for key, tag_, getall, trans1_, transall_ in [
('author', 'creatorName', True, None, None),
('name', "title[@titleType='AlternativeTitle']", False, None, None),
# actually it seems we have no title but "ShortDescription"!!! TODO
#('title', "title", False, _unwrap, None),
('shortdescription', "title", False, _unwrap, None),
('description', 'description', True, _unwrap, _merge),
('version', 'version', False, None, None),
('sameas', "identifier[@identifierType='DOI']", False, None, None),
# conflicts with our notion for having a "type" to be internal and to demarkate a Dataset
# here might include the field e.g. Dataset/Neurophysiology, so skipping for now
# ('type', "resourceType[@resourceTypeGeneral='Dataset']", False, None, None),
('citation', "relatedIdentifier", True, None, None),
('tag', "subject", True, None, None),
('formats', "format", True, None, None),
]:
trans1 = trans1_ or (lambda x: x)
text = lambda x: trans1(x.text.strip())
tag = nstag(tag_)
try:
if getall:
value = list(map(text, tree.findall(tag)))
else:
value = text(tree.find(tag))
except AttributeError:
continue
if not value or value == ['']:
continue
if transall_:
value = transall_(value)
rec[key] = value
return rec


class DataciteMetadataExtractor(BaseMetadataExtractor):
def _get_dataset_metadata(self):
canonical = op.join(self.ds.path, '.datalad', 'meta.datacite.xml')

# look for the first matching filename and go with it
fname = [canonical] if op.lexists(canonical) else \
[op.join(self.ds.path, f) for f in self.paths
if op.basename(f) == 'meta.datacite.xml']
if not fname or not op.lexists(fname[0]):
return {}
fname = fname[0]
# those namespaces are a b.ch
# TODO: avoid reading file twice
namespaces = dict([
node for _, node in ET.iterparse(
open(fname), events=('start-ns',)
)
])
ns = namespaces['']

def nstag(tag):
return './/{%s}%s' % (ns, tag)

tree = ET.ElementTree(file=fname)
return _process_tree(tree, nstag)

def _get_content_metadata(self):
return [] # no content metadata provided
Loading

0 comments on commit a4c8519

Please sign in to comment.