Merge pull request #284 from datalad/issue-283-adapt-to-thin-datalad

Move the extractors that were removed from datalad-core to metalad
datalad · Sep 16, 2022 · a4c8519 · a4c8519
2 parents b47b593 + 332a427
commit a4c8519
Show file tree

Hide file tree

Showing 34 changed files with 1,721 additions and 13 deletions.
diff --git a/datalad_metalad/extract.py b/datalad_metalad/extract.py
@@ -39,11 +39,11 @@
 from datalad.interface.base import Interface
 from datalad.interface.base import build_doc
 from datalad.interface.utils import eval_results
-from datalad.metadata.extractors.base import BaseMetadataExtractor
 from datalad.support.annexrepo import AnnexRepo
 from datalad.ui import ui
 
 from .extractors.base import (
+    BaseMetadataExtractor,
     DataOutputCategory,
     DatasetMetadataExtractor,
     FileInfo,

diff --git a/datalad_metalad/extractors/base.py b/datalad_metalad/extractors/base.py
@@ -336,7 +336,7 @@ def get_state(self, dataset):
 
 # XXX this is the legacy-legacy interface, keep around for a bit more and then
 # remove
-class BaseMetadataExtractor(metaclass=abc.ABCMeta):
+class BaseMetadataExtractor:
 
     NEEDS_CONTENT = True   # majority of the extractors need data content
 
@@ -366,7 +366,6 @@ def get_metadata(self, dataset=True, content=True):
             self._get_dataset_metadata() if dataset else None, \
             ((k, v) for k, v in self._get_content_metadata()) if content else None
 
-    @abc.abstractmethod
     def _get_dataset_metadata(self):
         """
         Returns
@@ -376,7 +375,6 @@ def _get_dataset_metadata(self):
         """
         raise NotImplementedError
 
-    @abc.abstractmethod
     def _get_content_metadata(self):
         """Get ALL metadata for all dataset content.
 

diff --git a/datalad_metalad/extractors/legacy/__init__.py b/datalad_metalad/extractors/legacy/__init__.py
@@ -0,0 +1,17 @@
+# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+#
+#   See COPYING file distributed along with the datalad package for the
+#   copyright and license terms.
+#
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+"""Legacy metadata extractors"""
+
+from os.path import join
+
+from datalad.consts import DATALAD_DOTDIR
+
+METADATA_DIR = join(DATALAD_DOTDIR, 'metadata')
+DATASET_METADATA_FILE = join(METADATA_DIR, 'dataset.json')
+DATASET_CONFIG_FILE = join(DATALAD_DOTDIR, 'config')
diff --git a/datalad_metalad/extractors/legacy/annex.py b/datalad_metalad/extractors/legacy/annex.py
@@ -0,0 +1,76 @@
+# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+#
+#   See COPYING file distributed along with the datalad package for the
+#   copyright and license terms.
+#
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+"""Metadata extractor for Git-annex metadata"""
+
+from ..base import BaseMetadataExtractor
+
+import logging
+lgr = logging.getLogger('datalad.metadata.extractors.annexmeta')
+from datalad.log import log_progress
+
+from datalad.support.annexrepo import AnnexRepo
+# use main version as core version
+
+# this must stay, despite being a seemingly unused import, each extractor defines a version
+from .definitions import version as vocabulary_version
+
+
+class AnnexMetadataExtractor(BaseMetadataExtractor):
+
+    NEEDS_CONTENT = False
+
+    def _get_dataset_metadata(self):
+        return {}
+
+    def _get_content_metadata(self):
+        log_progress(
+            lgr.info,
+            'extractorannex',
+            'Start annex metadata extraction from %s', self.ds,
+            total=len(self.paths),
+            label='Annex metadata extraction',
+            unit=' Files',
+        )
+        repo = self.ds.repo   # OPT: .repo could be relatively expensive
+        if not isinstance(repo, AnnexRepo):
+            log_progress(
+                lgr.info,
+                'extractorannex',
+                'Finished annex metadata extraction from %s', self.ds
+            )
+            return
+
+        valid_paths = None
+        if self.paths and sum(len(i) for i in self.paths) > 500000:
+            valid_paths = set(self.paths)
+        for file, meta in repo.get_metadata(
+                self.paths if self.paths and valid_paths is None else '.'):
+            if file.startswith('.datalad') or valid_paths and file not in valid_paths:
+                # do not report on our own internal annexed files (e.g. metadata blobs)
+                continue
+            log_progress(
+                lgr.info,
+                'extractorannex',
+                'Extracted annex metadata from %s', file,
+                update=1,
+                increment=True)
+            meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v
+                    for k, v in meta.items()}
+            key = repo.get_file_annexinfo(file).get('key')
+            if key:
+                meta['key'] = key
+            yield (file, meta)
+        # we need to make sure that batch processes are terminated
+        # otherwise they might cause trouble on windows
+        repo.precommit()
+        log_progress(
+            lgr.info,
+            'extractorannex',
+            'Finished annex metadata extraction from %s', self.ds
+        )
diff --git a/datalad_metalad/extractors/legacy/audio.py b/datalad_metalad/extractors/legacy/audio.py
@@ -0,0 +1,97 @@
+# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+#
+#   See COPYING file distributed along with the datalad package for the
+#   copyright and license terms.
+#
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+"""Audio metadata extractor"""
+from __future__ import absolute_import
+
+from os.path import join as opj
+import logging
+lgr = logging.getLogger('datalad.metadata.extractors.audio')
+from datalad.log import log_progress
+
+from mutagen import File as audiofile
+
+from .definitions import vocabulary_id
+from ..base import BaseMetadataExtractor
+
+
+# how properties reported by mutagen map onto our vocabulary
+vocab_map = {
+    'album': 'music:album',
+    'artist': 'music:artist',
+    'channels': 'music:channels',
+    'composer': 'music:Composer',
+    'copyright': 'dcterms:rights',
+    'genre': 'music:Genre',
+    'length': 'duration(s)',
+    'sample_rate': 'music:sample_rate',
+    'title': 'name',
+}
+
+
+class AudioMetadataExtractor(BaseMetadataExtractor):
+
+    _unique_exclude = {'bitrate'}
+
+    def get_metadata(self, dataset, content):
+        if not content:
+            return {}, []
+        log_progress(
+            lgr.info,
+            'extractoraudio',
+            'Start audio metadata extraction from %s', self.ds,
+            total=len(self.paths),
+            label='audio metadata extraction',
+            unit=' Files',
+        )
+        contentmeta = []
+        for f in self.paths:
+            absfp = opj(self.ds.path, f)
+            log_progress(
+                lgr.info,
+                'extractoraudio',
+                'Extract audio metadata from %s', absfp,
+                update=1,
+                increment=True)
+            info = audiofile(absfp, easy=True)
+            if info is None:
+                continue
+            meta = {vocab_map.get(k, k): info[k][0]
+                    if isinstance(info[k], list) and len(info[k]) == 1 else info[k]
+                    for k in info}
+            if hasattr(info, 'mime') and len(info.mime):
+                meta['format'] = 'mime:{}'.format(info.mime[0])
+            for k in ('length', 'channels', 'bitrate', 'sample_rate'):
+                if hasattr(info.info, k):
+                    val = getattr(info.info, k)
+                    if k == 'length':
+                        # duration comes in seconds, cap at millisecond level
+                        val = round(val, 3)
+                    meta[vocab_map.get(k, k)] = val
+            contentmeta.append((f, meta))
+
+        log_progress(
+            lgr.info,
+            'extractoraudio',
+            'Finished audio metadata extraction from %s', self.ds
+        )
+        return {
+            '@context': {
+                'music': {
+                    '@id': 'http://purl.org/ontology/mo/',
+                    'description': 'Music Ontology with main concepts and properties for describing music',
+                    'type': vocabulary_id,
+                },
+                'duration(s)': {
+                    "@id": 'time:Duration',
+                    "unit": "uo:0000010",
+                    'unit_label': 'second',
+                },
+            },
+        }, \
+            contentmeta
diff --git a/datalad_metalad/extractors/legacy/datacite.py b/datalad_metalad/extractors/legacy/datacite.py
@@ -0,0 +1,101 @@
+# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+#
+#   See COPYING file distributed along with the datalad package for the
+#   copyright and license terms.
+#
+# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
+"""Extractor for datacite xml records, currently for CRCNS datasets
+"""
+
+import re
+import os.path as op
+from collections import OrderedDict
+import logging
+lgr = logging.getLogger('datalad.metadata.extractors.datacite')
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+from ..base import BaseMetadataExtractor
+
+
+def _merge(iterable):
+    """Merge multiple items into a single one separating with a newline"""
+    return "\n".join(iterable)
+
+
+def _unwrap(text):
+    """Basic unwrapping of text separated by newlines"""
+    return re.sub(r'\n\s*', ' ', text)
+
+
+def _process_tree(tree, nstag):
+    """Process XML tree for a record and return a dictionary for our standard
+    """
+    rec = OrderedDict()
+    for key, tag_, getall, trans1_, transall_ in [
+        ('author', 'creatorName', True, None, None),
+        ('name', "title[@titleType='AlternativeTitle']", False, None, None),
+        # actually it seems we have no title but "ShortDescription"!!! TODO
+        #('title', "title", False, _unwrap, None),
+        ('shortdescription', "title", False, _unwrap, None),
+        ('description', 'description', True, _unwrap, _merge),
+        ('version', 'version', False, None, None),
+        ('sameas', "identifier[@identifierType='DOI']", False, None, None),
+        # conflicts with our notion for having a "type" to be internal and to demarkate a Dataset
+        # here might include the field e.g. Dataset/Neurophysiology, so skipping for now
+        # ('type', "resourceType[@resourceTypeGeneral='Dataset']", False, None, None),
+        ('citation', "relatedIdentifier", True, None, None),
+        ('tag', "subject", True, None, None),
+        ('formats', "format", True, None, None),
+    ]:
+        trans1 = trans1_ or (lambda x: x)
+        text = lambda x: trans1(x.text.strip())
+        tag = nstag(tag_)
+        try:
+            if getall:
+                value = list(map(text, tree.findall(tag)))
+            else:
+                value = text(tree.find(tag))
+        except AttributeError:
+            continue
+        if not value or value == ['']:
+            continue
+        if transall_:
+            value = transall_(value)
+        rec[key] = value
+    return rec
+
+
+class DataciteMetadataExtractor(BaseMetadataExtractor):
+    def _get_dataset_metadata(self):
+        canonical = op.join(self.ds.path, '.datalad', 'meta.datacite.xml')
+
+        # look for the first matching filename and go with it
+        fname = [canonical] if op.lexists(canonical) else \
+            [op.join(self.ds.path, f) for f in self.paths
+             if op.basename(f) == 'meta.datacite.xml']
+        if not fname or not op.lexists(fname[0]):
+            return {}
+        fname = fname[0]
+        # those namespaces are a b.ch
+        # TODO: avoid reading file twice
+        namespaces = dict([
+            node for _, node in ET.iterparse(
+                open(fname), events=('start-ns',)
+            )
+        ])
+        ns = namespaces['']
+
+        def nstag(tag):
+            return './/{%s}%s' % (ns, tag)
+
+        tree = ET.ElementTree(file=fname)
+        return _process_tree(tree, nstag)
+
+    def _get_content_metadata(self):
+        return []  # no content metadata provided