Skip to content

Commit

Permalink
salvage extractors removed from datalad-core
Browse files Browse the repository at this point in the history
This commit adds the extractors that were removed
from datalad core in the datalad PR #7014
(datalad/datalad#7014)
including their tests to datalad-metalad. This is
done to keep them available with only `datalad`
and `datalad-metalad` installed.
  • Loading branch information
christian-monch committed Sep 8, 2022
1 parent 3a953fc commit 492c228
Show file tree
Hide file tree
Showing 24 changed files with 132 additions and 247 deletions.
56 changes: 1 addition & 55 deletions datalad_metalad/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,58 +42,6 @@ class ExtractorResult:
immediate_data: Optional[Dict[str, Any]] = None


# Legacy extractor base from datalad-core
class BaseMetadataExtractor(object):

NEEDS_CONTENT = True # majority of the extractors need data content

def __init__(self, ds, paths):
"""
Parameters
----------
ds : dataset instance
Dataset to extract metadata from.
paths : list
Paths to investigate when extracting content metadata
"""

self.ds = ds
self.paths = paths

def get_metadata(self, dataset=True, content=True):
"""
Returns
-------
dict or None, dict or None
Dataset metadata dict, dictionary of filepath regexes with metadata,
dicts, each return value could be None if there is no such metadata
"""
# default implementation
return \
self._get_dataset_metadata() if dataset else None, \
((k, v) for k, v in self._get_content_metadata()) if content else None

def _get_dataset_metadata(self):
"""
Returns
-------
dict
keys and values are arbitrary
"""
raise NotImplementedError

def _get_content_metadata(self):
"""Get ALL metadata for all dataset content.
Possibly limited to the paths given to the extractor.
Returns
-------
generator((location, metadata_dict))
"""
raise NotImplementedError


class DataOutputCategory(enum.Enum):
"""
Describe how extractors output metadata.
Expand Down Expand Up @@ -388,7 +336,7 @@ def get_state(self, dataset):

# XXX this is the legacy-legacy interface, keep around for a bit more and then
# remove
class BaseMetadataExtractor(metaclass=abc.ABCMeta):
class BaseMetadataExtractor:

NEEDS_CONTENT = True # majority of the extractors need data content

Expand Down Expand Up @@ -418,7 +366,6 @@ def get_metadata(self, dataset=True, content=True):
self._get_dataset_metadata() if dataset else None, \
((k, v) for k, v in self._get_content_metadata()) if content else None

@abc.abstractmethod
def _get_dataset_metadata(self):
"""
Returns
Expand All @@ -428,7 +375,6 @@ def _get_dataset_metadata(self):
"""
raise NotImplementedError

@abc.abstractmethod
def _get_content_metadata(self):
"""Get ALL metadata for all dataset content.
Expand Down
8 changes: 8 additions & 0 deletions datalad_metalad/extractors/legacy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Legacy metadata extractors"""

from os.path import join

from datalad.consts import DATALAD_DOTDIR

METADATA_DIR = join(DATALAD_DOTDIR, 'metadata')
DATASET_METADATA_FILE = join(METADATA_DIR, 'dataset.json')
DATASET_CONFIG_FILE = join(DATALAD_DOTDIR, 'config')
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .definitions import version as vocabulary_version


class MetadataExtractor(BaseMetadataExtractor):
class AnnexMetadataExtractor(BaseMetadataExtractor):

NEEDS_CONTENT = False

Expand Down
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
}


class MetadataExtractor(BaseMetadataExtractor):
class AudioMetadataExtractor(BaseMetadataExtractor):

_unique_exclude = {'bitrate'}

Expand Down
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _process_tree(tree, nstag):
return rec


class MetadataExtractor(BaseMetadataExtractor):
class DataciteMetadataExtractor(BaseMetadataExtractor):
def _get_dataset_metadata(self):
canonical = op.join(self.ds.path, '.datalad', 'meta.datacite.xml')

Expand Down
14 changes: 8 additions & 6 deletions datalad_metalad/extractors/legacy/datalad_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,23 @@
from os.path import join as opj
from os.path import exists

from datalad.consts import (
DATASET_METADATA_FILE,
DATALAD_DOTDIR,
WEB_SPECIAL_REMOTE_UUID,
)
from datalad.consts import WEB_SPECIAL_REMOTE_UUID
from datalad.support.json_py import load as jsonload
from datalad.support.annexrepo import AnnexRepo
from datalad.coreapi import subdatasets

from . import (
DATASET_METADATA_FILE,
DATALAD_DOTDIR,
)

# use main version as core version

# this must stay, despite being a seemingly unused import, each extractor defines a version
from .definitions import version as vocabulary_version


class MetadataExtractor(BaseMetadataExtractor):
class DataladCoreMetadataExtractor(BaseMetadataExtractor):

NEEDS_CONTENT = False

Expand Down
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/datalad_rfc822.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _beautify_multiline_field(content):
return title, content


class MetadataExtractor(BaseMetadataExtractor):
class DataladRFC822MetadataExtractor(BaseMetadataExtractor):
_metadata_compliance = "http://docs.datalad.org/metadata.html#v0-1"
_core_metadata_filename = opj('.datalad', 'meta.rfc822')

Expand Down
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/exif.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _return_as_appropriate_dtype(val):
return val


class MetadataExtractor(BaseMetadataExtractor):
class ExifMetadataExtractor(BaseMetadataExtractor):
def get_metadata(self, dataset, content):
if not content:
return {}, []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _compact_license(obj):
return obj


class MetadataExtractor(BaseMetadataExtractor):
class FRDPMetadataExtractor(BaseMetadataExtractor):
metadatasrc_fname = 'datapackage.json'

_key2stdkey = {
Expand Down
2 changes: 1 addition & 1 deletion datalad_metalad/extractors/legacy/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
}


class MetadataExtractor(BaseMetadataExtractor):
class ImageMetadataExtractor(BaseMetadataExtractor):

_extractors = {
'format': lambda x: x.format_description,
Expand Down
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
30 changes: 6 additions & 24 deletions datalad_metalad/extractors/legacy/tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test audio extractor"""

from pathlib import Path

from datalad.tests.utils_pytest import (
SkipTest,
assert_in,
Expand All @@ -24,8 +26,6 @@
except ImportError:
raise SkipTest

from os.path import dirname
from os.path import join as opj
from shutil import copy

from datalad.api import Dataset
Expand All @@ -47,32 +47,14 @@
@with_tempfile(mkdir=True)
def test_audio(path=None):
ds = Dataset(path).create()
ds.config.add('datalad.metadata.nativetype', 'audio', scope='branch')
copy(
opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'),
path)
copy(Path(__file__).parent / 'data' / 'audio.mp3', path)
ds.save()
assert_repo_status(ds.path)
res = ds.aggregate_metadata()
assert_status('ok', res)
res = ds.metadata('audio.mp3')

res = ds.meta_extract('audio', str(Path(path) / 'audio.mp3'))
assert_result_count(res, 1)

# from this extractor
meta = res[0]['metadata']['audio']
meta = res[0]['metadata_record']['extracted_metadata']
for k, v in target.items():
eq_(meta[k], v)

assert_in('@context', meta)

uniques = ds.metadata(
reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties']
# test file has it, but uniques have it blanked out, because the extractor considers it worthless
# for discovering whole datasets
assert_in('bitrate', meta)
eq_(uniques['audio']['bitrate'], None)

# 'date' field carries not value, hence gets exclude from the unique report
assert_in('date', meta)
assert(not meta['date'])
assert_not_in('date', uniques['audio'])
53 changes: 5 additions & 48 deletions datalad_metalad/extractors/legacy/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test all extractors at a basic level"""

from inspect import isgenerator
from __future__ import annotations

from datalad.api import Dataset
from datalad.support.entrypoints import iter_entrypoints
from datalad.tests.utils_pytest import (
SkipTest,
Expand All @@ -21,57 +20,15 @@
)


@with_tree(tree={'file.dat': ''})
def check_api(annex, path):
ds = Dataset(path).create(force=True, annex=annex)
ds.save()
assert_repo_status(ds.path)

processed_extractors, skipped_extractors = [], []
def check_api(use_annex: bool):
processed_extractors = []
for ename, emod, eload in iter_entrypoints('datalad.metadata.extractors'):
# we need to be able to query for metadata, even if there is none
# from any extractor
try:
extractor_cls = eload()
except Exception as exc:
exc_ = str(exc)
skipped_extractors += [exc_]
continue
extractor = extractor_cls(
ds, paths=['file.dat'])
meta = extractor.get_metadata(
dataset=True,
content=True)
# we also get something for the dataset and something for the content
# even if any of the two is empty
assert_equal(len(meta), 2)
dsmeta, contentmeta = meta
assert (isinstance(dsmeta, dict))
assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta)
# verify that generator does not blow and has an entry for our
# precious file
cm = dict(contentmeta)
# datalad_core does provide some (not really) information about our
# precious file
if ename == 'datalad_core':
assert 'file.dat' in cm
elif ename == 'annex':
if annex:
# verify correct key, which is the same for all files of 0 size
assert_equal(
cm['file.dat']['key'],
'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat'
)
else:
# no metadata on that file
assert not cm
processed_extractors.append(ename)
assert "datalad_core" in processed_extractors, \
assert \
"datalad_core" in processed_extractors, \
"Should have managed to find at least the core extractor extractor"
if skipped_extractors:
raise SkipTest(
"Not fully tested/succeeded since some extractors failed"
" to load:\n%s" % ("\n".join(skipped_extractors)))


@known_failure_githubci_win
Expand Down
Loading

0 comments on commit 492c228

Please sign in to comment.