mne-tools · waldie11 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/doc/authors.rst b/doc/authors.rst
@@ -5,10 +5,12 @@
 .. _Amaia Benitez: https://github.com/AmaiaBA
 .. _Anand Saini: https://github.com/anandsaini024
 .. _Ariel Rokem: https://github.com/arokem
+.. _Arne Gottwald: https://github.com/waldie11
 .. _Austin Hurst: https://github.com/a-hurst
 .. _Berk Gerçek: https://github.com/berkgercek
 .. _Bruno Hebling Vieira: https://github.com/bhvieira
 .. _Chris Holdgraf: https://bids.berkeley.edu/people/chris-holdgraf
+.. _Christian O'Reilly: https://github.com/christian-oreilly
 .. _Clemens Brunner: https://github.com/cbrnr
 .. _Daniel McCloy: http://dan.mccloy.info
 .. _Denis Engemann: https://github.com/dengemann
@@ -52,4 +54,3 @@
 .. _Tom Donoghue: https://github.com/TomDonoghue
 .. _William Turner: https://bootstrapbill.github.io/
 .. _Yorguin Mantilla: https://github.com/yjmantilla
-.. _Christian O'Reilly: https://github.com/christian-oreilly
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -19,6 +19,7 @@ The following authors contributed for the first time. Thank you so much! 🤩
 
 * `Christian O'Reilly`_
 * `Berk Gerçek`_
+* `Arne Gottwald`_
 
 The following authors had contributed before. Thank you for sticking around! 🤘
 
@@ -35,6 +36,8 @@ Detailed list of changes
 - :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`)
 - :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`)
 - Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`)
+- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`)
+- :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`)
 
 
 🧐 API and behavior changes
@@ -53,6 +56,7 @@ Detailed list of changes
 - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
 - BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath`  correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`)
 - :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`)
+- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`)
 
 ⚕️ Code health
 ^^^^^^^^^^^^^^

diff --git a/mne_bids/path.py b/mne_bids/path.py
@@ -12,6 +12,7 @@
 from copy import deepcopy
 from datetime import datetime
 from io import StringIO
+from itertools import chain as iter_chain
 from os import path as op
 from pathlib import Path
 from textwrap import indent
@@ -1462,9 +1463,11 @@ def _truncate_tsv_line(line, lim=10):
     """Truncate a line to the specified number of characters."""
     return "".join(
         [
-            str(val) + (lim - len(val)) * " "
-            if len(val) < lim
-            else f"{val[: lim - 1]} "
+            (
+                str(val) + (lim - len(val)) * " "
+                if len(val) < lim
+                else f"{val[: lim - 1]} "
+            )
             for val in line.split("\t")
         ]
     )
@@ -1946,6 +1949,7 @@ def get_entity_vals(
     ignore_datatypes=None,
     ignore_dirs=("derivatives", "sourcedata"),
     ignore_suffixes=None,
+    include_match=None,
     with_key=False,
     verbose=None,
 ):
@@ -2006,12 +2010,15 @@ def get_entity_vals(
         Directories nested directly within ``root`` to ignore. If ``None``,
         include all directories in the search.
 
-        .. versionadded:: 0.9
+        .. versionadded:: 0.17
     ignore_suffixes : str | array-like of str | None
         Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for
         ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`.
+    include_match : str | array-like of str | None
+        Apply a starting match pragma following Unix style pattern syntax from
+        package glob to prefilter search criterion.
 
-        .. versionadded:: 0.17
+        .. versionadded:: 0.9
     with_key : bool
         If ``True``, returns the full entity with the key and the value. This
         will for example look like ``['sub-001', 'sub-002']``.
@@ -2107,7 +2114,13 @@ def get_entity_vals(
 
     p = re.compile(rf"{entity_long_abbr_map[entity_key]}-(.*?)_")
     values = list()
-    filenames = root.glob(f"**/*{entity_long_abbr_map[entity_key]}-*_*")
+    search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*"
+    if include_match is not None:
+        include_match = _ensure_tuple(include_match)
+        filenames = [root.glob(im + search_str) for im in include_match]
+        filenames = iter_chain(*filenames)
+    else:
+        filenames = root.glob(search_str)
 
     for filename in filenames:
         # Skip ignored directories
@@ -2358,7 +2371,7 @@ def _filter_fnames(
         r"_desc-(" + "|".join(description) + ")" if description else r"(|_desc-([^_]+))"
     )
     suffix_str = r"_(" + "|".join(suffix) + ")" if suffix else r"_([^_]+)"
-    ext_str = r"(" + "|".join(extension) + ")" if extension else r".([^_]+)"
+    ext_str = r"(" + "|".join(extension) + ")$" if extension else r".([^_]+)"
 
     regexp = (
         leading_path_str
@@ -2502,7 +2515,7 @@ def find_matching_paths(
 
 
 def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False):
-    """Return all file paths in root.
+    """Return all file paths + .ds paths in root.
 
     Can be filtered by datatype (which is present in the path but not in
     the BIDSPath basename). Can also be list of datatypes.
@@ -2523,29 +2536,52 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False
     Returns
     -------
     paths : list of pathlib.Path
-        All paths in `root`, filtered according to the function parameters.
+        All files + .ds paths in `root`, filtered according to the function parameters.
     """
     root = Path(root)  # if root is str
 
-    if datatype is not None:
-        datatype = _ensure_tuple(datatype)
-        search_str = f"*/{'|'.join(datatype)}/*"
-    else:
+    if datatype is None and not ignore_nosub:
         search_str = "*.*"
+        paths = root.rglob(search_str)
+    else:
+        if datatype is not None:
+            datatype = _ensure_tuple(datatype)
+            search_str = f"**/{'|'.join(datatype)}/*"
+
+            # I think this one is more appropriate
+            search_str = search_str + ".*"
+        else:
+            search_str = "**/*.*"
+
+        # only browse files which are of the form root/sub-*,
+        # such that we truely only look in 'sub'-folders:
+
+        if ignore_nosub:
+            search_str = "sub-*/" + search_str
+
+        paths = [
+            Path(root, fn)
+            for fn in glob.iglob(search_str, root_dir=root, recursive=True)
+        ]
 
-    paths = root.rglob(search_str)
     # Only keep files (not directories), ...
     # and omit the JSON sidecars if `ignore_json` is True.
     if ignore_json:
-        paths = [p for p in paths if p.is_file() and p.suffix != ".json"]
+        paths = [
+            p
+            for p in paths
+            if (p.is_file() and p.suffix != ".json")
+            # do we really want to do this here?
+            or (p.is_dir() and p.suffix == ".ds")
+        ]
     else:
-        paths = [p for p in paths if p.is_file()]
-
-    # only keep files which are of the form root/sub-*,
-    # such that we only look in 'sub'-folders:
-    if ignore_nosub:
-        root_sub = str(root / "sub-")
-        paths = [p for p in paths if str(p).startswith(root_sub)]
+        paths = [
+            p
+            for p in paths
+            if p.is_file()
+            # do we really want to do this here
+            or (p.is_dir() and p.suffix == ".ds")
+        ]
 
     return paths
 

diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py
@@ -7,10 +7,12 @@
 import os.path as op
 import shutil
 import shutil as sh
+import timeit
 from datetime import datetime, timezone
 from pathlib import Path
 
 import mne
+import numpy as np
 import pytest
 from mne.datasets import testing
 from mne.io import anonymize_info
@@ -166,6 +168,118 @@ def test_get_entity_vals(entity, expected_vals, kwargs, return_bids_test_dir):
     shutil.rmtree(deriv_path)
 
 
+def test_path_benchmark(tmp_path_factory):
+    """Benchmark exploring bids tree."""
+    tmp_bids_root = tmp_path_factory.mktemp("abc")
+
+    derivatives = [Path("derivatives", "derivatives" + str(i)) for i in range(17)]
+
+    bids_subdirectories = ["", "sourcedata", *derivatives]
+
+    def equal_length_subj_ids(v):
+        assert all(v > 0)
+        n_digits = len(str(np.max(v))) + 1
+        return list(map(lambda i: "0" * (n_digits - len(str(i))) + str(i), v))
+
+    # Create a BIDS compliant directory tree with high number of branches
+    for i in equal_length_subj_ids(np.arange(1, 20, dtype=int)):
+        for j in range(1, 9):
+            for subdir in bids_subdirectories:
+                for datatype in ["eeg", "meg"]:
+                    bids_subdir = BIDSPath(
+                        subject=i,
+                        session="0" + str(j),
+                        datatype=datatype,
+                        task="audvis",
+                        root=str(tmp_bids_root / subdir),
+                    )
+                    bids_subdir.mkdir(exist_ok=True)
+                    Path(bids_subdir.root / "participants.tsv").touch()
+                    Path(bids_subdir.root / "participants.csv").touch()
+                    Path(bids_subdir.root / "README").touch()
+
+                    # os.makedirs(bids_subdir.directory, exist_ok=True)
+                    Path(
+                        bids_subdir.directory, bids_subdir.basename + "_events.tsv"
+                    ).touch()
+                    Path(
+                        bids_subdir.directory, bids_subdir.basename + "_events.csv"
+                    ).touch()
+
+                    if datatype == "meg":
+                        ctf_path = Path(
+                            bids_subdir.directory, bids_subdir.basename + "_meg.ds"
+                        )
+                        ctf_path.mkdir(exist_ok=True)
+                        Path(ctf_path, bids_subdir.basename + ".meg4").touch()
+                        Path(ctf_path, bids_subdir.basename + ".hc").touch()
+                        Path(ctf_path / "hz.ds").mkdir(exist_ok=True)
+                        Path(ctf_path / "hz.ds" / "hz.meg4").touch()
+                        Path(ctf_path / "hz.ds" / "hz.hc").touch()
+
+    # apply nosub on find_matching_matchs with root level bids directory should
+    # yield a performance boost of order of length from bids_subdirectories.
+    timed_all, timed_ignored_nosub = (
+        timeit.timeit(
+            "mne_bids.find_matching_paths(tmp_bids_root)",
+            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
+            number=1,
+        ),
+        timeit.timeit(
+            "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
+            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
+            number=10,
+        )
+        / 10,
+    )
+
+    assert (
+        timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
+        # while this should be of same order, lets give it some space by a factor of 2
+        > 0.5 * timed_ignored_nosub
+    )
+
+    # apply include_match on get_entity_vals with root level bids directory should
+    # yield a performance boost of order of length from bids_subdirectories.
+    timed_entity, timed_entity_match = (
+        timeit.timeit(
+            "mne_bids.get_entity_vals(tmp_bids_root, 'session')",
+            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
+            number=1,
+        ),
+        timeit.timeit(
+            "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')",  # noqa: E501
+            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
+            number=10,
+        )
+        / 10,
+    )
+
+    assert (
+        timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
+        # while this should be of same order, lets give it some space by a factor of 2
+        > 0.5 * timed_entity_match
+    )
+
+    # This should yield the same performance gain, it is not.
+    # # apply include_match on get_entity_vals with root level bids directory should
+    # # yield a performance boost of order of length from bids_subdirectories.
+    # timed_entity_ignore = timeit.timeit(
+    #     "mne_bids.get_entity_vals(tmp_bids_root, 'session', ignore_dirs=['derivatives', 'sourcedata'])",  # noqa: E501
+    #     setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
+    #     number=1,
+    # )
+
+    # assert (
+    #     timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
+    #     # while this should be of same order, lets give it some space by a factor of 2
+    #     > 0.5 * timed_entity_ignore
+    # )
+
+    # Clean up
+    shutil.rmtree(tmp_bids_root)
+
+
 def test_search_folder_for_text(capsys):
     """Test finding entries."""
     with pytest.raises(ValueError, match="is not a directory"):