2.9.1 (#128)

* HOTFIX: When downloading cache only authenticate Alyx when necessary * HOTFIX: Ensure http data server URL does not end in slash * HOTFIX: No longer warns in silent mode when no param conflicts present * Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`) * account for public being in rel_path (#127) --------- Co-authored-by: mayofaulkner <[email protected]>
int-brain-lab · Sep 24, 2024 · 26bea63 · 26bea63
1 parent 7236111
commit 26bea63
Show file tree

Hide file tree

Showing 8 changed files with 88 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
-## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0]
+## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1]
+
+### Modified
+
+- HOTFIX: When downloading cache only authenticate Alyx when necessary
+- HOTFIX: Ensure http data server URL does not end in slash
+- HOTFIX: Handle public aggregate dataset relative paths
+- HOTFIX: No longer warns in silent mode when no param conflicts present
+- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)
+
+## [2.9.0]
 This version adds a couple of new ALF functions.
 
 ### Added 

diff --git a/one/__init__.py b/one/__init__.py
@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '2.9.0'
+__version__ = '2.9.1'
diff --git a/one/api.py b/one/api.py
@@ -563,6 +563,9 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
             repository
         update_exists : bool
             If true, the cache is updated to reflect the filesystem
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
 
         Returns
         -------
@@ -924,6 +927,7 @@ def load_object(self,
                     revision: Optional[str] = None,
                     query_type: Optional[str] = None,
                     download_only: bool = False,
+                    check_hash: bool = True,
                     **kwargs) -> Union[alfio.AlfBunch, List[Path]]:
         """
         Load all attributes of an ALF object from a Session ID and an object name.
@@ -951,6 +955,9 @@ def load_object(self,
         download_only : bool
             When true the data are downloaded and the file path is returned. NB: The order of the
             file path list is undefined.
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
         kwargs
             Additional filters for datasets, including namespace and timescale. For full list
             see the :func:`one.alf.spec.describe` function.
@@ -996,7 +1003,7 @@ def load_object(self,
 
         # For those that don't exist, download them
         offline = None if query_type == 'auto' else self.mode == 'local'
-        files = self._check_filesystem(datasets, offline=offline)
+        files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)
         files = [x for x in files if x]
         if not files:
             raise alferr.ALFObjectNotFound(f'ALF object "{obj}" not found on disk')
@@ -1015,7 +1022,7 @@ def load_dataset(self,
                      revision: Optional[str] = None,
                      query_type: Optional[str] = None,
                      download_only: bool = False,
-                     **kwargs) -> Any:
+                     check_hash: bool = True) -> Any:
         """
         Load a single dataset for a given session id and dataset name.
 
@@ -1040,6 +1047,9 @@ def load_dataset(self,
             Query cache ('local') or Alyx database ('remote')
         download_only : bool
             When true the data are downloaded and the file path is returned.
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
 
         Returns
         -------
@@ -1099,7 +1109,8 @@ def load_dataset(self,
             raise alferr.ALFObjectNotFound(f'Dataset "{dataset}" not found')
 
         # Check files exist / download remote files
-        file, = self._check_filesystem(datasets, **kwargs)
+        offline = None if query_type == 'auto' else self.mode == 'local'
+        file, = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)
 
         if not file:
             raise alferr.ALFObjectNotFound('Dataset not found')
@@ -1117,7 +1128,7 @@ def load_datasets(self,
                       query_type: Optional[str] = None,
                       assert_present=True,
                       download_only: bool = False,
-                      **kwargs) -> Any:
+                      check_hash: bool = True) -> Any:
         """
         Load datasets for a given session id.  Returns two lists the length of datasets.  The
         first is the data (or file paths if download_data is false), the second is a list of
@@ -1146,6 +1157,9 @@ def load_datasets(self,
             If true, missing datasets raises and error, otherwise None is returned
         download_only : bool
             When true the data are downloaded and the file path is returned.
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
 
         Returns
         -------
@@ -1257,7 +1271,8 @@ def _verify_specifiers(specifiers):
                 _logger.warning(message)
 
         # Check files exist / download remote files
-        files = self._check_filesystem(present_datasets, **kwargs)
+        offline = None if query_type == 'auto' else self.mode == 'local'
+        files = self._check_filesystem(present_datasets, offline=offline, check_hash=check_hash)
 
         if any(x is None for x in files):
             missing_list = ', '.join(x for x, y in zip(present_datasets.rel_path, files) if not y)
@@ -1284,7 +1299,8 @@ def _verify_specifiers(specifiers):
     def load_dataset_from_id(self,
                              dset_id: Union[str, UUID],
                              download_only: bool = False,
-                             details: bool = False) -> Any:
+                             details: bool = False,
+                             check_hash: bool = True) -> Any:
         """
         Load a dataset given a dataset UUID.
 
@@ -1296,6 +1312,9 @@ def load_dataset_from_id(self,
             If true the dataset is downloaded (if necessary) and the filepath returned.
         details : bool
             If true a pandas Series is returned in addition to the data.
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
 
         Returns
         -------
@@ -1314,7 +1333,7 @@ def load_dataset_from_id(self,
         except KeyError:
             raise alferr.ALFObjectNotFound('Dataset not found')
 
-        filepath, = self._check_filesystem(dataset)
+        filepath, = self._check_filesystem(dataset, check_hash=check_hash)
         if not filepath:
             raise alferr.ALFObjectNotFound('Dataset not found')
         output = filepath if download_only else alfio.load_file_content(filepath)
@@ -1332,6 +1351,7 @@ def load_collection(self,
                         revision: Optional[str] = None,
                         query_type: Optional[str] = None,
                         download_only: bool = False,
+                        check_hash: bool = True,
                         **kwargs) -> Union[Bunch, List[Path]]:
         """
         Load all objects in an ALF collection from a Session ID.  Any datasets with matching object
@@ -1357,6 +1377,9 @@ def load_collection(self,
             Query cache ('local') or Alyx database ('remote')
         download_only : bool
             When true the data are downloaded and the file path is returned.
+        check_hash : bool
+            Consider dataset missing if local file hash does not match. In online mode, the dataset
+            will be re-downloaded.
         kwargs
             Additional filters for datasets, including namespace and timescale. For full list
             see the one.alf.spec.describe function.
@@ -1397,7 +1420,7 @@ def load_collection(self,
 
         # For those that don't exist, download them
         offline = None if query_type == 'auto' else self.mode == 'local'
-        files = self._check_filesystem(datasets, offline=offline)
+        files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)
         if not any(files):
             raise alferr.ALFObjectNotFound(f'ALF collection "{collection}" not found on disk')
         # Remove missing items
@@ -1456,7 +1479,8 @@ def setup(cache_dir=None, silent=False, **kwargs):
 
 @lru_cache(maxsize=1)
 def ONE(*, mode='auto', wildcards=True, **kwargs):
-    """ONE API factory
+    """ONE API factory.
+
     Determine which class to instantiate depending on parameters passed.
 
     Parameters
@@ -1509,10 +1533,10 @@ def ONE(*, mode='auto', wildcards=True, **kwargs):
 
 
 class OneAlyx(One):
-    """An API for searching and loading data through the Alyx database"""
+    """An API for searching and loading data through the Alyx database."""
     def __init__(self, username=None, password=None, base_url=None, cache_dir=None,
                  mode='auto', wildcards=True, tables_dir=None, **kwargs):
-        """An API for searching and loading data through the Alyx database
+        """An API for searching and loading data through the Alyx database.
 
         Parameters
         ----------
@@ -1803,13 +1827,13 @@ def list_aggregates(self, relation: str, identifier: str = None,
                    .reset_index(level=0)
                    .drop('eid', axis=1)
                    .rename_axis(index={'id': 'did'}))
-        # Since rel_path for public FI file records starts with public/aggregates instead of just
-        # aggregates as is the case for internal FI file records, as well as public and internal
-        # AWS file records, we need to make sure to use the file path part after 'aggregates'
-        # and not simply the second part, as relation.
+        # Since rel_path for public FI file records starts with 'public/aggregates' instead of just
+        # 'aggregates', we should discard the file path parts before 'aggregates' (if present)
+        records['rel_path'] = records['rel_path'].str.replace(
+            r'^[\w\/]+(?=aggregates\/)', '', n=1, regex=True)
+        # The relation is the first part after 'aggregates', i.e. the second part
         records['relation'] = records['rel_path'].map(
-            lambda x: x.split('aggregates')[-1].split('/')[1].lower()
-        )
+            lambda x: x.split('aggregates')[-1].split('/')[1].lower())
         records = records[records['relation'] == relation.lower()]
 
         def path2id(p) -> str:
@@ -2563,14 +2587,14 @@ def path2eid(self, path_obj: Union[str, Path], query_type=None) -> util.Listable
         Parameters
         ----------
         path_obj : str, pathlib.Path, list
-            Local path or list of local paths
+            Local path or list of local paths.
         query_type : str
-            If set to 'remote', will force database connection
+            If set to 'remote', will force database connection.
 
         Returns
         -------
         str, list
-            An eid or list of eids
+            An eid or list of eids.
         """
         # If path_obj is a list recurse through it and return a list
         if isinstance(path_obj, list):

diff --git a/one/converters.py b/one/converters.py
@@ -187,12 +187,12 @@ def path2eid(self, path_obj):
         Parameters
         ----------
         path_obj : pathlib.Path, str
-            Local path or list of local paths
+            Local path or list of local paths.
 
         Returns
         -------
         eid, list
-            Experiment ID (eid) string or list of eids
+            Experiment ID (eid) string or list of eids.
         """
         # else ensure the path ends with mouse,date, number
         session_path = get_session_path(path_obj)

diff --git a/one/params.py b/one/params.py
@@ -122,22 +122,25 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir
     cache_map = iopar.read(f'{_PAR_ID_STR}/{_CLIENT_ID_STR}', {'CLIENT_MAP': dict()})
 
     if not silent:
+        prompt = 'Param %s, current value is ["%s"]:'
         par = iopar.as_dict(par_default)
-        for k in par.keys():
+        # Iterate through non-password pars
+        for k in filter(lambda k: 'PWD' not in k, par.keys()):
             cpar = _get_current_par(k, par_current)
-            # Prompt for database URL; skip if client url already provided
-            if k == 'ALYX_URL':
-                if not client:
-                    par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar
-                    if '://' not in par[k]:
-                        par[k] = 'https://' + par[k]
-                    url_parsed = urlsplit(par[k])
-                    if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)):
-                        raise ValueError(f'{k} must be valid HTTP URL')
+            # Prompt for database and FI URL
+            if 'URL' in k:
+                if k == 'ALYX_URL' and client:
+                    continue  # skip if client url already provided
+                par[k] = input(prompt % (k, cpar)).strip().rstrip('/') or cpar
+                if '://' not in par[k]:
+                    par[k] = 'https://' + par[k]
+                url_parsed = urlsplit(par[k])
+                if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)):
+                    raise ValueError(f'{k} must be valid HTTP URL')
+                if k == 'ALYX_URL':
                     client = par[k]
-            # Iterate through other non-password pars
-            elif 'PWD' not in k:
-                par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar
+            else:
+                par[k] = input(prompt % (k, cpar)).strip() or cpar
 
         cpar = _get_current_par('HTTP_DATA_SERVER_PWD', par_current)
         prompt = f'Enter the FlatIron HTTP password for {par["HTTP_DATA_SERVER_LOGIN"]} '\
@@ -179,12 +182,12 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir
             print('SETUP ABANDONED.  Please re-run.')
             return par_current
     else:
-        par = par_current
-        if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key):
-            warnings.warn('Warning: the directory provided is already a cache for another URL.')
         # Precedence: user provided cache_dir; previously defined; the default location
         default_cache_dir = Path(CACHE_DIR_DEFAULT, client_key)
         cache_dir = cache_dir or cache_map.CLIENT_MAP.get(client_key, default_cache_dir)
+        par = par_current
+        if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key and v == cache_dir):
+            warnings.warn('Warning: the directory provided is already a cache for another URL.')
 
     # Update and save parameters
     Path(cache_dir).mkdir(exist_ok=True, parents=True)

diff --git a/one/tests/test_one.py b/one/tests/test_one.py
@@ -1249,14 +1249,15 @@ def test_list_aggregates(self):
         """Test OneAlyx.list_aggregates"""
         # Test listing by relation
         datasets = self.one.list_aggregates('subjects')
-        self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects')))
+        self.assertTrue(all(datasets['rel_path'].str.startswith('aggregates/Subjects')))
         self.assertIn('exists_aws', datasets.columns)
         self.assertIn('session_path', datasets.columns)
         self.assertTrue(all(datasets['session_path'] == ''))
         self.assertTrue(self.one.list_aggregates('foobar').empty)
         # Test filtering with an identifier
         datasets = self.one.list_aggregates('subjects', 'ZM_1085')
-        self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects/ZM_1085')))
+        expected = 'aggregates/Subjects/mainenlab/ZM_1085'
+        self.assertTrue(all(datasets['rel_path'].str.startswith(expected)))
         self.assertTrue(self.one.list_aggregates('subjects', 'foobar').empty)
         # Test that additional parts of data path are correctly removed
         # specifically /public in FI openalyx file rec
@@ -1292,7 +1293,7 @@ def test_load_aggregate(self):
 
         # Touch a file to ensure that we do not try downloading
         expected = self.one.cache_dir.joinpath(
-            'cortexlab/Subjects/ZM_1085/_ibl_subjectTraining.table.pqt')
+            'aggregates/Subjects/mainenlab/ZM_1085/_ibl_subjectTraining.table.pqt')
         expected.parent.mkdir(parents=True), expected.touch()
 
         # Test loading with different input dataset formats

diff --git a/one/tests/test_params.py b/one/tests/test_params.py
@@ -61,7 +61,7 @@ def test_setup(self, _):
             self.assertNotEqual(cache.ALYX_LOGIN, 'mistake')
 
         # Check that raises ValueError when bad URL provided
-        self.url = 'ftp://'
+        self.url = 'ftp://foo.bar.org'
         with self.assertRaises(ValueError), mock.patch('one.params.input', new=self._mock_input):
             one.params.setup()
 

diff --git a/one/webclient.py b/one/webclient.py
@@ -791,14 +791,16 @@ def download_cache_tables(self, source=None, destination=None):
         -------
             List of parquet table file paths.
         """
-        # query the database for the latest cache; expires=None overrides cached response
-        if not self.is_logged_in:
-            self.authenticate()
         source = str(source or f'{self.base_url}/cache.zip')
         destination = destination or self.cache_dir
         Path(destination).mkdir(exist_ok=True, parents=True)
 
-        headers = self._headers if source.startswith(self.base_url) else None
+        headers = None
+        if source.startswith(self.base_url):
+            if not self.is_logged_in:
+                self.authenticate()
+            headers = self._headers
+
         with tempfile.TemporaryDirectory(dir=destination) as tmp:
             file = http_download_file(source,
                                       headers=headers,