Skip to content

Commit

Permalink
2.9.1 (#128)
Browse files Browse the repository at this point in the history
* HOTFIX: When downloading cache only authenticate Alyx when necessary
* HOTFIX: Ensure http data server URL does not end in slash
* HOTFIX: No longer warns in silent mode when no param conflicts present
* Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)
* account for public being in rel_path (#127)
---------
Co-authored-by: mayofaulkner <[email protected]>
  • Loading branch information
k1o0 authored Sep 24, 2024
1 parent 7236111 commit 26bea63
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 48 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.0]
## [Latest](https://github.com/int-brain-lab/ONE/commits/main) [2.9.1]

### Modified

- HOTFIX: When downloading cache only authenticate Alyx when necessary
- HOTFIX: Ensure http data server URL does not end in slash
- HOTFIX: Handle public aggregate dataset relative paths
- HOTFIX: No longer warns in silent mode when no param conflicts present
- Explicit kwargs in load_* methods to avoid user confusion (e.g. no 'namespace' kwarg for `load_dataset`)

## [2.9.0]
This version adds a couple of new ALF functions.

### Added
Expand Down
2 changes: 1 addition & 1 deletion one/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""The Open Neurophysiology Environment (ONE) API."""
__version__ = '2.9.0'
__version__ = '2.9.1'
64 changes: 44 additions & 20 deletions one/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,9 @@ def _check_filesystem(self, datasets, offline=None, update_exists=True, check_ha
repository
update_exists : bool
If true, the cache is updated to reflect the filesystem
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
Returns
-------
Expand Down Expand Up @@ -924,6 +927,7 @@ def load_object(self,
revision: Optional[str] = None,
query_type: Optional[str] = None,
download_only: bool = False,
check_hash: bool = True,
**kwargs) -> Union[alfio.AlfBunch, List[Path]]:
"""
Load all attributes of an ALF object from a Session ID and an object name.
Expand Down Expand Up @@ -951,6 +955,9 @@ def load_object(self,
download_only : bool
When true the data are downloaded and the file path is returned. NB: The order of the
file path list is undefined.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
kwargs
Additional filters for datasets, including namespace and timescale. For full list
see the :func:`one.alf.spec.describe` function.
Expand Down Expand Up @@ -996,7 +1003,7 @@ def load_object(self,

# For those that don't exist, download them
offline = None if query_type == 'auto' else self.mode == 'local'
files = self._check_filesystem(datasets, offline=offline)
files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)
files = [x for x in files if x]
if not files:
raise alferr.ALFObjectNotFound(f'ALF object "{obj}" not found on disk')
Expand All @@ -1015,7 +1022,7 @@ def load_dataset(self,
revision: Optional[str] = None,
query_type: Optional[str] = None,
download_only: bool = False,
**kwargs) -> Any:
check_hash: bool = True) -> Any:
"""
Load a single dataset for a given session id and dataset name.
Expand All @@ -1040,6 +1047,9 @@ def load_dataset(self,
Query cache ('local') or Alyx database ('remote')
download_only : bool
When true the data are downloaded and the file path is returned.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
Returns
-------
Expand Down Expand Up @@ -1099,7 +1109,8 @@ def load_dataset(self,
raise alferr.ALFObjectNotFound(f'Dataset "{dataset}" not found')

# Check files exist / download remote files
file, = self._check_filesystem(datasets, **kwargs)
offline = None if query_type == 'auto' else self.mode == 'local'
file, = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)

if not file:
raise alferr.ALFObjectNotFound('Dataset not found')
Expand All @@ -1117,7 +1128,7 @@ def load_datasets(self,
query_type: Optional[str] = None,
assert_present=True,
download_only: bool = False,
**kwargs) -> Any:
check_hash: bool = True) -> Any:
"""
Load datasets for a given session id. Returns two lists the length of datasets. The
first is the data (or file paths if download_data is false), the second is a list of
Expand Down Expand Up @@ -1146,6 +1157,9 @@ def load_datasets(self,
If true, missing datasets raises and error, otherwise None is returned
download_only : bool
When true the data are downloaded and the file path is returned.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
Returns
-------
Expand Down Expand Up @@ -1257,7 +1271,8 @@ def _verify_specifiers(specifiers):
_logger.warning(message)

# Check files exist / download remote files
files = self._check_filesystem(present_datasets, **kwargs)
offline = None if query_type == 'auto' else self.mode == 'local'
files = self._check_filesystem(present_datasets, offline=offline, check_hash=check_hash)

if any(x is None for x in files):
missing_list = ', '.join(x for x, y in zip(present_datasets.rel_path, files) if not y)
Expand All @@ -1284,7 +1299,8 @@ def _verify_specifiers(specifiers):
def load_dataset_from_id(self,
dset_id: Union[str, UUID],
download_only: bool = False,
details: bool = False) -> Any:
details: bool = False,
check_hash: bool = True) -> Any:
"""
Load a dataset given a dataset UUID.
Expand All @@ -1296,6 +1312,9 @@ def load_dataset_from_id(self,
If true the dataset is downloaded (if necessary) and the filepath returned.
details : bool
If true a pandas Series is returned in addition to the data.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
Returns
-------
Expand All @@ -1314,7 +1333,7 @@ def load_dataset_from_id(self,
except KeyError:
raise alferr.ALFObjectNotFound('Dataset not found')

filepath, = self._check_filesystem(dataset)
filepath, = self._check_filesystem(dataset, check_hash=check_hash)
if not filepath:
raise alferr.ALFObjectNotFound('Dataset not found')
output = filepath if download_only else alfio.load_file_content(filepath)
Expand All @@ -1332,6 +1351,7 @@ def load_collection(self,
revision: Optional[str] = None,
query_type: Optional[str] = None,
download_only: bool = False,
check_hash: bool = True,
**kwargs) -> Union[Bunch, List[Path]]:
"""
Load all objects in an ALF collection from a Session ID. Any datasets with matching object
Expand All @@ -1357,6 +1377,9 @@ def load_collection(self,
Query cache ('local') or Alyx database ('remote')
download_only : bool
When true the data are downloaded and the file path is returned.
check_hash : bool
Consider dataset missing if local file hash does not match. In online mode, the dataset
will be re-downloaded.
kwargs
Additional filters for datasets, including namespace and timescale. For full list
see the one.alf.spec.describe function.
Expand Down Expand Up @@ -1397,7 +1420,7 @@ def load_collection(self,

# For those that don't exist, download them
offline = None if query_type == 'auto' else self.mode == 'local'
files = self._check_filesystem(datasets, offline=offline)
files = self._check_filesystem(datasets, offline=offline, check_hash=check_hash)
if not any(files):
raise alferr.ALFObjectNotFound(f'ALF collection "{collection}" not found on disk')
# Remove missing items
Expand Down Expand Up @@ -1456,7 +1479,8 @@ def setup(cache_dir=None, silent=False, **kwargs):

@lru_cache(maxsize=1)
def ONE(*, mode='auto', wildcards=True, **kwargs):
"""ONE API factory
"""ONE API factory.
Determine which class to instantiate depending on parameters passed.
Parameters
Expand Down Expand Up @@ -1509,10 +1533,10 @@ def ONE(*, mode='auto', wildcards=True, **kwargs):


class OneAlyx(One):
"""An API for searching and loading data through the Alyx database"""
"""An API for searching and loading data through the Alyx database."""
def __init__(self, username=None, password=None, base_url=None, cache_dir=None,
mode='auto', wildcards=True, tables_dir=None, **kwargs):
"""An API for searching and loading data through the Alyx database
"""An API for searching and loading data through the Alyx database.
Parameters
----------
Expand Down Expand Up @@ -1803,13 +1827,13 @@ def list_aggregates(self, relation: str, identifier: str = None,
.reset_index(level=0)
.drop('eid', axis=1)
.rename_axis(index={'id': 'did'}))
# Since rel_path for public FI file records starts with public/aggregates instead of just
# aggregates as is the case for internal FI file records, as well as public and internal
# AWS file records, we need to make sure to use the file path part after 'aggregates'
# and not simply the second part, as relation.
# Since rel_path for public FI file records starts with 'public/aggregates' instead of just
# 'aggregates', we should discard the file path parts before 'aggregates' (if present)
records['rel_path'] = records['rel_path'].str.replace(
r'^[\w\/]+(?=aggregates\/)', '', n=1, regex=True)
# The relation is the first part after 'aggregates', i.e. the second part
records['relation'] = records['rel_path'].map(
lambda x: x.split('aggregates')[-1].split('/')[1].lower()
)
lambda x: x.split('aggregates')[-1].split('/')[1].lower())
records = records[records['relation'] == relation.lower()]

def path2id(p) -> str:
Expand Down Expand Up @@ -2563,14 +2587,14 @@ def path2eid(self, path_obj: Union[str, Path], query_type=None) -> util.Listable
Parameters
----------
path_obj : str, pathlib.Path, list
Local path or list of local paths
Local path or list of local paths.
query_type : str
If set to 'remote', will force database connection
If set to 'remote', will force database connection.
Returns
-------
str, list
An eid or list of eids
An eid or list of eids.
"""
# If path_obj is a list recurse through it and return a list
if isinstance(path_obj, list):
Expand Down
4 changes: 2 additions & 2 deletions one/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,12 @@ def path2eid(self, path_obj):
Parameters
----------
path_obj : pathlib.Path, str
Local path or list of local paths
Local path or list of local paths.
Returns
-------
eid, list
Experiment ID (eid) string or list of eids
Experiment ID (eid) string or list of eids.
"""
# else ensure the path ends with mouse,date, number
session_path = get_session_path(path_obj)
Expand Down
35 changes: 19 additions & 16 deletions one/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,25 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir
cache_map = iopar.read(f'{_PAR_ID_STR}/{_CLIENT_ID_STR}', {'CLIENT_MAP': dict()})

if not silent:
prompt = 'Param %s, current value is ["%s"]:'
par = iopar.as_dict(par_default)
for k in par.keys():
# Iterate through non-password pars
for k in filter(lambda k: 'PWD' not in k, par.keys()):
cpar = _get_current_par(k, par_current)
# Prompt for database URL; skip if client url already provided
if k == 'ALYX_URL':
if not client:
par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar
if '://' not in par[k]:
par[k] = 'https://' + par[k]
url_parsed = urlsplit(par[k])
if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)):
raise ValueError(f'{k} must be valid HTTP URL')
# Prompt for database and FI URL
if 'URL' in k:
if k == 'ALYX_URL' and client:
continue # skip if client url already provided
par[k] = input(prompt % (k, cpar)).strip().rstrip('/') or cpar
if '://' not in par[k]:
par[k] = 'https://' + par[k]
url_parsed = urlsplit(par[k])
if not (url_parsed.netloc and re.match('https?', url_parsed.scheme)):
raise ValueError(f'{k} must be valid HTTP URL')
if k == 'ALYX_URL':
client = par[k]
# Iterate through other non-password pars
elif 'PWD' not in k:
par[k] = input(f'Param {k}, current value is ["{str(cpar)}"]:') or cpar
else:
par[k] = input(prompt % (k, cpar)).strip() or cpar

cpar = _get_current_par('HTTP_DATA_SERVER_PWD', par_current)
prompt = f'Enter the FlatIron HTTP password for {par["HTTP_DATA_SERVER_LOGIN"]} '\
Expand Down Expand Up @@ -179,12 +182,12 @@ def setup(client=None, silent=False, make_default=None, username=None, cache_dir
print('SETUP ABANDONED. Please re-run.')
return par_current
else:
par = par_current
if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key):
warnings.warn('Warning: the directory provided is already a cache for another URL.')
# Precedence: user provided cache_dir; previously defined; the default location
default_cache_dir = Path(CACHE_DIR_DEFAULT, client_key)
cache_dir = cache_dir or cache_map.CLIENT_MAP.get(client_key, default_cache_dir)
par = par_current
if any(v for k, v in cache_map.CLIENT_MAP.items() if k != client_key and v == cache_dir):
warnings.warn('Warning: the directory provided is already a cache for another URL.')

# Update and save parameters
Path(cache_dir).mkdir(exist_ok=True, parents=True)
Expand Down
7 changes: 4 additions & 3 deletions one/tests/test_one.py
Original file line number Diff line number Diff line change
Expand Up @@ -1249,14 +1249,15 @@ def test_list_aggregates(self):
"""Test OneAlyx.list_aggregates"""
# Test listing by relation
datasets = self.one.list_aggregates('subjects')
self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects')))
self.assertTrue(all(datasets['rel_path'].str.startswith('aggregates/Subjects')))
self.assertIn('exists_aws', datasets.columns)
self.assertIn('session_path', datasets.columns)
self.assertTrue(all(datasets['session_path'] == ''))
self.assertTrue(self.one.list_aggregates('foobar').empty)
# Test filtering with an identifier
datasets = self.one.list_aggregates('subjects', 'ZM_1085')
self.assertTrue(all(datasets['rel_path'].str.startswith('cortexlab/Subjects/ZM_1085')))
expected = 'aggregates/Subjects/mainenlab/ZM_1085'
self.assertTrue(all(datasets['rel_path'].str.startswith(expected)))
self.assertTrue(self.one.list_aggregates('subjects', 'foobar').empty)
# Test that additional parts of data path are correctly removed
# specifically /public in FI openalyx file rec
Expand Down Expand Up @@ -1292,7 +1293,7 @@ def test_load_aggregate(self):

# Touch a file to ensure that we do not try downloading
expected = self.one.cache_dir.joinpath(
'cortexlab/Subjects/ZM_1085/_ibl_subjectTraining.table.pqt')
'aggregates/Subjects/mainenlab/ZM_1085/_ibl_subjectTraining.table.pqt')
expected.parent.mkdir(parents=True), expected.touch()

# Test loading with different input dataset formats
Expand Down
2 changes: 1 addition & 1 deletion one/tests/test_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_setup(self, _):
self.assertNotEqual(cache.ALYX_LOGIN, 'mistake')

# Check that raises ValueError when bad URL provided
self.url = 'ftp://'
self.url = 'ftp://foo.bar.org'
with self.assertRaises(ValueError), mock.patch('one.params.input', new=self._mock_input):
one.params.setup()

Expand Down
10 changes: 6 additions & 4 deletions one/webclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,14 +791,16 @@ def download_cache_tables(self, source=None, destination=None):
-------
List of parquet table file paths.
"""
# query the database for the latest cache; expires=None overrides cached response
if not self.is_logged_in:
self.authenticate()
source = str(source or f'{self.base_url}/cache.zip')
destination = destination or self.cache_dir
Path(destination).mkdir(exist_ok=True, parents=True)

headers = self._headers if source.startswith(self.base_url) else None
headers = None
if source.startswith(self.base_url):
if not self.is_logged_in:
self.authenticate()
headers = self._headers

with tempfile.TemporaryDirectory(dir=destination) as tmp:
file = http_download_file(source,
headers=headers,
Expand Down

0 comments on commit 26bea63

Please sign in to comment.