From 7873db2c434fb71cb83d83d9b7f14db03ef3a8a2 Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Wed, 21 Feb 2024 14:21:00 +0100 Subject: [PATCH] feat: get remote index dataframe --- hdxms_datasets/datavault.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/hdxms_datasets/datavault.py b/hdxms_datasets/datavault.py index b0ce513..3150f70 100644 --- a/hdxms_datasets/datavault.py +++ b/hdxms_datasets/datavault.py @@ -6,6 +6,8 @@ from functools import cached_property from pathlib import Path from typing import Optional, Union +import warnings +import pandas as pd import requests import yaml @@ -26,22 +28,33 @@ def __init__( self.cache_dir.mkdir(exist_ok=True, parents=True) self.remote_url = remote_url + self.remote_index: Optional[pd.DataFrame] = None def filter(self, *spec: dict): # filters list of available datasets raise NotImplementedError("Not yet implemented") - @cached_property - def remote_index(self) -> list[str]: - """List of available datasets in the remote database""" + def get_index(self, on_error="ignore") -> Optional[pd.DataFrame]: + """Retrieves the index of available datasets - url = urllib.parse.urljoin(self.remote_url, "index.txt") - response = requests.get(url) - if response.ok: - index = response.text.split("\n")[1:] - return index - else: - return [] + on success, returns the index dataframe and + stores as `remote_index` attribute. + + """ + + url = urllib.parse.urljoin(self.remote_url, "index.csv") + try: + index_df = pd.read_csv(url) + self.remote_index = index_df + return index_df + + except urllib.error.HTTPError as err: + if on_error == "ignore": + pass + elif on_error == "warn": + warnings.warn(f"Error loading index: {err}") + else: + raise err @property def datasets(self) -> list[str]: