From f28ac144aa8d30ac0c1a051e1d90950990345b5c Mon Sep 17 00:00:00 2001 From: Ben Young Date: Wed, 15 Nov 2023 14:59:05 -0500 Subject: [PATCH 1/2] avoid SettingWithCopyWarning --- esupy/dqi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/esupy/dqi.py b/esupy/dqi.py index 971e353..9873e62 100644 --- a/esupy/dqi.py +++ b/esupy/dqi.py @@ -119,10 +119,10 @@ def get_weighted_average(df, data_col, weight_col, agg_cols): df_agg[data_col] = get_weighted_average(df, data_col, weight_col, agg_cols) """ - df.loc[:, '_data_times_weight'] = df[data_col] * df[weight_col] - df.loc[:, '_weight_where_notnull'] = df[weight_col] * pd.notnull(df[data_col]) - calc_cols = ['_weight_where_notnull', '_data_times_weight'] - df[calc_cols] = df[calc_cols].applymap(float) + df = (df.assign(_data_times_weight = lambda x: x[data_col] * x[weight_col]) + .assign(_weight_where_notnull = lambda x: + x[weight_col] * pd.notnull(x[data_col])) + ) g = df.groupby(agg_cols) wt_avg = np.divide(g['_data_times_weight'].sum(), g['_weight_where_notnull'].sum(), out=np.zeros_like(g['_data_times_weight'].sum()), From fef05a709ba0bb4813cc9a39c10525f35c96ebae Mon Sep 17 00:00:00 2001 From: catherinebirney Date: Fri, 8 Dec 2023 11:53:32 -0700 Subject: [PATCH 2/2] subset list of data available on data commons with file_name instead of "name" so we can specify a version and/or hash --- esupy/processed_data_mgmt.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/esupy/processed_data_mgmt.py b/esupy/processed_data_mgmt.py index fbd5097..260b363 100644 --- a/esupy/processed_data_mgmt.py +++ b/esupy/processed_data_mgmt.py @@ -70,10 +70,6 @@ def download_from_remote(file_meta, paths, **kwargs): base_url = paths.remote_path + file_meta.tool + '/' if file_meta.category != '': base_url = base_url + file_meta.category + '/' - ## TODO: re-implement URL handling via f-strings and/or urllib - # base_url = f'{paths.remote_path}/{file_meta.tool}' - # if not file_meta.category == '': - # base_url = f'{base_url}/{file_meta.category}' files = get_most_recent_from_index(file_meta, paths) if files is None: log.info(f'{file_meta.name_data} not found in {base_url}') @@ -173,7 +169,9 @@ def get_most_recent_from_index(file_meta, paths): if file_df is None: return None file_df = parse_data_commons_index(file_df) - df = file_df[file_df['name'].str.startswith(file_meta.name_data)] + # subset using "file_name" instead of "name" to work when a user + # includes a GitHub version and hash + df = file_df[file_df['file_name'].str.startswith(file_meta.name_data)] df_ext = df[df['ext'] == file_meta.ext] if len(df_ext) == 0: return None