From bfd532bef68fc2e3dabbfbcc3facd5a979522d10 Mon Sep 17 00:00:00 2001 From: Jan Sebastian Rothe Date: Wed, 30 Oct 2024 20:02:22 +0100 Subject: [PATCH 1/3] Removed read latest function, use ssb-fagfunksjoner instead. --- src/arbmark/__init__.py | 1 - src/arbmark/functions/files.py | 71 ------------------------------ tests/test_data/dataset_v1.parquet | 0 tests/test_data/dataset_v2.parquet | 0 tests/test_data/dataset_v3.parquet | 0 tests/test_files.py | 13 ------ 6 files changed, 85 deletions(-) delete mode 100644 src/arbmark/functions/files.py delete mode 100644 tests/test_data/dataset_v1.parquet delete mode 100644 tests/test_data/dataset_v2.parquet delete mode 100644 tests/test_data/dataset_v3.parquet delete mode 100644 tests/test_files.py diff --git a/src/arbmark/__init__.py b/src/arbmark/__init__.py index b6200a9..46e3545 100644 --- a/src/arbmark/__init__.py +++ b/src/arbmark/__init__.py @@ -2,7 +2,6 @@ from arbmark.functions.aggregation import proc_sums from arbmark.functions.categorize_ranges import categorize_ranges -from arbmark.functions.files import read_latest from arbmark.functions.interval import pinterval from arbmark.functions.merge import indicate_merge from arbmark.functions.quarter import first_last_date_quarter diff --git a/src/arbmark/functions/files.py b/src/arbmark/functions/files.py deleted file mode 100644 index 97a9f46..0000000 --- a/src/arbmark/functions/files.py +++ /dev/null @@ -1,71 +0,0 @@ -# Glob for Unix style pathname pattern expansion. -import glob - -# OS for interacting with the operating system -import os - -# Dapla for cloud file client -from dapla import FileClient - - -def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None: - """Finds the latest version of a specified file in a given directory and returns its name. - - This function searches for files in the specified path that match the given name and file - type, sorts them by modification time, and returns the path of the latest version. If no - files are found, it returns None. - - Args: - path (str): The directory path where the files are located. - name (str): The base name of the files to search for. - dottype (str): The file extension to look for. Defaults to ".parquet". - - Returns: - Optional[str]: The path of the latest version of the file if found, None otherwise. - """ - # Inform the user about the file versions being checked - print(f"Checking versions of file: {name}") - - # Define the pattern to search for files based on the provided name and file type - file_name_pattern = f"{name}*{dottype}" - - # Join directory and file name - file_path = os.path.join(path, file_name_pattern) - - # If path is a google cloud bucket - if path[:4] in ["ssb-", "gs:/"]: - - # Get filesystem - fs = FileClient.get_gcs_file_system() - - # Use glob to find all files matching the pattern - file_list = fs.glob(file_path) - - else: - - # Use glob to find all files matching the pattern - file_list = glob.glob(file_path) - - # Sorting key based on file version - file_versions = sorted( - file_list, - key=lambda x: int(x.split("_v")[-1].split(".")[0]), - ) - - # Check if any files were found. If not, inform the user and return None - if not file_versions: - print("No files found.") - return None - - # Select the last file from the sorted list as it is the most recently modified one - latest_file = os.path.normpath(file_versions[-1]) - - # Extract the name of the latest file for reporting. - latest_file_name = os.path.basename(latest_file) - - # Inform the user about the number of versions found and the latest one being read - print(f"Found {len(file_versions)} version(s).") - print(f"Reading latest version: {latest_file_name}") - - # Return the path of the latest file - return latest_file diff --git a/tests/test_data/dataset_v1.parquet b/tests/test_data/dataset_v1.parquet deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_data/dataset_v2.parquet b/tests/test_data/dataset_v2.parquet deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_data/dataset_v3.parquet b/tests/test_data/dataset_v3.parquet deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_files.py b/tests/test_files.py deleted file mode 100644 index a146a33..0000000 --- a/tests/test_files.py +++ /dev/null @@ -1,13 +0,0 @@ -import os - -from arbmark import read_latest - - -def test_read_latest() -> None: - cwd = os.getcwd() - result = read_latest( - path=os.path.normpath(f"{cwd}/tests/test_data"), name="dataset" - ) - expected = os.path.normpath(f"{cwd}/tests/test_data/dataset_v3.parquet") - - assert result == expected, f"Expected {expected}, but got {result}." From 75b3e47b577828af4ed05d2c6d3d217fc23af4a6 Mon Sep 17 00:00:00 2001 From: Jan Sebastian Rothe Date: Wed, 30 Oct 2024 20:24:04 +0100 Subject: [PATCH 2/3] Changes to sector groups --- src/arbmark/groups/sector.py | 17 +++++------------ tests/test_sector.py | 5 ++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/arbmark/groups/sector.py b/src/arbmark/groups/sector.py index c2a44cd..4947a68 100644 --- a/src/arbmark/groups/sector.py +++ b/src/arbmark/groups/sector.py @@ -9,44 +9,37 @@ import pandas as pd if TYPE_CHECKING: - PdSeriesInt = pd.Series[int] # type: ignore[misc] PdSeriesStr = pd.Series[str] # type: ignore[misc] - NpArrayInt = npt.NDArray[np.int_] # type: ignore[misc] NpArrayStr = npt.NDArray[np.str_] # type: ignore[misc] else: - PdSeriesInt = pd.Series PdSeriesStr = pd.Series - NpArrayInt = npt.NDArray NpArrayStr = npt.NDArray def sektor2_grp( - sektor: PdSeriesStr, undersektor: PdSeriesStr, display: str = "label" + sektor: PdSeriesStr,display: str = "label" ) -> NpArrayStr: - """Categorize a pandas Series of sectors and subsectors into predefined groups. + """Categorize a pandas Series of sectors into predefined groups. Parameters: sektor: A pandas Series containing the sector codes. - undersektor: A pandas Series containing the subsector codes. display: If 'label', returns group labels; if 'number', returns keys; for any other string, returns a combination of keys and labels. Returns: - A numpy Array where the original sector and subsectors are replaced by group labels or keys. + A numpy Array where the original sector is replaced by group labels or keys. """ # Define the conditions for each group conditions = [ (sektor == "6100").to_numpy(), - np.logical_and(sektor == "6500", undersektor != "007"), - np.logical_and(sektor == "6500", undersektor == "007"), + (sektor == "6500").to_numpy(), (sektor == "1510").to_numpy(), (sektor == "1520").to_numpy(), ] groups = { "110": "Statlig forvaltning", - "550": "Kommunal forvaltning", - "510": "Fylkeskommunal forvaltning", + "550": "Kommunal forvaltning" "660": "Kommunale foretak med ubegrenset ansvar", "680": "Kommunalt eide aksjeselskaper m.v.", } diff --git a/tests/test_sector.py b/tests/test_sector.py index 9231386..40daf0a 100644 --- a/tests/test_sector.py +++ b/tests/test_sector.py @@ -10,20 +10,19 @@ def sample_df() -> pd.DataFrame: return pd.DataFrame( { "sektor": np.random.choice(["6100", "6500", "1510", "1520"], size=100), - "undersektor": np.random.choice(["007", "008", "009"], size=100), } ) def test_sektor2_grp(sample_df): df = sample_df - df["sektor2_grp"] = sektor2_grp(df["sektor"], df["undersektor"]).astype(str) + df["sektor2_grp"] = sektor2_grp(df["sektor"]).astype(str) assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values" def test_sektor2_grp_number(sample_df): df = sample_df df["sektor2_grp"] = sektor2_grp( - df["sektor"], df["undersektor"], display="number" + df["sektor"], display="number" ).astype(str) assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values" From 503772ae9f5d3e634f26ca03ad5fbb9704a78b33 Mon Sep 17 00:00:00 2001 From: Jan Sebastian Rothe Date: Wed, 30 Oct 2024 20:40:07 +0100 Subject: [PATCH 3/3] update --- pyproject.toml | 2 +- src/arbmark/groups/sector.py | 6 ++---- tests/test_sector.py | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be9af71..a53a8d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ssb-arbmark-fagfunksjoner" -version = "0.0.24" +version = "0.0.25" description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner" authors = ["Jan Sebastian Rothe "] license = "MIT" diff --git a/src/arbmark/groups/sector.py b/src/arbmark/groups/sector.py index 4947a68..d7fe78c 100644 --- a/src/arbmark/groups/sector.py +++ b/src/arbmark/groups/sector.py @@ -16,9 +16,7 @@ NpArrayStr = npt.NDArray -def sektor2_grp( - sektor: PdSeriesStr,display: str = "label" -) -> NpArrayStr: +def sektor2_grp(sektor: PdSeriesStr, display: str = "label") -> NpArrayStr: """Categorize a pandas Series of sectors into predefined groups. Parameters: @@ -39,7 +37,7 @@ def sektor2_grp( groups = { "110": "Statlig forvaltning", - "550": "Kommunal forvaltning" + "550": "Kommunal forvaltning", "660": "Kommunale foretak med ubegrenset ansvar", "680": "Kommunalt eide aksjeselskaper m.v.", } diff --git a/tests/test_sector.py b/tests/test_sector.py index 40daf0a..dae37d1 100644 --- a/tests/test_sector.py +++ b/tests/test_sector.py @@ -22,7 +22,5 @@ def test_sektor2_grp(sample_df): def test_sektor2_grp_number(sample_df): df = sample_df - df["sektor2_grp"] = sektor2_grp( - df["sektor"], display="number" - ).astype(str) + df["sektor2_grp"] = sektor2_grp(df["sektor"], display="number").astype(str) assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values"