From bfd532bef68fc2e3dabbfbcc3facd5a979522d10 Mon Sep 17 00:00:00 2001
From: Jan Sebastian Rothe <jsr@ssb.no>
Date: Wed, 30 Oct 2024 20:02:22 +0100
Subject: [PATCH 1/3] Removed read latest function, use ssb-fagfunksjoner
 instead.

---
 src/arbmark/__init__.py            |  1 -
 src/arbmark/functions/files.py     | 71 ------------------------------
 tests/test_data/dataset_v1.parquet |  0
 tests/test_data/dataset_v2.parquet |  0
 tests/test_data/dataset_v3.parquet |  0
 tests/test_files.py                | 13 ------
 6 files changed, 85 deletions(-)
 delete mode 100644 src/arbmark/functions/files.py
 delete mode 100644 tests/test_data/dataset_v1.parquet
 delete mode 100644 tests/test_data/dataset_v2.parquet
 delete mode 100644 tests/test_data/dataset_v3.parquet
 delete mode 100644 tests/test_files.py

diff --git a/src/arbmark/__init__.py b/src/arbmark/__init__.py
index b6200a9..46e3545 100644
--- a/src/arbmark/__init__.py
+++ b/src/arbmark/__init__.py
@@ -2,7 +2,6 @@
 
 from arbmark.functions.aggregation import proc_sums
 from arbmark.functions.categorize_ranges import categorize_ranges
-from arbmark.functions.files import read_latest
 from arbmark.functions.interval import pinterval
 from arbmark.functions.merge import indicate_merge
 from arbmark.functions.quarter import first_last_date_quarter
diff --git a/src/arbmark/functions/files.py b/src/arbmark/functions/files.py
deleted file mode 100644
index 97a9f46..0000000
--- a/src/arbmark/functions/files.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Glob for Unix style pathname pattern expansion.
-import glob
-
-# OS for interacting with the operating system
-import os
-
-# Dapla for cloud file client
-from dapla import FileClient
-
-
-def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None:
-    """Finds the latest version of a specified file in a given directory and returns its name.
-
-    This function searches for files in the specified path that match the given name and file
-    type, sorts them by modification time, and returns the path of the latest version. If no
-    files are found, it returns None.
-
-    Args:
-        path (str): The directory path where the files are located.
-        name (str): The base name of the files to search for.
-        dottype (str): The file extension to look for. Defaults to ".parquet".
-
-    Returns:
-        Optional[str]: The path of the latest version of the file if found, None otherwise.
-    """
-    # Inform the user about the file versions being checked
-    print(f"Checking versions of file: {name}")
-
-    # Define the pattern to search for files based on the provided name and file type
-    file_name_pattern = f"{name}*{dottype}"
-
-    # Join directory and file name
-    file_path = os.path.join(path, file_name_pattern)
-
-    # If path is a google cloud bucket
-    if path[:4] in ["ssb-", "gs:/"]:
-
-        # Get filesystem
-        fs = FileClient.get_gcs_file_system()
-
-        # Use glob to find all files matching the pattern
-        file_list = fs.glob(file_path)
-
-    else:
-
-        # Use glob to find all files matching the pattern
-        file_list = glob.glob(file_path)
-
-    # Sorting key based on file version
-    file_versions = sorted(
-        file_list,
-        key=lambda x: int(x.split("_v")[-1].split(".")[0]),
-    )
-
-    # Check if any files were found. If not, inform the user and return None
-    if not file_versions:
-        print("No files found.")
-        return None
-
-    # Select the last file from the sorted list as it is the most recently modified one
-    latest_file = os.path.normpath(file_versions[-1])
-
-    # Extract the name of the latest file for reporting.
-    latest_file_name = os.path.basename(latest_file)
-
-    # Inform the user about the number of versions found and the latest one being read
-    print(f"Found {len(file_versions)} version(s).")
-    print(f"Reading latest version: {latest_file_name}")
-
-    # Return the path of the latest file
-    return latest_file
diff --git a/tests/test_data/dataset_v1.parquet b/tests/test_data/dataset_v1.parquet
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_data/dataset_v2.parquet b/tests/test_data/dataset_v2.parquet
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_data/dataset_v3.parquet b/tests/test_data/dataset_v3.parquet
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_files.py b/tests/test_files.py
deleted file mode 100644
index a146a33..0000000
--- a/tests/test_files.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import os
-
-from arbmark import read_latest
-
-
-def test_read_latest() -> None:
-    cwd = os.getcwd()
-    result = read_latest(
-        path=os.path.normpath(f"{cwd}/tests/test_data"), name="dataset"
-    )
-    expected = os.path.normpath(f"{cwd}/tests/test_data/dataset_v3.parquet")
-
-    assert result == expected, f"Expected {expected}, but got {result}."

From 75b3e47b577828af4ed05d2c6d3d217fc23af4a6 Mon Sep 17 00:00:00 2001
From: Jan Sebastian Rothe <jsr@ssb.no>
Date: Wed, 30 Oct 2024 20:24:04 +0100
Subject: [PATCH 2/3] Changes to sector groups

---
 src/arbmark/groups/sector.py | 17 +++++------------
 tests/test_sector.py         |  5 ++---
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/arbmark/groups/sector.py b/src/arbmark/groups/sector.py
index c2a44cd..4947a68 100644
--- a/src/arbmark/groups/sector.py
+++ b/src/arbmark/groups/sector.py
@@ -9,44 +9,37 @@
 import pandas as pd
 
 if TYPE_CHECKING:
-    PdSeriesInt = pd.Series[int]  # type: ignore[misc]
     PdSeriesStr = pd.Series[str]  # type: ignore[misc]
-    NpArrayInt = npt.NDArray[np.int_]  # type: ignore[misc]
     NpArrayStr = npt.NDArray[np.str_]  # type: ignore[misc]
 else:
-    PdSeriesInt = pd.Series
     PdSeriesStr = pd.Series
-    NpArrayInt = npt.NDArray
     NpArrayStr = npt.NDArray
 
 
 def sektor2_grp(
-    sektor: PdSeriesStr, undersektor: PdSeriesStr, display: str = "label"
+    sektor: PdSeriesStr,display: str = "label"
 ) -> NpArrayStr:
-    """Categorize a pandas Series of sectors and subsectors into predefined groups.
+    """Categorize a pandas Series of sectors into predefined groups.
 
     Parameters:
         sektor: A pandas Series containing the sector codes.
-        undersektor: A pandas Series containing the subsector codes.
         display: If 'label', returns group labels; if 'number', returns keys;
                        for any other string, returns a combination of keys and labels.
 
     Returns:
-        A numpy Array where the original sector and subsectors are replaced by group labels or keys.
+        A numpy Array where the original sector is replaced by group labels or keys.
     """
     # Define the conditions for each group
     conditions = [
         (sektor == "6100").to_numpy(),
-        np.logical_and(sektor == "6500", undersektor != "007"),
-        np.logical_and(sektor == "6500", undersektor == "007"),
+        (sektor == "6500").to_numpy(),
         (sektor == "1510").to_numpy(),
         (sektor == "1520").to_numpy(),
     ]
 
     groups = {
         "110": "Statlig forvaltning",
-        "550": "Kommunal forvaltning",
-        "510": "Fylkeskommunal forvaltning",
+        "550": "Kommunal forvaltning"
         "660": "Kommunale foretak med ubegrenset ansvar",
         "680": "Kommunalt eide aksjeselskaper m.v.",
     }
diff --git a/tests/test_sector.py b/tests/test_sector.py
index 9231386..40daf0a 100644
--- a/tests/test_sector.py
+++ b/tests/test_sector.py
@@ -10,20 +10,19 @@ def sample_df() -> pd.DataFrame:
     return pd.DataFrame(
         {
             "sektor": np.random.choice(["6100", "6500", "1510", "1520"], size=100),
-            "undersektor": np.random.choice(["007", "008", "009"], size=100),
         }
     )
 
 
 def test_sektor2_grp(sample_df):
     df = sample_df
-    df["sektor2_grp"] = sektor2_grp(df["sektor"], df["undersektor"]).astype(str)
+    df["sektor2_grp"] = sektor2_grp(df["sektor"]).astype(str)
     assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values"
 
 
 def test_sektor2_grp_number(sample_df):
     df = sample_df
     df["sektor2_grp"] = sektor2_grp(
-        df["sektor"], df["undersektor"], display="number"
+        df["sektor"], display="number"
     ).astype(str)
     assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values"

From 503772ae9f5d3e634f26ca03ad5fbb9704a78b33 Mon Sep 17 00:00:00 2001
From: Jan Sebastian Rothe <jsr@ssb.no>
Date: Wed, 30 Oct 2024 20:40:07 +0100
Subject: [PATCH 3/3] update

---
 pyproject.toml               | 2 +-
 src/arbmark/groups/sector.py | 6 ++----
 tests/test_sector.py         | 4 +---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be9af71..a53a8d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-arbmark-fagfunksjoner"
-version = "0.0.24"
+version = "0.0.25"
 description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
 authors = ["Jan Sebastian Rothe <jsr@ssb.no>"]
 license = "MIT"
diff --git a/src/arbmark/groups/sector.py b/src/arbmark/groups/sector.py
index 4947a68..d7fe78c 100644
--- a/src/arbmark/groups/sector.py
+++ b/src/arbmark/groups/sector.py
@@ -16,9 +16,7 @@
     NpArrayStr = npt.NDArray
 
 
-def sektor2_grp(
-    sektor: PdSeriesStr,display: str = "label"
-) -> NpArrayStr:
+def sektor2_grp(sektor: PdSeriesStr, display: str = "label") -> NpArrayStr:
     """Categorize a pandas Series of sectors into predefined groups.
 
     Parameters:
@@ -39,7 +37,7 @@ def sektor2_grp(
 
     groups = {
         "110": "Statlig forvaltning",
-        "550": "Kommunal forvaltning"
+        "550": "Kommunal forvaltning",
         "660": "Kommunale foretak med ubegrenset ansvar",
         "680": "Kommunalt eide aksjeselskaper m.v.",
     }
diff --git a/tests/test_sector.py b/tests/test_sector.py
index 40daf0a..dae37d1 100644
--- a/tests/test_sector.py
+++ b/tests/test_sector.py
@@ -22,7 +22,5 @@ def test_sektor2_grp(sample_df):
 
 def test_sektor2_grp_number(sample_df):
     df = sample_df
-    df["sektor2_grp"] = sektor2_grp(
-        df["sektor"], display="number"
-    ).astype(str)
+    df["sektor2_grp"] = sektor2_grp(df["sektor"], display="number").astype(str)
     assert not df["sektor2_grp"].isnull().any(), "Sector 2 group contains null values"