Merge pull request #27 from statisticsnorway/klass-function

Klass function
statisticsnorway · Feb 29, 2024 · 80c65a0 · 80c65a0
2 parents de706a0 + a70de68
commit 80c65a0
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 119 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-arbmark-fagfunksjoner"
-version = "0.0.8"
+version = "0.0.9"
 description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
 authors = ["Jan Sebastian Rothe <[email protected]>"]
 license = "MIT"
@@ -25,6 +25,7 @@ holidays = "^0.37"
 pandas-stubs = "^2.1.1.230928"
 poetry-plugin-export = "^1.6.0"
 dapla-toolbelt = "^2.0.6"
+ssb-klass-python = "^0.0.9"
 
 [tool.poetry.group.dev.dependencies]
 pygments = ">=2.10.0"

diff --git a/src/ssb_arbmark_fagfunksjoner/functions.py b/src/ssb_arbmark_fagfunksjoner/functions.py
@@ -570,11 +570,8 @@ def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None:
     # Join directory and file name
     file_path = os.path.join(path, file_name_pattern)
 
-    # Checking environment
-    wenv = os.environ.get("DAPLA_REGION")
-
-    # If environment is Dapla
-    if wenv == "BIP":
+    # If path is a google cloud bucket
+    if path[:4] in ["ssb-", "gs:/"]:
 
         # Get filesystem
         fs = FileClient.get_gcs_file_system()
@@ -587,14 +584,10 @@ def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None:
         # Use glob to find all files matching the pattern
         file_list = glob.glob(file_path)
 
-    # Sorting key based on file modification time
+    # Sorting key based on file version
     file_versions = sorted(
         file_list,
-        key=lambda x: (
-            os.path.getmtime(x),
-            # Fallback to filename sorting
-            x,
-        ),
+        key=lambda x: int(x.split("_v")[-1].split(".")[0]),
     )
 
     # Check if any files were found. If not, inform the user and return None

diff --git a/src/ssb_arbmark_fagfunksjoner/groups.py b/src/ssb_arbmark_fagfunksjoner/groups.py
@@ -10,6 +10,9 @@
 # Pandas for table management
 import pandas as pd
 
+# Klass for standard classifications
+from klass.classes.variant import KlassVariant
+
 if TYPE_CHECKING:
     PdSeriesInt = pd.Series[int]  # type: ignore[misc]
     PdSeriesStr = pd.Series[str]  # type: ignore[misc]
@@ -270,93 +273,41 @@ def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArraySt
     return grouped
 
 
-def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:
-    """Categorize a pandas Series of NACE-codes (SN07) into predefined groups.
+def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
+    """Converts NACE codes in a Pandas Series to their corresponding group codes or labels.
+
+    NACE (Nomenclature of Economic Activities) is the European industry standard classification system.
+    This function maps NACE codes to a higher-level group (level 2) and optionally returns the group's name instead of its code.
 
     Parameters:
-        nace_sn07: A pandas Series containing the NACE-codes.
-        display: If 'label', returns group labels; if 'number', returns keys;
-                       for any other string, returns a combination of keys and labels.
+        nace: A Pandas Series containing NACE codes.
+        label: If True, returns the names of the groups instead of their codes. Defaults to False.
 
     Returns:
-        A numpy Array where the original NACE-codes are replaced by group labels or keys.
-    """
-    # Split the series by space and take the first part
-    first_parts = nace_sn07.str.split(" ", n=1).str[0]
-
-    # Check if the maximum length of the first parts exceeds 2
-    max_length = first_parts.str.len().max()
-
-    # Check if nace codes are already grouped into 47-groups
-    if max_length > 2:
-        print(
-            "Warning: The function first groups the input into the 47 groups standard."
-        )
-        nace_str2_np = pd.to_numeric(nace_sn07_47grp(nace_sn07, "number"))
-    else:
-        # Convert series to numpy array
-        nace_str2_np = pd.to_numeric(first_parts).to_numpy()
-
-    # Define the conditions for each group
-    conditions = [
-        (nace_str2_np == 1),  # 01-03 Jordbruk, skogbruk og fiske
-        np.isin(nace_str2_np, [2, 3]),  # 05-09 Bergverksdrift og utvinning
-        np.logical_and(nace_str2_np >= 4, nace_str2_np <= 16),  # 10-33 Industri
-        np.isin(nace_str2_np, [17, 18]),  # 35-39 Elektrisitet, vann og renovasjon
-        (nace_str2_np == 19),  # 41-43 Bygge- og anleggsvirksomhet
-        np.isin(
-            nace_str2_np, [20, 21, 22]
-        ),  # 45-47 Varehandel, reparasjon av motorvogner
-        np.logical_and(
-            nace_str2_np >= 23, nace_str2_np <= 27
-        ),  # 49-53 Transport og lagring
-        np.isin(nace_str2_np, [28, 29]),  # 55-56 Overnattings- og serveringsvirksomhet
-        np.isin(nace_str2_np, [30, 31]),  # 58-63 Informasjon og kommunikasjon
-        np.isin(nace_str2_np, [32, 33, 34]),  # 64-66 Finansiering og forsikring
-        np.logical_and(
-            nace_str2_np >= 35, nace_str2_np <= 38
-        ),  # 68-75 Teknisk tjenesteyting, eiendomsdrift
-        (nace_str2_np == 39),  # 77-82 Forretningsmessig tjenesteyting
-        (nace_str2_np == 40),  # 84 Off.adm., forsvar, sosialforsikring
-        (nace_str2_np == 41),  # 85 Undervisning
-        np.isin(nace_str2_np, [42, 43]),  # 86-88 Helse- og sosialtjenester
-        np.logical_and(
-            nace_str2_np >= 44, nace_str2_np <= 47
-        ),  # 90-99 Personlig tjenesteyting
-    ]
+        A Pandas Series with the mapped group codes or names, depending on the 'label' argument.
 
-    # Define the group labels with string keys
-    groups = {
-        "01-03": "Jordbruk, skogbruk og fiske",
-        "05-09": "Bergverksdrift og utvinning",
-        "10-33": "Industri",
-        "35-39": "Elektrisitet, vann og renovasjon",
-        "41-43": "Bygge- og anleggsvirksomhet",
-        "45-47": "Varehandel, reparasjon av motorvogner",
-        "49-53": "Transport og lagring",
-        "55-56": "Overnattings- og serveringsvirksomhet",
-        "58-63": "Informasjon og kommunikasjon",
-        "64-66": "Finansiering og forsikring",
-        "68-75": "Teknisk tjenesteyting, eiendomsdrift",
-        "77-82": "Forretningsmessig tjenesteyting",
-        "84": "Off.adm., forsvar, sosialforsikring",
-        "85": "Undervisning",
-        "86-88": "Helse- og sosialtjenester",
-        "90-99": "Personlig tjenesteyting",
-    }
-
-    # Determine and apply the selected format based on the labels parameter
-    if display == "label":
-        results = [str(value) for value in groups.values()]
-        default_code = "Uoppgitt"
-    elif display == "number":
-        results = [str(key) for key in groups.keys()]
-        default_code = "999"
+    Note:
+        The function relies on a predefined mapping ('KlassVariant(1616).data') to perform the conversion.
+        It assumes that this mapping has a specific structure, with 'level', 'code', and 'parentCode' (or 'name' if labels are requested) columns.
+    """
+    # Retrieve the predefined mapping data for NACE codes
+    kv = KlassVariant("1616").data
+    # Filter the mapping to include only level 2 categories
+    kv_level = kv.query('level == "2"')
+    # Create a mapping dictionary from NACE codes to their parent codes
+    mapping = kv_level.set_index("code").to_dict()
+    # Map the first two characters of each NACE code in the input series to their corresponding group codes
+    nace_groups = nace.str[0:2].map(mapping["parentCode"])
+
+    if label:
+        # If labels are requested, create a mapping for NACE code names at level 1
+        kv_label = kv.query('level == "1"')
+        mapping_label = kv_label.set_index("code").to_dict()
+        # Map the group codes to their names, filling in 'Uoppgitt' for any missing mappings
+        return nace_groups.map(mapping_label["name"]).fillna("Uoppgitt")
     else:
-        results = [f"{key} {value}" for key, value in groups.items()]
-        default_code = "999 Uoppgitt"
-    grouped = np.select(conditions, results, default=default_code)
-    return grouped
+        # If labels are not requested, return the group codes directly
+        return nace_groups
 
 
 def sektor2_grp(

diff --git a/tests/test_data/dataset1.parquet → tests/test_data/dataset_v1.parquet b/tests/test_data/dataset1.parquet → tests/test_data/dataset_v1.parquet
diff --git a/tests/test_data/dataset2.parquet → tests/test_data/dataset_v2.parquet b/tests/test_data/dataset2.parquet → tests/test_data/dataset_v2.parquet
diff --git a/tests/test_data/dataset3.parquet → tests/test_data/dataset_v3.parquet b/tests/test_data/dataset3.parquet → tests/test_data/dataset_v3.parquet
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -230,6 +230,6 @@ def test_read_latest() -> None:
     result = read_latest(
         path=os.path.normpath(f"{cwd}/tests/test_data"), name="dataset"
     )
-    expected = os.path.normpath(f"{cwd}/tests/test_data/dataset3.parquet")
+    expected = os.path.normpath(f"{cwd}/tests/test_data/dataset_v3.parquet")
 
     assert result == expected, f"Expected {expected}, but got {result}."
diff --git a/tests/test_groups.py b/tests/test_groups.py
@@ -5,8 +5,8 @@
 from ssb_arbmark_fagfunksjoner.groups import alder_5grp
 from ssb_arbmark_fagfunksjoner.groups import alder_grp
 from ssb_arbmark_fagfunksjoner.groups import landbakgrunn_grp
-from ssb_arbmark_fagfunksjoner.groups import nace_sn07_17grp
 from ssb_arbmark_fagfunksjoner.groups import nace_sn07_47grp
+from ssb_arbmark_fagfunksjoner.groups import nace_to_17_groups
 from ssb_arbmark_fagfunksjoner.groups import sektor2_grp
 from ssb_arbmark_fagfunksjoner.groups import virk_str_8grp
 
@@ -94,35 +94,20 @@ def test_nace_sn07_47grp_combined(sample_df):
     ), "NACE SN07 47 group contains null values"
 
 
-def test_nace_sn07_17grp(sample_df):
+def test_nace_to_17_groups(sample_df):
     df = sample_df
-    df["nace_sn07_17grp"] = nace_sn07_17grp(df["nace_sn07"]).astype(str)
+    df["nace_sn07_17grp"] = nace_to_17_groups(df["nace_sn07"]).astype(str)
     assert (
-        not df["nace_sn07_17grp"].isnull().any()
-    ), "NACE SN07 17 group contains null values"
-
-
-def test_nace_sn07_17grp_number(sample_df):
-    df = sample_df
-    df["nace_sn07_17grp"] = nace_sn07_17grp(df["nace_sn07"], display="number").astype(
-        str
-    )
-    assert (
-        not df["nace_sn07_17grp"].isnull().any()
-    ), "NACE SN07 17 group contains null values"
+        not df["nace_sn07_17grp"].nunique == 1
+    ), "NACE 17 group only found 1 group, likely did not find any matches to map"
 
 
-def test_nace_sn07_17grp_combined(sample_df):
+def test_nace_to_17_groups_label(sample_df):
     df = sample_df
-    df["nace_sn07_47grp"] = nace_sn07_47grp(df["nace_sn07"], display="combined").astype(
-        str
-    )
-    df["nace_sn07_17grp"] = nace_sn07_17grp(
-        df["nace_sn07_47grp"], display="combined"
-    ).astype(str)
+    df["nace_sn07_17grp"] = nace_to_17_groups(df["nace_sn07"], label=True).astype(str)
     assert (
         not df["nace_sn07_17grp"].isnull().any()
-    ), "NACE SN07 17 group contains null values"
+    ), "NACE 17 group contains null values"
 
 
 def test_sektor2_grp(sample_df):