Merge pull request #58 from statisticsnorway/clean-nace-17-groups

Clean nace 17 groups
statisticsnorway · Jun 20, 2024 · 158713e · 158713e
2 parents fae40c8 + 4caa500
commit 158713e
Show file tree

Hide file tree

Showing 7 changed files with 1,317 additions and 980 deletions.
diff --git a/docs/reference.md b/docs/reference.md
@@ -129,3 +129,11 @@ arbmark.groups.sector module
    :members:
    :undoc-members:
    :show-inheritance:
+
+arbmark.groups.shift_work module
+--------------------------------
+
+.. automodule:: arbmark.groups.shift_work
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-arbmark-fagfunksjoner"
-version = "0.0.18"
+version = "0.0.19"
 description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
 authors = ["Jan Sebastian Rothe <[email protected]>"]
 license = "MIT"
@@ -18,10 +18,10 @@ Changelog = "https://github.com/statisticsnorway/ssb-arbmark-fagfunksjoner/relea
 python = "^3.10"
 click = ">=8.0.1"
 pandas = ">=1.5.3"
-numpy = ">=1.26.2"
-holidays = ">=0.37"
+numpy = "^2.0.0"
+holidays = "<1.0"
 pandas-stubs = ">=2.1.1.230928"
-dapla-toolbelt = ">=2.0.6"
+dapla-toolbelt = "^2.0.18"
 ssb-klass-python = ">=0.0.9"
 
 [tool.poetry.group.dev.dependencies]

diff --git a/src/arbmark/__init__.py b/src/arbmark/__init__.py
@@ -24,6 +24,7 @@
 from arbmark.groups.age import alder_grp
 from arbmark.groups.company_size import virk_str_8grp
 from arbmark.groups.country_origin import landbakgrunn_grp
+from arbmark.groups.nace import clean_nace_17_groups
 from arbmark.groups.nace import nace_sn07_47grp
 from arbmark.groups.nace import nace_to_17_groups
 from arbmark.groups.sector import sektor2_grp
@@ -55,6 +56,7 @@
     "alder_5grp",
     "virk_str_8grp",
     "landbakgrunn_grp",
+    "clean_nace_17_groups",
     "nace_to_17_groups",
     "nace_sn07_47grp",
     "sektor2_grp",

diff --git a/src/arbmark/functions/workdays.py b/src/arbmark/functions/workdays.py
@@ -6,24 +6,23 @@
 
 # Numpy for data wrangling
 import numpy as np
+import numpy.typing as npt
 
 # Pandas for table management
 import pandas as pd
 
 if TYPE_CHECKING:
-    from numpy.typing import NDArray
-
     PdSeriesTimestamp = pd.Series[pd.Timestamp]  # type: ignore[misc]
     PdSeriesInt = pd.Series[int]  # type: ignore[misc]
-    NpArrayInt = NDArray[np.int_]  # type: ignore[misc]
-    NpArrayDate = NDArray[np.datetime64]  # type: ignore[misc]
-    NpArrayBoolean = NDArray[np.bool_]  # type: ignore[misc]
+    NpArrayInt = npt.NDArray[np.int_]  # type: ignore[misc]
+    NpArrayDate = npt.NDArray[np.datetime64]  # type: ignore[misc]
+    NpArrayBoolean = npt.NDArray[np.bool_]  # type: ignore[misc]
 else:
     PdSeriesTimestamp = pd.Series
     PdSeriesInt = pd.Series
-    NpArrayInt = np.ndarray
-    NpArrayDate = np.ndarray
-    NpArrayBoolean = np.ndarray
+    NpArrayInt = npt.NDArray
+    NpArrayDate = npt.NDArray
+    NpArrayBoolean = npt.NDArray
 
 
 def numpy_dates(dates: PdSeriesTimestamp) -> NpArrayDate:

diff --git a/src/arbmark/groups/nace.py b/src/arbmark/groups/nace.py
@@ -23,6 +23,32 @@
     NpArrayStr = npt.NDArray
 
 
+def clean_nace_17_groups(val: str) -> str:
+    """Cleans the NACE code value by removing redundant parts.
+
+    This function checks if the input string `val` contains a hyphen ('-') and if the parts
+    before and after the hyphen are identical. If they are, it returns only the part before the hyphen.
+    Otherwise, it returns the original input value.
+
+    Args:
+        val: A string containing the NACE code to be cleaned.
+
+    Returns:
+        A string with the cleaned NACE code.
+
+    """
+    # Check if the hyphen is in the string
+    if "-" in val:
+        # Split the string at the hyphen
+        parts = val.split("-")
+        # Check if parts before and after hyphen are the same
+        if parts[0] == parts[1]:
+            # Return the part before the hyphen if true
+            return parts[0]
+    # Return original value if no modifications are made
+    return val
+
+
 def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
     """Converts NACE codes in a Pandas Series to their corresponding group codes or labels.
 
@@ -57,7 +83,7 @@ def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
         return nace_groups.map(mapping_label["name"]).fillna("Uoppgitt")
     else:
         # If labels are not requested, return the group codes directly
-        return nace_groups
+        return nace_groups.apply(clean_nace_17_groups)
 
 
 def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:

diff --git a/tests/test_nace.py b/tests/test_nace.py
@@ -11,7 +11,7 @@ def sample_df() -> pd.DataFrame:
     return pd.DataFrame(
         {
             "nace_sn07": np.random.choice(
-                ["49.100", "56.101", "84.110", "93.130", "95.110"], size=100
+                ["49.100", "56.101", "84.110", "85.421", "93.130", "95.110"], size=100
             ),
         }
     )