Skip to content

Commit

Permalink
Merge pull request #58 from statisticsnorway/clean-nace-17-groups
Browse files Browse the repository at this point in the history
Clean nace 17 groups
  • Loading branch information
sebrothe authored Jun 20, 2024
2 parents fae40c8 + 4caa500 commit 158713e
Show file tree
Hide file tree
Showing 7 changed files with 1,317 additions and 980 deletions.
8 changes: 8 additions & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,11 @@ arbmark.groups.sector module
:members:
:undoc-members:
:show-inheritance:
arbmark.groups.shift_work module
--------------------------------
.. automodule:: arbmark.groups.shift_work
:members:
:undoc-members:
:show-inheritance:
2,234 changes: 1,268 additions & 966 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ssb-arbmark-fagfunksjoner"
version = "0.0.18"
version = "0.0.19"
description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
authors = ["Jan Sebastian Rothe <[email protected]>"]
license = "MIT"
Expand All @@ -18,10 +18,10 @@ Changelog = "https://github.com/statisticsnorway/ssb-arbmark-fagfunksjoner/relea
python = "^3.10"
click = ">=8.0.1"
pandas = ">=1.5.3"
numpy = ">=1.26.2"
holidays = ">=0.37"
numpy = "^2.0.0"
holidays = "<1.0"
pandas-stubs = ">=2.1.1.230928"
dapla-toolbelt = ">=2.0.6"
dapla-toolbelt = "^2.0.18"
ssb-klass-python = ">=0.0.9"

[tool.poetry.group.dev.dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/arbmark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from arbmark.groups.age import alder_grp
from arbmark.groups.company_size import virk_str_8grp
from arbmark.groups.country_origin import landbakgrunn_grp
from arbmark.groups.nace import clean_nace_17_groups
from arbmark.groups.nace import nace_sn07_47grp
from arbmark.groups.nace import nace_to_17_groups
from arbmark.groups.sector import sektor2_grp
Expand Down Expand Up @@ -55,6 +56,7 @@
"alder_5grp",
"virk_str_8grp",
"landbakgrunn_grp",
"clean_nace_17_groups",
"nace_to_17_groups",
"nace_sn07_47grp",
"sektor2_grp",
Expand Down
15 changes: 7 additions & 8 deletions src/arbmark/functions/workdays.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,23 @@

# Numpy for data wrangling
import numpy as np
import numpy.typing as npt

# Pandas for table management
import pandas as pd

if TYPE_CHECKING:
from numpy.typing import NDArray

PdSeriesTimestamp = pd.Series[pd.Timestamp] # type: ignore[misc]
PdSeriesInt = pd.Series[int] # type: ignore[misc]
NpArrayInt = NDArray[np.int_] # type: ignore[misc]
NpArrayDate = NDArray[np.datetime64] # type: ignore[misc]
NpArrayBoolean = NDArray[np.bool_] # type: ignore[misc]
NpArrayInt = npt.NDArray[np.int_] # type: ignore[misc]
NpArrayDate = npt.NDArray[np.datetime64] # type: ignore[misc]
NpArrayBoolean = npt.NDArray[np.bool_] # type: ignore[misc]
else:
PdSeriesTimestamp = pd.Series
PdSeriesInt = pd.Series
NpArrayInt = np.ndarray
NpArrayDate = np.ndarray
NpArrayBoolean = np.ndarray
NpArrayInt = npt.NDArray
NpArrayDate = npt.NDArray
NpArrayBoolean = npt.NDArray


def numpy_dates(dates: PdSeriesTimestamp) -> NpArrayDate:
Expand Down
28 changes: 27 additions & 1 deletion src/arbmark/groups/nace.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,32 @@
NpArrayStr = npt.NDArray


def clean_nace_17_groups(val: str) -> str:
"""Cleans the NACE code value by removing redundant parts.
This function checks if the input string `val` contains a hyphen ('-') and if the parts
before and after the hyphen are identical. If they are, it returns only the part before the hyphen.
Otherwise, it returns the original input value.
Args:
val: A string containing the NACE code to be cleaned.
Returns:
A string with the cleaned NACE code.
"""
# Check if the hyphen is in the string
if "-" in val:
# Split the string at the hyphen
parts = val.split("-")
# Check if parts before and after hyphen are the same
if parts[0] == parts[1]:
# Return the part before the hyphen if true
return parts[0]
# Return original value if no modifications are made
return val


def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
"""Converts NACE codes in a Pandas Series to their corresponding group codes or labels.
Expand Down Expand Up @@ -57,7 +83,7 @@ def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
return nace_groups.map(mapping_label["name"]).fillna("Uoppgitt")
else:
# If labels are not requested, return the group codes directly
return nace_groups
return nace_groups.apply(clean_nace_17_groups)


def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_nace.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def sample_df() -> pd.DataFrame:
return pd.DataFrame(
{
"nace_sn07": np.random.choice(
["49.100", "56.101", "84.110", "93.130", "95.110"], size=100
["49.100", "56.101", "84.110", "85.421", "93.130", "95.110"], size=100
),
}
)
Expand Down

0 comments on commit 158713e

Please sign in to comment.