Skip to content

Commit

Permalink
Merge pull request #27 from statisticsnorway/klass-function
Browse files Browse the repository at this point in the history
Klass function
  • Loading branch information
vilderov authored Feb 29, 2024
2 parents de706a0 + a70de68 commit 80c65a0
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 119 deletions.
91 changes: 89 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ssb-arbmark-fagfunksjoner"
version = "0.0.8"
version = "0.0.9"
description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
authors = ["Jan Sebastian Rothe <[email protected]>"]
license = "MIT"
Expand All @@ -25,6 +25,7 @@ holidays = "^0.37"
pandas-stubs = "^2.1.1.230928"
poetry-plugin-export = "^1.6.0"
dapla-toolbelt = "^2.0.6"
ssb-klass-python = "^0.0.9"

[tool.poetry.group.dev.dependencies]
pygments = ">=2.10.0"
Expand Down
15 changes: 4 additions & 11 deletions src/ssb_arbmark_fagfunksjoner/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,11 +570,8 @@ def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None:
# Join directory and file name
file_path = os.path.join(path, file_name_pattern)

# Checking environment
wenv = os.environ.get("DAPLA_REGION")

# If environment is Dapla
if wenv == "BIP":
# If path is a google cloud bucket
if path[:4] in ["ssb-", "gs:/"]:

# Get filesystem
fs = FileClient.get_gcs_file_system()
Expand All @@ -587,14 +584,10 @@ def read_latest(path: str, name: str, dottype: str = ".parquet") -> str | None:
# Use glob to find all files matching the pattern
file_list = glob.glob(file_path)

# Sorting key based on file modification time
# Sorting key based on file version
file_versions = sorted(
file_list,
key=lambda x: (
os.path.getmtime(x),
# Fallback to filename sorting
x,
),
key=lambda x: int(x.split("_v")[-1].split(".")[0]),
)

# Check if any files were found. If not, inform the user and return None
Expand Down
113 changes: 32 additions & 81 deletions src/ssb_arbmark_fagfunksjoner/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
# Pandas for table management
import pandas as pd

# Klass for standard classifications
from klass.classes.variant import KlassVariant

if TYPE_CHECKING:
PdSeriesInt = pd.Series[int] # type: ignore[misc]
PdSeriesStr = pd.Series[str] # type: ignore[misc]
Expand Down Expand Up @@ -270,93 +273,41 @@ def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArraySt
return grouped


def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of NACE-codes (SN07) into predefined groups.
def nace_to_17_groups(nace: PdSeriesStr, label: bool = False) -> PdSeriesStr:
"""Converts NACE codes in a Pandas Series to their corresponding group codes or labels.
NACE (Nomenclature of Economic Activities) is the European industry standard classification system.
This function maps NACE codes to a higher-level group (level 2) and optionally returns the group's name instead of its code.
Parameters:
nace_sn07: A pandas Series containing the NACE-codes.
display: If 'label', returns group labels; if 'number', returns keys;
for any other string, returns a combination of keys and labels.
nace: A Pandas Series containing NACE codes.
label: If True, returns the names of the groups instead of their codes. Defaults to False.
Returns:
A numpy Array where the original NACE-codes are replaced by group labels or keys.
"""
# Split the series by space and take the first part
first_parts = nace_sn07.str.split(" ", n=1).str[0]

# Check if the maximum length of the first parts exceeds 2
max_length = first_parts.str.len().max()

# Check if nace codes are already grouped into 47-groups
if max_length > 2:
print(
"Warning: The function first groups the input into the 47 groups standard."
)
nace_str2_np = pd.to_numeric(nace_sn07_47grp(nace_sn07, "number"))
else:
# Convert series to numpy array
nace_str2_np = pd.to_numeric(first_parts).to_numpy()

# Define the conditions for each group
conditions = [
(nace_str2_np == 1), # 01-03 Jordbruk, skogbruk og fiske
np.isin(nace_str2_np, [2, 3]), # 05-09 Bergverksdrift og utvinning
np.logical_and(nace_str2_np >= 4, nace_str2_np <= 16), # 10-33 Industri
np.isin(nace_str2_np, [17, 18]), # 35-39 Elektrisitet, vann og renovasjon
(nace_str2_np == 19), # 41-43 Bygge- og anleggsvirksomhet
np.isin(
nace_str2_np, [20, 21, 22]
), # 45-47 Varehandel, reparasjon av motorvogner
np.logical_and(
nace_str2_np >= 23, nace_str2_np <= 27
), # 49-53 Transport og lagring
np.isin(nace_str2_np, [28, 29]), # 55-56 Overnattings- og serveringsvirksomhet
np.isin(nace_str2_np, [30, 31]), # 58-63 Informasjon og kommunikasjon
np.isin(nace_str2_np, [32, 33, 34]), # 64-66 Finansiering og forsikring
np.logical_and(
nace_str2_np >= 35, nace_str2_np <= 38
), # 68-75 Teknisk tjenesteyting, eiendomsdrift
(nace_str2_np == 39), # 77-82 Forretningsmessig tjenesteyting
(nace_str2_np == 40), # 84 Off.adm., forsvar, sosialforsikring
(nace_str2_np == 41), # 85 Undervisning
np.isin(nace_str2_np, [42, 43]), # 86-88 Helse- og sosialtjenester
np.logical_and(
nace_str2_np >= 44, nace_str2_np <= 47
), # 90-99 Personlig tjenesteyting
]
A Pandas Series with the mapped group codes or names, depending on the 'label' argument.
# Define the group labels with string keys
groups = {
"01-03": "Jordbruk, skogbruk og fiske",
"05-09": "Bergverksdrift og utvinning",
"10-33": "Industri",
"35-39": "Elektrisitet, vann og renovasjon",
"41-43": "Bygge- og anleggsvirksomhet",
"45-47": "Varehandel, reparasjon av motorvogner",
"49-53": "Transport og lagring",
"55-56": "Overnattings- og serveringsvirksomhet",
"58-63": "Informasjon og kommunikasjon",
"64-66": "Finansiering og forsikring",
"68-75": "Teknisk tjenesteyting, eiendomsdrift",
"77-82": "Forretningsmessig tjenesteyting",
"84": "Off.adm., forsvar, sosialforsikring",
"85": "Undervisning",
"86-88": "Helse- og sosialtjenester",
"90-99": "Personlig tjenesteyting",
}

# Determine and apply the selected format based on the labels parameter
if display == "label":
results = [str(value) for value in groups.values()]
default_code = "Uoppgitt"
elif display == "number":
results = [str(key) for key in groups.keys()]
default_code = "999"
Note:
The function relies on a predefined mapping ('KlassVariant(1616).data') to perform the conversion.
It assumes that this mapping has a specific structure, with 'level', 'code', and 'parentCode' (or 'name' if labels are requested) columns.
"""
# Retrieve the predefined mapping data for NACE codes
kv = KlassVariant("1616").data
# Filter the mapping to include only level 2 categories
kv_level = kv.query('level == "2"')
# Create a mapping dictionary from NACE codes to their parent codes
mapping = kv_level.set_index("code").to_dict()
# Map the first two characters of each NACE code in the input series to their corresponding group codes
nace_groups = nace.str[0:2].map(mapping["parentCode"])

if label:
# If labels are requested, create a mapping for NACE code names at level 1
kv_label = kv.query('level == "1"')
mapping_label = kv_label.set_index("code").to_dict()
# Map the group codes to their names, filling in 'Uoppgitt' for any missing mappings
return nace_groups.map(mapping_label["name"]).fillna("Uoppgitt")
else:
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "999 Uoppgitt"
grouped = np.select(conditions, results, default=default_code)
return grouped
# If labels are not requested, return the group codes directly
return nace_groups


def sektor2_grp(
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,6 @@ def test_read_latest() -> None:
result = read_latest(
path=os.path.normpath(f"{cwd}/tests/test_data"), name="dataset"
)
expected = os.path.normpath(f"{cwd}/tests/test_data/dataset3.parquet")
expected = os.path.normpath(f"{cwd}/tests/test_data/dataset_v3.parquet")

assert result == expected, f"Expected {expected}, but got {result}."
31 changes: 8 additions & 23 deletions tests/test_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from ssb_arbmark_fagfunksjoner.groups import alder_5grp
from ssb_arbmark_fagfunksjoner.groups import alder_grp
from ssb_arbmark_fagfunksjoner.groups import landbakgrunn_grp
from ssb_arbmark_fagfunksjoner.groups import nace_sn07_17grp
from ssb_arbmark_fagfunksjoner.groups import nace_sn07_47grp
from ssb_arbmark_fagfunksjoner.groups import nace_to_17_groups
from ssb_arbmark_fagfunksjoner.groups import sektor2_grp
from ssb_arbmark_fagfunksjoner.groups import virk_str_8grp

Expand Down Expand Up @@ -94,35 +94,20 @@ def test_nace_sn07_47grp_combined(sample_df):
), "NACE SN07 47 group contains null values"


def test_nace_sn07_17grp(sample_df):
def test_nace_to_17_groups(sample_df):
df = sample_df
df["nace_sn07_17grp"] = nace_sn07_17grp(df["nace_sn07"]).astype(str)
df["nace_sn07_17grp"] = nace_to_17_groups(df["nace_sn07"]).astype(str)
assert (
not df["nace_sn07_17grp"].isnull().any()
), "NACE SN07 17 group contains null values"


def test_nace_sn07_17grp_number(sample_df):
df = sample_df
df["nace_sn07_17grp"] = nace_sn07_17grp(df["nace_sn07"], display="number").astype(
str
)
assert (
not df["nace_sn07_17grp"].isnull().any()
), "NACE SN07 17 group contains null values"
not df["nace_sn07_17grp"].nunique == 1
), "NACE 17 group only found 1 group, likely did not find any matches to map"


def test_nace_sn07_17grp_combined(sample_df):
def test_nace_to_17_groups_label(sample_df):
df = sample_df
df["nace_sn07_47grp"] = nace_sn07_47grp(df["nace_sn07"], display="combined").astype(
str
)
df["nace_sn07_17grp"] = nace_sn07_17grp(
df["nace_sn07_47grp"], display="combined"
).astype(str)
df["nace_sn07_17grp"] = nace_to_17_groups(df["nace_sn07"], label=True).astype(str)
assert (
not df["nace_sn07_17grp"].isnull().any()
), "NACE SN07 17 group contains null values"
), "NACE 17 group contains null values"


def test_sektor2_grp(sample_df):
Expand Down

0 comments on commit 80c65a0

Please sign in to comment.