Skip to content

Commit

Permalink
Merge pull request #20 from statisticsnorway/fix-series
Browse files Browse the repository at this point in the history
Fix series
  • Loading branch information
vilderov authored Feb 5, 2024
2 parents 7e037ee + b90c448 commit 6ae586f
Show file tree
Hide file tree
Showing 9 changed files with 1,125 additions and 372 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Sphinx configuration."""

# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
Expand Down
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Nox sessions."""

import os
import shlex
import shutil
Expand Down
1,339 changes: 1,049 additions & 290 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ssb-arbmark-fagfunksjoner"
version = "0.0.4"
version = "0.0.5"
description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
authors = ["Jan Sebastian Rothe <[email protected]>"]
license = "MIT"
Expand All @@ -23,6 +23,7 @@ pandas = ">=1.5.3"
numpy = "^1.26.2"
holidays = "^0.37"
pandas-stubs = "^2.1.1.230928"
poetry-plugin-export = "^1.6.0"

[tool.poetry.group.dev.dependencies]
pygments = ">=2.10.0"
Expand Down
1 change: 1 addition & 0 deletions src/ssb_arbmark_fagfunksjoner/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Command-line interface."""

import click


Expand Down
1 change: 0 additions & 1 deletion src/ssb_arbmark_fagfunksjoner/functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""A collection of useful functions."""


# Itertools for functions creating iterators for efficient looping
import itertools

Expand Down
49 changes: 27 additions & 22 deletions src/ssb_arbmark_fagfunksjoner/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,24 @@

# Numpy for data wrangling
import numpy as np
import numpy.typing as npt

# Pandas for table management
import pandas as pd

if TYPE_CHECKING:
PdSeriesInt = pd.Series[int] # type: ignore[misc]
PdSeriesStr = pd.Series[str] # type: ignore[misc]
NpArrayInt = npt.NDArray[np.int_] # type: ignore[misc]
NpArrayStr = npt.NDArray[np.str_] # type: ignore[misc]
else:
PdSeriesInt = pd.Series
PdSeriesStr = pd.Series
NpArrayInt = npt.NDArray
NpArrayStr = npt.NDArray


def alder_grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
def alder_grp(alder: PdSeriesInt, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of person ages into predefined groups used in SYKEFR.
Parameters:
Expand All @@ -26,7 +31,7 @@ def alder_grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original person ages are replaced by group labels, keys, or a combination.
A numpy Array where the original person ages are replaced by group labels, keys, or a combination.
"""
# Define the conditions for each group
conditions = [
Expand Down Expand Up @@ -73,10 +78,10 @@ def alder_grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
results = [f"{key} {value}" for key, value in groups.items()]

# Apply the selected format to the series
return pd.Series(np.select(conditions, results, default="."), dtype="string")
return np.select(conditions, results, default=".")


def alder_5grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
def alder_5grp(alder: PdSeriesInt, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of person ages into predefined groups used in ARBLONN.
Parameters:
Expand All @@ -85,7 +90,7 @@ def alder_5grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original person ages are replaced by group labels, keys, or a combination.
A numpy Array where the original person ages are replaced by group labels, keys, or a combination.
"""
# Define the conditions for each group
conditions = [
Expand Down Expand Up @@ -114,10 +119,10 @@ def alder_5grp(alder: PdSeriesInt, display: str = "label") -> PdSeriesStr:
results = [f"{key} {value}" for key, value in groups.items()]

# Apply the selected format to the series
return pd.Series(np.select(conditions, results, default=""), dtype="string")
return np.select(conditions, results, default="")


def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesStr:
def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of NACE-codes (SN07) into predefined groups.
Parameters:
Expand All @@ -126,7 +131,7 @@ def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesS
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original NACE-codes are replaced by group labels or keys.
A numpy Array where the original NACE-codes are replaced by group labels or keys.
"""
# Removes periods in the NACE codes (if any)
nace_sn07 = nace_sn07.replace(".", "")
Expand Down Expand Up @@ -262,10 +267,10 @@ def nace_sn07_47grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesS
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "99 Uoppgitt"
grouped = np.select(conditions, results, default=default_code)
return pd.Series(grouped, dtype="string")
return grouped


def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesStr:
def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of NACE-codes (SN07) into predefined groups.
Parameters:
Expand All @@ -274,7 +279,7 @@ def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesS
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original NACE-codes are replaced by group labels or keys.
A numpy Array where the original NACE-codes are replaced by group labels or keys.
"""
# Split the series by space and take the first part
first_parts = nace_sn07.str.split(" ", n=1).str[0]
Expand All @@ -287,7 +292,7 @@ def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesS
print(
"Warning: The function first groups the input into the 47 groups standard."
)
nace_str2_np = pd.to_numeric(nace_sn07_47grp(nace_sn07, "number")).to_numpy()
nace_str2_np = pd.to_numeric(nace_sn07_47grp(nace_sn07, "number"))
else:
# Convert series to numpy array
nace_str2_np = pd.to_numeric(first_parts).to_numpy()
Expand Down Expand Up @@ -351,12 +356,12 @@ def nace_sn07_17grp(nace_sn07: PdSeriesStr, display: str = "label") -> PdSeriesS
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "999 Uoppgitt"
grouped = np.select(conditions, results, default=default_code)
return pd.Series(grouped, dtype="string")
return grouped


def sektor2_grp(
sektor: PdSeriesStr, undersektor: PdSeriesStr, display: str = "label"
) -> PdSeriesStr:
) -> NpArrayStr:
"""Categorize a pandas Series of sectors and subsectors into predefined groups.
Parameters:
Expand All @@ -366,7 +371,7 @@ def sektor2_grp(
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original sector and subsectors are replaced by group labels or keys.
A numpy Array where the original sector and subsectors are replaced by group labels or keys.
"""
# Define the conditions for each group
conditions = [
Expand Down Expand Up @@ -396,10 +401,10 @@ def sektor2_grp(
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "999 Uoppgitt"
grouped = np.select(conditions, results, default=default_code)
return pd.Series(grouped, dtype="string")
return grouped


def virk_str_8grp(ansatte: PdSeriesInt, display: str = "label") -> PdSeriesStr:
def virk_str_8grp(ansatte: PdSeriesInt, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of employee counts into predefined groups.
Parameters:
Expand All @@ -408,7 +413,7 @@ def virk_str_8grp(ansatte: PdSeriesInt, display: str = "label") -> PdSeriesStr:
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original employee counts are replaced by group labels or keys.
A numpy Array where the original employee counts are replaced by group labels or keys.
"""
# Define the conditions for each group
conditions = [
Expand Down Expand Up @@ -445,10 +450,10 @@ def virk_str_8grp(ansatte: PdSeriesInt, display: str = "label") -> PdSeriesStr:
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "99 Uoppgitt"
grouped = np.select(conditions, results, default=default_code)
return pd.Series(grouped, dtype="string")
return grouped


def landbakgrunn_grp(landbakgrunn: PdSeriesStr, display: str = "label") -> PdSeriesStr:
def landbakgrunn_grp(landbakgrunn: PdSeriesStr, display: str = "label") -> NpArrayStr:
"""Categorize a pandas Series of country origins from 3 generations into world regions.
Parameters:
Expand All @@ -458,7 +463,7 @@ def landbakgrunn_grp(landbakgrunn: PdSeriesStr, display: str = "label") -> PdSer
for any other string, returns a combination of keys and labels.
Returns:
A pandas Series where the original country origins are replaced by group labels or keys.
A numpy Array where the original country origins are replaced by group labels or keys.
"""
# Convert Series to Numpy array
landbakgrunn_np = pd.to_numeric(landbakgrunn).to_numpy()
Expand Down Expand Up @@ -555,4 +560,4 @@ def landbakgrunn_grp(landbakgrunn: PdSeriesStr, display: str = "label") -> PdSer
results = [f"{key} {value}" for key, value in groups.items()]
default_code = "999 Ukjent"
grouped = np.select(conditions, results, default=default_code)
return pd.Series(grouped, dtype="string")
return grouped
Loading

0 comments on commit 6ae586f

Please sign in to comment.