diff --git a/pyproject.toml b/pyproject.toml index 85deaf4..547790a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ssb-arbmark-fagfunksjoner" -version = "0.0.20" +version = "0.0.21" description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner" authors = ["Jan Sebastian Rothe "] license = "MIT" diff --git a/src/arbmark/__init__.py b/src/arbmark/__init__.py index b5e93af..2bec5e0 100644 --- a/src/arbmark/__init__.py +++ b/src/arbmark/__init__.py @@ -8,6 +8,8 @@ from arbmark.functions.reference import ref_day from arbmark.functions.reference import ref_tuesday from arbmark.functions.reference import ref_week +from arbmark.functions.statbank_formats import sb_integer +from arbmark.functions.statbank_formats import sb_percent from arbmark.functions.workdays import count_days from arbmark.functions.workdays import count_holidays from arbmark.functions.workdays import count_weekend_days @@ -37,8 +39,10 @@ "indicate_merge", "first_last_date_quarter", "ref_day", - "ref_week", "ref_tuesday", + "ref_week", + "sb_integer", + "sb_percent", "count_days", "count_holidays", "count_weekend_days", diff --git a/src/arbmark/functions/statbank_formats.py b/src/arbmark/functions/statbank_formats.py new file mode 100644 index 0000000..df36e8c --- /dev/null +++ b/src/arbmark/functions/statbank_formats.py @@ -0,0 +1,52 @@ +# Type hints +from typing import TYPE_CHECKING +from typing import Any + +# Pandas for table management +import pandas as pd + +if TYPE_CHECKING: + PdSeriesAny = pd.Series[Any] # type: ignore[misc] +else: + PdSeriesAny = pd.Series + + +def sb_integer(number: PdSeriesAny, unit: int = 0) -> PdSeriesAny: + """Format a pandas Series of numbers as rounded integers, with optional unit scaling. + + Args: + number: A pandas Series containing numeric values. + unit: The power of 10 to which to round the numbers. Default is 0 (no scaling). + + Returns: + A pandas Series with the numbers rounded to the specified unit, + converted to strings, and with NaNs replaced by empty strings. + """ + return ( + number.fillna(-1 * 10 ** abs(unit)) # Replace NaN values with a placeholder + .round(-abs(unit)) # Round numbers to the nearest specified unit + .astype(int) # Convert the Series to integer type + .astype(str) # Convert the Series to string type + .replace(f"{-1 * 10**abs(unit)}", "") # Replace placeholder with empty strings + ) + + +def sb_percent(fraction: PdSeriesAny, decimals: int = 1) -> PdSeriesAny: + """Convert a pandas Series of fractions to percentages, formatted as strings. + + Args: + fraction: A pandas Series containing fractional values (e.g., 0.25 for 25%). + decimals: Number of decimal places to round the percentage values to. Default is 1. + + Returns: + A pandas Series with the percentage values formatted as strings, + with a comma as the decimal separator and empty strings for NaNs and infinities. + """ + return ( + fraction.multiply(100) # Convert fractions to percentages + .round(decimals) # Round to the specified number of decimal places + .fillna("") # Replace NaN values with empty strings + .astype(str) # Convert the Series to string type + .replace("inf", "") # Replace 'inf' strings with empty strings + .str.replace(".", ",") # Replace periods with commas + ) diff --git a/tests/test_statbank_formats.py b/tests/test_statbank_formats.py new file mode 100644 index 0000000..6686707 --- /dev/null +++ b/tests/test_statbank_formats.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +import pytest + +from arbmark import sb_integer +from arbmark import sb_percent + + +@pytest.fixture +def sample_df() -> pd.DataFrame: + return pd.DataFrame( + { + "antall_ansatte": pd.to_numeric([np.nan, None, 0, 1.00001, 975, 25]), + "sykefravaersprosent": pd.to_numeric( + [np.nan, None, 0, 0.9999, 0.5, 0.3246588] + ), + } + ) + + +def test_sb_integer(sample_df: pd.DataFrame) -> None: + test1_result = sb_integer(sample_df["antall_ansatte"], unit=2).to_list() + test1_expected = ["", "", "0", "0", "1000", "0"] + + assert ( + test1_result == test1_expected + ), f"Expected {test1_expected}, but got {test1_result}" + + test2_result = sb_integer(sample_df["antall_ansatte"], unit=1).to_list() + test2_expected = ["", "", "0", "0", "980", "20"] + + assert ( + test2_result == test2_expected + ), f"Expected {test2_expected}, but got {test2_result}" + + test3_result = sb_integer(sample_df["antall_ansatte"], unit=0).to_list() + test3_expected = ["", "", "0", "1", "975", "25"] + + assert ( + test3_result == test3_expected + ), f"Expected {test3_expected}, but got {test3_result}" + + +def test_sb_percent(sample_df: pd.DataFrame) -> None: + test1_result = sb_percent(sample_df["sykefravaersprosent"], decimals=1).to_list() + test1_expected = ["", "", "0,0", "100,0", "50,0", "32,5"] + + assert ( + test1_result == test1_expected + ), f"Expected {test1_expected}, but got {test1_result}" + + test2_result = sb_percent(sample_df["sykefravaersprosent"], decimals=2).to_list() + test2_expected = ["", "", "0,0", "99,99", "50,0", "32,47"] + + assert ( + test2_result == test2_expected + ), f"Expected {test2_expected}, but got {test2_result}"