Merge pull request #63 from statisticsnorway/statbank-formats

Added functions for statbank formats
statisticsnorway · Jul 5, 2024 · 0e3a71d · 0e3a71d
2 parents 4002053 + ece61b8
commit 0e3a71d
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 2 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-arbmark-fagfunksjoner"
-version = "0.0.20"
+version = "0.0.21"
 description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
 authors = ["Jan Sebastian Rothe <[email protected]>"]
 license = "MIT"

diff --git a/src/arbmark/__init__.py b/src/arbmark/__init__.py
@@ -8,6 +8,8 @@
 from arbmark.functions.reference import ref_day
 from arbmark.functions.reference import ref_tuesday
 from arbmark.functions.reference import ref_week
+from arbmark.functions.statbank_formats import sb_integer
+from arbmark.functions.statbank_formats import sb_percent
 from arbmark.functions.workdays import count_days
 from arbmark.functions.workdays import count_holidays
 from arbmark.functions.workdays import count_weekend_days
@@ -37,8 +39,10 @@
     "indicate_merge",
     "first_last_date_quarter",
     "ref_day",
-    "ref_week",
     "ref_tuesday",
+    "ref_week",
+    "sb_integer",
+    "sb_percent",
     "count_days",
     "count_holidays",
     "count_weekend_days",

diff --git a/src/arbmark/functions/statbank_formats.py b/src/arbmark/functions/statbank_formats.py
@@ -0,0 +1,52 @@
+# Type hints
+from typing import TYPE_CHECKING
+from typing import Any
+
+# Pandas for table management
+import pandas as pd
+
+if TYPE_CHECKING:
+    PdSeriesAny = pd.Series[Any]  # type: ignore[misc]
+else:
+    PdSeriesAny = pd.Series
+
+
+def sb_integer(number: PdSeriesAny, unit: int = 0) -> PdSeriesAny:
+    """Format a pandas Series of numbers as rounded integers, with optional unit scaling.
+
+    Args:
+        number: A pandas Series containing numeric values.
+        unit: The power of 10 to which to round the numbers. Default is 0 (no scaling).
+
+    Returns:
+        A pandas Series with the numbers rounded to the specified unit,
+        converted to strings, and with NaNs replaced by empty strings.
+    """
+    return (
+        number.fillna(-1 * 10 ** abs(unit))  # Replace NaN values with a placeholder
+        .round(-abs(unit))  # Round numbers to the nearest specified unit
+        .astype(int)  # Convert the Series to integer type
+        .astype(str)  # Convert the Series to string type
+        .replace(f"{-1 * 10**abs(unit)}", "")  # Replace placeholder with empty strings
+    )
+
+
+def sb_percent(fraction: PdSeriesAny, decimals: int = 1) -> PdSeriesAny:
+    """Convert a pandas Series of fractions to percentages, formatted as strings.
+
+    Args:
+        fraction: A pandas Series containing fractional values (e.g., 0.25 for 25%).
+        decimals: Number of decimal places to round the percentage values to. Default is 1.
+
+    Returns:
+        A pandas Series with the percentage values formatted as strings,
+        with a comma as the decimal separator and empty strings for NaNs and infinities.
+    """
+    return (
+        fraction.multiply(100)  # Convert fractions to percentages
+        .round(decimals)  # Round to the specified number of decimal places
+        .fillna("")  # Replace NaN values with empty strings
+        .astype(str)  # Convert the Series to string type
+        .replace("inf", "")  # Replace 'inf' strings with empty strings
+        .str.replace(".", ",")  # Replace periods with commas
+    )
diff --git a/tests/test_statbank_formats.py b/tests/test_statbank_formats.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from arbmark import sb_integer
+from arbmark import sb_percent
+
+
+@pytest.fixture
+def sample_df() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "antall_ansatte": pd.to_numeric([np.nan, None, 0, 1.00001, 975, 25]),
+            "sykefravaersprosent": pd.to_numeric(
+                [np.nan, None, 0, 0.9999, 0.5, 0.3246588]
+            ),
+        }
+    )
+
+
+def test_sb_integer(sample_df: pd.DataFrame) -> None:
+    test1_result = sb_integer(sample_df["antall_ansatte"], unit=2).to_list()
+    test1_expected = ["", "", "0", "0", "1000", "0"]
+
+    assert (
+        test1_result == test1_expected
+    ), f"Expected {test1_expected}, but got {test1_result}"
+
+    test2_result = sb_integer(sample_df["antall_ansatte"], unit=1).to_list()
+    test2_expected = ["", "", "0", "0", "980", "20"]
+
+    assert (
+        test2_result == test2_expected
+    ), f"Expected {test2_expected}, but got {test2_result}"
+
+    test3_result = sb_integer(sample_df["antall_ansatte"], unit=0).to_list()
+    test3_expected = ["", "", "0", "1", "975", "25"]
+
+    assert (
+        test3_result == test3_expected
+    ), f"Expected {test3_expected}, but got {test3_result}"
+
+
+def test_sb_percent(sample_df: pd.DataFrame) -> None:
+    test1_result = sb_percent(sample_df["sykefravaersprosent"], decimals=1).to_list()
+    test1_expected = ["", "", "0,0", "100,0", "50,0", "32,5"]
+
+    assert (
+        test1_result == test1_expected
+    ), f"Expected {test1_expected}, but got {test1_result}"
+
+    test2_result = sb_percent(sample_df["sykefravaersprosent"], decimals=2).to_list()
+    test2_expected = ["", "", "0,0", "99,99", "50,0", "32,47"]
+
+    assert (
+        test2_result == test2_expected
+    ), f"Expected {test2_expected}, but got {test2_result}"