Skip to content

Commit

Permalink
Merge pull request #63 from statisticsnorway/statbank-formats
Browse files Browse the repository at this point in the history
Added functions for statbank formats
  • Loading branch information
vilderov authored Jul 5, 2024
2 parents 4002053 + ece61b8 commit 0e3a71d
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ssb-arbmark-fagfunksjoner"
version = "0.0.20"
version = "0.0.21"
description = "SSB Arbeidsmarked og lønn Fag-fellesfunksjoner"
authors = ["Jan Sebastian Rothe <[email protected]>"]
license = "MIT"
Expand Down
6 changes: 5 additions & 1 deletion src/arbmark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from arbmark.functions.reference import ref_day
from arbmark.functions.reference import ref_tuesday
from arbmark.functions.reference import ref_week
from arbmark.functions.statbank_formats import sb_integer
from arbmark.functions.statbank_formats import sb_percent
from arbmark.functions.workdays import count_days
from arbmark.functions.workdays import count_holidays
from arbmark.functions.workdays import count_weekend_days
Expand Down Expand Up @@ -37,8 +39,10 @@
"indicate_merge",
"first_last_date_quarter",
"ref_day",
"ref_week",
"ref_tuesday",
"ref_week",
"sb_integer",
"sb_percent",
"count_days",
"count_holidays",
"count_weekend_days",
Expand Down
52 changes: 52 additions & 0 deletions src/arbmark/functions/statbank_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Type hints
from typing import TYPE_CHECKING
from typing import Any

# Pandas for table management
import pandas as pd

if TYPE_CHECKING:
PdSeriesAny = pd.Series[Any] # type: ignore[misc]
else:
PdSeriesAny = pd.Series


def sb_integer(number: PdSeriesAny, unit: int = 0) -> PdSeriesAny:
"""Format a pandas Series of numbers as rounded integers, with optional unit scaling.
Args:
number: A pandas Series containing numeric values.
unit: The power of 10 to which to round the numbers. Default is 0 (no scaling).
Returns:
A pandas Series with the numbers rounded to the specified unit,
converted to strings, and with NaNs replaced by empty strings.
"""
return (
number.fillna(-1 * 10 ** abs(unit)) # Replace NaN values with a placeholder
.round(-abs(unit)) # Round numbers to the nearest specified unit
.astype(int) # Convert the Series to integer type
.astype(str) # Convert the Series to string type
.replace(f"{-1 * 10**abs(unit)}", "") # Replace placeholder with empty strings
)


def sb_percent(fraction: PdSeriesAny, decimals: int = 1) -> PdSeriesAny:
"""Convert a pandas Series of fractions to percentages, formatted as strings.
Args:
fraction: A pandas Series containing fractional values (e.g., 0.25 for 25%).
decimals: Number of decimal places to round the percentage values to. Default is 1.
Returns:
A pandas Series with the percentage values formatted as strings,
with a comma as the decimal separator and empty strings for NaNs and infinities.
"""
return (
fraction.multiply(100) # Convert fractions to percentages
.round(decimals) # Round to the specified number of decimal places
.fillna("") # Replace NaN values with empty strings
.astype(str) # Convert the Series to string type
.replace("inf", "") # Replace 'inf' strings with empty strings
.str.replace(".", ",") # Replace periods with commas
)
57 changes: 57 additions & 0 deletions tests/test_statbank_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
import pandas as pd
import pytest

from arbmark import sb_integer
from arbmark import sb_percent


@pytest.fixture
def sample_df() -> pd.DataFrame:
return pd.DataFrame(
{
"antall_ansatte": pd.to_numeric([np.nan, None, 0, 1.00001, 975, 25]),
"sykefravaersprosent": pd.to_numeric(
[np.nan, None, 0, 0.9999, 0.5, 0.3246588]
),
}
)


def test_sb_integer(sample_df: pd.DataFrame) -> None:
test1_result = sb_integer(sample_df["antall_ansatte"], unit=2).to_list()
test1_expected = ["", "", "0", "0", "1000", "0"]

assert (
test1_result == test1_expected
), f"Expected {test1_expected}, but got {test1_result}"

test2_result = sb_integer(sample_df["antall_ansatte"], unit=1).to_list()
test2_expected = ["", "", "0", "0", "980", "20"]

assert (
test2_result == test2_expected
), f"Expected {test2_expected}, but got {test2_result}"

test3_result = sb_integer(sample_df["antall_ansatte"], unit=0).to_list()
test3_expected = ["", "", "0", "1", "975", "25"]

assert (
test3_result == test3_expected
), f"Expected {test3_expected}, but got {test3_result}"


def test_sb_percent(sample_df: pd.DataFrame) -> None:
test1_result = sb_percent(sample_df["sykefravaersprosent"], decimals=1).to_list()
test1_expected = ["", "", "0,0", "100,0", "50,0", "32,5"]

assert (
test1_result == test1_expected
), f"Expected {test1_expected}, but got {test1_result}"

test2_result = sb_percent(sample_df["sykefravaersprosent"], decimals=2).to_list()
test2_expected = ["", "", "0,0", "99,99", "50,0", "32,47"]

assert (
test2_result == test2_expected
), f"Expected {test2_expected}, but got {test2_result}"

0 comments on commit 0e3a71d

Please sign in to comment.