Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow the synthesized output to be seeded #353

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ The ``synthesize`` command also takes two optional arguments:

- ``-n [rows]`` or ``--num_rows [rows]``: To generate a specific number of data rows.
- ``-p`` or ``--preview``: To preview the first six rows of synthesized data. This can be extremely useful for quick data validation without saving it to a file.
- ``-s [seed]`` or ``--seed [seed]``: Set the seed for the generation of synthetic data.

.. note::

Expand Down
19 changes: 19 additions & 0 deletions docs/source/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,22 @@ Metasyn's synthetically generated datasets are classified as `Synthetically-Augm
* Disclosure control evaluation is necessary case by case, special care to be taken with names and so on.
* To be used for extended code testing, minimal analytical value, non-negligible disclosure risk.

**Can I make the generation of synthetic data reproducible?**
-------------------------------------------------------------
To some extent, the answer is yes. You can set the seed for the generation of synthetic data as follows:

.. tab:: Python

.. code-block:: python

mf.synthesize(10, seed=1234)

.. tab:: CLI

.. code-block:: bash

metasyn synthesize gmf_file.json --preview --seed 1234

This should give the same results when you run it multiple times on your machine. However,
we cannot guarantee reproducibility across different versions of Python, Numpy, Faker. Different
CPU architectures will also most likely produce different results.
2 changes: 1 addition & 1 deletion examples/basic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
mf_out = MetaFrame.load_json(gmf_path)

# create a fake dataset
df_syn = mf_out.synthesize(10)
df_syn = mf_out.synthesize(10, seed=1234)
4 changes: 2 additions & 2 deletions examples/gmf_files/example_gmf_simple.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"provenance": {
"created by": {
"name": "metasyn",
"version": "1.0.2.dev34+gd68929e"
"version": "1.1.0"
},
"creation time": "2024-10-01T09:57:15.595769"
"creation time": "2024-12-18T14:54:05.300334"
},
"vars": [
{
Expand Down
11 changes: 9 additions & 2 deletions metasyn/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,13 @@ def synthesize() -> None:
type=int,
required=False,
)
parser.add_argument(
"-s", "--seed",
help="Seed for the generation of synthetic data.",
type=int,
default=None,
required=False,
)
parser.add_argument(
"-p", "--preview",
help="preview six-row synthesized data frame in console and exit",
Expand All @@ -192,11 +199,11 @@ def synthesize() -> None:

if args.preview:
# only print six rows and exit
print(meta_frame.synthesize(6))
print(meta_frame.synthesize(6, seed=args.seed))
return

# Generate a data frame
data_frame = meta_frame.synthesize(args.num_rows)
data_frame = meta_frame.synthesize(args.num_rows, seed=args.seed)

# Store the dataframe to file
if args.output.suffix == ".csv":
Expand Down
7 changes: 5 additions & 2 deletions metasyn/metaframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from metasyn.config import MetaConfig
from metasyn.privacy import BasePrivacy, get_privacy
from metasyn.util import set_global_seeds
from metasyn.validation import validate_gmf_dict
from metasyn.var import MetaVar
from metasyn.varspec import VarSpec
Expand Down Expand Up @@ -449,7 +450,7 @@ def load_toml(cls, fp: Union[pathlib.Path, str],
meta_vars = [MetaVar.from_dict(d) for d in self_dict["vars"]]
return cls(meta_vars, n_rows)

def synthesize(self, n: Optional[int] = None) -> pl.DataFrame:
def synthesize(self, n: Optional[int] = None, seed: Optional[int] = None) -> pl.DataFrame:
"""Create a synthetic Polars dataframe.

Parameters
Expand All @@ -467,7 +468,9 @@ def synthesize(self, n: Optional[int] = None) -> pl.DataFrame:
raise ValueError("Cannot synthesize DataFrame, since number of rows is unknown."
"Please specify the number of rows to synthesize.")
n = self.n_rows
synth_dict = {var.name: var.draw_series(n) for var in self.meta_vars}
if seed is not None:
set_global_seeds(seed)
synth_dict = {var.name: var.draw_series(n, seed=None) for var in self.meta_vars}
return pl.DataFrame(synth_dict)

def __repr__(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion metasyn/testutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def create_md_report(file_name, out_md_file):
examples = np.random.permutation([str(var.distribution.draw()) for _ in range(3)] +
["NA", "NA"])
else:
examples = [str(x) for x in var.draw_series(5)]
examples = [str(x) for x in var.draw_series(5, None)]

if "privacy" in var_dict["creation_method"]:
partition_size = var_dict["creation_method"]["privacy"]["parameters"]["partition_size"]
Expand Down
17 changes: 17 additions & 0 deletions metasyn/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
"""Utility module for metasyn."""
from __future__ import annotations

import random

import faker
import numpy as np

try:
import tomllib
except ImportError:
Expand All @@ -27,3 +32,15 @@ def get_registry() -> dict:
with open(registry_fp, "rb") as handle:
registry = tomllib.load(handle)
return registry

def set_global_seeds(seed: int):
"""Set the global seeds for all random number generators.

Parameters
----------
seed
The seed to use for the random number generators.
"""
np.random.seed(seed)
random.seed(seed)
faker.Faker.seed(seed)
6 changes: 5 additions & 1 deletion metasyn/var.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from metasyn.distribution.base import BaseDistribution
from metasyn.privacy import BasePrivacy, BasicPrivacy
from metasyn.provider import BaseDistributionProvider, DistributionProviderList
from metasyn.util import set_global_seeds
from metasyn.varspec import DistributionSpec


Expand Down Expand Up @@ -232,7 +233,7 @@ def draw(self) -> Any:
return None
return self.distribution.draw()

def draw_series(self, n: int) -> pl.Series:
def draw_series(self, n: int, seed: Optional[int]) -> pl.Series:
"""Draw a new synthetic series from the metadata.

Parameters
Expand All @@ -245,6 +246,9 @@ def draw_series(self, n: int) -> pl.Series:
polars.Series:
Polars series with the synthetic data.
"""
if seed is not None:
set_global_seeds(seed)

self.distribution.draw_reset()
value_list = [self.draw() for _ in range(n)]
pl_type = self.dtype.split("(")[0]
Expand Down
4 changes: 3 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ def test_cli(tmp_dir, ext):
"-n 25", # only generate 25 samples
tmp_dir / "titanic.json", # the input file
"-o",
out_file # the output file
out_file, # the output file
"--seed",
str(1234),
]

# Run the cli with different extensions
Expand Down
4 changes: 4 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ def test_demo_datasets(tmp_path, dataset_name):
mf = MetaFrame.load_json(tmp_file)

df_syn = mf.synthesize(100)
df_syn_1 = mf.synthesize(100, seed=1234)
df_syn_2 = mf.synthesize(100, seed=1234)
for col in df_syn.columns:
assert all(df_syn_1[col].drop_nulls() == df_syn_2[col].drop_nulls())

for col, dtype in demo_class.schema.items():
assert dtype == df_syn[col].dtype
Expand Down
22 changes: 15 additions & 7 deletions tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,22 @@
from pytest import mark

from metasyn.distribution.string import FakerDistribution, FreeTextDistribution
from metasyn.var import MetaVar


@mark.parametrize("series_type", [pd.Series, pl.Series])
def test_faker(series_type):
"""Test the faker distribution."""
var = FakerDistribution.fit(series_type([1, 2, 3]))
assert isinstance(var.to_dict(), dict)
assert isinstance(var.draw(), str)
assert 'faker' in str(var)
assert var.locale == "en_US"
assert var.faker_type == "city"
dist = FakerDistribution.fit(series_type([1, 2, 3]))
assert isinstance(dist.to_dict(), dict)
assert isinstance(dist.draw(), str)
assert 'faker' in str(dist)
assert dist.locale == "en_US"
assert dist.faker_type == "city"
var = MetaVar("some_city", "string", dist, prop_missing=0.0)
series_1 = var.draw_series(100, seed=1234)
series_2 = var.draw_series(100, seed=1234)
assert all(series_1 == series_2)


@mark.parametrize(
Expand All @@ -35,4 +40,7 @@ def test_free_text(series, lang, avg_sentences, avg_words):
assert dist.locale == lang
assert dist.avg_sentences == avg_sentences
assert dist.avg_words == avg_words
dist.draw()
var = MetaVar("some_var", "string", dist, prop_missing=0.0)
series_1 = var.draw_series(100, seed=1234)
series_2 = var.draw_series(100, seed=1234)
assert all(series_1 == series_2)
20 changes: 13 additions & 7 deletions tests/test_var.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,22 @@ def check_similar(series_a, series_b):
assert len(series_a) == len(series_b)
base_type_a = _series_element_classname(series_a, all_nan)
base_type_b = _series_element_classname(series_b, all_nan)
if type(series_a) == type(series_b):
if type(series_a) is type(series_b):
assert base_type_a == base_type_b
has_nans_a = len(series_a) - len(_series_drop_nans(series_a)) > 0
has_nans_b = len(series_b) - len(_series_drop_nans(series_b)) > 0
assert has_nans_a == has_nans_b

def check_random_draw(var, n_series):
series_1 = var.draw_series(n_series, 1234)
series_2 = var.draw_series(n_series, 1234)
assert all(_series_drop_nans(series_1) == _series_drop_nans(series_2))

assert isinstance(series, (pd.Series, pl.Series))

var = MetaVar.fit(series)
new_series = var.draw_series(len(series))
new_series = var.draw_series(len(series), 5123)
check_random_draw(var, len(series))
print(new_series)
check_similar(series, new_series)
assert var.var_type == var_type
Expand All @@ -76,10 +82,10 @@ def check_similar(series_a, series_b):
var_dict["distribution"].update({"implements": "unknown"})
MetaVar.from_dict(var_dict)

newer_series = new_var.draw_series(len(series))
newer_series = new_var.draw_series(len(series), 6789)
check_similar(newer_series, series)

assert type(new_var) == type(var)
assert type(new_var) is type(var)
assert new_var.dtype == var.dtype
assert var_type == new_var.var_type

Expand All @@ -90,9 +96,9 @@ def check_similar(series_a, series_b):

with open(tmp_fp, "r") as f:
new_var = MetaVar.from_dict(json.load(f))
check_similar(series, new_var.draw_series(len(series)))
check_similar(series, new_var.draw_series(len(series), 8234))

assert type(new_var) == type(var)
assert type(new_var) is type(var)
assert new_var.dtype == var.dtype
assert new_var.var_type == var_type
assert new_var.creation_method["created_by"] == "metasyn"
Expand Down Expand Up @@ -220,7 +226,7 @@ def test_bool(tmp_path, series_type):
series = series_type(np.random.choice([True, False], size=100))
check_var(series, "categorical", tmp_path)
var = MetaVar.fit(series)
new_series = var.draw_series(10)
new_series = var.draw_series(10, 1234)
assert new_series.dtype == pl.Boolean


Expand Down
Loading