Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft pr1 #325

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions causaltune/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class CausalityDatasetProcessor(BaseEstimator, TransformerMixin):
outcome (str): The target variable used for encoding.
encoder: Encoder object used during feature transformations.
"""

def __init__(self):
"""
Initializes CausalityDatasetProcessor with default attributes for encoder_type, outcome, and encoder.
Expand Down
121 changes: 72 additions & 49 deletions causaltune/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import numpy as np
import pickle
import os
from scipy import special

# from scipy.stats import betabinom
Expand All @@ -12,10 +14,8 @@


def linear_multi_dataset(
n_points=10000,
impact=None,
include_propensity=False,
include_control=False) -> CausalityDataset:
n_points=10000, impact=None, include_propensity=False, include_control=False
) -> CausalityDataset:
if impact is None:
impact = {0: 0.0, 1: 2.0, 2: 1.0}
df = pd.DataFrame(
Expand Down Expand Up @@ -80,8 +80,9 @@ def nhefs() -> CausalityDataset:
df = df.loc[~missing]

df = df[covariates + ["qsmk"] + ["wt82_71"]]
df.rename(columns={c: "x" + str(i + 1)
for i, c in enumerate(covariates)}, inplace=True)
df.rename(
columns={c: "x" + str(i + 1) for i, c in enumerate(covariates)}, inplace=True
)

return CausalityDataset(df, treatment="qsmk", outcomes=["wt82_71"])

Expand Down Expand Up @@ -172,8 +173,7 @@ def amazon_reviews(rating="pos") -> CausalityDataset:
gdown.download(url, "amazon_" + rating + ".csv", fuzzy=True)
df = pd.read_csv("amazon_" + rating + ".csv")
df.drop(df.columns[[2, 3, 4]], axis=1, inplace=True)
df.columns = ["treatment", "y_factual"] + \
["x" + str(i) for i in range(1, 301)]
df.columns = ["treatment", "y_factual"] + ["x" + str(i) for i in range(1, 301)]
return CausalityDataset(df, "treatment", ["y_factual"])
else:
print(
Expand Down Expand Up @@ -226,14 +226,10 @@ def synth_ihdp(return_df=False) -> CausalityDataset:
data.columns = col
# drop the columns we don't care about
ignore_patterns = ["y_cfactual", "mu"]
ignore_cols = [c for c in data.columns if any(
[s in c for s in ignore_patterns])]
ignore_cols = [c for c in data.columns if any([s in c for s in ignore_patterns])]
data = data.drop(columns=ignore_cols)

return CausalityDataset(
data,
"treatment",
["y_factual"]) if not return_df else data
return CausalityDataset(data, "treatment", ["y_factual"]) if not return_df else data


def synth_acic(condition=1) -> CausalityDataset:
Expand Down Expand Up @@ -347,6 +343,7 @@ def generate_synthetic_data(
noisy_outcomes: bool = False,
effect_size: Union[int, None] = None,
add_instrument: bool = False,
known_propensity: bool = False,
) -> CausalityDataset:
"""Generates synthetic dataset with conditional treatment effect (CATE) and optional instrumental variable.
Supports RCT (unconfounded) and observational (confounded) data.
Expand Down Expand Up @@ -385,11 +382,15 @@ def generate_synthetic_data(
p = np.clip(p, 0.1, 0.9)
C = p > np.random.rand(n_samples)
# print(min(p), max(p))

else:
p = 0.5 * np.ones(n_samples)
C = np.random.binomial(n=1, p=0.5, size=n_samples)

if known_propensity:
known_p = np.random.beta(2, 5, size=n_samples)
else:
known_p = p

if add_instrument:
Z = np.random.binomial(n=1, p=0.5, size=n_samples)
C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
Expand All @@ -416,18 +417,11 @@ def mu(X):
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
df = pd.DataFrame(
np.array([*X.T, T, Y, tau, known_p, Y_base]).T,
columns=features
+ ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
Expand All @@ -450,6 +444,7 @@ def generate_linear_synthetic_data(
noisy_outcomes: bool = False,
effect_size: Union[int, None] = None,
add_instrument: bool = False,
known_propensity: bool = False,
) -> CausalityDataset:
"""Generates synthetic dataset with linear treatment effect (CATE) and optional instrumental variable.
Supports RCT (unconfounded) and observational (confounded) data.
Expand Down Expand Up @@ -494,6 +489,11 @@ def generate_linear_synthetic_data(
p = 0.5 * np.ones(n_samples)
C = np.random.binomial(n=1, p=0.5, size=n_samples)

if known_propensity:
known_p = np.random.beta(2, 5, size=n_samples)
else:
known_p = p

if add_instrument:
Z = np.random.binomial(n=1, p=0.5, size=n_samples)
C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
Expand All @@ -520,18 +520,11 @@ def mu(X):
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
df = pd.DataFrame(
np.array([*X.T, T, Y, tau, known_p, Y_base]).T,
columns=features
+ ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
Expand Down Expand Up @@ -641,16 +634,8 @@ def generate_non_random_dataset(num_samples=1000):
)
treatment = np.random.binomial(1, propensity)
outcome = (
0.2
* treatment
+ 0.5
* x1
- 0.2
* x2
+ np.random.normal(
0,
1,
num_samples))
0.2 * treatment + 0.5 * x1 - 0.2 * x2 + np.random.normal(0, 1, num_samples)
)

dataset = {
"T": treatment,
Expand Down Expand Up @@ -729,3 +714,41 @@ def mlrate_experiment_synth_dgp(
cd = CausalityDataset(data=df, outcomes=["Y"], treatment="T")

return cd


def save_dataset(dataset: CausalityDataset, filename: str):
"""
Save a CausalityDataset object to a file using pickle.

Args:
dataset (CausalityDataset): The dataset to save.
filename (str): The name of the file to save the dataset to.
"""
with open(filename, "wb") as f:
pickle.dump(dataset, f)
print(f"Dataset saved to {filename}")


def load_dataset(filename: str) -> CausalityDataset:
"""
Load a CausalityDataset object from a file using pickle.

Args:
filename (str): The name of the file to load the dataset from.

Returns:
CausalityDataset: The loaded dataset.
"""
if not os.path.exists(filename):
raise FileNotFoundError(f"File {filename} not found.")

with open(filename, "rb") as f:
dataset = pickle.load(f)

if not isinstance(dataset, CausalityDataset):
raise ValueError(
f"The file {filename} does not contain a valid CausalityDataset object."
)

print(f"Dataset loaded from {filename}")
return dataset
Loading
Loading