-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_model.py
58 lines (45 loc) · 1.84 KB
/
test_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pytest
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from model import (
lr,
model_columns,
)
def test_data_loading():
url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
df = pd.read_csv(url)
assert not df.empty, "Dataframe should not be empty after loading data"
def test_preprocessing():
include = ["Age", "Sex", "Embarked", "Survived"]
df = pd.read_csv(
"http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
)
df_ = df[include]
df_["Age"].fillna(0, inplace=True) # Example preprocessing step
assert (
df_["Age"].isnull().sum() == 0
), "No null values should be in Age column after preprocessing"
def test_model_training():
df = pd.read_csv(
"http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
)
df_ = df[["Age", "Sex", "Embarked", "Survived"]]
# Fill NaN values in all columns to handle both categorical and numerical columns
df_.fillna({"Age": 0, "Sex": "unknown", "Embarked": "unknown"}, inplace=True)
df_ohe = pd.get_dummies(df_, columns=["Sex", "Embarked"], dummy_na=False)
x = df_ohe[df_ohe.columns.difference(["Survived"])]
y = df_ohe["Survived"]
model = LogisticRegression()
model.fit(x, y)
assert model, "Model should be trained"
def test_model_saving_loading():
model = LogisticRegression()
joblib.dump(model, "test_model.pkl")
loaded_model = joblib.load("test_model.pkl")
assert loaded_model, "Model should be loaded successfully"
def test_model_columns_saving():
columns = ["col1", "col2", "col3"] # Example columns
joblib.dump(columns, "test_model_columns.pkl")
loaded_columns = joblib.load("test_model_columns.pkl")
assert loaded_columns == columns, "Model columns should be loaded successfully"