diff --git a/data_science/data_preprocessing.py b/data_science/data_preprocessing.py index 181699f..bbd562b 100644 --- a/data_science/data_preprocessing.py +++ b/data_science/data_preprocessing.py @@ -1,5 +1,6 @@ -import pandas as pd import numpy as np +import pandas as pd + def preprocess_data(data): # Preprocess data using Pandas or NumPy @@ -9,14 +10,18 @@ def preprocess_data(data): data = data.fillna(data.mean()) # Normalize numerical data - numerical_data = data.select_dtypes(include=['int64', 'float64']) - numerical_data = (numerical_data - numerical_data.min()) / (numerical_data.max() - numerical_data.min()) + numerical_data = data.select_dtypes(include=["int64", "float64"]) + numerical_data = (numerical_data - numerical_data.min()) / ( + numerical_data.max() - numerical_data.min() + ) # Encode categorical data - categorical_data = data.select_dtypes(include=['object']) + categorical_data = data.select_dtypes(include=["object"]) encoded_data = pd.get_dummies(categorical_data) # Concatenate the preprocessed data - preprocessed_data = np.concatenate([numerical_data.values, encoded_data.values], axis=1) + preprocessed_data = np.concatenate( + [numerical_data.values, encoded_data.values], axis=1 + ) return preprocessed_data