From 232542bfa5037f4912731f3ac6fea4ee043b97e3 Mon Sep 17 00:00:00 2001 From: Joel Parr Date: Fri, 25 Oct 2024 15:39:03 +0200 Subject: [PATCH 1/2] General updates. - Custom Distance Function for KNN - New PCA - Create the new base for a .python file where we can create time based functions. Hoping to create progress bars so that when the main programs/functions are run we can see how much longer until they are finsihed. - general updates. For instance I have added options so that you can skip the creation of the 'create datafiles' stage. --- src/functions/create_datafiles.py | 53 +- src/functions/ml_modeller.py | 1144 +++++++++++++++++------------ src/functions/oppdateringsfil.py | 111 ++- src/functions/time.py | 83 +++ src/notebooks/Master.ipynb | 91 ++- 5 files changed, 963 insertions(+), 519 deletions(-) create mode 100644 src/functions/time.py diff --git a/src/functions/create_datafiles.py b/src/functions/create_datafiles.py index fc7a8c2..a901402 100644 --- a/src/functions/create_datafiles.py +++ b/src/functions/create_datafiles.py @@ -49,14 +49,16 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F # start_year = 2018 # else: # start_year = 2017 - + prior_year= year - 1 start_year = 2017 all_good_dataframes = [] # List to store good dataframes for each year all_bad_dataframes = [] # List to store bad dataframes for each year all_training_dataframes = [] # List to store training dataframes for each year all_time_series_dataframes = [] # List to store time series dataframes for each year - + + start_datafile_loop = time.time() + for current_year in range(start_year, year + 1): fjor = current_year - 1 # Previous year @@ -156,12 +158,13 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F ] # Use the ParquetDataset to read multiple files - dataset = pq.ParquetDataset(fil_path, filesystem=fs) + # dataset = pq.ParquetDataset(fil_path, filesystem=fs) + foretak_pub = pd.read_parquet(fil_path, filesystem=fs) - table = dataset.read() +# table = dataset.read() - # Convert to Pandas DataFrame - foretak_pub = table.to_pandas() +# # Convert to Pandas DataFrame +# foretak_pub = table.to_pandas() # Check if current_year is 2022 or higher if current_year >= 2023: @@ -484,7 +487,9 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F del onlygooddriftskostnader - if uu_data and current_year == year: + if uu_data and (current_year == year or current_year == prior_year): + + start_uu = time.time() print("uu_data for:", {current_year}, "is True, proceeding with data processing...") @@ -584,6 +589,11 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F del bedrift_pub + + # Calculate processing time + processing_time_uu = time.time() - start_uu + print(f"Time taken to process uu data for {current_year}: {processing_time_uu:.2f} seconds") + else: print("uu_data is False, skipping data processing.") @@ -595,6 +605,9 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F # kpi_df = kpi.process_kpi_data(current_year) # Get kommune population growth , income trends and inflation data + + api_time = time.time() + try: kommune_befolk = kommune_pop.befolkning_behandling(current_year, fjor) except Exception as e: @@ -624,6 +637,11 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F except Exception as e: print(f"Failed to fetch kpi_df for {current_year - 1} as well.") kpi_df = None + + processing_time_api = time.time() - api_time + print(f"Time taken to process kommune data, population data and inflation data for {current_year}: {processing_time_api:.2f} seconds") + + # Convert string columns to numeric merged_df["gjeldende_bdr_syss"] = pd.to_numeric( merged_df["gjeldende_bdr_syss"], errors="coerce" @@ -924,7 +942,13 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F current_year_good_oms = good_data[good_data['year'] == year] current_year_bad_oms = bad_data[bad_data['year'] == year] v_orgnr_list_for_imputering = current_year_bad_oms['v_orgnr'].tolist() - unique_id_list = current_year_bad_oms[current_year_bad_oms['nacef_5'].str.startswith(tosiffernaring)]['id'].unique().tolist() + # unique_id_list = current_year_bad_oms[current_year_bad_oms['nacef_5'].str.startswith(tosiffernaring)]['id'].unique().tolist() + # If tosiffernaring contains multiple categories, filter by checking if 'nacef_5' starts with any of them + + unique_id_list = current_year_bad_oms[ + current_year_bad_oms['nacef_5'].str[:2].isin(tosiffernaring) + ]['id'].unique().tolist() + # Easy solution for filling Nan Values - only for training, not for editing real data training_data['tmp_sn2007_5'].fillna(training_data['nacef_5'], inplace=True) @@ -935,6 +959,12 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F # Create trend data + # Calculate processing time + processing_time_datafile_loop = time.time() - start_datafile_loop + print(f"Time taken to create base training data: {processing_time_datafile_loop:.2f} seconds") + + oms_trend_time = time.time() + print("starting regression line function") # Determine the number of CPU cores available num_cores = multiprocessing.cpu_count() @@ -946,7 +976,7 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F # Sort the data training_data = training_data.sort_values(by=["v_orgnr", "year"]) - + # Function to process each group def process_group(v_orgnr, group): group_forecast = group[["v_orgnr", "year"]].copy() @@ -975,6 +1005,9 @@ def process_group(v_orgnr, group): # Concatenate results trend_forecasts = pd.concat(results, ignore_index=True) + + processing_time_trends = time.time() - start_datafile_loop + print(f"Time taken to create base training data: {processing_time_trends:.2f} seconds") # Merge the trend forecasts with the original training data training_data = pd.merge(training_data, trend_forecasts, on=["v_orgnr", "year"], how="left") @@ -1117,3 +1150,5 @@ def geo(training_data): training_data = training_data[~training_data['v_orgnr'].isin(v_orgnr_list_for_imputering)] return current_year_good_oms, current_year_bad_oms, v_orgnr_list_for_imputering, training_data, imputatable_df, time_series_df, unique_id_list + + diff --git a/src/functions/ml_modeller.py b/src/functions/ml_modeller.py index 4925d73..5bef2f2 100644 --- a/src/functions/ml_modeller.py +++ b/src/functions/ml_modeller.py @@ -176,15 +176,21 @@ def xgboost_model(training_df, scaler, df_estimeres, year, GridSearch=True): y = df["new_oms"] # Define categorical and numerical features - categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + # categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + # numerical_features = [ + # "inntekt_delta_oms", + # "emp_delta_oms", + # "befolkning_delta_oms", + # "inflation_rate_oms", + # "gjeldende_bdr_syss", + # "new_oms_trendForecast", + # 'oms_syssmean_basedOn_naring', + # 'oms_syssmean_basedOn_naring_kommune' + # ] + + categorical_features = ["nacef_5", "b_kommunenr"] numerical_features = [ - "inntekt_delta_oms", - "emp_delta_oms", - "befolkning_delta_oms", - "inflation_rate_oms", - "gjeldende_bdr_syss", "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' ] @@ -370,6 +376,190 @@ def xgboost_model(training_df, scaler, df_estimeres, year, GridSearch=True): return imputed_df +def xgboost_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, apply_pca=True, n_components=6): + """ + Trains an XGBoost model for predicting new_oms values with optional PCA for dimensionality reduction + and GridSearch for hyperparameter tuning. Includes visualizations for explained variance, learning history, + and SHAP values. + + Parameters: + training_df (pd.DataFrame): DataFrame containing the training data. + scaler (object): Scaler object for numerical features (e.g., StandardScaler, RobustScaler). + df_estimeres (pd.DataFrame): DataFrame containing the data to be imputed. + GridSearch (bool): Whether to perform GridSearch for hyperparameter tuning. Default is True. + apply_pca (bool): Whether to apply PCA for dimensionality reduction. Default is True. + n_components (int, float, or None): Number of components to keep after applying PCA. If None, it will not reduce dimensions. + + Returns: + pd.DataFrame: DataFrame with predicted new_oms values. + """ + import numpy as np + import xgboost as xgb + from sklearn.model_selection import train_test_split, GridSearchCV + from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error + from sklearn.preprocessing import OneHotEncoder + from sklearn.compose import ColumnTransformer + from sklearn.decomposition import PCA + import matplotlib.pyplot as plt + import pandas as pd + import shap + import plotly.graph_objects as go + + print('Starting the XGBoost model with PCA...') + + # Make copies of the input DataFrames + df = training_df.copy() + imputed_df = df_estimeres.copy() + + # Drop rows with NaN values in the target column + df = df.dropna(subset=['new_oms']) + + # Convert specified columns to category type + categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + for col in categorical_columns: + df[col] = df[col].astype("category") + + # Define features and target variable + X = df.drop(columns=["new_oms"]) + y = df["new_oms"] + + # Define categorical and numerical features + categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + numerical_features = [ + "inntekt_delta_oms", + "emp_delta_oms", + "befolkning_delta_oms", + "inflation_rate_oms", + "gjeldende_bdr_syss", + "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', + 'oms_syssmean_basedOn_naring_kommune' + ] + + # Preprocessing pipeline + preprocessor = ColumnTransformer( + transformers=[ + ("num", scaler, numerical_features), # Apply scaling to numerical features + ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), # One-hot encoding for categorical features + ] + ) + + # Split the dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Fit the preprocessor on the training data + preprocessor.fit(X_train) + + # Transform the training and testing data + X_train_transformed = preprocessor.transform(X_train).toarray() + X_test_transformed = preprocessor.transform(X_test).toarray() + + # Apply PCA if requested + if apply_pca: + pca = PCA(n_components=n_components) + X_train_transformed = pca.fit_transform(X_train_transformed) + X_test_transformed = pca.transform(X_test_transformed) + + # Visualize explained variance + explained_variance = np.cumsum(pca.explained_variance_ratio_) + fig = go.Figure() + fig.add_trace(go.Scatter( + x=np.arange(1, len(explained_variance) + 1), + y=explained_variance, + mode='lines+markers', + marker=dict(size=10, color='lightgreen'), + line=dict(color='lightgreen', width=3), + hovertemplate='Component %{x}
Cumulative Explained Variance: %{y:.2f}' + )) + fig.update_layout( + title='Cumulative Explained Variance by PCA Components', + xaxis_title='Number of Components', + yaxis_title='Cumulative Explained Variance', + template="plotly_white", + font=dict(size=14) + ) + fig.show() + + # Define the model and perform GridSearch if requested + if GridSearch: + regressor = xgb.XGBRegressor(eval_metric="rmse", random_state=42) + param_grid = { + 'n_estimators': [50, 100, 200], + 'max_depth': [3, 5, 7], + 'learning_rate': [0.01, 0.1, 0.2], + 'subsample': [0.8, 1.0], + 'colsample_bytree': [0.8, 1.0] + } + grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1) + grid_search.fit(X_train_transformed, y_train) + print("Best parameters found by GridSearch:", grid_search.best_params_) + regressor = grid_search.best_estimator_ + else: + regressor = xgb.XGBRegressor(eval_metric="rmse", random_state=42) + eval_set = [(X_train_transformed, y_train), (X_test_transformed, y_test)] + regressor.fit(X_train_transformed, y_train, eval_set=eval_set, verbose=False) + + # Evaluate the model + y_pred = regressor.predict(X_test_transformed) + mse = mean_squared_error(y_test, y_pred) + r_squared = r2_score(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + print("Mean Squared Error:", mse) + print("R-squared:", r_squared) + print("Mean Absolute Error:", mae) + + # Plot the learning history + results = regressor.evals_result() + epochs = len(results["validation_0"]["rmse"]) + x_axis = range(0, epochs) + plt.figure(figsize=(10, 5)) + plt.plot(x_axis, results["validation_0"]["rmse"], label="Train") + plt.plot(x_axis, results["validation_1"]["rmse"], label="Test") + plt.legend() + plt.xlabel("Epochs") + plt.ylabel("RMSE") + plt.title("XGBoost Learning History") + plt.show() + + # Plot Predicted vs. Actual Values + plt.figure(figsize=(10, 5)) + plt.scatter(y_test, y_pred, alpha=0.3) + plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) + plt.xlabel("Actual") + plt.ylabel("Predicted") + plt.title("Predicted vs. Actual Values") + plt.show() + + # Plot Residuals + residuals = y_test - y_pred + plt.figure(figsize=(10, 5)) + plt.scatter(y_test, residuals, alpha=0.3) + plt.hlines(0, y_test.min(), y_test.max(), colors="r", linestyles="dashed") + plt.xlabel("Actual") + plt.ylabel("Residuals") + plt.title("Residuals Plot") + plt.show() + + # SHAP values if PCA was not applied + if not apply_pca: + explainer = shap.TreeExplainer(regressor, X_train_transformed) + shap_values = explainer.shap_values(X_test_transformed) + feature_names = preprocessor.get_feature_names_out() + shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names) + + # Impute the missing data + imputed_X = imputed_df.drop(columns=["new_oms"]) + imputed_X_transformed = preprocessor.transform(imputed_X) + + if apply_pca: + imputed_X_transformed = pca.transform(imputed_X_transformed) + + imputed_df["predicted_oms"] = regressor.predict(imputed_X_transformed) + imputed_df['predicted_oms'] = imputed_df['predicted_oms'].clip(lower=0).astype(float) + + return imputed_df + + def knn_model(training_df, scaler, df_estimeres, year, GridSearch=True): @@ -584,12 +774,218 @@ def knn_model(training_df, scaler, df_estimeres, year, GridSearch=True): return imputed_df -def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, apply_pca=True, n_components=5): +# def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, apply_pca=True, n_components=5): + +# """ +# Trains a K-Nearest Neighbors model for predicting new_oms values with optional PCA for dimensionality reduction +# and optional GridSearch for hyperparameter tuning. Includes interactive plot for explained variance. + +# Parameters: +# training_df (pd.DataFrame): DataFrame containing the training data. +# scaler (object): Scaler object for numerical features (e.g., StandardScaler, RobustScaler). +# df_estimeres (pd.DataFrame): DataFrame containing the data to be imputed. +# GridSearch (bool): Whether to perform GridSearch for hyperparameter tuning. Default is True. +# apply_pca (bool): Whether to apply PCA for dimensionality reduction. Default is True. +# n_components (int, float, or None): Number of components to keep after applying PCA. If None, it will not reduce dimensions. +# If a float is given (e.g., 0.95), PCA will select the number of components that explain that proportion of variance. + +# Returns: +# pd.DataFrame: DataFrame with predicted new_oms values. +# """ +# from sklearn.decomposition import PCA +# import matplotlib.pyplot as plt +# import plotly.graph_objects as go +# import numpy as np +# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error +# from sklearn.preprocessing import OneHotEncoder +# from sklearn.compose import ColumnTransformer +# from sklearn.neighbors import KNeighborsRegressor +# import pandas as pd + +# # Make copies of the input DataFrames +# df = training_df.copy() +# imputed_df = df_estimeres.copy() + +# categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] +# df[categorical_columns] = df[categorical_columns].astype(str) +# imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str) + +# columns_to_fill = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] +# numeric_columns_to_fill = [ +# "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", +# "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", +# 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' +# ] + +# # Fill missing values +# df[columns_to_fill] = df[columns_to_fill].fillna('missing') +# imputed_df[columns_to_fill] = imputed_df[columns_to_fill].fillna('missing') +# df[numeric_columns_to_fill] = df[numeric_columns_to_fill].fillna(0) +# imputed_df[numeric_columns_to_fill] = imputed_df[numeric_columns_to_fill].fillna(0) + +# # Define features and target +# X = df.drop(columns=["new_oms"]) +# y = df["new_oms"] + +# categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] +# numerical_features = [ +# "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", +# "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", +# 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' +# ] + +# # Preprocessing pipeline +# preprocessor = ColumnTransformer( +# transformers=[ +# ("num", scaler, numerical_features), +# ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), +# ] +# ) + +# # Split the data into training and testing sets +# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# # Fit the preprocessor and transform the training and testing data +# preprocessor.fit(X_train) +# X_train_transformed = preprocessor.transform(X_train) +# X_test_transformed = preprocessor.transform(X_test) + +# pca = PCA(n_components=n_components) +# X_train_pca = pca.fit_transform(X_train_transformed) +# X_test_pca = pca.transform(X_test_transformed) + +# # Automatically create and display the interactive PCA plot +# explained_variance = np.cumsum(pca.explained_variance_ratio_) +# fig = go.Figure() +# fig.add_trace(go.Scatter( +# x=np.arange(1, len(explained_variance) + 1), +# y=explained_variance, +# mode='lines+markers', +# marker=dict(size=8), +# hovertemplate='Component %{x}
Cumulative Explained Variance: %{y:.2f}', +# line=dict(dash='dash', color='blue') +# )) + +# # Update layout +# fig.update_layout( +# title='Cumulative Explained Variance by PCA Components', +# xaxis_title='Number of Components', +# yaxis_title='Cumulative Explained Variance', +# template="plotly_white" +# ) + +# # Show the plot +# fig.show() + +# # Get the PCA components and their corresponding feature importance +# pca_components = pd.DataFrame( +# pca.components_, +# columns=numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out()), +# index=[f"PC{i+1}" for i in range(pca.n_components_)] +# ) + +# # Display the top contributing features for each component +# for i in range(pca.n_components_): +# print(f"\nTop features for PC{i+1}:") +# component = pca_components.iloc[i] +# sorted_component = component.abs().sort_values(ascending=False) +# top_features = sorted_component.head(5).index.tolist() +# print(f"Top contributing features: {top_features}") +# print(component.loc[top_features]) + +# X_train_transformed = X_train_pca +# X_test_transformed = X_test_pca + +# if GridSearch: +# # Define the model and perform GridSearch with cross-validation +# regressor = KNeighborsRegressor() +# param_grid = {'n_neighbors': [2, 3, 5, 7]} +# grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1) +# grid_search.fit(X_train_transformed, y_train) +# print("Best parameters found by GridSearch:", grid_search.best_params_) +# regressor = grid_search.best_estimator_ +# else: +# regressor = KNeighborsRegressor(n_neighbors=2) +# regressor.fit(X_train_transformed, y_train) + +# # Evaluate with cross-validation +# cv_scores = cross_val_score(regressor, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error') +# mean_mae = -np.mean(cv_scores) +# std_mae = np.std(cv_scores) +# print(f"Cross-Validated Mean MAE: {mean_mae}") +# print(f"Cross-Validated MAE Standard Deviation: {std_mae}") + +# # Predict on test data +# y_pred = regressor.predict(X_test_transformed) +# mse = mean_squared_error(y_test, y_pred) +# r_squared = r2_score(y_test, y_pred) +# mae = mean_absolute_error(y_test, y_pred) +# print("Mean Squared Error:", mse) +# print("R-squared:", r_squared) +# print("Mean Absolute Error:", mae) + +# # Calculate MAE per year and print it +# results = X_test.copy() +# results['actual'] = y_test +# results['predicted'] = y_pred + +# if 'year' in results.columns: +# mae_per_year = results.groupby('year').apply(lambda group: mean_absolute_error(group['actual'], group['predicted'])) +# print("\nMean Absolute Error per Year:") +# print(mae_per_year) + +# # Create the n3 class by taking the first 4 characters of nacef_5 +# X_test['n3'] = X_test['nacef_5'].str[:4] + +# # Evaluate performance based on the n3 class +# results = pd.DataFrame({'n3': X_test['n3'], 'actual': y_test, 'predicted': y_pred}) + +# metrics_per_n3 = results.groupby('n3').apply(lambda group: pd.Series({ +# 'mse': mean_squared_error(group['actual'], group['predicted']), +# 'r_squared': r2_score(group['actual'], group['predicted']), +# 'mae': mean_absolute_error(group['actual'], group['predicted']) +# })).reset_index() + +# print("Metrics per 'n3':") +# print(metrics_per_n3) + +# # Plot Predicted vs. Actual Values +# plt.figure(figsize=(10, 5)) +# plt.scatter(y_test, y_pred, alpha=0.3) +# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) +# plt.xlabel("Actual") +# plt.ylabel("Predicted") +# plt.title("Predicted vs. Actual Values") +# plt.show() + +# # Plot Residuals +# residuals = y_test - y_pred +# plt.figure(figsize=(10, 5)) +# plt.scatter(y_test, residuals, alpha=0.3) +# plt.hlines(0, y_test.min(), y_test.max(), colors="r", linestyles="dashed") +# plt.xlabel("Actual") +# plt.ylabel("Residuals") +# plt.title("Residuals Plot") +# plt.show() + +# # Impute the missing data +# imputed_X = imputed_df.drop(columns=["new_oms"]) +# imputed_X_transformed = preprocessor.transform(imputed_X) + +# imputed_X_transformed = pca.transform(imputed_X_transformed) + +# imputed_df["predicted_oms"] = regressor.predict(imputed_X_transformed) + +# return imputed_df + +def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=False, apply_pca=True, n_components=6): + """ Trains a K-Nearest Neighbors model for predicting new_oms values with optional PCA for dimensionality reduction and optional GridSearch for hyperparameter tuning. Includes interactive plot for explained variance. - + Parameters: training_df (pd.DataFrame): DataFrame containing the training data. scaler (object): Scaler object for numerical features (e.g., StandardScaler, RobustScaler). @@ -598,11 +994,12 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, apply_pca (bool): Whether to apply PCA for dimensionality reduction. Default is True. n_components (int, float, or None): Number of components to keep after applying PCA. If None, it will not reduce dimensions. If a float is given (e.g., 0.95), PCA will select the number of components that explain that proportion of variance. - + Returns: pd.DataFrame: DataFrame with predicted new_oms values. """ from sklearn.decomposition import PCA + import plotly.express as px import matplotlib.pyplot as plt import plotly.graph_objects as go import numpy as np @@ -617,6 +1014,9 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, df = training_df.copy() imputed_df = df_estimeres.copy() + def filter_previous_years(df, current_year): + return df[df['year'] <= current_year] + categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] df[categorical_columns] = df[categorical_columns].astype(str) imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str) @@ -634,55 +1034,103 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, df[numeric_columns_to_fill] = df[numeric_columns_to_fill].fillna(0) imputed_df[numeric_columns_to_fill] = imputed_df[numeric_columns_to_fill].fillna(0) - # Define features and target - X = df.drop(columns=["new_oms"]) - y = df["new_oms"] - - categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] - numerical_features = [ - "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", - "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' - ] - - # Preprocessing pipeline - preprocessor = ColumnTransformer( - transformers=[ - ("num", scaler, numerical_features), - ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), + # Loop through each unique year, train a model for each year + unique_years = df['year'].unique() + + for year in unique_years: + print("--------------------------------") + print(f"Training model for year: {year}") + + # Filter data to include only the current year and previous years + df_filtered = filter_previous_years(df, year) + + # Define features and target + X = df_filtered.drop(columns=["new_oms"]) + y = df_filtered["new_oms"] + + categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + numerical_features = [ + "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", + "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' ] - ) - # Split the data into training and testing sets - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + # Preprocessing pipeline + preprocessor = ColumnTransformer( + transformers=[ + ("num", scaler, numerical_features), + ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), + ] + ) - # Fit the preprocessor and transform the training and testing data - preprocessor.fit(X_train) - X_train_transformed = preprocessor.transform(X_train) - X_test_transformed = preprocessor.transform(X_test) + # Split into training and testing sets (only train on past and current data) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Fit the preprocessor and transform the training and testing data + preprocessor.fit(X_train) + X_train_transformed = preprocessor.transform(X_train) + X_test_transformed = preprocessor.transform(X_test) + + if apply_pca: + pca = PCA(n_components=n_components) + X_train_pca = pca.fit_transform(X_train_transformed) + X_test_pca = pca.transform(X_test_transformed) + X_train_transformed = X_train_pca + X_test_transformed = X_test_pca + + if GridSearch: + # Define the model and perform GridSearch with cross-validation + regressor = KNeighborsRegressor() + param_grid = {'n_neighbors': [2, 3, 5, 7]} + grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1) + grid_search.fit(X_train_transformed, y_train) + print("Best parameters found by GridSearch:", grid_search.best_params_) + regressor = grid_search.best_estimator_ + else: + regressor = KNeighborsRegressor(n_neighbors=2) + regressor.fit(X_train_transformed, y_train) + + # Predict on test data + y_pred = regressor.predict(X_test_transformed) + mse = mean_squared_error(y_test, y_pred) + r_squared = r2_score(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + print(f"Year: {year} - Mean Squared Error:", mse) + print(f"Year: {year} - R-squared:", r_squared) + print(f"Year: {year} - Mean Absolute Error:", mae) + + # Create the n3 class by taking the first 4 characters of nacef_5 + X_test['n3'] = X_test['nacef_5'].str[:4] - pca = PCA(n_components=n_components) - X_train_pca = pca.fit_transform(X_train_transformed) - X_test_pca = pca.transform(X_test_transformed) + # Evaluate performance based on the n3 class + results = pd.DataFrame({'n3': X_test['n3'], 'actual': y_test, 'predicted': y_pred}) - # Automatically create and display the interactive PCA plot + metrics_per_n3 = results.groupby('n3').apply(lambda group: pd.Series({ + 'mse': mean_squared_error(group['actual'], group['predicted']), + 'r_squared': r2_score(group['actual'], group['predicted']), + 'mae': mean_absolute_error(group['actual'], group['predicted']) + })).reset_index() + explained_variance = np.cumsum(pca.explained_variance_ratio_) fig = go.Figure() + + # Create a smooth light green curve with larger markers and no dash fig.add_trace(go.Scatter( x=np.arange(1, len(explained_variance) + 1), y=explained_variance, mode='lines+markers', - marker=dict(size=8), - hovertemplate='Component %{x}
Cumulative Explained Variance: %{y:.2f}', - line=dict(dash='dash', color='blue') + marker=dict(size=10, color='lightgreen'), # Larger markers with light green color + line=dict(color='lightgreen', width=3), # Light green curve, thicker line + hovertemplate='Component %{x}
Cumulative Explained Variance: %{y:.2f}' )) - # Update layout + # Update layout for cleaner presentation fig.update_layout( title='Cumulative Explained Variance by PCA Components', xaxis_title='Number of Components', yaxis_title='Cumulative Explained Variance', - template="plotly_white" + template="plotly_white", # Use a clean white template + font=dict(size=14) # Adjust font size for readability ) # Show the plot @@ -704,59 +1152,6 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, print(f"Top contributing features: {top_features}") print(component.loc[top_features]) - X_train_transformed = X_train_pca - X_test_transformed = X_test_pca - - if GridSearch: - # Define the model and perform GridSearch with cross-validation - regressor = KNeighborsRegressor() - param_grid = {'n_neighbors': [2, 3, 5, 7]} - grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1) - grid_search.fit(X_train_transformed, y_train) - print("Best parameters found by GridSearch:", grid_search.best_params_) - regressor = grid_search.best_estimator_ - else: - regressor = KNeighborsRegressor(n_neighbors=2) - regressor.fit(X_train_transformed, y_train) - - # Evaluate with cross-validation - cv_scores = cross_val_score(regressor, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error') - mean_mae = -np.mean(cv_scores) - std_mae = np.std(cv_scores) - print(f"Cross-Validated Mean MAE: {mean_mae}") - print(f"Cross-Validated MAE Standard Deviation: {std_mae}") - - # Predict on test data - y_pred = regressor.predict(X_test_transformed) - mse = mean_squared_error(y_test, y_pred) - r_squared = r2_score(y_test, y_pred) - mae = mean_absolute_error(y_test, y_pred) - print("Mean Squared Error:", mse) - print("R-squared:", r_squared) - print("Mean Absolute Error:", mae) - - # Calculate MAE per year and print it - results = X_test.copy() - results['actual'] = y_test - results['predicted'] = y_pred - - if 'year' in results.columns: - mae_per_year = results.groupby('year').apply(lambda group: mean_absolute_error(group['actual'], group['predicted'])) - print("\nMean Absolute Error per Year:") - print(mae_per_year) - - # Create the n3 class by taking the first 4 characters of nacef_5 - X_test['n3'] = X_test['nacef_5'].str[:4] - - # Evaluate performance based on the n3 class - results = pd.DataFrame({'n3': X_test['n3'], 'actual': y_test, 'predicted': y_pred}) - - metrics_per_n3 = results.groupby('n3').apply(lambda group: pd.Series({ - 'mse': mean_squared_error(group['actual'], group['predicted']), - 'r_squared': r2_score(group['actual'], group['predicted']), - 'mae': mean_absolute_error(group['actual'], group['predicted']) - })).reset_index() - print("Metrics per 'n3':") print(metrics_per_n3) @@ -766,7 +1161,7 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) plt.xlabel("Actual") plt.ylabel("Predicted") - plt.title("Predicted vs. Actual Values") + plt.title("Predicted vs. Actual Values. Static Plot") plt.show() # Plot Residuals @@ -776,347 +1171,229 @@ def knn_model_with_pca(training_df, scaler, df_estimeres, year, GridSearch=True, plt.hlines(0, y_test.min(), y_test.max(), colors="r", linestyles="dashed") plt.xlabel("Actual") plt.ylabel("Residuals") - plt.title("Residuals Plot") + plt.title("Residuals Plot. Static") plt.show() - # Impute the missing data + # Create a DataFrame to hold all information + plot_data = pd.DataFrame({ + 'Actual': y_test, + 'Predicted': y_pred, + 'Residuals': y_test - y_pred + }) + + + # Impute the missing data for the current year imputed_X = imputed_df.drop(columns=["new_oms"]) imputed_X_transformed = preprocessor.transform(imputed_X) - - imputed_X_transformed = pca.transform(imputed_X_transformed) - + + if apply_pca: + imputed_X_transformed = pca.transform(imputed_X_transformed) + imputed_df["predicted_oms"] = regressor.predict(imputed_X_transformed) return imputed_df +from sklearn.neighbors import KNeighborsRegressor +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.decomposition import PCA +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import plotly.graph_objects as go +from sklearn.model_selection import train_test_split, GridSearchCV -def knn_model_new(training_df, scaler, df_estimeres, current_year, GridSearch=True): - import numpy as np - from sklearn.model_selection import GridSearchCV - from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error - from sklearn.preprocessing import OneHotEncoder - from sklearn.compose import ColumnTransformer - from sklearn.neighbors import KNeighborsRegressor - import matplotlib.pyplot as plt - import pandas as pd - +def knn_model_with_pca_custom_distance(training_df, scaler, df_estimeres, year, GridSearch=False, apply_pca=True, n_components=6): + """ + Trains a K-Nearest Neighbors model for predicting new_oms values with optional PCA for dimensionality reduction + and custom distance metric to penalize future years. + + Parameters: + training_df (pd.DataFrame): DataFrame containing the training data. + scaler (object): Scaler object for numerical features (e.g., StandardScaler, RobustScaler). + df_estimeres (pd.DataFrame): DataFrame containing the data to be imputed. + GridSearch (bool): Whether to perform GridSearch for hyperparameter tuning. Default is True. + apply_pca (bool): Whether to apply PCA for dimensionality reduction. Default is True. + n_components (int, float, or None): Number of components to keep after applying PCA. If None, it will not reduce dimensions. + + Returns: + pd.DataFrame: DataFrame with predicted new_oms values. + """ + + # Make copies of the input DataFrames df = training_df.copy() imputed_df = df_estimeres.copy() - print('Preparing the data') - - print("training_data shape:", df.shape) - print("imputed_df shape:", imputed_df.shape) + # Include year as a feature categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] df[categorical_columns] = df[categorical_columns].astype(str) imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str) columns_to_fill = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] numeric_columns_to_fill = [ - "inntekt_delta_oms", - "emp_delta_oms", - "befolkning_delta_oms", - "inflation_rate_oms", - "gjeldende_bdr_syss", - "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', - 'oms_syssmean_basedOn_naring_kommune' + "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", + "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune', + 'year' # Add 'year' to the list of numeric columns ] - + + # Fill missing values df[columns_to_fill] = df[columns_to_fill].fillna('missing') imputed_df[columns_to_fill] = imputed_df[columns_to_fill].fillna('missing') df[numeric_columns_to_fill] = df[numeric_columns_to_fill].fillna(0) imputed_df[numeric_columns_to_fill] = imputed_df[numeric_columns_to_fill].fillna(0) - X = df.drop(columns=["new_oms"]) - y = df["new_oms"] - - print('Transforming the data') + # Preprocessing pipeline + categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] + numerical_features = [ + "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", + "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune', + 'year' # Include 'year' in the list of features for training + ] preprocessor = ColumnTransformer( transformers=[ - ("num", scaler, numeric_columns_to_fill), - ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_columns), + ("num", scaler, numerical_features), + ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), ] ) - - X_transformed = preprocessor.fit_transform(X) - - print('Converting from dense to sparse') - - # Convert to dense if sparse - if hasattr(X_transformed, "toarray"): - X_transformed = X_transformed.toarray() - - year_column = df[['year']].values - - def custom_distance(x, y): - year_diff = np.abs(x[-1] - y[-1]) - if year_diff > 0: - return np.inf - return np.linalg.norm(x[:-1] - y[:-1]) - - if GridSearch: - print('GridSearch is on. Performing GridSearch') - param_grid = {'n_neighbors': [2, 3, 5, 7]} - knn = KNeighborsRegressor(metric=custom_distance) - grid_search = GridSearchCV(knn, param_grid, scoring='neg_mean_squared_error', cv=5) - grid_search.fit(np.hstack([X_transformed, year_column]), y) - knn = grid_search.best_estimator_ - else: - ('GridSearch is off. Training the model') - knn = KNeighborsRegressor(n_neighbors=2, metric=custom_distance) - knn.fit(np.hstack([X_transformed, year_column]), y) - - X_imputed = imputed_df.drop(columns=["new_oms"]) - ('Preprocessor.transform started. X imputed') - X_imputed_transformed = preprocessor.transform(X_imputed) - - if hasattr(X_imputed_transformed, "toarray"): - ('X imputed - toarray') - X_imputed_transformed = X_imputed_transformed.toarray() - - current_year_column = np.full((X_imputed_transformed.shape[0], 1), current_year) - ('predict for imputed') - imputed_df["predicted_oms"] = knn.predict(np.hstack([X_imputed_transformed, current_year_column])) - - X_test = df[df['year'] == current_year].drop(columns=["new_oms", "year"]) - y_test = df[df['year'] == current_year]["new_oms"] - X_test_transformed = preprocessor.transform(X_test) - - if hasattr(X_test_transformed, "toarray"): - ('X test transformed to array') - X_test_transformed = X_test_transformed.toarray() - - current_year_column_test = np.full((X_test_transformed.shape[0], 1), current_year) - ('predict y_pred') - y_pred = knn.predict(np.hstack([X_test_transformed, current_year_column_test])) - - print('Evaluate model:') - mse = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) - mae = mean_absolute_error(y_test, y_pred) - print(f"Mean Squared Error: {mse}") - print(f"R-squared: {r2}") - print(f"Mean Absolute Error: {mae}") - - plt.figure(figsize=(10, 5)) - plt.scatter(y_test, y_pred, alpha=0.3) - plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) - plt.xlabel("Actual") - plt.ylabel("Predicted") - plt.title("Predicted vs Actual Values") - plt.show() - - - return imputed_df - - -def knn_model_fast(training_df, scaler, df_estimeres, current_year, GridSearch=True): - import numpy as np - from sklearn.model_selection import GridSearchCV - from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error - from sklearn.preprocessing import OneHotEncoder - from sklearn.compose import ColumnTransformer - from sklearn.neighbors import KNeighborsRegressor - from sklearn.decomposition import PCA - import matplotlib.pyplot as plt - import pandas as pd - import scipy.sparse as sp - import time - - print('Preparing the data') - df = training_df.copy() - imputed_df = df_estimeres.copy() - - categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] - df[categorical_columns] = df[categorical_columns].astype(str) - imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str) - - df[categorical_columns] = df[categorical_columns].fillna('missing') - imputed_df[categorical_columns] = imputed_df[categorical_columns].fillna('missing') - numeric_columns_to_fill = [ - "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", - "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' - ] - df[numeric_columns_to_fill] = df[numeric_columns_to_fill].fillna(0) - imputed_df[numeric_columns_to_fill] = imputed_df[numeric_columns_to_fill].fillna(0) - + # Split the data into training and testing sets X = df.drop(columns=["new_oms"]) y = df["new_oms"] - - print('Scaling and OneHotEncoding the data') - preprocessor = ColumnTransformer( - transformers=[ - ("num", scaler, numeric_columns_to_fill), - ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore", sparse_output=True), categorical_columns), - ] - ) - - print('Fitting and transforming the data') - X_transformed = preprocessor.fit_transform(X) - year_column = sp.csr_matrix(df[['year']].values) - - # Reduce dimensionality with PCA - print('Applying PCA for dimensionality reduction') - pca = PCA(n_components=50) - X_transformed_pca = pca.fit_transform(X_transformed.toarray()) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - # Include year as a sparse column - X_transformed_pca = sp.hstack([sp.csr_matrix(X_transformed_pca), year_column], format='csr') - - def custom_distance(x, y): - # Extract the year directly from sparse matrices without converting the full matrix - x_year = x[0, -1] if sp.issparse(x) else x[-1] - y_year = y[0, -1] if sp.issparse(y) else y[-1] - - # Only apply penalty if y's year is in the future compared to x's year - if y_year > x_year: - return np.inf - - # Extract the non-year feature vectors - x_no_year = x[0, :-1] if sp.issparse(x) else x[:-1] - y_no_year = y[0, :-1] if sp.issparse(y) else y[:-1] + # Fit the preprocessor and transform the training and testing data + preprocessor.fit(X_train) + X_train_transformed = preprocessor.transform(X_train) + X_test_transformed = preprocessor.transform(X_test) - # Use the sparse matrix norm for efficient calculation on non-year features - return sp.linalg.norm(x_no_year - y_no_year) if sp.issparse(x_no_year) else np.linalg.norm(x_no_year - y_no_year) + if apply_pca: + pca = PCA(n_components=n_components) + X_train_pca = pca.fit_transform(X_train_transformed) + X_test_pca = pca.transform(X_test_transformed) + X_train_transformed = X_train_pca + X_test_transformed = X_test_pca + # Define a custom distance metric that penalizes future years + def custom_distance_metric(x1, x2): + year_diff = x1[-1] - x2[-1] # Compare the year feature, assuming it's the last column + if year_diff > 0: + # Penalize if the current point is from a future year + penalty = 1e10 # A very large number to ensure it's not chosen as a neighbor + else: + penalty = 0 + return np.linalg.norm(x1[:-1] - x2[:-1]) + penalty + # Initialize the KNN model with the custom distance metric + regressor = KNeighborsRegressor(n_neighbors=2, metric=custom_distance_metric) if GridSearch: - print('GridSearch is on. Performing GridSearch') - knn = KNeighborsRegressor(metric=custom_distance, n_jobs=-1) param_grid = {'n_neighbors': [2, 3, 5, 7]} - grid_search = GridSearchCV(knn, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1) - grid_search.fit(X_transformed_pca, y) - knn = grid_search.best_estimator_ + grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1) + grid_search.fit(X_train_transformed, y_train) + print("Best parameters found by GridSearch:", grid_search.best_params_) + regressor = grid_search.best_estimator_ else: - print('GridSearch is off. Training the model') - knn = KNeighborsRegressor(n_neighbors=2, metric=custom_distance, n_jobs=-1) - knn.fit(X_transformed_pca, y) - - print('Imputing data...') - start_time = time.time() - X_imputed = imputed_df.drop(columns=["new_oms"]) - X_imputed_transformed = preprocessor.transform(X_imputed) - X_imputed_transformed_pca = pca.transform(X_imputed_transformed.toarray()) - current_year_column = sp.csr_matrix(np.full((X_imputed_transformed_pca.shape[0], 1), current_year)) - X_imputed_combined = sp.hstack([sp.csr_matrix(X_imputed_transformed_pca), current_year_column], format='csr') - imputed_df["predicted_oms"] = knn.predict(X_imputed_combined) - end_time = time.time() - - elapsed_time = end_time - start_time - print(f"Elapsed time for imputation prediction: {elapsed_time:.2f} seconds") - - print('Evaluating the model') - X_test = df[df['year'] == current_year].drop(columns=["new_oms", "year"]) - y_test = df[df['year'] == current_year]["new_oms"] - X_test_transformed = preprocessor.transform(X_test) - X_test_transformed_pca = pca.transform(X_test_transformed.toarray()) - current_year_column_test = sp.csr_matrix(np.full((X_test_transformed_pca.shape[0], 1), current_year)) - X_test_combined = sp.hstack([sp.csr_matrix(X_test_transformed_pca), current_year_column_test], format='csr') - start_time = time.time() - y_pred = knn.predict(X_test_combined) - - elapsed_time = end_time - start_time - print(f"Elapsed time for test prediction: {elapsed_time:.2f} seconds") + regressor.fit(X_train_transformed, y_train) + # Predict on test data + y_pred = regressor.predict(X_test_transformed) mse = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) + r_squared = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) print(f"Mean Squared Error: {mse}") - print(f"R-squared: {r2}") + print(f"R-squared: {r_squared}") print(f"Mean Absolute Error: {mae}") + # Plot Predicted vs. Actual Values plt.figure(figsize=(10, 5)) plt.scatter(y_test, y_pred, alpha=0.3) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) plt.xlabel("Actual") plt.ylabel("Predicted") - plt.title("Predicted vs Actual Values") + plt.title("Predicted vs. Actual Values") plt.show() - return imputed_df + # Plot Residuals + residuals = y_test - y_pred + plt.figure(figsize=(10, 5)) + plt.scatter(y_test, residuals, alpha=0.3) + plt.hlines(0, y_test.min(), y_test.max(), colors="r", linestyles="dashed") + plt.xlabel("Actual") + plt.ylabel("Residuals") + plt.title("Residuals Plot") + plt.show() + + # Impute the missing data + imputed_X = imputed_df.drop(columns=["new_oms"]) + imputed_X_transformed = preprocessor.transform(imputed_X) + if apply_pca: + imputed_X_transformed = pca.transform(imputed_X_transformed) + imputed_df["predicted_oms"] = regressor.predict(imputed_X_transformed) + + return imputed_df -def knn_model_filtered_for_current_year(training_df, scaler, df_estimeres, year, GridSearch=True): - """ - Trains a K-Nearest Neighbors model for predicting new_oms values with an optional GridSearch for hyperparameter tuning. +def knn_model_with_pca_mae_per_year(training_df, scaler, df_estimeres, year, GridSearch=False, apply_pca=True, n_components=6): + """ + Trains a K-Nearest Neighbors model for predicting new_oms values with optional PCA for dimensionality reduction + and optional GridSearch for hyperparameter tuning. Calculates MAE per year after final model training. + Parameters: training_df (pd.DataFrame): DataFrame containing the training data. scaler (object): Scaler object for numerical features (e.g., StandardScaler, RobustScaler). df_estimeres (pd.DataFrame): DataFrame containing the data to be imputed. GridSearch (bool): Whether to perform GridSearch for hyperparameter tuning. Default is True. - + apply_pca (bool): Whether to apply PCA for dimensionality reduction. Default is True. + n_components (int, float, or None): Number of components to keep after applying PCA. If None, it will not reduce dimensions. + Returns: pd.DataFrame: DataFrame with predicted new_oms values. """ - import numpy as np - from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score + from sklearn.decomposition import PCA + from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.neighbors import KNeighborsRegressor - import matplotlib.pyplot as plt import pandas as pd + import numpy as np + import matplotlib.pyplot as plt # Make copies of the input DataFrames df = training_df.copy() imputed_df = df_estimeres.copy() - df = df[df['year'] == year] - + # Define categorical and numerical columns categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] df[categorical_columns] = df[categorical_columns].astype(str) imputed_df[categorical_columns] = imputed_df[categorical_columns].astype(str) - # Columns to fill with 'missing' and 0 respectively columns_to_fill = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] numeric_columns_to_fill = [ - "inntekt_delta_oms", - "emp_delta_oms", - "befolkning_delta_oms", - "inflation_rate_oms", - "gjeldende_bdr_syss", - "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', - 'oms_syssmean_basedOn_naring_kommune' + "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", + "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' ] - # Fill NaN values with 'missing' for the specified columns + # Fill missing values df[columns_to_fill] = df[columns_to_fill].fillna('missing') imputed_df[columns_to_fill] = imputed_df[columns_to_fill].fillna('missing') - - # Fill NaN values with 0 for the specified columns df[numeric_columns_to_fill] = df[numeric_columns_to_fill].fillna(0) imputed_df[numeric_columns_to_fill] = imputed_df[numeric_columns_to_fill].fillna(0) - # Convert specified columns to category type - # categorical_columns = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] - for col in categorical_columns: - df[col] = df[col].astype("category") - - # Define features and target - X = df.drop(columns=["new_oms"]) - y = df["new_oms"] - - # Define preprocessor + # Preprocessing pipeline categorical_features = ["nacef_5", "tmp_sn2007_5", "b_kommunenr"] - numerical_features = [ - "inntekt_delta_oms", - "emp_delta_oms", - "befolkning_delta_oms", - "inflation_rate_oms", - "gjeldende_bdr_syss", - "new_oms_trendForecast", - 'oms_syssmean_basedOn_naring', - 'oms_syssmean_basedOn_naring_kommune' + "inntekt_delta_oms", "emp_delta_oms", "befolkning_delta_oms", + "inflation_rate_oms", "gjeldende_bdr_syss", "new_oms_trendForecast", + 'oms_syssmean_basedOn_naring', 'oms_syssmean_basedOn_naring_kommune' ] preprocessor = ColumnTransformer( @@ -1125,131 +1402,82 @@ def knn_model_filtered_for_current_year(training_df, scaler, df_estimeres, year, ("cat", OneHotEncoder(categories="auto", handle_unknown="ignore"), categorical_features), ] ) - - # Split the data into training and testing sets - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - # Fit the preprocessor and transform the training and testing data - preprocessor.fit(X_train) - X_train_transformed = preprocessor.transform(X_train) - X_test_transformed = preprocessor.transform(X_test) - - if GridSearch: - # Define the model - regressor = KNeighborsRegressor() - - # Define parameter grid for GridSearch - param_grid = { - 'n_neighbors': [2, 3, 5, 7] - } - - # Perform GridSearch with cross-validation - grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1) - grid_search.fit(X_train_transformed, y_train) - - # Print best parameters - print("Best parameters found by GridSearch:", grid_search.best_params_) - - # Use best estimator from grid search - regressor = grid_search.best_estimator_ - else: - # Define the model with default parameters - regressor = KNeighborsRegressor(n_neighbors=2) - - # Train the model - regressor.fit(X_train_transformed, y_train) - - # Perform cross-validation using MAE as the scoring metric - cv_scores = cross_val_score(regressor, X_train_transformed, y_train, cv=5, scoring='neg_mean_absolute_error') - - # Since cross_val_score returns negative values for error metrics, we negate them to get the actual MAE - mean_mae = -np.mean(cv_scores) - std_mae = np.std(cv_scores) - - print(f"Cross-Validated Mean MAE: {mean_mae}") - print(f"Cross-Validated MAE Standard Deviation: {std_mae}") - - - # Predict on test data - y_pred = regressor.predict(X_test_transformed) - - # Calculate metrics - mse = mean_squared_error(y_test, y_pred) - r_squared = r2_score(y_test, y_pred) - mae = mean_absolute_error(y_test, y_pred) - print("Mean Squared Error:", mse) - print("R-squared:", r_squared) - print("Mean Absolute Error:", mae) - - # Create the n3 class by taking the first 4 characters of nacef_5 - X_test['n3'] = X_test['nacef_5'].str[:4] - # Evaluate performance based on the n3 class - results = pd.DataFrame({'n3': X_test['n3'], 'actual': y_test, 'predicted': y_pred}) + # Calculate MAE per year + mae_per_year = [] + start_year = df['year'].min() + end_year = df['year'].max() - # Define the n3 categories to exclude - n3_to_exclude = ['45.1', '45.2', '46.3', '46.4', '46.5', '46.7', '46.9', '10.4', '02.4'] + for current_year in range(start_year, end_year + 1): + # Split into training and testing based on the year + train_df = df[df["year"] < current_year] + test_df = df[df["year"] == current_year] + + if train_df.empty or test_df.empty: + print(f"No data available for training or testing for the year {current_year}. Skipping this year.") + continue - # Check if there are any n3 categories not in the excluded list - if not results['n3'].isin(n3_to_exclude).all(): - # Filter out the rows where the n3 is in the excluded list - filtered_results = results[~results['n3'].isin(n3_to_exclude)] + # Define features and target for train and test + X_train = train_df.drop(columns=["new_oms", "year"]) + y_train = train_df["new_oms"] + X_test = test_df.drop(columns=["new_oms", "year"]) + y_test = test_df["new_oms"] - # Extract the actual and predicted values after filtering - filtered_y_test = filtered_results['actual'] - filtered_y_pred = filtered_results['predicted'] + # Transform the data + preprocessor.fit(X_train) + X_train_transformed = preprocessor.transform(X_train) + X_test_transformed = preprocessor.transform(X_test) - # Recalculate the evaluation metrics excluding the specified n3 categories - filtered_mse = mean_squared_error(filtered_y_test, filtered_y_pred) - filtered_mae = mean_absolute_error(filtered_y_test, filtered_y_pred) - filtered_r_squared = r2_score(filtered_y_test, filtered_y_pred) - filtered_rmse = np.sqrt(filtered_mse) + if apply_pca: + pca = PCA(n_components=n_components) + X_train_pca = pca.fit_transform(X_train_transformed) + X_test_pca = pca.transform(X_test_transformed) + X_train_transformed = X_train_pca + X_test_transformed = X_test_pca - # Print out the filtered metrics - print(f"Filtered Mean Squared Error (MSE): {filtered_mse}") - print(f"Filtered Mean Absolute Error (MAE): {filtered_mae}") - print(f"Filtered R-squared score: {filtered_r_squared}") - print(f"Filtered Root Mean Squared Error (RMSE): {filtered_rmse}") - else: - print("No valid n3 categories found after exclusion. Skipping filtered metrics calculation.") + # Define and fit the model + if GridSearch: + param_grid = {'n_neighbors': [2, 3, 5, 7]} + grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, scoring='neg_mean_squared_error', cv=5) + grid_search.fit(X_train_transformed, y_train) + model = grid_search.best_estimator_ + else: + model = KNeighborsRegressor(n_neighbors=2) + model.fit(X_train_transformed, y_train) - metrics_per_n3 = results.groupby('n3').apply(lambda group: pd.Series({ - 'mse': mean_squared_error(group['actual'], group['predicted']), - 'r_squared': r2_score(group['actual'], group['predicted']), - 'mae': mean_absolute_error(group['actual'], group['predicted']) - })).reset_index() - - print("Metrics per 'n3':") - print(metrics_per_n3) - - # Plot Predicted vs. Actual Values - plt.figure(figsize=(10, 5)) - plt.scatter(y_test, y_pred, alpha=0.3) - plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2) - plt.xlabel("Actual") - plt.ylabel("Predicted") - plt.title("Predicted vs. Actual Values") - plt.show() + # Make predictions and calculate MAE + y_pred = model.predict(X_test_transformed) + mae = mean_absolute_error(y_test, y_pred) + mae_per_year.append((current_year, mae)) + print(f"Year {current_year}: MAE = {mae}") - # Plot Residuals - residuals = y_test - y_pred - plt.figure(figsize=(10, 5)) - plt.scatter(y_test, residuals, alpha=0.3) - plt.hlines(0, y_test.min(), y_test.max(), colors="r", linestyles="dashed") - plt.xlabel("Actual") - plt.ylabel("Residuals") - plt.title("Residuals Plot") - plt.show() + # Print MAE for each year + print("\nMAE per Year:") + for year, mae in mae_per_year: + print(f"{year}: {mae}") - # Impute the missing data + # Optionally, plot MAE over the years + if mae_per_year: + years, mae_values = zip(*mae_per_year) + plt.figure(figsize=(10, 5)) + plt.plot(years, mae_values, marker='o') + plt.xlabel("Year") + plt.ylabel("Mean Absolute Error (MAE)") + plt.title("MAE per Year") + plt.grid(True) + plt.show() + + # Final training with the most recent data + # Impute the missing data for the current year imputed_X = imputed_df.drop(columns=["new_oms"]) imputed_X_transformed = preprocessor.transform(imputed_X) - imputed_df["predicted_oms"] = regressor.predict(imputed_X_transformed) - - return imputed_df - + if apply_pca: + imputed_X_transformed = pca.transform(imputed_X_transformed) + imputed_df["predicted_oms"] = model.predict(imputed_X_transformed) + + return imputed_df def evaluate_year_based_mae(training_df, scaler, df_estimeres, start_year=2017, end_year=2023, GridSearch=True): diff --git a/src/functions/oppdateringsfil.py b/src/functions/oppdateringsfil.py index 816a3df..eb54396 100644 --- a/src/functions/oppdateringsfil.py +++ b/src/functions/oppdateringsfil.py @@ -67,14 +67,66 @@ -def create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distribtion_percent, rerun_ml=False, geo_data=False, uu_data=False, GridSearch=False): +def create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distribtion_percent, rerun_ml=False, geo_data=False, uu_data=False, GridSearch=False, collect_data=False): start_time = time.time() - print('starting to collect data') + tosiffernaring_str = '_'.join(tosiffernaring) - # Generate the data required for processing using the create_datafiles.main function - current_year_good_oms, current_year_bad_oms, v_orgnr_list_for_imputering, training_data, imputatable_df, time_series_df, unique_id_list = create_datafiles.main(year, rate, skjema, distribtion_percent, tosiffernaring, geo_data=geo_data, uu_data=uu_data) + if collect_data: + + print('starting to collect data') + + # Generate the data required for processing using the create_datafiles.main function + current_year_good_oms, current_year_bad_oms, v_orgnr_list_for_imputering, training_data, imputatable_df, time_series_df, unique_id_list = create_datafiles.main(year, rate, skjema, distribtion_percent, tosiffernaring, geo_data=geo_data, uu_data=uu_data) + + current_year_good_oms['orgnr_n_1'] = current_year_good_oms['orgnr_n_1'].astype(str) + current_year_bad_oms['orgnr_n_1'] = current_year_bad_oms['orgnr_n_1'].astype(str) + training_data['orgnr_n_1'] = training_data['orgnr_n_1'].astype(str) + imputatable_df['orgnr_n_1'] = imputatable_df['orgnr_n_1'].astype(str) + current_year_good_oms['tmp_no_p4005'] = pd.to_numeric(current_year_good_oms['tmp_no_p4005'], errors='coerce') + current_year_bad_oms['tmp_no_p4005'] = pd.to_numeric(current_year_bad_oms['tmp_no_p4005'], errors='coerce') + time_series_df['orgnr_n_1'] = time_series_df['orgnr_n_1'].astype(str) + time_series_df['tmp_no_p4005'] = pd.to_numeric(time_series_df['tmp_no_p4005'], errors='coerce') + + v_orgnr_list_for_imputering_df = pd.DataFrame(v_orgnr_list_for_imputering, columns=['v_orgnr']) + unique_id_list_df = pd.DataFrame(unique_id_list, columns=['id']) + + + current_year_good_oms.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/good_oms/skjema={skjema}/aar={year}/current_year_good_oms_{tosiffernaring_str}.parquet", filesystem=fs) + + current_year_bad_oms.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/bad_oms/skjema={skjema}/aar={year}/current_year_bad_oms.parquet_{tosiffernaring_str}", filesystem=fs) + + v_orgnr_list_for_imputering_df.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/imputering_list/skjema={skjema}/aar={year}/v_orgnr_list_for_imputering_{tosiffernaring_str}.parquet", filesystem=fs) + + training_data.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/training_data/skjema={skjema}/aar={year}/training_data_{tosiffernaring_str}.parquet", filesystem=fs) + + imputatable_df.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/trenger_imputering_dfs/skjema={skjema}/aar={year}/imputable_df_{tosiffernaring_str}.parquet", filesystem=fs) + + unique_id_list_df.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/unique_id_list/skjema={skjema}/aar={year}/unique_id_list_{tosiffernaring_str}.parquet", filesystem=fs) + + time_series_df.to_parquet( + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/time_series/skjema={skjema}/aar={year}/time_series_{tosiffernaring_str}.parquet", filesystem=fs) + + else: + + current_year_good_oms = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/good_oms/skjema={skjema}/aar={year}/current_year_good_oms_{tosiffernaring_str}.parquet", filesystem=fs) + current_year_bad_oms = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/bad_oms/skjema={skjema}/aar={year}/current_year_bad_oms.parquet_{tosiffernaring_str}", filesystem=fs) + v_orgnr_list_for_imputering_df = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/imputering_list/skjema={skjema}/aar={year}/v_orgnr_list_for_imputering_{tosiffernaring_str}.parquet", filesystem=fs) + training_data = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/training_data/skjema={skjema}/aar={year}/training_data_{tosiffernaring_str}.parquet", filesystem=fs) + imputatable_df = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/trenger_imputering_dfs/skjema={skjema}/aar={year}/imputable_df_{tosiffernaring_str}.parquet", filesystem=fs) + unique_id_list_df = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/unique_id_list/skjema={skjema}/aar={year}/unique_id_list_{tosiffernaring_str}.parquet", filesystem=fs) + time_series_df = pd.read_parquet(f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/arbeidsfiler/time_series/skjema={skjema}/aar={year}/time_series_{tosiffernaring_str}.parquet", filesystem=fs) + + v_orgnr_list_for_imputering = v_orgnr_list_for_imputering_df['v_orgnr'].tolist() + unique_id_list = unique_id_list_df['id'].tolist() + # Construct the function name dynamically based on the model parameter function_name = f"{model}" @@ -96,20 +148,23 @@ def create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distri else: # Call the function without the additional parameters imputed_df = function_to_call(training_data, scaler, imputatable_df, year, GridSearch=GridSearch) + + # Ensure the problematic column is of string type + imputed_df['orgnr_n_1'] = imputed_df['orgnr_n_1'].astype(str) imputed_df.to_parquet( - f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/imputert-skjema-data/aar={year}/skjema={skjema}/imputed_{tosiffernaring}_{model}.parquet", + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/imputert-skjema-data/aar={year}/skjema={skjema}/imputed_{tosiffernaring_str}_{model}.parquet", storage_options={"token": AuthClient.fetch_google_credentials()}, ) - print("finsihed machine learning model training, starting final treatment of update file") + print("finished machine learning model training, starting final treatment of update file") else: print('not rerunning ml model, read in data from GCP file path') fil_path = [ f for f in fs.glob( - f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/imputert-skjema-data/aar={year}/skjema={skjema}/imputed_{tosiffernaring}_{model}.parquet" + f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/imputert-skjema-data/aar={year}/skjema={skjema}/imputed_{tosiffernaring_str}_{model}.parquet" ) if f.endswith(".parquet") ] @@ -1117,36 +1172,62 @@ def create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distri timeseries_knn_agg = pd.DataFrame({'explanation': ['placeholder until code is set up for this skjema']}) timeseries_knn__kommune_agg = pd.DataFrame({'explanation': ['placeholder until code is set up for this skjema']}) - - filtered_df = oppdateringsfil[oppdateringsfil['v_orgnr'].isin(v_orgnr_list_for_imputering)] + # Filter based on 'nacef_5' starting with values in 'tosiffernaring' and excluding rows where 'regtype' is '01' + unique_id_list = current_year_bad_oms[ + current_year_bad_oms['nacef_5'].str[:2].isin(tosiffernaring) & + (current_year_bad_oms['regtype'] != '01') + ]['id'].unique().tolist() + + + print(unique_id_list) + + # filtered_df = oppdateringsfil[oppdateringsfil['v_orgnr'].isin(unique_id_list)] + + filtered_df = oppdateringsfil[oppdateringsfil['id'].isin(unique_id_list)] - unique_id_count = filtered_df[filtered_df['n2_f'] == {tosiffernaring}]['id'].nunique() + print(filtered_df.head()) + + # Use .isin() to filter by multiple categories in 'tosiffernaring' + unique_id_count = filtered_df[filtered_df['n2_f'].isin(tosiffernaring)]['id'].nunique() print(f"Number of unique 'id' where 'n2_f' is {tosiffernaring}: {unique_id_count}") # Get the unique 'id' values as a list where 'n2_f' is 45 # unique_ids_list = filtered_df[filtered_df['n2_f'] == {tosiffernaring}]['id'].unique().tolist() - unique_ids_list = filtered_df[(filtered_df['n2_f'] == tosiffernaring) & (filtered_df['regtype'] != '01')]['id'].unique().tolist() + + # Use .isin() in your filter to match multiple categories + # unique_ids_list = filtered_df[(filtered_df['n2_f'].isin(tosiffernaring)) & (filtered_df['regtype'] != '01')]['id'].unique().tolist() # Print the list of unique IDs - print(f"Unique 'id' values where 'n2_f' is {tosiffernaring}: {unique_ids_list}") + print(f"Unique 'id' values where 'n2_f' is {tosiffernaring}: {unique_id_list}") # rename variables in filtered_df - filtered_df = filtered_df[filtered_df['n2_f'] == {tosiffernaring}] + # filtered_df = filtered_df[filtered_df['n2_f'].isin(tosiffernaring)] til_bakken = filtered_df[['id', 'v_orgnr', 'oms', 'new_drkost']] til_bakken = til_bakken.rename(columns={'id': 'enhets_id', 'v_orgnr': 'orgnr_bedrift', 'oms': 'gjeldende_omsetn_kr', 'new_drkost': 'gjeldende_driftsk_kr'}) + import datetime + + # Get the current date and format it (e.g., YYYYMMDD) + current_date = datetime.datetime.now().strftime("%d%m%Y") # Format as 'DDMMYYYY' + + # Define your destination path with the current date dynamically inserted + destination_path = f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/maskin-laering/oppdaterte_filer/skjema={skjema}/aar={year}/til_bakken_{current_date}_{model}_{tosiffernaring}.parquet" + + # Save the parquet file with the dynamic date in the file name + til_bakken.to_parquet(destination_path, filesystem=fs, index=False) + # Calculate processing time processing_time = time.time() - start_time - print(f"Time taken to create training data: {processing_time:.2f} seconds") + print(f"Time taken to run program with selected critera: {processing_time:.2f} seconds") # Return the processed dataframes - return oppdateringsfil, timeseries_knn_agg, timeseries_knn__kommune_agg, check_totals, check_manually, v_orgnr_list_for_imputering, til_bakken, unique_id_list + return oppdateringsfil, timeseries_knn_agg, timeseries_knn__kommune_agg, check_totals, check_manually, v_orgnr_list_for_imputering, til_bakken, unique_id_list, current_year_bad_oms diff --git a/src/functions/time.py b/src/functions/time.py new file mode 100644 index 0000000..4df6aaf --- /dev/null +++ b/src/functions/time.py @@ -0,0 +1,83 @@ +import time +from tqdm import tqdm + +def entire_program(duration): + """ + Simulate a program running for a specific duration, showing a progress bar. + + Parameters: + duration (int): The total expected time for the program to run (in seconds). + """ + start_time = time.time() # Track the start time + end_time = start_time + duration # Calculate when the program should end + + # Initialize the progress bar + with tqdm(total=duration, desc='Running Time', unit='s', ncols=100) as pbar: + while time.time() < end_time: + # Calculate elapsed time + elapsed_time = int(time.time() - start_time) + + # Update the progress bar + pbar.update(elapsed_time - pbar.n) # Update only the difference + + # Simulate some work + time.sleep(0.1) # Adjust the sleep time as necessary + + print("\nProgram completed!") + +# Example: Run a simulated program for 20 seconds +# entire_program(20) + +def just_create_datafiles(duration): + """ + Simulate a program running for a specific duration, showing a progress bar. + + Parameters: + duration (int): The total expected time for the program to run (in seconds). + """ + start_time = time.time() # Track the start time + end_time = start_time + duration # Calculate when the program should end + + # Initialize the progress bar + with tqdm(total=duration, desc='Running Time', unit='s', ncols=100) as pbar: + while time.time() < end_time: + # Calculate elapsed time + elapsed_time = int(time.time() - start_time) + + # Update the progress bar + pbar.update(elapsed_time - pbar.n) # Update only the difference + + # Simulate some work + time.sleep(0.1) # Adjust the sleep time as necessary + + print("\nProgram completed!") + +# Example: Run a simulated program for 20 seconds +# just_create_datafiles(20) + +def just_machine_learning(duration): + """ + Simulate a program running for a specific duration, showing a progress bar. + + Parameters: + duration (int): The total expected time for the program to run (in seconds). + """ + start_time = time.time() # Track the start time + end_time = start_time + duration # Calculate when the program should end + + # Initialize the progress bar + with tqdm(total=duration, desc='Running Time', unit='s', ncols=100) as pbar: + while time.time() < end_time: + # Calculate elapsed time + elapsed_time = int(time.time() - start_time) + + # Update the progress bar + pbar.update(elapsed_time - pbar.n) # Update only the difference + + # Simulate some work + time.sleep(0.1) # Adjust the sleep time as necessary + + print("\nProgram completed!") + +# Example: Run a simulated program for 20 seconds +# just_create_datafiles(20) \ No newline at end of file diff --git a/src/notebooks/Master.ipynb b/src/notebooks/Master.ipynb index f26de6a..4795b34 100644 --- a/src/notebooks/Master.ipynb +++ b/src/notebooks/Master.ipynb @@ -487,15 +487,31 @@ "# Hente data\n", "from imports import *\n", "\n", - "training_data, imputatable_df, foretak_pub = ml_modeller.hente_training_data()" + "year = 2022\n", + "\n", + "training_data, imputatable_df, foretak_pub = ml_modeller.hente_training_data(year)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "38", "metadata": { "tags": [] }, + "outputs": [], + "source": [ + "# print all columns option\n", + "pd.set_option('display.max_columns', None)\n", + "imputatable_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "39", + "metadata": { + "tags": [] + }, "source": [ "# \n", "### [Regression Problemer](#regression-problemer)" @@ -503,7 +519,7 @@ }, { "cell_type": "markdown", - "id": "39", + "id": "40", "metadata": {}, "source": [ "# \n", @@ -513,7 +529,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40", + "id": "41", "metadata": { "tags": [] }, @@ -525,14 +541,14 @@ "# Turn off GridSearch for faster run time\n", "GridSearch=False\n", "\n", - "result = ml_modeller.xgboost_model(training_data, scaler, imputatable_df, GridSearch=GridSearch)\n", + "result = ml_modeller.xgboost_model(training_data, scaler, imputatable_df, year, GridSearch=GridSearch)\n", "\n", "# Best result for GridSearch so far:" ] }, { "cell_type": "markdown", - "id": "41", + "id": "42", "metadata": {}, "source": [ "# \n", @@ -542,7 +558,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42", + "id": "43", "metadata": { "tags": [] }, @@ -566,7 +582,7 @@ }, { "cell_type": "markdown", - "id": "43", + "id": "44", "metadata": {}, "source": [ "# \n", @@ -576,7 +592,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44", + "id": "45", "metadata": { "tags": [] }, @@ -598,7 +614,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "46", "metadata": { "tags": [] }, @@ -611,7 +627,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46", + "id": "47", "metadata": { "tags": [] }, @@ -633,7 +649,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47", + "id": "48", "metadata": { "tags": [] }, @@ -645,7 +661,7 @@ }, { "cell_type": "markdown", - "id": "48", + "id": "49", "metadata": { "tags": [] }, @@ -656,7 +672,7 @@ }, { "cell_type": "markdown", - "id": "49", + "id": "50", "metadata": { "tags": [] }, @@ -668,7 +684,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50", + "id": "51", "metadata": { "tags": [] }, @@ -679,7 +695,7 @@ }, { "cell_type": "markdown", - "id": "51", + "id": "52", "metadata": { "tags": [] }, @@ -691,7 +707,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52", + "id": "53", "metadata": { "tags": [] }, @@ -702,7 +718,7 @@ }, { "cell_type": "markdown", - "id": "53", + "id": "54", "metadata": { "tags": [] }, @@ -714,7 +730,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "55", "metadata": { "tags": [] }, @@ -725,7 +741,7 @@ "pd.set_option('display.max_rows', None)\n", "year = 2023\n", "\n", - "model = 'knn_model_with_pca' # Choose between knn_model, xgboost_model, lstm_model or, nn_model_1, nn_model_2, evaluate_year_based_mae, knn_model_fast, knn_model_new, knn_model_with_pca\n", + "model = 'knn_model_with_pca_custom_distance' # Choose between knn_model, xgboost_model, lstm_model or, nn_model_1, nn_model_2, evaluate_year_based_mae, knn_model_with_pca, knn_model_with_pca_custom_distance, knn_model_with_pca_mae_per_year, xgboost_model_with_pca\n", "rate = 0.65 # rate will determine which survey responses are sorted into a good/bad df. The bad df will be updated/corrected using machine learning, while the good_df will go through more tradional cleaning\n", "scaler = RobustScaler() # Choose between StandardScaler(), MinMaxScaler(), RobustScaler(), etc\n", "GridSearch=False # Turn on to fine tune parameters, but this will take a lot more time - maybe even days for some models. \n", @@ -736,6 +752,7 @@ "geo_data=False # turn on to build latitude and longitude data into the dfs\n", "uu_data=True # turn on to add reg_type 01 utenfor utvalg data to the dfs\n", "rerun_ml=True # Turn on in order to run a machine learning model on the data. This may take time depending on the model, so if output exists already, turn this to false. \n", + "collect_data=False\n", "\n", "# skjema = 'RA-0174-1' for varehandel\n", "# skjema = 'RA-0255-1' for Tjenesteyting - delreg 59\n", @@ -746,13 +763,13 @@ "\n", "tosiffernaring = ['45', '46', '47']\n", "\n", - "update_file, timeseries_knn_agg, timeseries_knn__kommune_agg, check_totals, check_manually, v_orgnr_list_for_imputering, til_bakken, unique_id_list, current_year_bad_oms = oppdateringsfil.create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distribtion_percent, rerun_ml=rerun_ml, geo_data=geo_data, uu_data=uu_data, GridSearch=GridSearch)" + "update_file, timeseries_knn_agg, timeseries_knn__kommune_agg, check_totals, check_manually, v_orgnr_list_for_imputering, til_bakken, unique_id_list, current_year_bad_oms = oppdateringsfil.create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distribtion_percent, rerun_ml=rerun_ml, geo_data=geo_data, uu_data=uu_data, GridSearch=GridSearch, collect_data=collect_data)" ] }, { "cell_type": "code", "execution_count": null, - "id": "55", + "id": "56", "metadata": { "tags": [] }, @@ -763,7 +780,7 @@ }, { "cell_type": "markdown", - "id": "56", + "id": "57", "metadata": { "tags": [] }, @@ -775,7 +792,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57", + "id": "58", "metadata": { "tags": [] }, @@ -789,7 +806,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58", + "id": "59", "metadata": { "tags": [] }, @@ -801,7 +818,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "60", "metadata": { "tags": [] }, @@ -813,7 +830,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60", + "id": "61", "metadata": { "tags": [] }, @@ -873,7 +890,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "62", "metadata": { "tags": [] }, @@ -887,7 +904,7 @@ }, { "cell_type": "markdown", - "id": "62", + "id": "63", "metadata": {}, "source": [ "---------------------------------------------------------------------------------------------------------------------------------" @@ -895,7 +912,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "64", "metadata": {}, "source": [ "# \n", @@ -904,7 +921,7 @@ }, { "cell_type": "markdown", - "id": "64", + "id": "65", "metadata": {}, "source": [ "#### 2021 XGBOOST:\n", @@ -935,7 +952,7 @@ }, { "cell_type": "markdown", - "id": "65", + "id": "66", "metadata": {}, "source": [ "--------------------------------------------------------------------------------------------------------------------------------\n", @@ -950,7 +967,7 @@ }, { "cell_type": "markdown", - "id": "66", + "id": "67", "metadata": {}, "source": [ "# Co-operation with other sections" @@ -958,7 +975,7 @@ }, { "cell_type": "markdown", - "id": "67", + "id": "68", "metadata": {}, "source": [ "#### A lot of potential here:\n", @@ -977,7 +994,7 @@ }, { "cell_type": "markdown", - "id": "68", + "id": "69", "metadata": {}, "source": [ "# Network Analysis" @@ -986,7 +1003,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69", + "id": "70", "metadata": { "tags": [] }, @@ -1011,7 +1028,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70", + "id": "71", "metadata": { "tags": [] }, From 7720fa8776151132f6820fa1fc5442bc5e3e5d56 Mon Sep 17 00:00:00 2001 From: Joel Parr Date: Mon, 2 Dec 2024 10:02:26 +0100 Subject: [PATCH 2/2] Commit final changes. Basic changes to parameters. Some changes made to workbooks such as 'File Transfers' which have no use in production but might be useful to exist in the main branch anyways. No material changes - can now delete this branch. --- poetry.lock | 261 ++----- pyproject.toml | 3 +- src/extra/fun/banneord.py | 89 ++- src/extra/nni tester/NNI Master.ipynb | 8 + src/functions/create_datafiles.py | 150 ++-- src/functions/oppdateringsfil.py | 17 +- src/notebooks/AO.ipynb | 165 ++--- src/notebooks/File Transfers.ipynb | 831 ++++++++++++++++++++++- src/notebooks/ML function builder.ipynb | 376 ++++++---- src/notebooks/Master.ipynb | 50 +- "src/notebooks/klargj\303\270ring.ipynb" | 438 ++++++++---- 11 files changed, 1714 insertions(+), 674 deletions(-) diff --git a/poetry.lock b/poetry.lock index 85e0ef1..86dc02e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -39,91 +39,11 @@ files = [ [[package]] name = "aiohttp" - version = "3.10.3" - description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"}, - {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"}, - {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"}, - {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"}, - {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"}, - {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"}, - {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"}, - {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"}, - {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"}, - {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"}, - {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"}, - {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"}, - {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"}, - {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"}, - {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"}, - {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"}, - {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"}, - {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"}, - {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"}, - {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"}, - {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"}, - {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"}, - {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"}, - {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"}, - {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"}, - {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"}, - {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"}, -======= {file = "aiohttp-3.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cc36cbdedf6f259371dbbbcaae5bb0e95b879bc501668ab6306af867577eb5db"}, {file = "aiohttp-3.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85466b5a695c2a7db13eb2c200af552d13e6a9313d7fa92e4ffe04a2c0ea74c1"}, {file = "aiohttp-3.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:71bb1d97bfe7e6726267cea169fdf5df7658831bb68ec02c9c6b9f3511e108bb"}, @@ -200,7 +120,6 @@ files = [ {file = "aiohttp-3.10.3-cp39-cp39-win32.whl", hash = "sha256:b69d832e5f5fa15b1b6b2c8eb6a9fd2c0ec1fd7729cb4322ed27771afc9fc2ac"}, {file = "aiohttp-3.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:673bb6e3249dc8825df1105f6ef74e2eab779b7ff78e96c15cadb78b04a83752"}, {file = "aiohttp-3.10.3.tar.gz", hash = "sha256:21650e7032cc2d31fc23d353d7123e771354f2a3d5b05a5647fc30fea214e696"}, - ] [package.dependencies] @@ -213,7 +132,6 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] - speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] @@ -1276,76 +1194,6 @@ tornado = ">=6.0.4" urllib3 = ">=1.24.3" zict = ">=3.0.0" -[[package]] -name = "docstring-inheritance" -version = "2.2.0" -description = "Avoid writing and maintaining duplicated docstrings." -optional = false -python-versions = "<3.13,>=3.8" -files = [ - {file = "docstring-inheritance-2.2.0.tar.gz", hash = "sha256:30df77a04f32940000136b6a1cb20718b34291ba8905aacbf8375b85d7f85f9a"}, - {file = "docstring_inheritance-2.2.0-py3-none-any.whl", hash = "sha256:70a15972a6ee456576db4392c4ab806736b62892b1307f846b4a19200f06781e"}, -] - -[package.extras] -test = ["covdefaults", "pytest", "pytest-cov"] - -[[package]] -name = "duckdb" -version = "0.10.3" -description = "DuckDB in-process database" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, - {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, - {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, - {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, - {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, - {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, - {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, - {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, - {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, - {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, - {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, - {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, - {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, - {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, - {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, - {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, - {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, - {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, - {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, - {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, - {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, - {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, - {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, - {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, - {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, - {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, - {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, -] - [[package]] name = "et-xmlfile" version = "1.1.0" @@ -1802,12 +1650,12 @@ files = [ google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] proto-plus = ">=1.22.3,<2.0.0dev" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" @@ -1895,8 +1743,8 @@ grpc-google-iam-v1 = ">=0.12.4,<1.0.0dev" grpcio = ">=1.51.3,<2.0dev" grpcio-status = ">=1.33.2" proto-plus = [ - {version = ">=1.22.2,<2.0.0dev", markers = "python_version >= \"3.11\""}, {version = ">=1.22.0,<2.0.0dev", markers = "python_version < \"3.11\""}, + {version = ">=1.22.2,<2.0.0dev", markers = "python_version >= \"3.11\""}, ] protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" @@ -3697,8 +3545,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.23.3", markers = "python_version >= \"3.11\""}, {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.3", markers = "python_version >= \"3.11\""}, ] [package.extras] @@ -4341,8 +4189,8 @@ files = [ [package.dependencies] lxml = {version = ">=4.9.2", optional = true, markers = "extra == \"xml\""} numpy = [ - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] odfpy = {version = ">=1.4.1", optional = true, markers = "extra == \"excel\""} openpyxl = {version = ">=3.1.0", optional = true, markers = "extra == \"excel\""} @@ -4443,20 +4291,6 @@ tests-deser = ["odfpy", "openpyxl", "pyarrow", "tables", "xlrd"] tests-examples = ["aiohttp", "nbval", "pandas", "panel", "pytest", "pytest-asyncio", "pytest-xdist"] tests-full = ["aiohttp", "cloudpickle", "coverage[toml]", "gmpy", "ipython", "jsonschema", "nbval", "nest-asyncio", "numpy", "odfpy", "openpyxl", "pandas", "panel", "pyarrow", "pytest", "pytest-asyncio", "pytest-xdist", "tables", "xlrd"] -[[package]] -name = "parameterized" -version = "0.9.0" -description = "Parameterized testing with any Python test framework" -optional = false -python-versions = ">=3.7" -files = [ - {file = "parameterized-0.9.0-py2.py3-none-any.whl", hash = "sha256:4e0758e3d41bea3bbd05ec14fc2c24736723f243b28d702081aef438c9372b1b"}, - {file = "parameterized-0.9.0.tar.gz", hash = "sha256:7fc905272cefa4f364c1a3429cbbe9c0f98b793988efb5bf90aac80f08db09b1"}, -] - -[package.extras] -dev = ["jinja2"] - [[package]] name = "parso" version = "0.8.4" @@ -4507,7 +4341,6 @@ ptyprocess = ">=0.5" [[package]] name = "pillow" version = "10.4.0" - description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.8" @@ -4596,7 +4429,6 @@ files = [ [package.extras] docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] - fpx = ["olefile"] mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] @@ -4649,6 +4481,47 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "1.14.0" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.9" +files = [ + {file = "polars-1.14.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f346177c6f3442e8e61eadc4830d588348bf3383b0100d1c942b5615813be16e"}, + {file = "polars-1.14.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:588b5622b3a73be874a8e432d45c8a122662c09ce5ba2d5e5966f6dacce2b914"}, + {file = "polars-1.14.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0bc46ad6ceeec5d9d881f09c7c1811844e851980735f8455981cdea456e08f5c"}, + {file = "polars-1.14.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:3fc0cf084f848799379e8eba14733ae0e9d66a0fa8ec41719df82ed29c827003"}, + {file = "polars-1.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:5ca507d162f88a44e1a945feecfa474fda0b66f378336d69b9ee23917da670c3"}, + {file = "polars-1.14.0.tar.gz", hash = "sha256:e34fbeca4664fba754a12d0a66b36569c4c9e5a0116108d9362067a0ca596b4d"}, +] + +[package.extras] +adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"] +all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"] +async = ["gevent"] +calamine = ["fastexcel (>=0.9)"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"] +deltalake = ["deltalake (>=0.15.0)"] +excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"] +fsspec = ["fsspec"] +gpu = ["cudf-polars-cu12"] +graph = ["matplotlib"] +iceberg = ["pyiceberg (>=0.5.0)"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "polars[pyarrow]"] +plot = ["altair (>=5.4.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +sqlalchemy = ["polars[pandas]", "sqlalchemy"] +style = ["great-tables (>=0.8.0)"] +timezone = ["backports-zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "prometheus-client" version = "0.20.0" @@ -5156,6 +5029,10 @@ files = [ {file = "pyreadr-0.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3f200d4bd7dcfa37b9d244f05b3708b7183bb7978c6865a1364f39727021fb0"}, {file = "pyreadr-0.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa4919beb4cfa9c915e704b41d32d8c40a4f505a7c9bfdfc4930b3b4da5d2b8"}, {file = "pyreadr-0.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:6bae17a8970b62c2af257ec8c5aad9d759a1bdc2a763e299ff82826d7140afe4"}, + {file = "pyreadr-0.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f8e0b0db674c3a25b4d38016a4a2fe18c5107ccab08b31bfdc1e328c184d8b17"}, + {file = "pyreadr-0.5.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d77ddc6a7ac98ea61394f349aa90bbda0513fdde8b27e73116cb077dff8d5a85"}, + {file = "pyreadr-0.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d31b4adcef91a8c75818097851dfdce9d44ad46c763c8a2dc6ffbca0b62c4675"}, + {file = "pyreadr-0.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b11fc281cd6e3a689de1962a60c812f5138c369bb25530fd0bcd91164566cb"}, {file = "pyreadr-0.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:286a29fe24e319325655455b265bf1acf34541ff77d0ad8dd56900da3ab72fb8"}, {file = "pyreadr-0.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed98fbaae0198a73efa29f2d2aa9af77d8f1cb8ad9c2ac23ecc6fd70a75ca092"}, {file = "pyreadr-0.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:444190968331437863fff09e64ba003665dd075c3fe3736c638083ed1b16d8ff"}, @@ -6375,29 +6252,6 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] -[[package]] -name = "ssb-eimerdb" -version = "0.1.9" -description = "EimerDB" -optional = false -python-versions = "<3.13,>=3.10" -files = [ - {file = "ssb_eimerdb-0.1.9-py3-none-any.whl", hash = "sha256:68bd03506169eb7b5b9fe8f2b2d6d323b7d1b541035abbbace16266498bf8c5c"}, - {file = "ssb_eimerdb-0.1.9.tar.gz", hash = "sha256:9c7219f29c7847ba7fca39002063b8b274cbb549e4659355aa68ca2f006095f4"}, -] - -[package.dependencies] -click = ">=8.0.1" -dapla-toolbelt = ">=2.0.8,<3.0.0" -docstring-inheritance = ">=2.2.0,<3.0.0" -duckdb = ">=0.10.0,<0.11.0" -google-cloud-storage = ">=2.15.0,<3.0.0" -ipykernel = ">=6.29.3,<7.0.0" -pandas = ">=2.2.1,<3.0.0" -parameterized = {version = ">=0.9.0,<0.10.0", extras = ["typed"]} -pyarrow = "14.0.2" -uuid = ">=1.30,<2.0" - [[package]] name = "ssb-sgis" version = "1.0.3" @@ -6822,7 +6676,6 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake [[package]] name = "urllib3" version = "2.2.2" - description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" @@ -6837,16 +6690,6 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "uuid" -version = "1.30" -description = "UUID object and generation functions (Python 2.3 or higher)" -optional = false -python-versions = "*" -files = [ - {file = "uuid-1.30.tar.gz", hash = "sha256:1f87cc004ac5120466f36c5beae48b4c48cc411968eed0eaecd3da82aa96193f"}, -] - [[package]] name = "wcwidth" version = "0.2.13" @@ -7221,4 +7064,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "63d0095df7e781c968e50e334070b3f5826b466648fd50da27113bca99044f5a" +content-hash = "11f2caf485577c4018fa237fb7d1105164ed07c1e1b3d3a4ded7c1bacdd41350" diff --git a/pyproject.toml b/pyproject.toml index 6e1218b..ca1f5d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ graphviz = "^0.20.1" xgboost = "^2.0.3" numpy = "^1.26.4" shap = "^0.45.1" -ssb-eimerdb = "^0.1.8" tensorflow = "^2.16.1" scikeras = "^0.13.0" pyreadr = "^0.5.0" @@ -34,6 +33,8 @@ dash = "^2.17.1" holoviews = "^1.19.1" dash-bootstrap-components = "^1.6.0" dash-leaflet = "^1.0.15" +pyarrow = "14.0.2" +polars = "^1.14.0" [tool.poetry.group.dev.dependencies] pytest = ">=7.1.3" diff --git a/src/extra/fun/banneord.py b/src/extra/fun/banneord.py index aa902db..41d1fcc 100644 --- a/src/extra/fun/banneord.py +++ b/src/extra/fun/banneord.py @@ -100,10 +100,11 @@ def holymoly(year, start_year, words): skjema_df.columns = skjema_df.columns.str.lower() # Convert column names to lower case happy_days.append(skjema_df) - + # Concatenate all DataFrames into a single DataFrame happy_days = pd.concat(happy_days, ignore_index=True) + def count_words_in_row(row, words): count = 0 exclamation_count = 0 @@ -142,6 +143,61 @@ def count_words_in_row(row, words): happy_days['n2'] = happy_days['nacef_5'].str[:2] happy_days = happy_days.reset_index(drop=True) + + # Dictionary for hard-keyed replacements + replacement_dict = { + '41': 'Oppføring av bygninger (41)', + '42': 'Anleggsvirksomhet (42)', + '43': 'Spesialisert bygge-og anleggsvirksomhet (43)', + '45': 'Handel med reparasjon av motorvogner (45)', + '46': 'Agentur- og engroshandel, unntatt med motorvogner (46)', + '47': 'Detaljhandel, unntatt med motorvogner (47)', + '56': 'Serveringsvirksomhet (56)', + '62': 'Tjenester tilknyttet informasjonsteknologi (62)', + '68': 'Omsetning og drift av fast eiendom (68)', + '69': 'Juridisk og regnskapsmessig tjenesteyting (69)', + '70': 'Hovedkontortjenester, administrativ rådgivning (70)', + '71': 'Arkitektvirksomhet og teknisk konsulentvirksomhet, og teknisk prøving og analyse (71)', + '72': 'Forskning og utviklingarbeid (72)', + '73': 'Annonse- og reklamevirksomhet og markedsundersøkelser (73)', + '74': 'Annen faglig, vitenskapelig og teknisk virksomhet (74)', + '77': 'Utlei- og leasingvirksomhet (77)', + '78': 'Arbeidskrafttjenester (78)', + '79': 'Reisebyrå- og reisarrangeørvirksomhet og tilknyttede tjenester (79)', + '80': 'Vakttjeneste og etterforsking (80)', + '81': 'Tjenester tilknyttet eiendomsdrift (81)', + '82': 'Annen forretningsmessig tjenesteyting (82)', + '95': 'Reparasjon av datamaskiner, husholdingsvarer og varer til personlig bruk (95)', + '96': 'Annen personlig tjenesteyting (96)' + } + + replacement_dict_n3 = { + '41.1': 'Utvikling av byggeprosekter (41.1)', + '41.2': 'Oppføring av bygninger (41.2)', + '43.1': 'Riving og grunnarbeid (43.1)', + '43.2': 'Elektrisk installasjonsarbeid, VVS-arbeid og annet installasjonsarbeid (43.2)', + '45.2': 'Vedlikehold og reparasjon av motorvogner, unntatt motorsykler (45.2)', + '46.2': 'Engroshandel med jordbruksråvarer og levende dyr (46.2)', + '46.3': 'Engroshandel med nærings-og sytelsesmidler (46.3)', + '46.4': 'Engroshandel med husholdningsvarer og varer til personlig bruk (46.4)', + '46.6': 'Engroshandel med andre maskiner og annet utstyr (46.6)', + '46.7': 'Engroshandel med spesialisert vareutvalg ellers (46.7)', + '47.1': 'Butikkhandel med bredt vareutvalg (47.1)', + '47.3': 'Detaljhandel med drivstoff til motorvogner (47.3)', + '47.5': 'Butikkhandel med andre husholdingsvarer i spesialforretninger (47.5)', + '47.6': 'Butikkhandel med bøker, musikkartikler og andre fritidsartikler i spesialforretninger (47.6)', + '47.7': 'Annen butikkhandel i spesialforretninger (47.7)', + '68.2': 'Utleie av egen eller leid fast eiendom (68.2)', + '69.2': 'Regnskap, revisjon og skatterådgivning (69.2)', + '71.1': 'Arkitektvirksomhet og teknisk konsulentvirksomhet (71.1)', + '77.3': 'Utleie og leasing av andre maskiner, og annet utstyr og materiell (77.3)', + '81.2': 'Rengjøringsvirksomhet (81.2)' + } + + # Replace values in the 'n2' column + happy_days['n2'] = happy_days['n2'].replace(replacement_dict) + + happy_days['n3'] = happy_days['n3'].replace(replacement_dict_n3) # sort exclamation_count and count happy_days = happy_days.sort_values(by=["exclamation_count", "count"], ascending=False) @@ -270,6 +326,7 @@ def static_barchart(df): # Sort the DataFrame by 'count' in descending order df = df.sort_values(by='count', ascending=False) + # Create a horizontal bar chart with a predefined color sequence fig = px.bar( @@ -279,9 +336,9 @@ def static_barchart(df): color='n2', # Color bars by 'n2' color_discrete_sequence=px.colors.sequential.Viridis, # Use the Viridis color sequence orientation='h', # Make the bars horizontal - height=1200, # Set height of the plot - width=900, # Set width of the plot - title='The total amount of swear words used per n2 from 2017 to 2021' # Add title + height=1400, # Set height of the plot + width=2800, # Set width of the plot + title='Total banneord brukt per n2 fra 2017 til 2021' # Add title ) # Ensure y-axis categories are sorted by 'count' @@ -309,9 +366,9 @@ def static_barchart_n3(df): color='n3', # Color bars by 'n2' color_discrete_sequence=px.colors.sequential.Viridis, # Use the Viridis color sequence orientation='h', # Make the bars horizontal - height=1200, # Set height of the plot - width=900, # Set width of the plot - title='The total amount of swear words used per n3 from 2017 to 2021' # Add title + height=1400, # Set height of the plot + width=2800, # Set width of the plot + title='Total banneord brukt per n3 fra 2017 til 2021' # Add title ) # Ensure y-axis categories are sorted by 'count' @@ -337,9 +394,9 @@ def static_barchart_exclamation(df): color='n2', # Color bars by 'n2' color_discrete_sequence=px.colors.sequential.Viridis, # Use the Viridis color sequence orientation='h', # Make the bars horizontal - height=1200, # Set height of the plot - width=900, # Set width of the plot - title='The Total Explanation Points used per n2 from 2017 to 2021' # Add title + height=1400, # Set height of the plot + width=2800, # Set width of the plot + title='Total utropstegn brukt per n2 fra 2017 til 2021' # Add title ) # Apply a logarithmic scale to the x-axis @@ -370,9 +427,9 @@ def static_barchart_exclamation_n3(df): color='n3', # Color bars by 'n2' color_discrete_sequence=px.colors.sequential.Viridis, # Use the Viridis color sequence orientation='h', # Make the bars horizontal - height=900, # Set height of the plot - width=800, # Set width of the plot - title='The Total Explanation Points used per n3 from 2017 to 2021' # Add title + height=1400, # Set height of the plot + width=2800, # Set width of the plot + title='Total utropstegn brukt per n3 fra 2017 til 2021' # Add title ) # Apply a logarithmic scale to the x-axis @@ -442,9 +499,9 @@ def middle_finger_barchart(df): # Update layout fig.update_layout( yaxis=dict(range=[-10, df['count'].max() + 20]), # Extend y-axis to make space for knuckles - title='Is the distribution of swear words across industries trying to tell us something? 🤷‍♂️', - height=800, - width=1000, + title='Prøver fordelingen av banneord på tvers av bransjer å fortelle oss noe?? 🤷‍♂️', + height=1500, + width=900, showlegend=False # Hide legend if you don't want to show it ) diff --git a/src/extra/nni tester/NNI Master.ipynb b/src/extra/nni tester/NNI Master.ipynb index 3eae829..e6414e4 100644 --- a/src/extra/nni tester/NNI Master.ipynb +++ b/src/extra/nni tester/NNI Master.ipynb @@ -159,6 +159,14 @@ "\n", "nni.evaluate_varehandel(year, start_year)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/functions/create_datafiles.py b/src/functions/create_datafiles.py index a901402..51f9e06 100644 --- a/src/functions/create_datafiles.py +++ b/src/functions/create_datafiles.py @@ -39,6 +39,8 @@ import multiprocessing import time import kommune_translate +import polars as pl +import fsspec def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=False, uu_data=False): @@ -61,25 +63,28 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F for current_year in range(start_year, year + 1): + print("starting data collection for:", {current_year}, "...") + fjor = current_year - 1 # Previous year # skjema_list = ['RA-0174-1', 'RA-0174A3', 'RA-0827A3'] # skjema_list = 'RA-0174-1' skjema_list = skjema_nr - fil_path = [ - f - for f in fs.glob( - f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema={skjema_list}/aar={current_year}/*" - ) - # if f.endswith(".parquet") - ] - - # Assuming there's only one file in fil_path - if fil_path: - skjema = pd.read_parquet(fil_path[0], filesystem=fs) - else: - raise FileNotFoundError(f"No Parquet files found for year {current_year}") - print(fil_path) + +# fil_path = [ +# f +# for f in fs.glob( +# f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema={skjema_list}/aar={current_year}/*" +# ) +# # if f.endswith(".parquet") +# ] + +# # Assuming there's only one file in fil_path +# if fil_path: +# skjema = pd.read_parquet(fil_path[0], filesystem=fs) +# else: +# raise FileNotFoundError(f"No Parquet files found for year {current_year}") +# print(fil_path) felt_id_values = [ "V_ORGNR", @@ -126,8 +131,22 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F "REG_TYPE_BEDRIFT" ] - # Filter the DataFrame for the specified field values - skjema = skjema[skjema["feltnavn"].isin(felt_id_values)] +# # Filter the DataFrame for the specified field values +# skjema = skjema[skjema["feltnavn"].isin(felt_id_values)] + + + file_path = f"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema={skjema_list}/aar={current_year}/skjemadata_data_0.parquet" + + f = FileClient.gcs_open(file_path) + + skjema = ( + pl.read_parquet(f) + .filter(pl.col("feltnavn").is_in(felt_id_values)) + ) + + skjema = skjema.to_pandas() + + skjema.columns = skjema.columns.str.lower() # Pivot the DataFrame skjema = skjema.pivot_table( @@ -181,7 +200,7 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F # fill tmp_no_p4005 nan with 0 skjema['tmp_no_p4005'].fillna(0, inplace=True) - del foretak_pub, dataset, table + del foretak_pub if skjema_list == 'RA-1100': @@ -341,8 +360,12 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F bedrift.drop(columns_to_drop, axis=1, inplace=True) + + # Replace commas with dots in the specified columns columns_to_fill = ["gjeldende_omsetn_kr", "driftskost_kr"] + + bedrift[columns_to_fill] = bedrift[columns_to_fill].replace(',', '.', regex=True) # Convert columns to numeric, replacing non-convertible values with NaN bedrift[columns_to_fill] = bedrift[columns_to_fill].apply( @@ -420,6 +443,23 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F good_temp_df["distribution_count"] = good_temp_df.groupby("orgnr_n_1")[ "gjeldende_omsetn_kr" ].transform(lambda x: (x > 0).sum()) + + # Calculate 'bedrift_count' where 'gjeldende_bdr_syss' is not equal to 0 + # Calculate 'bedrift_count' per 'orgnr_n_1' +# bedrift_counts = good_temp_df.groupby('orgnr_n_1')['gjeldende_bdr_syss'].apply(lambda x: (x != 0).sum()) + +# # Map the counts back to the DataFrame +# good_temp_df['bedrift_count'] = good_temp_df['orgnr_n_1'].map(bedrift_counts) + +# # Calculate 'distribution_count' per 'orgnr_n_1' +# distribution_counts = good_temp_df.groupby('orgnr_n_1').apply( +# lambda g: ((g['gjeldende_bdr_syss'] != 0) & (g['gjeldende_omsetn_kr'] > 0)).sum() +# ) + +# # Map the counts back to the DataFrame +# good_temp_df['distribution_count'] = good_temp_df['orgnr_n_1'].map(distribution_counts) + + # Create 'bad_temp' DataFrame based on conditions # bad_temp = good_temp_df[ @@ -493,25 +533,40 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F print("uu_data for:", {current_year}, "is True, proceeding with data processing...") - fil_path = [ - f - for f in fs.glob( - f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={year}/statistikkfil_bedrifter_pub.parquet" - ) - if f.endswith(".parquet") + temp_prior_year= current_year - 1 + +# fil_path = [ +# f +# for f in fs.glob( +# f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={current_year}/statistikkfil_bedrifter_pub.parquet" +# ) +# if f.endswith(".parquet") +# ] + + + +# # Use the ParquetDataset to read multiple files +# dataset = pq.ParquetDataset(fil_path, filesystem=fs) +# table = dataset.read() + +# # Convert to Pandas DataFrame +# bedrift_pub = table.to_pandas() + + columns_needed = [ + 'ts_forbruk', 'naring_f', 'orgnr_foretak', 'ts_salgsint', 'omsetning', + 'nopost_p4005', 'nopost_driftskostnader', 'kommune', 'sysselsetting_syss', + 'naring', 'orgnr_bedrift', 'reg_type_f', 'type' ] - # Use the ParquetDataset to read multiple files - dataset = pq.ParquetDataset(fil_path, filesystem=fs) - table = dataset.read() + file_path = f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={current_year}/statistikkfil_bedrifter_pub.parquet" - # Convert to Pandas DataFrame - bedrift_pub = table.to_pandas() + f = FileClient.gcs_open(file_path) + bedrift_pub = pl.read_parquet(f, columns=columns_needed) + + bedrift_pub = bedrift_pub.to_pandas() bedrift_pub.columns = bedrift_pub.columns.str.lower() - del table, dataset - # filter for when reg_type_f = 01 bedrift_pub = bedrift_pub[bedrift_pub['reg_type_f'] == '01'] bedrift_pub = bedrift_pub[bedrift_pub['type'] != 'S'] @@ -540,25 +595,34 @@ def main(year, limit, skjema_nr, distribtion_percent, tosiffernaring, geo_data=F bedrift_pub['tmp_salgsint_bed'] = bedrift_pub['salgsint'] bedrift_pub['id'] = bedrift_pub['orgnr_n_1'] - fil_path = [ - f - for f in fs.glob( - f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={fjor}/statistikkfil_bedrifter_pub.parquet" - ) - if f.endswith(".parquet") +# fil_path = [ +# f +# for f in fs.glob( +# f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={temp_prior_year}/statistikkfil_bedrifter_pub.parquet" +# ) +# if f.endswith(".parquet") +# ] + +# # Use the ParquetDataset to read multiple files +# dataset = pq.ParquetDataset(fil_path, filesystem=fs) +# table = dataset.read() + +# # Convert to Pandas DataFrame +# bedrift_pub_x = table.to_pandas() + + columns_needed_x = [ + 'reg_type_f', 'orgnr_bedrift', 'sysselsetting_syss' ] - # Use the ParquetDataset to read multiple files - dataset = pq.ParquetDataset(fil_path, filesystem=fs) - table = dataset.read() + file_path = f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={temp_prior_year}/statistikkfil_bedrifter_pub.parquet" - # Convert to Pandas DataFrame - bedrift_pub_x = table.to_pandas() + f = FileClient.gcs_open(file_path) + bedrift_pub_x = pl.read_parquet(f, columns=columns_needed_x) + + bedrift_pub_x = bedrift_pub_x.to_pandas() bedrift_pub_x.columns = bedrift_pub_x.columns.str.lower() - del dataset, table - bedrift_pub_x = bedrift_pub_x[bedrift_pub_x['reg_type_f'] == '01'] diff --git a/src/functions/oppdateringsfil.py b/src/functions/oppdateringsfil.py index eb54396..a260493 100644 --- a/src/functions/oppdateringsfil.py +++ b/src/functions/oppdateringsfil.py @@ -181,14 +181,27 @@ def create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distri # Extract the relevant columns from the imputed DataFrame for merging df_to_merge = imputed_df[['v_orgnr', 'year', 'id', 'predicted_oms']] - - test8 = imputed_df.copy() # Merge the imputed DataFrame with the current year's bad data on 'v_orgnr', 'id', and 'year' bad_df = pd.merge(current_year_bad_oms, df_to_merge, on=['v_orgnr', 'id', 'year'], how='left') # Assign the 'predicted_oms' values to a new column 'new_oms' bad_df['new_oms'] = bad_df['predicted_oms'] + + # Set 'new_oms' to 0 where 'gjeldende_bdr_syss' is 0 + # bad_df.loc[bad_df['gjeldende_bdr_syss'] == 0, 'new_oms'] = 0 + + bad_df["n3"] = bad_df["nacef_5"].str[:4] + bad_df["n2"] = bad_df["nacef_5"].str[:2] + + # Set 'new_oms' to 0 where 'gjeldende_bdr_syss' is 0 AND 'n3' is not '47.3' AND 'n2' is not 68 + bad_df.loc[ + (bad_df['gjeldende_bdr_syss'] == 0) & + (bad_df['n3'] != '47.3') & + (bad_df['n2'] != '68'), + 'new_oms' + ] = 0 + # Drop the 'predicted_oms' column as it is no longer needed bad_df.drop(['predicted_oms'], axis=1, inplace=True) diff --git a/src/notebooks/AO.ipynb b/src/notebooks/AO.ipynb index 9985785..f67df4b 100644 --- a/src/notebooks/AO.ipynb +++ b/src/notebooks/AO.ipynb @@ -81,8 +81,21 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pyarrow\n", + "print(pyarrow.__version__)\n" + ] + }, + { + "cell_type": "markdown", + "id": "5", "metadata": {}, "source": [ "# Data exploration" @@ -90,7 +103,7 @@ }, { "cell_type": "markdown", - "id": "5", + "id": "6", "metadata": {}, "source": [ "This step requires the filtering of dataframes so that the correct nærings are included. All bedfrifter will be included in analysis, not just those belonging to reg_type 2. Also, and very importantly, if running for an entire 'delreg' (or an equivalent) then a backup file needs to be saved. \n", @@ -114,7 +127,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "7", "metadata": { "tags": [] }, @@ -124,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "7", + "id": "8", "metadata": { "tags": [] }, @@ -139,7 +152,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "9", "metadata": {}, "source": [ "# Basic function\n" @@ -147,7 +160,7 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "10", "metadata": {}, "source": [ "Were basically talking about a constrained optimisation problem. We need to investigate several functions and determine which one is better. It may in fact be that some functions work better depending on the situtation - ideally an algorythm would determine which one is better - but for fun perhaps we can have a toggle option so users can look through which one they think works best. \n", @@ -159,7 +172,7 @@ }, { "cell_type": "markdown", - "id": "10", + "id": "11", "metadata": {}, "source": [ "Constrained optimization in the context of machine learning typically involves defining an objective function that you want to minimize or maximize while keeping certain constraints satisfied. In your case, the constraint is that the sum of all estimated revenues for 'bedrifter' under a single 'foretak' must equal the known total revenue for that 'foretak'.\n", @@ -232,7 +245,7 @@ }, { "cell_type": "markdown", - "id": "11", + "id": "12", "metadata": {}, "source": [ "# Choosing the right function:" @@ -240,7 +253,7 @@ }, { "cell_type": "markdown", - "id": "12", + "id": "13", "metadata": {}, "source": [ "We can build several ourselves and test them directly. Or we can use machine learning to detmerine what the correct one is. " @@ -248,7 +261,7 @@ }, { "cell_type": "markdown", - "id": "13", + "id": "14", "metadata": { "tags": [] }, @@ -258,7 +271,7 @@ }, { "cell_type": "markdown", - "id": "14", + "id": "15", "metadata": { "tags": [] }, @@ -271,7 +284,7 @@ }, { "cell_type": "markdown", - "id": "15", + "id": "16", "metadata": { "tags": [] }, @@ -281,7 +294,7 @@ }, { "cell_type": "markdown", - "id": "16", + "id": "17", "metadata": {}, "source": [ "If the optimal function varies by industry and the goal is to minimize error across different segments such as 'bedrifter' under various 'foretak' and industries, then one approach is to use a machine learning model that can learn the function from the data. This model would be trained to minimize the prediction error and could adapt to different industries by learning from industry-specific patterns in the training data.\n", @@ -356,7 +369,7 @@ }, { "cell_type": "markdown", - "id": "17", + "id": "18", "metadata": {}, "source": [ "# Get user input" @@ -365,7 +378,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "19", "metadata": { "tags": [] }, @@ -381,7 +394,7 @@ }, { "cell_type": "markdown", - "id": "19", + "id": "20", "metadata": { "tags": [] }, @@ -391,7 +404,7 @@ }, { "cell_type": "markdown", - "id": "20", + "id": "21", "metadata": {}, "source": [ "### Current year and month" @@ -400,7 +413,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "22", "metadata": { "tags": [] }, @@ -436,7 +449,7 @@ { "cell_type": "code", "execution_count": null, - "id": "22", + "id": "23", "metadata": { "tags": [] }, @@ -483,7 +496,7 @@ }, { "cell_type": "markdown", - "id": "23", + "id": "24", "metadata": {}, "source": [ "# Get bedrift data" @@ -492,7 +505,7 @@ { "cell_type": "code", "execution_count": null, - "id": "24", + "id": "25", "metadata": { "tags": [] }, @@ -572,7 +585,7 @@ }, { "cell_type": "markdown", - "id": "25", + "id": "26", "metadata": {}, "source": [ "# Get foretak data" @@ -581,7 +594,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26", + "id": "27", "metadata": { "tags": [] }, @@ -610,7 +623,7 @@ }, { "cell_type": "markdown", - "id": "27", + "id": "28", "metadata": { "tags": [] }, @@ -621,7 +634,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "29", "metadata": { "tags": [] }, @@ -638,7 +651,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29", + "id": "30", "metadata": { "tags": [] }, @@ -650,7 +663,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "31", "metadata": { "tags": [] }, @@ -675,7 +688,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "32", "metadata": { "tags": [] }, @@ -686,7 +699,7 @@ }, { "cell_type": "markdown", - "id": "32", + "id": "33", "metadata": {}, "source": [ "# Explore the map" @@ -695,7 +708,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33", + "id": "34", "metadata": { "tags": [] }, @@ -707,7 +720,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34", + "id": "35", "metadata": { "tags": [] }, @@ -720,7 +733,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "36", "metadata": { "tags": [] }, @@ -733,7 +746,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36", + "id": "37", "metadata": { "tags": [] }, @@ -745,7 +758,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37", + "id": "38", "metadata": { "tags": [] }, @@ -757,7 +770,7 @@ }, { "cell_type": "markdown", - "id": "38", + "id": "39", "metadata": { "tags": [] }, @@ -768,7 +781,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39", + "id": "40", "metadata": { "tags": [] }, @@ -794,7 +807,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40", + "id": "41", "metadata": { "tags": [] }, @@ -858,7 +871,7 @@ }, { "cell_type": "markdown", - "id": "41", + "id": "42", "metadata": {}, "source": [ "# Train the data" @@ -867,7 +880,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42", + "id": "43", "metadata": { "tags": [] }, @@ -1051,7 +1064,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43", + "id": "44", "metadata": { "tags": [] }, @@ -1075,7 +1088,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44", + "id": "45", "metadata": { "tags": [] }, @@ -1087,7 +1100,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "46", "metadata": { "tags": [] }, @@ -1107,7 +1120,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46", + "id": "47", "metadata": { "tags": [] }, @@ -1121,7 +1134,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47", + "id": "48", "metadata": { "tags": [] }, @@ -1133,7 +1146,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48", + "id": "49", "metadata": { "tags": [] }, @@ -1151,7 +1164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49", + "id": "50", "metadata": { "tags": [] }, @@ -1163,7 +1176,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50", + "id": "51", "metadata": { "tags": [] }, @@ -1181,7 +1194,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51", + "id": "52", "metadata": { "tags": [] }, @@ -1194,7 +1207,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52", + "id": "53", "metadata": { "tags": [] }, @@ -1210,7 +1223,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53", + "id": "54", "metadata": { "tags": [] }, @@ -1239,7 +1252,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "55", "metadata": { "tags": [] }, @@ -1261,7 +1274,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55", + "id": "56", "metadata": { "tags": [] }, @@ -1273,7 +1286,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56", + "id": "57", "metadata": { "tags": [] }, @@ -1293,7 +1306,7 @@ }, { "cell_type": "markdown", - "id": "57", + "id": "58", "metadata": { "tags": [] }, @@ -1304,7 +1317,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58", + "id": "59", "metadata": { "tags": [] }, @@ -1345,7 +1358,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "60", "metadata": { "tags": [] }, @@ -1379,7 +1392,7 @@ }, { "cell_type": "markdown", - "id": "60", + "id": "61", "metadata": { "tags": [] }, @@ -1390,7 +1403,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "62", "metadata": { "tags": [] }, @@ -1452,7 +1465,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62", + "id": "63", "metadata": { "tags": [] }, @@ -1465,7 +1478,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63", + "id": "64", "metadata": { "tags": [] }, @@ -1529,7 +1542,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64", + "id": "65", "metadata": { "tags": [] }, @@ -1571,7 +1584,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65", + "id": "66", "metadata": { "tags": [] }, @@ -1587,7 +1600,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66", + "id": "67", "metadata": { "tags": [] }, @@ -1598,7 +1611,7 @@ }, { "cell_type": "markdown", - "id": "67", + "id": "68", "metadata": { "tags": [] }, @@ -1609,7 +1622,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68", + "id": "69", "metadata": { "tags": [] }, @@ -1780,7 +1793,7 @@ }, { "cell_type": "markdown", - "id": "69", + "id": "70", "metadata": { "tags": [] }, @@ -1848,14 +1861,14 @@ { "cell_type": "code", "execution_count": null, - "id": "70", + "id": "71", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", - "id": "71", + "id": "72", "metadata": {}, "source": [ "# Train data only for reg_type 2" @@ -1864,7 +1877,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72", + "id": "73", "metadata": { "tags": [] }, @@ -1876,7 +1889,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73", + "id": "74", "metadata": { "tags": [] }, @@ -1972,7 +1985,7 @@ { "cell_type": "code", "execution_count": null, - "id": "74", + "id": "75", "metadata": {}, "outputs": [], "source": [ @@ -2036,7 +2049,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75", + "id": "76", "metadata": { "tags": [] }, @@ -2069,7 +2082,7 @@ }, { "cell_type": "markdown", - "id": "76", + "id": "77", "metadata": {}, "source": [ "# Linear Regression" @@ -2078,7 +2091,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77", + "id": "78", "metadata": { "tags": [] }, @@ -2172,7 +2185,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78", + "id": "79", "metadata": { "tags": [] }, @@ -2209,7 +2222,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79", + "id": "80", "metadata": {}, "outputs": [], "source": [] diff --git a/src/notebooks/File Transfers.ipynb b/src/notebooks/File Transfers.ipynb index 7ecf95a..70f757c 100644 --- a/src/notebooks/File Transfers.ipynb +++ b/src/notebooks/File Transfers.ipynb @@ -36,7 +36,9 @@ "cell_type": "code", "execution_count": null, "id": "3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "aar = '2023'\n", @@ -45,7 +47,7 @@ "fil_path = [\n", " f\n", " for f in fs.glob(\n", - " f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_5823.parquet\"\n", + " f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_6023.parquet\"\n", " )\n", " # if f.endswith(\".parquet\")\n", "]\n", @@ -99,7 +101,7 @@ "\n", "pq.write_table(\n", " table_renamed,\n", - " f\"gs://ssb-prod-noeku-data-produkt/eimerdb/nokubasen/skjemadata/aar={aar}/skjema=RA-1403/skjemadata_data_0.parquet\",\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar=2023/skjemadata_data_0.parquet\",\n", " filesystem=fs\n", ")" ] @@ -360,16 +362,62 @@ "table = dataset.read()\n", "\n", "# Convert to Pandas DataFrame\n", - "# skjema = table.to_pandas()\n", + "skjema = table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "columns = {\n", + " \"ENHETS_ID\": \"id\",\n", + " \"RAD_NR\": \"radnr\",\n", + " \"FELT_ID\": \"feltnavn\",\n", + " \"FELT_VERDI\": \"feltverdi\",\n", + " \"SKJEMA\": \"skjema\",\n", + " \"LOPENR\": \"lopenr\",\n", + " \"AKTIV\": \"aktiv\",\n", + " \"DELREG_NR\": \"delreg\",\n", + " \"ENHETS_TYPE\": \"enhets_type\"\n", + "}\n", "\n", + "skjema = skjema.rename(columns=columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ "# Create a PyArrow array filled with the value '2022' for each row\n", "# Create the 'aar' column as a categorical (dictionary-encoded) string\n", - "num_rows = table.num_rows\n", - "year_array = pa.array(['2023'] * num_rows, type=pa.string())\n", - "year_array = pa.DictionaryArray.from_arrays(pa.array(range(num_rows), type=pa.int32()), year_array)\n", + "# num_rows = table.num_rows\n", + "# year_array = pa.array(['2023'] * num_rows, type=pa.string())\n", + "# year_array = pa.DictionaryArray.from_arrays(pa.array(range(num_rows), type=pa.int32()), year_array)\n", "\n", - "# Add the new column to your existing table\n", - "table_with_aar = table.append_column('aar', year_array)\n", + "# # Add the new column to your existing table\n", + "# table_with_aar = table.append_column('aar', year_array)\n", "\n", "columns = {\n", " \"ENHETS_ID\": \"id\",\n", @@ -385,7 +433,7 @@ "}\n", "\n", "# Get the current schema of the table with the 'aar' column\n", - "schema = table_with_aar.schema\n", + "# schema = table_with_aar.schema\n", "\n", "# Create a list to hold the new fields with renamed column names\n", "new_fields = []\n", @@ -406,7 +454,7 @@ "\n", "pq.write_table(\n", " table_renamed,\n", - " f\"gs://ssb-prod-noeku-data-produkt/eimerdb/nokubasen/skjemadata/aar=2023/skjema=RA-0174-1/skjemadata_data_0.parquet\",\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0174-1/aar=2023/skjemadata_data_0.parquet\",\n", " filesystem=fs\n", ")" ] @@ -414,7 +462,7 @@ { "cell_type": "code", "execution_count": null, - "id": "14", + "id": "17", "metadata": { "tags": [] }, @@ -487,7 +535,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "18", "metadata": {}, "outputs": [], "source": [ @@ -511,7 +559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "16", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -573,7 +621,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "20", "metadata": { "tags": [] }, @@ -587,7 +635,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18", + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -599,7 +647,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "22", "metadata": {}, "outputs": [], "source": [ @@ -622,7 +670,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20", + "id": "23", "metadata": { "tags": [] }, @@ -637,7 +685,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "24", "metadata": { "tags": [] }, @@ -648,7 +696,7 @@ }, { "cell_type": "markdown", - "id": "22", + "id": "25", "metadata": {}, "source": [ "# Transfer from old bucks to new:" @@ -657,7 +705,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "26", "metadata": { "tags": [] }, @@ -669,14 +717,14 @@ { "cell_type": "code", "execution_count": null, - "id": "24", + "id": "27", "metadata": { "tags": [] }, "outputs": [], "source": [ "start_year = 2017\n", - "end_year = 2021\n", + "end_year = 2022\n", "skjema_list = 'RA-1407'\n", "\n", "\n", @@ -703,7 +751,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25", + "id": "28", "metadata": { "tags": [] }, @@ -715,7 +763,7 @@ { "cell_type": "code", "execution_count": null, - "id": "26", + "id": "29", "metadata": { "tags": [] }, @@ -804,7 +852,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "30", "metadata": { "tags": [] }, @@ -885,7 +933,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "31", "metadata": { "tags": [] }, @@ -910,7 +958,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29", + "id": "32", "metadata": { "tags": [] }, @@ -926,7 +974,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "33", "metadata": { "tags": [] }, @@ -938,7 +986,730 @@ { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "34", + "metadata": {}, + "outputs": [], + "source": [ + "# NO" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35", + "metadata": {}, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/no2_6022.parquet.parquet\"\n", + " )\n", + " # if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "table1 = dataset.read()\n", + "\n", + "\n", + "pq.write_table(\n", + " table1,\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/nobasen/an_registrering2/aar=2022/no2.parquet\",\n", + " filesystem=fs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Define the year value\n", + "aar = '2023'\n", + "\n", + "# List of Parquet files to load (assuming you're using a local filesystem or remote one with PyArrow)\n", + "# fil_path = [\n", + "# f\n", + "# for f in fs.glob(\n", + "# f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_5923.parquet\"\n", + "# )\n", + "# ]\n", + "\n", + "# Read the Parquet file into a pandas DataFrame\n", + "fil_path = \"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_5923.parquet\"\n", + "df = pd.read_parquet(fil_path, filesystem=fs)\n", + "# df = table.to_pandas()\n", + "\n", + "# Add the 'aar' column with the year value, make it categorical if needed\n", + "# df['aar'] = pd.Categorical([aar] * len(df))\n", + "\n", + "# Rename columns based on the provided mapping\n", + "columns = {\n", + " \"ENHETS_ID\": \"id\",\n", + " \"RAD_NR\": \"radnr\",\n", + " \"FELT_ID\": \"feltnavn\",\n", + " \"FELT_VERDI\": \"feltverdi\",\n", + " \"SKJEMA\": \"skjema\",\n", + " \"LOPENR\": \"lopenr\",\n", + " \"AKTIV\": \"aktiv\",\n", + " \"DELREG_NR\": \"delreg\",\n", + "}\n", + "\n", + "df.rename(columns=columns, inplace=True)\n", + "\n", + "# Save the modified DataFrame back to Parquet\n", + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar=2023/skjemadata_data_0.parquet\"\n", + "\n", + "df.to_parquet(destination_path, filesystem=fs, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar=2022/*\"\n", + " )\n", + " # if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Assuming there's only one file in fil_path\n", + "if fil_path:\n", + " skjema = pd.read_parquet(fil_path[0], filesystem=fs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0174-1/aar=2023/skjemadata_data_0.parquet\"\n", + "\n", + "skjema.to_parquet(destination_path, filesystem=fs, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#remove aar from skjema\n", + "skjema = skjema.drop(columns=['aar'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar=2023/*\"\n", + " )\n", + " # if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Assuming there's only one file in fil_path\n", + "if fil_path:\n", + " skjema = pd.read_parquet(fil_path[0], filesystem=fs)\n", + " \n", + " # ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar=2022" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-noeku-data-produkt/eimerdb/nokubasen/skjemadata/aar=2022/skjema=RA-0255-1/*\",\n", + " )\n", + " # if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "if fil_path:\n", + " skjema = pd.read_parquet(fil_path[0], filesystem=fs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0174-1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_6023.parquet\",\n", + " )\n", + " # if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "if fil_path:\n", + " skjema = pd.read_parquet(fil_path[0], filesystem=fs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0174-1/aar=2023/skjemadata_data_0.parquet\"\n", + "skjema.to_parquet(destination_path, filesystem=fs, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "year = 2023\n", + "\n", + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{year}/statistikkfil_bedrifter_pub.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "foretak_pub = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "print(foretak_pub.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "foretak_pub['aar'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# drop 'aar' in df\n", + "foretak_pub = foretak_pub.drop(columns=['aar'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={year}/statistikkfil_bedrifter_pub.parquet\"\n", + "foretak_pub.to_parquet(destination_path, filesystem=fs, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar={year}/statistikkfil_bedrifter_pub.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "foretak_pub = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "print(foretak_pub.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Skjema test\n", + "\n", + "year = 2023" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_5922.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "test1 = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "print(test1.shape)\n", + "\n", + "# test1 = test1.rename(columns={'SKJEMA': 'skjema', 'AKTIV': 'aktiv', 'DELREG_NR': 'delreg_nr', 'ENHETS_ID': 'id', 'LOPENR': 'lopenr', 'RAD_NR': 'radnr', 'FELT_ID': 'feltnavn', 'FELT_VERDI': 'feltverdi'})\n", + "\n", + "# test1['skjema'] = test1['skjema'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# print all rows\n", + "pd.set_option('display.max_rows', None)\n", + "test1['FELT_ID'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "year = 2023\n", + "\n", + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0255-1/aar={year}/skjemadata_data_0.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "test1 = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "print(test1.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/delregistre/60/6023.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "test2 = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "print(test2.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "6023\n", + "\n", + "2017: (3412469, 9)\n", + "2018: (4674219, 9)\n", + "2019: (4785879, 9)\n", + "2020: (4841254, 9)\n", + "2021: (4137853, 9)\n", + "2022: (4223800, 10)\n", + "2023: (1751297, 10)\n", + "\n", + "5923\n", + "\n", + "2017: (2585284, 9)\n", + "2018: (2689164, 9)\n", + "2019: (2415684, 9)\n", + "2020: (2393381, 9)\n", + "2021: (2121860, 9)88\n", + "2022: (3049512, 10)6\n", + "2023: (977892, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "test2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# remove aar\n", + "test1 = test1.drop(columns=['aar'])\n", + "\n", + "# test1['aar'] = year\n", + "\n", + "# test1.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# fil_path = [\n", + "# f\n", + "# for f in fs.glob(\n", + "# f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-1100/aar={year}/skjemadata_data_0.parquet\"\n", + "# )\n", + "# if f.endswith(\".parquet\")\n", + "# ]\n", + "\n", + "# # Use the ParquetDataset to read multiple files\n", + "# # dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "# test2 = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# # ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "# print(test2.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-0174-1/aar=2023/skjemadata_data_0.parquet\"\n", + "test2.to_parquet(destination_path, filesystem=fs, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-1403/aar={year}/skjemadata_data_0.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "test3 = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "# ssb-prod-noeku-data-produkt/statistikkfiler/g2017\n", + "\n", + "print(test3.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "year = 2021" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pyarrow.parquet as pq\n", + "import pyarrow as pa\n", + "\n", + "# Step 1: Read the Parquet file\n", + "parquet_file = pq.ParquetDataset('gs://ssb-prod-skatt-naering-data-synk-opp/test/dynadata_8923.parquet', filesystem=fs)\n", + "\n", + "# Convert it to a table\n", + "table = parquet_file.read()\n", + "\n", + "if 'aar' in table.column_names:\n", + " table = table.drop(['aar'])\n", + "\n", + "# Step 2: Create a new schema with renamed columns\n", + "# Rename the columns as needed (from original names to new names)\n", + "new_column_names = {\n", + " 'AKTIV': 'aktiv',\n", + " 'DELREG_NR': 'delreg_nr',\n", + " 'ENHETS_ID': 'id',\n", + " 'LOPENR': 'lopenr',\n", + " 'RAD_NR': 'radnr',\n", + " 'FELT_ID': 'feltnavn',\n", + " 'FELT_VERDI': 'feltverdi'\n", + "}\n", + "\n", + "# Apply renaming by creating a new table with updated column names\n", + "renamed_table = table.rename_columns([new_column_names.get(col, col) for col in table.column_names])\n", + "\n", + "# Step 3: Save the renamed Parquet file\n", + "destination_path = f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/inndata/skjemadata/skjema=RA-1100/aar=2023/skjemadata_data_0.parquet\"\n", + "pq.write_table(renamed_table, destination_path, filesystem=fs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "renamed_table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "skjema.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73", "metadata": {}, "outputs": [], "source": [] diff --git a/src/notebooks/ML function builder.ipynb b/src/notebooks/ML function builder.ipynb index e11f25d..24f1a81 100644 --- a/src/notebooks/ML function builder.ipynb +++ b/src/notebooks/ML function builder.ipynb @@ -59,67 +59,12 @@ }, "outputs": [], "source": [ - "# import datetime\n", - "\n", - "# # Get the current date\n", - "# current_date = datetime.datetime.now()\n", - "\n", - "# # Format the year and month\n", - "# current_year = current_date.strftime(\"%Y\")\n", - "# current_month = current_date.strftime(\"%m\")\n", - "\n", - "# # Subtract one day from the first day of the current month to get the last day of the previous month\n", - "# last_day_of_previous_month = datetime.datetime(\n", - "# current_date.year, current_date.month, 1\n", - "# ) - datetime.timedelta(days=1)\n", - "\n", - "# # Now we can get the month number of the previous month\n", - "# previous_month = last_day_of_previous_month.strftime(\"%m\")\n", - "\n", - "# VOFSTI = (\n", - "# \"ssb-vof-data-delt-prod/stedfesting-situasjonsuttak_data/klargjorte-data/parquet\"\n", - "# )\n", - "# file_path = (\n", - "# f\"{VOFSTI}/stedfesting-situasjonsuttak_p{current_year}-{previous_month}_v1.parquet\"\n", - "# )\n", - "\n", - "# vof_df = dp.read_pandas(f\"{file_path}\")\n", - "# vof_gdf = gpd.GeoDataFrame(\n", - "# vof_df,\n", - "# geometry=gpd.points_from_xy(\n", - "# vof_df[\"y_koordinat\"],\n", - "# vof_df[\"x_koordinat\"],\n", - "# ),\n", - "# crs=25833,\n", - "# )\n", - "\n", - "# vof_gdf = vof_gdf.rename(\n", - "# columns={\n", - "# \"orgnrbed\": \"orgnr_bedrift\",\n", - "# \"org_nr\": \"orgnr_foretak\",\n", - "# \"nace1_sn07\": \"naring\",\n", - "# }\n", - "# )\n", - "\n", - "\n", - "# vof_gdf = vof_gdf[\n", - "# [\n", - "# \"orgnr_bedrift\",\n", - "# \"orgnr_foretak\",\n", - "# \"naring\",\n", - "# \"x_koordinat\",\n", - "# \"y_koordinat\",\n", - "# \"rute_100m\",\n", - "# \"rute_1000m\",\n", - "# \"geometry\",\n", - "# ]\n", - "# ]\n", - "# pd.set_option(\"display.max_columns\", None)\n", - "\n", - "# vof_gdf = vof_gdf.dropna(subset=[\"x_koordinat\"])\n", - "# vof_gdf = vof_gdf.drop_duplicates(subset=\"orgnr_bedrift\")\n", - "# vof_gdf = vof_gdf.drop(\"orgnr_foretak\", axis=1)\n", - "# vof_gdf.head()" + "# Hente data\n", + "from imports import *\n", + "\n", + "year = 2022\n", + "\n", + "training_data, imputatable_df, foretak_pub = ml_modeller.hente_training_data(year)" ] }, { @@ -131,46 +76,19 @@ }, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "# Define a list of years\n", - "years_to_process = [2020, 2021]\n", - "\n", - "# Define the values for naring_f_3 to filter on\n", - "varehandel = [\"45\", \"46\", \"47\"]\n", - "\n", - "# Initialize an empty list to store dataframes for each iteration\n", - "dfs = []\n", - "\n", - "for year in years_to_process:\n", - " fil_path = f\"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{year}/statistikkfil_bedrifter_pub.parquet\"\n", - "\n", - " # Read data for the current year\n", - " bedrifter = pd.read_parquet(fil_path, filesystem=fs)\n", - "\n", - " # Create 'nace4' by slicing the first 5 characters of 'naring'\n", - " bedrifter[\"naring4\"] = bedrifter[\"naring\"].str[:5]\n", - "\n", - " bedrifter[\"naring_f_4\"] = bedrifter[\"naring_f\"].str[:5]\n", - "\n", - " # Create 'nace3' by slicing the first 4 characters of 'naring'\n", - " bedrifter[\"naring3\"] = bedrifter[\"naring\"].str[:4]\n", - "\n", - " bedrifter[\"naring_f_3\"] = bedrifter[\"naring_f\"].str[:4]\n", - "\n", - " bedrifter[\"naring_f_2\"] = bedrifter[\"naring_f\"].str[:2]\n", - "\n", - " # Filter rows based on desired naring_f_3 values\n", - " filtered_bedrifter = bedrifter[bedrifter[\"naring_f_2\"].isin(varehandel)]\n", - "\n", - " # Append the resulting dataframe to the list\n", - " dfs.append(filtered_bedrifter)\n", - "\n", - "# Merge all dataframes in the list into a single dataframe\n", - "merged_dataframe = pd.concat(dfs, ignore_index=True)\n", - "\n", - "# Display the resulting merged dataframe\n", - "print(merged_dataframe.head())" + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar=2022/statistikkfil_foretak_pub.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "# dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "foretak_pub = pd.read_parquet(fil_path, filesystem=fs)\n", + "\n", + "print(foretak_pub.shape)" ] }, { @@ -181,53 +99,231 @@ "tags": [] }, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "# Define a list of years\n", - "years_to_process = [2020, 2021]\n", - "\n", - "# Define the values for naring_f_3 to filter on\n", - "varehandel = [45, 46, 47]\n", - "\n", - "# Initialize an empty list to store dataframes for each iteration\n", - "dfs = []\n", - "\n", - "for year in years_to_process:\n", - " fil_path = f\"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g{year}/statistikkfil_bedrifter_pub.parquet\"\n", - "\n", - " # Read data for the current year\n", - " bedrifter = pd.read_parquet(fil_path, filesystem=fs)\n", - "\n", - " # Create 'nace4' by slicing the first 5 characters of 'naring'\n", - " bedrifter[\"naring4\"] = bedrifter[\"naring\"].str[:5]\n", - "\n", - " bedrifter[\"naring_f_4\"] = bedrifter[\"naring_f\"].str[:5]\n", - "\n", - " # Create 'nace3' by slicing the first 4 characters of 'naring'\n", - " bedrifter[\"naring3\"] = bedrifter[\"naring\"].str[:4]\n", + "# print all columns\n", + "# for col in foretak_pub.columns:\n", + "# print(col)\n", "\n", - " bedrifter[\"naring_f_3\"] = bedrifter[\"naring_f\"].str[:4]\n", - "\n", - " # Filter rows based on desired naring_f_3 values\n", - " filtered_bedrifter = bedrifter[\n", - " bedrifter[\"naring_f_3\"].isin(desired_naring_f_3_values)\n", - " ]\n", - "\n", - " # Append the resulting dataframe to the list\n", - " dfs.append(filtered_bedrifter)\n", - "\n", - "# Merge all dataframes in the list into a single dataframe\n", - "merged_dataframe = pd.concat(dfs, ignore_index=True)\n", + "foretak_pub = foretak_pub[['omsetning', 'enhets_id', 'sysselsetting_syss', 'naring_f']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "foretak_pub.head() df['n3'] = df['naring_f'].str[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_absolute_error\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Keep rows where the substring (characters 2 and 3) of 'nacef_5' is '45', '46', or '47'\n", + "df = foretak_pub[foretak_pub['naring_f'].str[:2].isin(['45', '46', '47'])]\n", + "\n", + "df['n3'] = df['naring_f'].str[:3]\n", + "\n", + "# Initialize a dictionary to store the results for each 'naring_f'\n", + "results = {}\n", + "\n", + "# Iterate over each unique 'naring_f'\n", + "for category in df['n3'].unique():\n", + " # Filter data for the current category\n", + " category_data = df[df['n3'] == category]\n", + " \n", + " # Define features and target\n", + " X = category_data[['sysselsetting_syss']]\n", + " y = category_data['omsetning']\n", + "\n", + " # Fit a linear regression model\n", + " model = LinearRegression()\n", + " model.fit(X, y)\n", + "\n", + " # Make predictions\n", + " y_pred = model.predict(X)\n", + " \n", + " # Calculate MAE for the current category\n", + " mae = mean_absolute_error(y, y_pred)\n", + " \n", + " # Store the model and evaluation metric\n", + " results[category] = {\n", + " 'model': model,\n", + " 'MAE': mae,\n", + " 'y_true': y,\n", + " 'y_pred': y_pred\n", + " }\n", + " \n", + " # Visualize residuals\n", + " plt.figure(figsize=(10, 6))\n", + " sns.residplot(x=y, y=y_pred - y, lowess=True)\n", + " plt.title(f'Residual Plot for {category}')\n", + " plt.xlabel('Actual Values')\n", + " plt.ylabel('Residuals')\n", + " plt.show()\n", + " \n", + " # Visualize prediction vs actual\n", + " plt.figure(figsize=(10, 6))\n", + " plt.scatter(y, y_pred)\n", + " plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--') # Line of perfect prediction\n", + " plt.title(f'Prediction vs Actual for {category}')\n", + " plt.xlabel('Actual Values')\n", + " plt.ylabel('Predicted Values')\n", + " plt.show()\n", + "\n", + "# Calculate the overall MAE for the entire dataset\n", + "overall_y_true = df['omsetning']\n", + "overall_y_pred = pd.concat([pd.Series(results[cat]['y_pred'], index=results[cat]['y_true'].index) for cat in results])\n", + "overall_mae = mean_absolute_error(overall_y_true, overall_y_pred)\n", + "\n", + "print(f'Overall MAE: {overall_mae}')\n", + "\n", + "# Show MAE per 'naring_f'\n", + "mae_df = pd.DataFrame({'n3': results.keys(), 'MAE': [results[cat]['MAE'] for cat in results]})\n", + "print(mae_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "# Load your data\n", + "# Assuming training_data is already defined and loaded\n", + "# training_data = pd.read_csv('your_file.csv') # Uncomment this if you need to load data\n", + "\n", + "# Step 1: Filter the data\n", + "# Keep rows where the substring (characters 2 and 3) of 'nacef_5' is '45', '46', or '47'\n", + "df = foretak_pub[foretak_pub['naring_f'].str[:2].isin(['45', '46', '47'])]\n", + "\n", + "df['n3'] = df['naring_f'].str[:3]\n", + "\n", + "# Step 2: Prepare the data\n", + "X = df[['sysselsetting_syss']] # Feature\n", + "y = df['omsetning'] # Target\n", + "\n", + "# Step 3: Train the Linear Regression Model\n", + "model = LinearRegression()\n", + "model.fit(X, y)\n", + "\n", + "# Predict new_oms\n", + "df['predicted_new_oms'] = model.predict(X)\n", + "\n", + "# Step 4: Calculate MAE\n", + "# Overall MAE\n", + "overall_mae = mean_absolute_error(y, df['predicted_new_oms'])\n", + "print(f\"Overall MAE: {overall_mae}\")\n", + "\n", + "# MAE per 'nacef_5'\n", + "mae_per_nacef_5 = df.groupby('naring_f').apply(\n", + " lambda group: mean_absolute_error(group['omsetning'], group['predicted_new_oms'])\n", + ").reset_index().rename(columns={0: 'MAE'})\n", + "\n", + "# Display the MAE per 'nacef_5'\n", + "print(\"MAE per 'nacef_5':\")\n", + "print(mae_per_nacef_5)\n", + "\n", + "# Step 5: Visualization\n", + "# Calculate residuals\n", + "df['residuals'] = df['omsetning'] - df['predicted_new_oms']\n", + "\n", + "# Visualization - Residual Plot\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(data=df, x='predicted_new_oms', y='residuals')\n", + "plt.axhline(0, color='red', linestyle='--')\n", + "plt.title('Residual Plot')\n", + "plt.xlabel('Predicted new_oms')\n", + "plt.ylabel('Residuals')\n", + "plt.show()\n", + "\n", + "# Visualization - Prediction vs Actual Plot\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(data=df, x='omsetning', y='predicted_new_oms', hue='naring_f', palette='viridis')\n", + "plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--') # 45-degree line for reference\n", + "plt.title('Prediction vs Actual Plot')\n", + "plt.xlabel('Actual new_oms')\n", + "plt.ylabel('Predicted new_oms')\n", + "plt.legend(title='naring_f', bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plt.show()\n", + "\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "sns.scatterplot(data=df, x='omsetning', y='sysselsetting_syss')\n", + "plt.axhline(0, color='red', linestyle='--')\n", + "plt.title('Residual Plot')\n", + "plt.xlabel('oms')\n", + "plt.ylabel('sysselsetting_syss')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# filter training_data for gjeldende_bdr_syss > 600\n", + "training_data = training_data[training_data['gjeldende_bdr_syss'] > 600]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# turn on option to print all columns\n", + "pd.set_option('display.max_columns', None)\n", "\n", - "# Display the resulting merged dataframe\n", - "print(merged_dataframe.head())" + "training_data.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "5", + "id": "11", "metadata": {}, "outputs": [], "source": [] diff --git a/src/notebooks/Master.ipynb b/src/notebooks/Master.ipynb index 4795b34..c0f431f 100644 --- a/src/notebooks/Master.ipynb +++ b/src/notebooks/Master.ipynb @@ -577,7 +577,7 @@ "# Best result for GridSearch so far:\n", "# Best parameters found by GridSearch: {'n_neighbors': 2}\n", "# knn_model_new\n", - "#knn_model_fast" + "# knn_model_fast" ] }, { @@ -741,7 +741,7 @@ "pd.set_option('display.max_rows', None)\n", "year = 2023\n", "\n", - "model = 'knn_model_with_pca_custom_distance' # Choose between knn_model, xgboost_model, lstm_model or, nn_model_1, nn_model_2, evaluate_year_based_mae, knn_model_with_pca, knn_model_with_pca_custom_distance, knn_model_with_pca_mae_per_year, xgboost_model_with_pca\n", + "model = 'knn_model' # Choose between knn_model, xgboost_model, lstm_model or, nn_model_1, nn_model_2, evaluate_year_based_mae, knn_model_with_pca, knn_model_with_pca_custom_distance, knn_model_with_pca_mae_per_year, xgboost_model_with_pca\n", "rate = 0.65 # rate will determine which survey responses are sorted into a good/bad df. The bad df will be updated/corrected using machine learning, while the good_df will go through more tradional cleaning\n", "scaler = RobustScaler() # Choose between StandardScaler(), MinMaxScaler(), RobustScaler(), etc\n", "GridSearch=False # Turn on to fine tune parameters, but this will take a lot more time - maybe even days for some models. \n", @@ -750,8 +750,8 @@ "distribtion_percent = 0.5 \n", "\n", "geo_data=False # turn on to build latitude and longitude data into the dfs\n", - "uu_data=True # turn on to add reg_type 01 utenfor utvalg data to the dfs\n", - "rerun_ml=True # Turn on in order to run a machine learning model on the data. This may take time depending on the model, so if output exists already, turn this to false. \n", + "uu_data=False # turn on to add reg_type 01 utenfor utvalg data to the dfs\n", + "rerun_ml=False # Turn on in order to run a machine learning model on the data. This may take time depending on the model, so if output exists already, turn this to false. \n", "collect_data=False\n", "\n", "# skjema = 'RA-0174-1' for varehandel\n", @@ -766,21 +766,9 @@ "update_file, timeseries_knn_agg, timeseries_knn__kommune_agg, check_totals, check_manually, v_orgnr_list_for_imputering, til_bakken, unique_id_list, current_year_bad_oms = oppdateringsfil.create_bedrift_fil(year, model, rate, scaler, skjema, tosiffernaring, distribtion_percent, rerun_ml=rerun_ml, geo_data=geo_data, uu_data=uu_data, GridSearch=GridSearch, collect_data=collect_data)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "56", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "til_bakken.head()" - ] - }, { "cell_type": "markdown", - "id": "57", + "id": "56", "metadata": { "tags": [] }, @@ -792,7 +780,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58", + "id": "57", "metadata": { "tags": [] }, @@ -806,7 +794,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "58", "metadata": { "tags": [] }, @@ -818,7 +806,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60", + "id": "59", "metadata": { "tags": [] }, @@ -830,7 +818,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61", + "id": "60", "metadata": { "tags": [] }, @@ -890,7 +878,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62", + "id": "61", "metadata": { "tags": [] }, @@ -904,7 +892,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "62", "metadata": {}, "source": [ "---------------------------------------------------------------------------------------------------------------------------------" @@ -912,7 +900,7 @@ }, { "cell_type": "markdown", - "id": "64", + "id": "63", "metadata": {}, "source": [ "# \n", @@ -921,7 +909,7 @@ }, { "cell_type": "markdown", - "id": "65", + "id": "64", "metadata": {}, "source": [ "#### 2021 XGBOOST:\n", @@ -952,7 +940,7 @@ }, { "cell_type": "markdown", - "id": "66", + "id": "65", "metadata": {}, "source": [ "--------------------------------------------------------------------------------------------------------------------------------\n", @@ -967,7 +955,7 @@ }, { "cell_type": "markdown", - "id": "67", + "id": "66", "metadata": {}, "source": [ "# Co-operation with other sections" @@ -975,7 +963,7 @@ }, { "cell_type": "markdown", - "id": "68", + "id": "67", "metadata": {}, "source": [ "#### A lot of potential here:\n", @@ -994,7 +982,7 @@ }, { "cell_type": "markdown", - "id": "69", + "id": "68", "metadata": {}, "source": [ "# Network Analysis" @@ -1003,7 +991,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70", + "id": "69", "metadata": { "tags": [] }, @@ -1028,7 +1016,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71", + "id": "70", "metadata": { "tags": [] }, diff --git "a/src/notebooks/klargj\303\270ring.ipynb" "b/src/notebooks/klargj\303\270ring.ipynb" index 4e59557..c8fa77e 100644 --- "a/src/notebooks/klargj\303\270ring.ipynb" +++ "b/src/notebooks/klargj\303\270ring.ipynb" @@ -580,7 +580,7 @@ }, "outputs": [], "source": [ - "merged_df.shape" + "merged_df.dtypes" ] }, { @@ -592,7 +592,7 @@ }, "outputs": [], "source": [ - "good_df.shape" + "merged_df.head()" ] }, { @@ -604,7 +604,49 @@ }, "outputs": [], "source": [ - "onlygoodoms.shape" + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g2023/statistikkfil_bedrifter_pub.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "table = dataset.read()\n", + "\n", + "# Convert to Pandas DataFrame\n", + "bedrift_pub = table.to_pandas()\n", + "\n", + "del table, dataset\n", + "\n", + "# filter for when reg_type_f = 01\n", + "bedrift_pub = bedrift_pub[bedrift_pub['reg_type_f'] == '01']\n", + "bedrift_pub = bedrift_pub[bedrift_pub['type'] != 'S']\n", + "\n", + "bedrift_pub = bedrift_pub[['ts_forbruk', 'naring_f', 'orgnr_foretak', 'ts_salgsint', 'omsetning', 'nopost_p4005', 'nopost_driftskostnader', 'kommune', 'sysselsetting_syss', 'naring', 'orgnr_bedrift']]\n", + "\n", + "bedrift_pub['lopenr'] = 1\n", + "bedrift_pub['radnr'] = 1\n", + "\n", + "bedrift_pub['driftskostnader_percentage'] = 1\n", + "bedrift_pub['omsetning_percentage'] = 1\n", + "\n", + "# rename variables\n", + "\n", + "bedrift_pub.rename(columns={'ts_forbruk': 'forbruk', 'naring_f': 'nacef_5', 'orgnr_foretak': 'orgnr_n_1', 'ts_salgsint': 'salgsint', 'omsetning': 'foretak_omsetning', 'nopost_p4005': 'tmp_no_p4005', 'nopost_driftskostnader': 'foretak_driftskostnad', 'kommune': 'b_kommunenr', 'sysselsetting_syss': 'b_sysselsetting_syss', 'reg_type_f': 'regtype', 'naring': 'tmp_sn2007_5', 'orgnr_bedrift': 'v_orgnr'}, inplace=True)\n", + "\n", + "bedrift_pub['gjeldende_omsetn_kr'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['omsetn_kr'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['tot_oms_fordelt'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['driftskost_kr'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['gjeldende_driftsk_kr'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['tot_driftskost_fordelt'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['gjeldende_bdr_syss'] = bedrift_pub['b_sysselsetting_syss']\n", + "bedrift_pub['tmp_forbruk_bed'] = bedrift_pub['forbruk']\n", + "bedrift_pub['tmp_salgsint_bed'] = bedrift_pub['salgsint']\n", + "bedrift_pub['id'] = bedrift_pub['orgnr_n_1']" ] }, { @@ -616,7 +658,22 @@ }, "outputs": [], "source": [ - "bad_df.shape" + "fil_path = [\n", + " f\n", + " for f in fs.glob(\n", + " f\"gs://ssb-prod-noeku-data-produkt/statistikkfiler/g2022/statistikkfil_bedrifter_pub.parquet\"\n", + " )\n", + " if f.endswith(\".parquet\")\n", + "]\n", + "\n", + "# Use the ParquetDataset to read multiple files\n", + "dataset = pq.ParquetDataset(fil_path, filesystem=fs)\n", + "table = dataset.read()\n", + "\n", + "# Convert to Pandas DataFrame\n", + "bedrift_pub_x = table.to_pandas()\n", + "bedrift_pub_x.columns = bedrift_pub_x.columns.str.lower()\n", + "bedrift_pub_x.head()" ] }, { @@ -627,13 +684,142 @@ "tags": [] }, "outputs": [], + "source": [ + "del dataset, table\n", + "\n", + "bedrift_pub_x = bedrift_pub_x[bedrift_pub_x['reg_type_f'] == '01']\n", + "\n", + "\n", + "bedrift_pub_x = bedrift_pub_x[['orgnr_bedrift', 'sysselsetting_syss']]\n", + "\n", + "bedrift_pub_x.rename(columns={'sysselsetting_syss': 'fjor_syssel_t1', 'orgnr_bedrift': 'v_orgnr'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bedrift_pub = pd.merge(bedrift_pub, bedrift_pub_x, how='left', on='v_orgnr')\n", + "\n", + "\n", + "# fill nan for fjor_syssel_t1 with 0\n", + "\n", + "bedrift_pub['fjor_syssel_t1'] = bedrift_pub['fjor_syssel_t1'].fillna(0)\n", + "\n", + "del bedrift_pub_x\n", + "\n", + "merged_df = pd.concat([merged_df, bedrift_pub])\n", + "\n", + "del bedrift_pub\n", + "\n", + "merged_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "merged_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# filter for when reg_type_f = 01\n", + "bedrift_pub = bedrift_pub[bedrift_pub['reg_type_f'] == '01']\n", + "bedrift_pub = bedrift_pub[bedrift_pub['type'] != 'S']\n", + "\n", + "\n", + "bedrift_pub = bedrift_pub[['enhets_id', 'ts_forbruk', 'naring_f', 'orgnr_foretak', 'ts_salgsint', 'omsetning', 'nopost_p4005', 'nopost_driftskostnader', 'kommune', 'sysselsetting_syss', 'naring', 'orgnr_bedrift']]\n", + "\n", + "bedrift_pub['lopenr'] = 1\n", + "bedrift_pub['radnr'] = 1\n", + "\n", + "bedrift_pub['driftskostnader_percentage'] = 1\n", + "bedrift_pub['omsetning_percentage'] = 1\n", + "\n", + "# rename variables\n", + "\n", + "bedrift_pub.rename(columns={'enhets_id': 'id', 'ts_forbruk': 'forbruk', 'naring_f': 'nacef_5', 'orgnr_foretak': 'orgnr_n_1', 'ts_salgsint': 'salgsint', 'omsetning': 'foretak_omsetning', 'nopost_p4005': 'tmp_no_p4005', 'nopost_driftskostnader': 'foretak_driftskostnad', 'kommune': 'b_kommunenr', 'sysselsetting_syss': 'b_sysselsetting_syss', 'reg_type_f': 'regtype', 'naring': 'tmp_sn2007_5', 'orgnr_bedrift': 'v_orgnr'}, inplace=True)\n", + "\n", + "bedrift_pub['gjeldende_omsetn_kr'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['omsetn_kr'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['tot_oms_fordelt'] = bedrift_pub['foretak_omsetning']\n", + "bedrift_pub['driftskost_kr'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['gjeldende_driftsk_kr'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['tot_driftskost_fordelt'] = bedrift_pub['foretak_driftskostnad']\n", + "bedrift_pub['gjeldende_bdr_syss'] = bedrift_pub['b_sysselsetting_syss']\n", + "bedrift_pub['tmp_forbruk_bed'] = bedrift_pub['forbruk']\n", + "bedrift_pub['tmp_salgsint_bed'] = bedrift_pub['salgsint']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bedrift_pub.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "onlygoodoms.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bad_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32", + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "bad_df.head()" ] }, { "cell_type": "markdown", - "id": "26", + "id": "33", "metadata": {}, "source": [ "good distribution delivered (both oms and dkost): 83.33%\n", @@ -644,7 +830,7 @@ }, { "cell_type": "markdown", - "id": "27", + "id": "34", "metadata": {}, "source": [ "# Step 1. \n", @@ -655,7 +841,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "35", "metadata": { "tags": [] }, @@ -693,7 +879,7 @@ }, { "cell_type": "markdown", - "id": "29", + "id": "36", "metadata": {}, "source": [ " \n", @@ -706,7 +892,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "37", "metadata": { "tags": [] }, @@ -720,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "31", + "id": "38", "metadata": { "tags": [] }, @@ -732,7 +918,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32", + "id": "39", "metadata": { "tags": [] }, @@ -775,7 +961,7 @@ }, { "cell_type": "markdown", - "id": "33", + "id": "40", "metadata": {}, "source": [ "# step 3 . \n", @@ -797,7 +983,7 @@ }, { "cell_type": "markdown", - "id": "34", + "id": "41", "metadata": { "tags": [] }, @@ -808,7 +994,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "42", "metadata": { "tags": [] }, @@ -823,7 +1009,7 @@ }, { "cell_type": "markdown", - "id": "36", + "id": "43", "metadata": { "tags": [] }, @@ -834,7 +1020,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37", + "id": "44", "metadata": { "tags": [] }, @@ -848,7 +1034,7 @@ }, { "cell_type": "markdown", - "id": "38", + "id": "45", "metadata": {}, "source": [ "## Inflasjon data" @@ -857,7 +1043,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39", + "id": "46", "metadata": { "tags": [] }, @@ -890,7 +1076,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40", + "id": "47", "metadata": { "tags": [] }, @@ -911,7 +1097,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41", + "id": "48", "metadata": { "tags": [] }, @@ -922,7 +1108,7 @@ }, { "cell_type": "markdown", - "id": "42", + "id": "49", "metadata": {}, "source": [ "# Merge files with different features\n", @@ -935,7 +1121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43", + "id": "50", "metadata": { "tags": [] }, @@ -1049,7 +1235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44", + "id": "51", "metadata": { "tags": [] }, @@ -1061,7 +1247,7 @@ { "cell_type": "code", "execution_count": null, - "id": "45", + "id": "52", "metadata": { "tags": [] }, @@ -1124,7 +1310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46", + "id": "53", "metadata": { "tags": [] }, @@ -1140,7 +1326,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47", + "id": "54", "metadata": { "tags": [] }, @@ -1152,7 +1338,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48", + "id": "55", "metadata": { "tags": [] }, @@ -1171,7 +1357,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49", + "id": "56", "metadata": { "tags": [] }, @@ -1184,7 +1370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50", + "id": "57", "metadata": { "tags": [] }, @@ -1196,7 +1382,7 @@ { "cell_type": "code", "execution_count": null, - "id": "51", + "id": "58", "metadata": { "tags": [] }, @@ -1218,7 +1404,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52", + "id": "59", "metadata": { "tags": [] }, @@ -1234,7 +1420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "53", + "id": "60", "metadata": { "tags": [] }, @@ -1278,7 +1464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54", + "id": "61", "metadata": { "tags": [] }, @@ -1302,7 +1488,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55", + "id": "62", "metadata": { "tags": [] }, @@ -1325,7 +1511,7 @@ { "cell_type": "code", "execution_count": null, - "id": "56", + "id": "63", "metadata": { "tags": [] }, @@ -1338,7 +1524,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57", + "id": "64", "metadata": { "tags": [] }, @@ -1361,7 +1547,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58", + "id": "65", "metadata": { "tags": [] }, @@ -1373,7 +1559,7 @@ { "cell_type": "code", "execution_count": null, - "id": "59", + "id": "66", "metadata": { "tags": [] }, @@ -1385,7 +1571,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60", + "id": "67", "metadata": { "tags": [] }, @@ -1400,7 +1586,7 @@ }, { "cell_type": "markdown", - "id": "61", + "id": "68", "metadata": { "tags": [] }, @@ -1411,7 +1597,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62", + "id": "69", "metadata": { "tags": [] }, @@ -1446,7 +1632,7 @@ }, { "cell_type": "markdown", - "id": "63", + "id": "70", "metadata": { "tags": [] }, @@ -1457,7 +1643,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64", + "id": "71", "metadata": { "tags": [] }, @@ -1504,7 +1690,7 @@ }, { "cell_type": "markdown", - "id": "65", + "id": "72", "metadata": { "tags": [] }, @@ -1515,7 +1701,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66", + "id": "73", "metadata": { "tags": [] }, @@ -1561,7 +1747,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67", + "id": "74", "metadata": { "tags": [] }, @@ -1607,7 +1793,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68", + "id": "75", "metadata": { "tags": [] }, @@ -1630,7 +1816,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69", + "id": "76", "metadata": { "tags": [] }, @@ -1643,7 +1829,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70", + "id": "77", "metadata": { "tags": [] }, @@ -1656,7 +1842,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71", + "id": "78", "metadata": { "tags": [] }, @@ -1677,7 +1863,7 @@ { "cell_type": "code", "execution_count": null, - "id": "72", + "id": "79", "metadata": { "tags": [] }, @@ -1702,7 +1888,7 @@ { "cell_type": "code", "execution_count": null, - "id": "73", + "id": "80", "metadata": { "tags": [] }, @@ -1729,7 +1915,7 @@ }, { "cell_type": "markdown", - "id": "74", + "id": "81", "metadata": { "tags": [] }, @@ -1743,7 +1929,7 @@ }, { "cell_type": "markdown", - "id": "75", + "id": "82", "metadata": { "tags": [] }, @@ -1757,7 +1943,7 @@ }, { "cell_type": "markdown", - "id": "76", + "id": "83", "metadata": { "tags": [] }, @@ -1771,7 +1957,7 @@ }, { "cell_type": "markdown", - "id": "77", + "id": "84", "metadata": { "tags": [] }, @@ -1781,7 +1967,7 @@ }, { "cell_type": "markdown", - "id": "78", + "id": "85", "metadata": { "tags": [] }, @@ -1792,7 +1978,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79", + "id": "86", "metadata": { "tags": [] }, @@ -1806,7 +1992,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80", + "id": "87", "metadata": { "tags": [] }, @@ -1823,7 +2009,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81", + "id": "88", "metadata": { "tags": [] }, @@ -1835,7 +2021,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82", + "id": "89", "metadata": { "tags": [] }, @@ -1855,7 +2041,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83", + "id": "90", "metadata": { "tags": [] }, @@ -1868,7 +2054,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84", + "id": "91", "metadata": { "tags": [] }, @@ -1880,7 +2066,7 @@ { "cell_type": "code", "execution_count": null, - "id": "85", + "id": "92", "metadata": { "tags": [] }, @@ -1905,7 +2091,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86", + "id": "93", "metadata": { "tags": [] }, @@ -2000,7 +2186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87", + "id": "94", "metadata": { "tags": [] }, @@ -2024,7 +2210,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88", + "id": "95", "metadata": { "tags": [] }, @@ -2043,7 +2229,7 @@ { "cell_type": "code", "execution_count": null, - "id": "89", + "id": "96", "metadata": { "tags": [] }, @@ -2056,7 +2242,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90", + "id": "97", "metadata": { "tags": [] }, @@ -2069,7 +2255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "91", + "id": "98", "metadata": { "tags": [] }, @@ -2094,7 +2280,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92", + "id": "99", "metadata": { "tags": [] }, @@ -2119,7 +2305,7 @@ }, { "cell_type": "markdown", - "id": "93", + "id": "100", "metadata": { "tags": [] }, @@ -2130,7 +2316,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94", + "id": "101", "metadata": { "tags": [] }, @@ -2142,7 +2328,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95", + "id": "102", "metadata": { "tags": [] }, @@ -2202,7 +2388,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96", + "id": "103", "metadata": { "tags": [] }, @@ -2216,7 +2402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "97", + "id": "104", "metadata": { "tags": [] }, @@ -2235,7 +2421,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98", + "id": "105", "metadata": { "tags": [] }, @@ -2418,7 +2604,7 @@ }, { "cell_type": "markdown", - "id": "99", + "id": "106", "metadata": { "tags": [] }, @@ -2429,7 +2615,7 @@ { "cell_type": "code", "execution_count": null, - "id": "100", + "id": "107", "metadata": { "tags": [] }, @@ -2441,7 +2627,7 @@ { "cell_type": "code", "execution_count": null, - "id": "101", + "id": "108", "metadata": { "tags": [] }, @@ -2454,7 +2640,7 @@ { "cell_type": "code", "execution_count": null, - "id": "102", + "id": "109", "metadata": { "tags": [] }, @@ -2653,7 +2839,7 @@ { "cell_type": "code", "execution_count": null, - "id": "103", + "id": "110", "metadata": { "tags": [] }, @@ -2665,7 +2851,7 @@ { "cell_type": "code", "execution_count": null, - "id": "104", + "id": "111", "metadata": { "tags": [] }, @@ -2841,7 +3027,7 @@ { "cell_type": "code", "execution_count": null, - "id": "105", + "id": "112", "metadata": { "tags": [] }, @@ -2855,7 +3041,7 @@ }, { "cell_type": "markdown", - "id": "106", + "id": "113", "metadata": { "tags": [] }, @@ -2866,7 +3052,7 @@ { "cell_type": "code", "execution_count": null, - "id": "107", + "id": "114", "metadata": { "tags": [] }, @@ -2894,7 +3080,7 @@ { "cell_type": "code", "execution_count": null, - "id": "108", + "id": "115", "metadata": { "tags": [] }, @@ -2913,7 +3099,7 @@ { "cell_type": "code", "execution_count": null, - "id": "109", + "id": "116", "metadata": { "tags": [] }, @@ -2935,7 +3121,7 @@ { "cell_type": "code", "execution_count": null, - "id": "110", + "id": "117", "metadata": { "tags": [] }, @@ -2946,7 +3132,7 @@ }, { "cell_type": "markdown", - "id": "111", + "id": "118", "metadata": { "tags": [] }, @@ -3046,7 +3232,7 @@ }, { "cell_type": "markdown", - "id": "112", + "id": "119", "metadata": { "tags": [] }, @@ -3060,7 +3246,7 @@ { "cell_type": "code", "execution_count": null, - "id": "113", + "id": "120", "metadata": { "tags": [] }, @@ -3090,7 +3276,7 @@ }, { "cell_type": "markdown", - "id": "114", + "id": "121", "metadata": { "tags": [] }, @@ -3101,7 +3287,7 @@ }, { "cell_type": "markdown", - "id": "115", + "id": "122", "metadata": { "tags": [] }, @@ -3112,7 +3298,7 @@ { "cell_type": "code", "execution_count": null, - "id": "116", + "id": "123", "metadata": { "tags": [] }, @@ -3134,7 +3320,7 @@ }, { "cell_type": "markdown", - "id": "117", + "id": "124", "metadata": {}, "source": [ "### rette salgsint og forbruk" @@ -3142,7 +3328,7 @@ }, { "cell_type": "markdown", - "id": "118", + "id": "125", "metadata": { "tags": [] }, @@ -3181,7 +3367,7 @@ { "cell_type": "code", "execution_count": null, - "id": "119", + "id": "126", "metadata": { "tags": [] }, @@ -3217,7 +3403,7 @@ { "cell_type": "code", "execution_count": null, - "id": "120", + "id": "127", "metadata": { "tags": [] }, @@ -3245,7 +3431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "121", + "id": "128", "metadata": { "tags": [] }, @@ -3266,7 +3452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "122", + "id": "129", "metadata": { "tags": [] }, @@ -3288,7 +3474,7 @@ { "cell_type": "code", "execution_count": null, - "id": "123", + "id": "130", "metadata": { "tags": [] }, @@ -3313,7 +3499,7 @@ { "cell_type": "code", "execution_count": null, - "id": "124", + "id": "131", "metadata": { "tags": [] }, @@ -3345,7 +3531,7 @@ { "cell_type": "code", "execution_count": null, - "id": "125", + "id": "132", "metadata": { "tags": [] }, @@ -3359,7 +3545,7 @@ { "cell_type": "code", "execution_count": null, - "id": "126", + "id": "133", "metadata": { "tags": [] }, @@ -3376,7 +3562,7 @@ { "cell_type": "code", "execution_count": null, - "id": "127", + "id": "134", "metadata": { "tags": [] }, @@ -3398,7 +3584,7 @@ { "cell_type": "code", "execution_count": null, - "id": "128", + "id": "135", "metadata": { "tags": [] }, @@ -3446,7 +3632,7 @@ { "cell_type": "code", "execution_count": null, - "id": "129", + "id": "136", "metadata": { "tags": [] }, @@ -3474,7 +3660,7 @@ { "cell_type": "code", "execution_count": null, - "id": "130", + "id": "137", "metadata": { "tags": [] }, @@ -3498,7 +3684,7 @@ { "cell_type": "code", "execution_count": null, - "id": "131", + "id": "138", "metadata": { "tags": [] }, @@ -3573,7 +3759,7 @@ { "cell_type": "code", "execution_count": null, - "id": "132", + "id": "139", "metadata": { "tags": [] }, @@ -3589,7 +3775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "133", + "id": "140", "metadata": { "tags": [] }, @@ -3616,7 +3802,7 @@ }, { "cell_type": "markdown", - "id": "134", + "id": "141", "metadata": {}, "source": [ "# Kontrol Drkost for forbuk og lønn. " @@ -3625,7 +3811,7 @@ { "cell_type": "code", "execution_count": null, - "id": "135", + "id": "142", "metadata": {}, "outputs": [], "source": [ @@ -3657,7 +3843,7 @@ }, { "cell_type": "markdown", - "id": "136", + "id": "143", "metadata": {}, "source": [ "### Create df with a list of orgnr_foretak that have thieves. " @@ -3666,7 +3852,7 @@ { "cell_type": "code", "execution_count": null, - "id": "137", + "id": "144", "metadata": { "tags": [] }, @@ -3681,7 +3867,7 @@ { "cell_type": "code", "execution_count": null, - "id": "138", + "id": "145", "metadata": { "tags": [] }, @@ -3714,7 +3900,7 @@ { "cell_type": "code", "execution_count": null, - "id": "139", + "id": "146", "metadata": { "tags": [] }, @@ -3730,7 +3916,7 @@ { "cell_type": "code", "execution_count": null, - "id": "140", + "id": "147", "metadata": { "tags": [] }, @@ -3753,7 +3939,7 @@ { "cell_type": "code", "execution_count": null, - "id": "141", + "id": "148", "metadata": { "tags": [] }, @@ -3767,7 +3953,7 @@ { "cell_type": "code", "execution_count": null, - "id": "142", + "id": "149", "metadata": { "tags": [] }, @@ -3799,7 +3985,7 @@ }, { "cell_type": "markdown", - "id": "143", + "id": "150", "metadata": { "tags": [] }, @@ -3911,7 +4097,7 @@ }, { "cell_type": "markdown", - "id": "144", + "id": "151", "metadata": {}, "source": [ "### Correct drkost for good_df. Have to make sure it passes with salary and forbruk " @@ -3919,7 +4105,7 @@ }, { "cell_type": "markdown", - "id": "145", + "id": "152", "metadata": {}, "source": [ "### Do the same for the mixed_dfs. (good oms, bad costs etc etc ) Will need to selectivly treat based on what is good and what isnt. Maybe its worth just fixing bad oms. Bad costs can be fixed based on oms" @@ -3927,7 +4113,7 @@ }, { "cell_type": "markdown", - "id": "146", + "id": "153", "metadata": {}, "source": [ "### Implement a the ml models for the bad_df. Maybe make it so you can choose which one to use. " @@ -3936,7 +4122,7 @@ { "cell_type": "code", "execution_count": null, - "id": "147", + "id": "154", "metadata": {}, "outputs": [], "source": []