diff --git a/playground/gradient-boosted-trees/README.md b/playground/gradient-boosted-trees/README.md new file mode 100644 index 0000000..5a32129 --- /dev/null +++ b/playground/gradient-boosted-trees/README.md @@ -0,0 +1,22 @@ +# Gradient boosted trees + +## Installing Dependencies +Install [eVE](../../../pyVEs/eVE.yml) virtual environment +``` +mamba activate eVE +``` + +## Data +``` +datasets_diabetes.py +``` + +## Voting regression predictions +``` +python *.py +``` + +## References +https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers +https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_regressor.html#sphx-glr-auto-examples-ensemble-plot-voting-regressor-py + diff --git a/playground/gradient-boosted-trees/datasets_diabetes.py b/playground/gradient-boosted-trees/datasets_diabetes.py new file mode 100644 index 0000000..e141d23 --- /dev/null +++ b/playground/gradient-boosted-trees/datasets_diabetes.py @@ -0,0 +1,38 @@ +#https://colab.research.google.com/gist/DeepakNair93/573cc1d52f497c685b7a96ce37838dd5/untitled0.ipynb#scrollTo=kdHS_7x78c0M + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from sklearn.datasets import load_diabetes +diabetes = load_diabetes() +#X, y = load_diabetes(return_X_y=True) +#Samples total 442 +#Dimensionality 10 +#Features real, -.2 < x < .2 +#Targets integer 25 - 346 + +#print(diabetes.DESCR) +print(diabetes.feature_names) #checking the feature names +print(diabetes.data.shape) #checking the shape of data +print(diabetes.target.shape) +#print(diabetes.target) +print(diabetes.target[:3]) + + +db_df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names) +db_df['Progression'] = diabetes.target #new column name 'Progression' +#print(db_df.isna().sum()) +print(db_df.describe()) +print(db_df.info()) + +corr = db_df.corr() +plt.subplots(figsize=(8,8)) +sns.heatmap(corr,cmap= 'RdYlGn',annot=True) +plt.show() + + +#This plot shows the linear correlation between the variables within themselves & also variables with the target 'Progression'. +#This could be a phase where the variables which are multicollinear can be eliminated. +# https://medium.com/@hammad.ai/3-ways-to-detect-multicollinearity-in-your-dataset-6ee1776b7aa8 + + diff --git a/playground/gradient-boosted-trees/gradient-boosting-regression.py b/playground/gradient-boosted-trees/gradient-boosting-regression.py new file mode 100644 index 0000000..4aaf738 --- /dev/null +++ b/playground/gradient-boosted-trees/gradient-boosting-regression.py @@ -0,0 +1,99 @@ +#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html +# Author: Peter Prettenhofer +# Maria Telenczuk +# Katrina Ni +# +# License: BSD 3 clause + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn import datasets, ensemble +from sklearn.inspection import permutation_importance +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + +diabetes = datasets.load_diabetes() +X, y = diabetes.data, diabetes.target + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=13 +) + +params = { + "n_estimators": 500, + "max_depth": 4, + "min_samples_split": 5, + "learning_rate": 0.01, + "loss": "squared_error", +} + +#Fit regression model +reg = ensemble.GradientBoostingRegressor(**params) +reg.fit(X_train, y_train) + +mse = mean_squared_error(y_test, reg.predict(X_test)) +print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) + +# Plot training deviance +test_score = np.zeros((params["n_estimators"],), dtype=np.float64) +for i, y_pred in enumerate(reg.staged_predict(X_test)): + test_score[i] = mean_squared_error(y_test, y_pred) + +fig = plt.figure(figsize=(6, 6)) +plt.subplot(1, 1, 1) +plt.title("Deviance") +plt.plot( + np.arange(params["n_estimators"]) + 1, + reg.train_score_, + "b-", + label="Training Set Deviance", +) +plt.plot( + np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance" +) +plt.legend(loc="upper right") +plt.xlabel("Boosting Iterations") +plt.ylabel("Deviance") +fig.tight_layout() +plt.show() + +# Plot feature importance¶ +feature_importance = reg.feature_importances_ +sorted_idx = np.argsort(feature_importance) +pos = np.arange(sorted_idx.shape[0]) + 0.5 +fig = plt.figure(figsize=(12, 6)) +plt.subplot(1, 2, 1) +plt.barh(pos, feature_importance[sorted_idx], align="center") +plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx]) +plt.title("Feature Importance (MDI)") + +result = permutation_importance( + reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 +) +sorted_idx = result.importances_mean.argsort() +plt.subplot(1, 2, 2) +plt.boxplot( + result.importances[sorted_idx].T, + vert=False, + labels=np.array(diabetes.feature_names)[sorted_idx], +) +plt.title("Permutation Importance (test set)") +fig.tight_layout() +plt.show() + + + + + + + + + + + + + + + + diff --git a/playground/gradient-boosted-trees/voting-regressor.py b/playground/gradient-boosted-trees/voting-regressor.py new file mode 100644 index 0000000..9e7d6d9 --- /dev/null +++ b/playground/gradient-boosted-trees/voting-regressor.py @@ -0,0 +1,50 @@ +import matplotlib.pyplot as plt + +from sklearn.datasets import load_diabetes +from sklearn.ensemble import ( + GradientBoostingRegressor, + RandomForestRegressor, + VotingRegressor, +) +from sklearn.linear_model import LinearRegression + + +X, y = load_diabetes(return_X_y=True) + +# Train classifiers +reg1 = GradientBoostingRegressor(random_state=1) +reg2 = RandomForestRegressor(random_state=1) +reg3 = LinearRegression() + +reg1.fit(X, y) +reg2.fit(X, y) +reg3.fit(X, y) + +ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)]) +ereg.fit(X, y) + +#Making predictions +xt = X[:100] + +pred1 = reg1.predict(xt) +pred2 = reg2.predict(xt) +pred3 = reg3.predict(xt) +pred4 = ereg.predict(xt) + + +#Plot the results¶ +plt.figure() +plt.plot(pred1, "gd", label="GradientBoostingRegressor") +plt.plot(pred2, "b^", label="RandomForestRegressor") +plt.plot(pred3, "ys", label="LinearRegression") +plt.plot(pred4, "r*", ms=10, label="VotingRegressor") + +plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) +plt.ylabel("predicted") +plt.xlabel("training samples") +plt.legend(loc="best") +plt.title("Regressor predictions and their average") + +plt.show() + + diff --git a/playground/xgboost/README.md b/playground/xgboost/README.md index 005eccb..84fb9d0 100644 --- a/playground/xgboost/README.md +++ b/playground/xgboost/README.md @@ -1,8 +1,5 @@ # xgboost.XGBClassifier -## Installing Dependencies -Install [eVE](../../../pyVEs/eVE.yml) virtual environment - ## Running notebooks ``` mamba activate eVE diff --git a/pyVEs/eVE.yml b/pyVEs/eVE.yml index 11a1398..7686fcc 100644 --- a/pyVEs/eVE.yml +++ b/pyVEs/eVE.yml @@ -27,6 +27,7 @@ dependencies: - tqdm #https://github.com/tqdm/tqdm/tags - pylint #https://github.com/pylint-dev/pylint/tags - seaborn #https://github.com/mwaskom/seaborn/tags + - pandas ### VERSIONS of pyside6: https://pypi.org/project/PySide6/#history #- pyside6>=6.4.2 ### VERSIONS of VTK https://gitlab.kitware.com/vtk/vtk/-/tags @@ -41,7 +42,6 @@ dependencies: #- matplotlib #- scikit-learn #- notebook - #- pandas #- seaborn #- araviq6 #- civiq6