Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

61 - adds playground/gradient-boosted-trees #62

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions playground/gradient-boosted-trees/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Gradient boosted trees

## Installing Dependencies
Install [eVE](../../../pyVEs/eVE.yml) virtual environment
```
mamba activate eVE
```

## Data
```
datasets_diabetes.py
```

## Voting regression predictions
```
python *.py
```

## References
https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers
https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_regressor.html#sphx-glr-auto-examples-ensemble-plot-voting-regressor-py

38 changes: 38 additions & 0 deletions playground/gradient-boosted-trees/datasets_diabetes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#https://colab.research.google.com/gist/DeepakNair93/573cc1d52f497c685b7a96ce37838dd5/untitled0.ipynb#scrollTo=kdHS_7x78c0M

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
#X, y = load_diabetes(return_X_y=True)
#Samples total 442
#Dimensionality 10
#Features real, -.2 < x < .2
#Targets integer 25 - 346

#print(diabetes.DESCR)
print(diabetes.feature_names) #checking the feature names
print(diabetes.data.shape) #checking the shape of data
print(diabetes.target.shape)
#print(diabetes.target)
print(diabetes.target[:3])


db_df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
db_df['Progression'] = diabetes.target #new column name 'Progression'
#print(db_df.isna().sum())
print(db_df.describe())
print(db_df.info())

corr = db_df.corr()
plt.subplots(figsize=(8,8))
sns.heatmap(corr,cmap= 'RdYlGn',annot=True)
plt.show()


#This plot shows the linear correlation between the variables within themselves & also variables with the target 'Progression'.
#This could be a phase where the variables which are multicollinear can be eliminated.
# https://medium.com/@hammad.ai/3-ways-to-detect-multicollinearity-in-your-dataset-6ee1776b7aa8


99 changes: 99 additions & 0 deletions playground/gradient-boosted-trees/gradient-boosting-regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
# Author: Peter Prettenhofer <[email protected]>
# Maria Telenczuk <https://github.com/maikia>
# Katrina Ni <https://github.com/nilichen>
#
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13
)

params = {
"n_estimators": 500,
"max_depth": 4,
"min_samples_split": 5,
"learning_rate": 0.01,
"loss": "squared_error",
}

#Fit regression model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# Plot training deviance
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = mean_squared_error(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
reg.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()

# Plot feature importance¶
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
result.importances[sorted_idx].T,
vert=False,
labels=np.array(diabetes.feature_names)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()
















50 changes: 50 additions & 0 deletions playground/gradient-boosted-trees/voting-regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import matplotlib.pyplot as plt

from sklearn.datasets import load_diabetes
from sklearn.ensemble import (
GradientBoostingRegressor,
RandomForestRegressor,
VotingRegressor,
)
from sklearn.linear_model import LinearRegression


X, y = load_diabetes(return_X_y=True)

# Train classifiers
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = LinearRegression()

reg1.fit(X, y)
reg2.fit(X, y)
reg3.fit(X, y)

ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)])
ereg.fit(X, y)

#Making predictions
xt = X[:100]

pred1 = reg1.predict(xt)
pred2 = reg2.predict(xt)
pred3 = reg3.predict(xt)
pred4 = ereg.predict(xt)


#Plot the results¶
plt.figure()
plt.plot(pred1, "gd", label="GradientBoostingRegressor")
plt.plot(pred2, "b^", label="RandomForestRegressor")
plt.plot(pred3, "ys", label="LinearRegression")
plt.plot(pred4, "r*", ms=10, label="VotingRegressor")

plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.ylabel("predicted")
plt.xlabel("training samples")
plt.legend(loc="best")
plt.title("Regressor predictions and their average")

plt.show()


3 changes: 0 additions & 3 deletions playground/xgboost/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# xgboost.XGBClassifier

## Installing Dependencies
Install [eVE](../../../pyVEs/eVE.yml) virtual environment

## Running notebooks
```
mamba activate eVE
Expand Down
2 changes: 1 addition & 1 deletion pyVEs/eVE.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- tqdm #https://github.com/tqdm/tqdm/tags
- pylint #https://github.com/pylint-dev/pylint/tags
- seaborn #https://github.com/mwaskom/seaborn/tags
- pandas
### VERSIONS of pyside6: https://pypi.org/project/PySide6/#history
#- pyside6>=6.4.2
### VERSIONS of VTK https://gitlab.kitware.com/vtk/vtk/-/tags
Expand All @@ -41,7 +42,6 @@ dependencies:
#- matplotlib
#- scikit-learn
#- notebook
#- pandas
#- seaborn
#- araviq6
#- civiq6
Expand Down