It’s been a while since I had a look at the UCI Machine Learning Repository. So, let’s have a look at a recently uploaded dataset, the QSAR fish toxicity which originates from work by Cassotti et al. (2015).
Contents
Dataset exploration
Let’s dive into it:
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost
columns = ["CIC0","SM1_Dz", "GATS1i", "NdsCH", "NdssC", "MLOGP", "LC50"]
inputData = pd.read_csv("./data/qsar_fish_toxicity.csv",delimiter=";",names=columns)
display(inputData.sample(10))
display(inputData.describe())
CIC0 | SM1_Dz | GATS1i | NdsCH | NdssC | MLOGP | LC50 | |
---|---|---|---|---|---|---|---|
556 | 4.436 | 0.764 | 1.227 | 0 | 1 | 5.258 | 5.412 |
244 | 3.191 | 0.223 | 0.847 | 0 | 0 | 2.859 | 4.967 |
828 | 3.247 | 0.874 | 1.221 | 0 | 1 | 2.659 | 4.262 |
341 | 4.037 | 0.134 | 1.563 | 0 | 0 | 3.806 | 5.180 |
455 | 3.333 | 0.405 | 0.963 | 1 | 0 | 2.974 | 5.630 |
549 | 4.443 | 0.251 | 1.287 | 4 | 3 | 4.453 | 6.483 |
609 | 2.110 | 0.580 | 0.805 | 0 | 0 | 1.610 | 3.391 |
335 | 2.479 | 0.887 | 0.954 | 0 | 0 | 1.748 | 4.458 |
247 | 2.164 | 1.437 | 1.227 | 0 | 2 | 2.573 | 6.207 |
509 | 2.728 | 0.134 | 0.923 | 0 | 0 | 2.102 | 3.420 |
CIC0 | SM1_Dz | GATS1i | NdsCH | NdssC | MLOGP | LC50 | |
---|---|---|---|---|---|---|---|
count | 908.000000 | 908.000000 | 908.000000 | 908.000000 | 908.000000 | 908.000000 | 908.000000 |
mean | 2.898129 | 0.628468 | 1.293591 | 0.229075 | 0.485683 | 2.109285 | 4.064431 |
std | 0.756088 | 0.428459 | 0.394303 | 0.605335 | 0.861279 | 1.433181 | 1.455698 |
min | 0.667000 | 0.000000 | 0.396000 | 0.000000 | 0.000000 | -2.884000 | 0.053000 |
25% | 2.347000 | 0.223000 | 0.950750 | 0.000000 | 0.000000 | 1.209000 | 3.151750 |
50% | 2.934000 | 0.570000 | 1.240500 | 0.000000 | 0.000000 | 2.127000 | 3.987500 |
75% | 3.407000 | 0.892750 | 1.562250 | 0.000000 | 1.000000 | 3.105000 | 4.907500 |
max | 5.926000 | 2.171000 | 2.920000 | 4.000000 | 6.000000 | 6.515000 | 9.612000 |
That doesn’t look too promising.
Brute force approach
Let’s try to brute force it.
y = inputData["LC50"].copy(deep=True)
X = inputData.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
datasets = {}
datasets[0] = {'X_train': X_train,
'X_test' : X_test,
'y_train': y_train,
'y_test' : y_test}
def train_test_linear_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
linear_regression = LinearRegression()
grid_parameters_linear_regression = {'fit_intercept' : [False, True]}
start_time = time.time()
grid_obj = GridSearchCV(linear_regression,
param_grid=grid_parameters_linear_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Linear Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_bRidge_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
bRidge_regression = BayesianRidge()
grid_parameters_BayesianRidge_regression = {'fit_intercept' : [False, True],
'n_iter':[300,1000,5000]}
start_time = time.time()
grid_obj = GridSearchCV(bRidge_regression,
param_grid=grid_parameters_BayesianRidge_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Bayesian Ridge Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_decision_tree_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
decision_tree_regression = DecisionTreeRegressor(random_state=42)
grid_parameters_decision_tree_regression = {'max_depth' : [None, 3,5,7,9,10,11]}
start_time = time.time()
grid_obj = GridSearchCV(decision_tree_regression,
param_grid=grid_parameters_decision_tree_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Decision Tree Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_knn_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
knn_regression = KNeighborsRegressor()
grid_parameters_knn_regression = {'n_neighbors' : [1,2,3],
'weights': ['uniform', 'distance'],
'algorithm': ['ball_tree', 'kd_tree'],
'leaf_size': [30,90,100,110],
'p': [1,2]}
start_time = time.time()
grid_obj = GridSearchCV(knn_regression,
param_grid=grid_parameters_knn_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'KNN Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_SVR_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
SVR_regression = LinearSVR()
grid_parameters_SVR_regression = {'C' : [1, 10, 50],
'epsilon' : [0.01, 0.1],
'fit_intercept' : [False, True]}
start_time = time.time()
grid_obj = GridSearchCV(SVR_regression,
param_grid=grid_parameters_SVR_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStarTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStarTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Linear SVM Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_random_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
random_forest_regression = RandomForestRegressor(random_state=42)
grid_parameters_random_forest_regression = {'n_estimators' : [3,5,10,15,18],
'max_depth' : [None, 2,3,5,7,9]}
start_time = time.time()
grid_obj = GridSearchCV(random_forest_regression,
param_grid=grid_parameters_random_forest_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Random Forest Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_mondrian_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
mondrian_forest_regression = MondrianForestRegressor(random_state=42)
grid_parameters_mondrian_forest_regression = {'n_estimators' : [3,5,10,15,18,100],
'max_depth' : [None, 2,3,5,7,9,25,50]}
start_time = time.time()
grid_obj = GridSearchCV(mondrian_forest_regression,
param_grid=grid_parameters_mondrian_forest_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Mondrian Forest Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def xgboost_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
x_gradient_boosting_regression = xgboost.XGBRegressor(random_state=42)
grid_parameters_x_gradient_boosting_regression = {'n_estimators' : [3,5,10,15,50,60,80,100,200,300],
'max_depth' : [1,2, 3,5,7,9,10,11,15],
'learning_rate' :[ 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.8, 1.0],
}
start_time = time.time()
grid_obj = GridSearchCV(x_gradient_boosting_regression,
param_grid=grid_parameters_x_gradient_boosting_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'XGBoost Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
# make scorer
scorer = 'neg_mean_squared_error'
results = {}
counter = 0
cv_count = 5
for dataset in [0]:
X_train, X_test, y_train, y_test = datasets[dataset]['X_train'], datasets[dataset]['X_test'], datasets[dataset]['y_train'], datasets[dataset]['y_test']
results[counter] = train_test_linear_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Linear Regression completed")
counter += 1
results[counter] = train_test_bRidge_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Bayesian Ridge Regression completed")
counter += 1
results[counter] = train_test_decision_tree_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Decision Trees completed")
counter += 1
results[counter] = train_test_knn_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("KNN completed")
counter += 1
results[counter] = train_test_SVR_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("SVR completed")
counter += 1
results[counter] = train_test_random_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Random Forest completed")
counter += 1
results[counter] = train_test_mondrian_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Mondrian Forest completed")
counter += 1
results[counter] = xgboost_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("XGBoost completed")
counter += 1
results_df = pd.DataFrame.from_dict(results, orient='index')
display(results_df)
results_df.to_csv('results_df_manual.csv')
pickle.dump(results, open('results_manual.p', 'wb'))
Regression type | model | Predictions | R2 | MSE | MAE | MSE_true_scale | RMSE_true_scale | MAE_true_scale | MedAE_true_scale | Training time | Prediction time | dataset | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Linear Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.6962392905063024, 5.240629218487905, 2.6232... | 0.568906 | 1.002387 | 0.710787 | 1.002387 | 1.001193 | 0.710787 | 0.481504 | 1.525417 | 0.001327 | 0 |
1 | Bayesian Ridge Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.6822112893645746, 5.241252665899271, 2.6327... | 0.569596 | 1.000782 | 0.710268 | 1.000782 | 1.000391 | 0.710268 | 0.488724 | 0.116765 | 0.001265 | 0 |
2 | Decision Tree Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.8589999999999995, 5.3370588235294125, 2.658... | 0.435099 | 1.313516 | 0.821030 | 1.313516 | 1.146087 | 0.821030 | 0.594059 | 0.141535 | 0.001082 | 0 |
3 | KNN Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.9755878644528235, 5.681003065265153, 3.5645... | 0.522664 | 1.109909 | 0.771125 | 1.109909 | 1.053522 | 0.771125 | 0.562378 | 0.662403 | 0.003900 | 0 |
4 | Linear SVM Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.4797490882683526, 5.379094823589719, 2.4501... | 0.548842 | 1.049039 | 0.714117 | 1.049039 | 1.024226 | 0.714117 | 0.444056 | 0.574919 | 0.001178 | 0 |
5 | Random Forest Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.322425925925926, 5.6754999999999995, 3.4766... | 0.565839 | 1.009516 | 0.722748 | 1.009516 | 1.004747 | 0.722748 | 0.533503 | 0.924254 | 0.002564 | 0 |
6 | Mondrian Forest Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.032608087062836, 5.612142534255981, 3.55935... | 0.545763 | 1.056198 | 0.730505 | 1.056198 | 1.027715 | 0.730505 | 0.514570 | 2.507813 | 0.031417 | 0 |
7 | XGBoost Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [5.195564, 5.492118, 2.679538, 1.8706595, 3.85... | 0.541646 | 1.065771 | 0.734338 | 1.065771 | 1.032362 | 0.734338 | 0.520626 | 40.880635 | 0.001308 | 0 |
If we only look at the metrics, especially R2, this looks like a clear failure. However, a visual assessment shows that it is not too bad considering that this is a science dataset and nothing releated to e.g. marketing.
TPOT
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor
columns = ["CIC0","SM1_Dz", "GATS1i", "NdsCH", "NdssC", "MLOGP", "LC50"]
input_data = pd.read_csv("./data/qsar_fish_toxicity.csv",delimiter=";",names=columns)
# split data into X and y
y = input_data["LC50"].copy(deep=True)
X = input_data.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.25,
shuffle=True,
random_state=42)
tpot = TPOTRegressor(max_time_mins=60,
verbosity=2,
n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('QSAR_fish_toxicity.py')
y_predictions = tpot.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
R2 score: 0.5855640659217719
MAE: 0.6920005770925111
MSE: 0.9636518895874657
RMSE: 0.981657725272646
That is even worse.
This is the pipeline TPOT came up with:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['target'].values, random_state=None)
# Average CV score on the training set was:-0.6590324162633999
exported_pipeline = make_pipeline(
StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=1, min_samples_split=9, n_estimators=100)),
StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.1, min_samples_leaf=1, min_samples_split=6, n_estimators=100)),
StackingEstimator(estimator=AdaBoostRegressor(learning_rate=0.01, loss="exponential", n_estimators=100)),
SelectPercentile(score_func=f_regression, percentile=90),
ExtraTreesRegressor(bootstrap=False, max_features=0.15000000000000002, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)