It’s been a while since I had a look at the UCI Machine Learning Repository. So, let’s have a look at a recently uploaded dataset, the QSAR fish toxicity which originates from work by Cassotti et al. (2015).


Contents


Dataset exploration

Let’s dive into it:

import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost

columns = ["CIC0","SM1_Dz", "GATS1i", "NdsCH", "NdssC", "MLOGP", "LC50"]
inputData = pd.read_csv("./data/qsar_fish_toxicity.csv",delimiter=";",names=columns)
display(inputData.sample(10))
display(inputData.describe())
CIC0 SM1_Dz GATS1i NdsCH NdssC MLOGP LC50
556 4.436 0.764 1.227 0 1 5.258 5.412
244 3.191 0.223 0.847 0 0 2.859 4.967
828 3.247 0.874 1.221 0 1 2.659 4.262
341 4.037 0.134 1.563 0 0 3.806 5.180
455 3.333 0.405 0.963 1 0 2.974 5.630
549 4.443 0.251 1.287 4 3 4.453 6.483
609 2.110 0.580 0.805 0 0 1.610 3.391
335 2.479 0.887 0.954 0 0 1.748 4.458
247 2.164 1.437 1.227 0 2 2.573 6.207
509 2.728 0.134 0.923 0 0 2.102 3.420
CIC0 SM1_Dz GATS1i NdsCH NdssC MLOGP LC50
count 908.000000 908.000000 908.000000 908.000000 908.000000 908.000000 908.000000
mean 2.898129 0.628468 1.293591 0.229075 0.485683 2.109285 4.064431
std 0.756088 0.428459 0.394303 0.605335 0.861279 1.433181 1.455698
min 0.667000 0.000000 0.396000 0.000000 0.000000 -2.884000 0.053000
25% 2.347000 0.223000 0.950750 0.000000 0.000000 1.209000 3.151750
50% 2.934000 0.570000 1.240500 0.000000 0.000000 2.127000 3.987500
75% 3.407000 0.892750 1.562250 0.000000 1.000000 3.105000 4.907500
max 5.926000 2.171000 2.920000 4.000000 6.000000 6.515000 9.612000

That doesn’t look too promising.

Brute force approach

Let’s try to brute force it.

y = inputData["LC50"].copy(deep=True)
X = inputData.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
datasets = {}
datasets[0] = {'X_train': X_train, 
               'X_test' : X_test,
               'y_train': y_train, 
               'y_test' : y_test}


def train_test_linear_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    linear_regression = LinearRegression()
    grid_parameters_linear_regression = {'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(linear_regression,
                            param_grid=grid_parameters_linear_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime 
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_bRidge_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    bRidge_regression = BayesianRidge()
    grid_parameters_BayesianRidge_regression = {'fit_intercept' : [False, True], 
                                                'n_iter':[300,1000,5000]}
    start_time = time.time()
    grid_obj = GridSearchCV(bRidge_regression,
                            param_grid=grid_parameters_BayesianRidge_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Bayesian Ridge Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_decision_tree_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    decision_tree_regression = DecisionTreeRegressor(random_state=42)
    grid_parameters_decision_tree_regression = {'max_depth' : [None, 3,5,7,9,10,11]}
    start_time = time.time()
    grid_obj = GridSearchCV(decision_tree_regression,
                            param_grid=grid_parameters_decision_tree_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Decision Tree Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_knn_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    knn_regression = KNeighborsRegressor()
    grid_parameters_knn_regression = {'n_neighbors' : [1,2,3],
                                      'weights': ['uniform', 'distance'],
                                      'algorithm': ['ball_tree', 'kd_tree'],
                                      'leaf_size': [30,90,100,110],
                                      'p': [1,2]}
    start_time = time.time()
    grid_obj = GridSearchCV(knn_regression,
                            param_grid=grid_parameters_knn_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'KNN Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_SVR_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    SVR_regression = LinearSVR()
    grid_parameters_SVR_regression = {'C' : [1, 10, 50],
                                     'epsilon' : [0.01, 0.1],
                                     'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(SVR_regression,
                            param_grid=grid_parameters_SVR_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStarTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStarTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear SVM Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_random_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    random_forest_regression = RandomForestRegressor(random_state=42)
    grid_parameters_random_forest_regression = {'n_estimators' : [3,5,10,15,18],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_regression,
                            param_grid=grid_parameters_random_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Random Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_mondrian_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    mondrian_forest_regression = MondrianForestRegressor(random_state=42)
    grid_parameters_mondrian_forest_regression = {'n_estimators' : [3,5,10,15,18,100],
                                     'max_depth' : [None, 2,3,5,7,9,25,50]}
    start_time = time.time()
    grid_obj = GridSearchCV(mondrian_forest_regression,
                            param_grid=grid_parameters_mondrian_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Mondrian Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def xgboost_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    x_gradient_boosting_regression = xgboost.XGBRegressor(random_state=42)
    grid_parameters_x_gradient_boosting_regression = {'n_estimators' : [3,5,10,15,50,60,80,100,200,300],
                                                    'max_depth' : [1,2, 3,5,7,9,10,11,15],
                                                    'learning_rate' :[ 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.8, 1.0],
                                                     }
    start_time = time.time()
    grid_obj = GridSearchCV(x_gradient_boosting_regression,
                            param_grid=grid_parameters_x_gradient_boosting_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'XGBoost Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

# make scorer
scorer = 'neg_mean_squared_error'
results = {}
counter = 0
cv_count = 5

for dataset in [0]:
    X_train, X_test, y_train, y_test = datasets[dataset]['X_train'], datasets[dataset]['X_test'], datasets[dataset]['y_train'], datasets[dataset]['y_test']
    results[counter] = train_test_linear_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Linear Regression completed")
    counter += 1
    results[counter] = train_test_bRidge_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Bayesian Ridge Regression completed")
    counter += 1
    results[counter] = train_test_decision_tree_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Decision Trees completed")
    counter += 1
    results[counter] = train_test_knn_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("KNN completed")
    counter += 1
    results[counter] = train_test_SVR_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("SVR completed")
    counter += 1
    results[counter] = train_test_random_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Random Forest completed")
    counter += 1
    results[counter] = train_test_mondrian_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Mondrian Forest completed")
    counter += 1
    results[counter] = xgboost_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("XGBoost completed")
    counter += 1

results_df = pd.DataFrame.from_dict(results, orient='index')
display(results_df)
results_df.to_csv('results_df_manual.csv')
pickle.dump(results, open('results_manual.p', 'wb'))
Regression type model Predictions R2 MSE MAE MSE_true_scale RMSE_true_scale MAE_true_scale MedAE_true_scale Training time Prediction time dataset
0 Linear Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.6962392905063024, 5.240629218487905, 2.6232... 0.568906 1.002387 0.710787 1.002387 1.001193 0.710787 0.481504 1.525417 0.001327 0
1 Bayesian Ridge Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.6822112893645746, 5.241252665899271, 2.6327... 0.569596 1.000782 0.710268 1.000782 1.000391 0.710268 0.488724 0.116765 0.001265 0
2 Decision Tree Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.8589999999999995, 5.3370588235294125, 2.658... 0.435099 1.313516 0.821030 1.313516 1.146087 0.821030 0.594059 0.141535 0.001082 0
3 KNN Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.9755878644528235, 5.681003065265153, 3.5645... 0.522664 1.109909 0.771125 1.109909 1.053522 0.771125 0.562378 0.662403 0.003900 0
4 Linear SVM Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.4797490882683526, 5.379094823589719, 2.4501... 0.548842 1.049039 0.714117 1.049039 1.024226 0.714117 0.444056 0.574919 0.001178 0
5 Random Forest Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.322425925925926, 5.6754999999999995, 3.4766... 0.565839 1.009516 0.722748 1.009516 1.004747 0.722748 0.533503 0.924254 0.002564 0
6 Mondrian Forest Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.032608087062836, 5.612142534255981, 3.55935... 0.545763 1.056198 0.730505 1.056198 1.027715 0.730505 0.514570 2.507813 0.031417 0
7 XGBoost Regression GridSearchCV(cv=5, error_score='raise-deprecat... [5.195564, 5.492118, 2.679538, 1.8706595, 3.85... 0.541646 1.065771 0.734338 1.065771 1.032362 0.734338 0.520626 40.880635 0.001308 0

If we only look at the metrics, especially R2, this looks like a clear failure. However, a visual assessment shows that it is not too bad considering that this is a science dataset and nothing releated to e.g. marketing.

TPOT

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor

columns = ["CIC0","SM1_Dz", "GATS1i", "NdsCH", "NdssC", "MLOGP", "LC50"]
input_data = pd.read_csv("./data/qsar_fish_toxicity.csv",delimiter=";",names=columns)

# split data into X and y
y = input_data["LC50"].copy(deep=True)
X = input_data.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.25,
                                                     shuffle=True,
                                                     random_state=42)


tpot = TPOTRegressor(max_time_mins=60,
                     verbosity=2,
                     n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('QSAR_fish_toxicity.py')
y_predictions = tpot.predict(X_test)

r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
R2 score: 0.5855640659217719
MAE: 0.6920005770925111
MSE: 0.9636518895874657
RMSE: 0.981657725272646

That is even worse.

This is the pipeline TPOT came up with:

import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-0.6590324162633999
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=1, min_samples_split=9, n_estimators=100)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.1, min_samples_leaf=1, min_samples_split=6, n_estimators=100)),
    StackingEstimator(estimator=AdaBoostRegressor(learning_rate=0.01, loss="exponential", n_estimators=100)),
    SelectPercentile(score_func=f_regression, percentile=90),
    ExtraTreesRegressor(bootstrap=False, max_features=0.15000000000000002, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)