udacity/python/Supervised Learning/Model Evaluation Metrics/Regression Metrics/regression.py

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
import tests2 as t
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

boston = load_boston()
y = boston.target
X = boston.data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


dec_tree = DecisionTreeRegressor()
ran_for = RandomForestRegressor()
ada = AdaBoostRegressor()
lin_reg = LinearRegression()


dec_tree.fit(X_train, y_train)
ran_for.fit(X_train, y_train)
ada.fit(X_train, y_train)
lin_reg.fit(X_train, y_train)


dec_pred = dec_tree.predict(X_test)
ran_pred = ran_for.predict(X_test)
ada_pred = ada.predict(X_test)
lin_pred = lin_reg.predict(X_test)


# potential model options
a = 'regression'
b = 'classification'
c = 'both regression and classification'

metrics_dict = {
    'precision': b,
    'recall': b,
    'accuracy': b,
    'r2_score': a,
    'mean_squared_error': a,
    'area_under_curve': b,
    'mean_absolute_area': a
}

# checks your answer, no need to change this code
t.q6_check(metrics_dict)
print()

models = {'dec_pred': dec_pred, 'ran_pred': ran_pred, 'ada_pred': ada_pred,
          'lin_pred': lin_pred}
metrics = [r2_score, mean_squared_error, mean_absolute_error]


# Check r2
def r2(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the r-squared score as a float
    '''
    sse = np.sum((actual - preds)**2)
    sst = np.sum((actual - np.mean(actual))**2)
    return 1 - sse / sst


for i in models:
    print(f'r2 manual for {i} is {r2(y_test, models[i]):.4f}')
    print(f'r2 sklearn for {i} is {r2_score(y_test, models[i]):.4f}')
    print()
# Check solution matches sklearn


def mse(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean squared error as a float
    '''

    return np.sum((actual - preds)**2) / len(actual)


# Check your solution matches sklearn
for i in models:
    print(f'mse manual for {i} is {mse(y_test, models[i]):.4f}')
    print(f'mse sklearn for {i} is'
          f' {mean_squared_error(y_test, models[i]):.4f}')
    print()


def mae(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean absolute error as a float
    '''

    return np.sum(np.abs(actual - preds)) / len(actual)


# Check your solution matches sklearn
for i in models:
    print(f'mae manual for {i} is {mae(y_test, models[i]):.4f}')
    print(f'mae sklearn for {i} is'
          f' {mean_absolute_error(y_test, models[i]):.4f}')
    print()

print('=================')
print('Comparison of all models:\n')
for i in models:
    for j in range(len(metrics)):
        print(f'{metrics[j].__name__} for '
              f'{i} {metrics[j](y_test, models[i]):.4f}')
    print()


# match each metric to the model that performed best on it
a = 'decision tree'
b = 'random forest'
c = 'adaptive boosting'
d = 'linear regression'


best_fit = {
    'mse': b,
    'r2': b,
    'mae': b
}

# Tests your answer - don't change this code
t.check_ten(best_fit)