Files

144 lines
3.4 KiB
Python

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
import tests2 as t
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
boston = load_boston()
y = boston.target
X = boston.data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
dec_tree = DecisionTreeRegressor()
ran_for = RandomForestRegressor()
ada = AdaBoostRegressor()
lin_reg = LinearRegression()
dec_tree.fit(X_train, y_train)
ran_for.fit(X_train, y_train)
ada.fit(X_train, y_train)
lin_reg.fit(X_train, y_train)
dec_pred = dec_tree.predict(X_test)
ran_pred = ran_for.predict(X_test)
ada_pred = ada.predict(X_test)
lin_pred = lin_reg.predict(X_test)
# potential model options
a = 'regression'
b = 'classification'
c = 'both regression and classification'
metrics_dict = {
'precision': b,
'recall': b,
'accuracy': b,
'r2_score': a,
'mean_squared_error': a,
'area_under_curve': b,
'mean_absolute_area': a
}
# checks your answer, no need to change this code
t.q6_check(metrics_dict)
print()
models = {'dec_pred': dec_pred, 'ran_pred': ran_pred, 'ada_pred': ada_pred,
'lin_pred': lin_pred}
metrics = [r2_score, mean_squared_error, mean_absolute_error]
# Check r2
def r2(actual, preds):
'''
INPUT:
actual - numpy array or pd series of actual y values
preds - numpy array or pd series of predicted y values
OUTPUT:
returns the r-squared score as a float
'''
sse = np.sum((actual - preds)**2)
sst = np.sum((actual - np.mean(actual))**2)
return 1 - sse / sst
for i in models:
print(f'r2 manual for {i} is {r2(y_test, models[i]):.4f}')
print(f'r2 sklearn for {i} is {r2_score(y_test, models[i]):.4f}')
print()
# Check solution matches sklearn
def mse(actual, preds):
'''
INPUT:
actual - numpy array or pd series of actual y values
preds - numpy array or pd series of predicted y values
OUTPUT:
returns the mean squared error as a float
'''
return np.sum((actual - preds)**2) / len(actual)
# Check your solution matches sklearn
for i in models:
print(f'mse manual for {i} is {mse(y_test, models[i]):.4f}')
print(f'mse sklearn for {i} is'
f' {mean_squared_error(y_test, models[i]):.4f}')
print()
def mae(actual, preds):
'''
INPUT:
actual - numpy array or pd series of actual y values
preds - numpy array or pd series of predicted y values
OUTPUT:
returns the mean absolute error as a float
'''
return np.sum(np.abs(actual - preds)) / len(actual)
# Check your solution matches sklearn
for i in models:
print(f'mae manual for {i} is {mae(y_test, models[i]):.4f}')
print(f'mae sklearn for {i} is'
f' {mean_absolute_error(y_test, models[i]):.4f}')
print()
print('=================')
print('Comparison of all models:\n')
for i in models:
for j in range(len(metrics)):
print(f'{metrics[j].__name__} for '
f'{i} {metrics[j](y_test, models[i]):.4f}')
print()
# match each metric to the model that performed best on it
a = 'decision tree'
b = 'random forest'
c = 'adaptive boosting'
d = 'linear regression'
best_fit = {
'mse': b,
'r2': b,
'mae': b
}
# Tests your answer - don't change this code
t.check_ten(best_fit)