144 lines
3.4 KiB
Python
144 lines
3.4 KiB
Python
from sklearn.datasets import load_boston
|
|
from sklearn.model_selection import train_test_split
|
|
import numpy as np
|
|
import tests2 as t
|
|
from sklearn.tree import DecisionTreeRegressor
|
|
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
|
|
|
boston = load_boston()
|
|
y = boston.target
|
|
X = boston.data
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.33, random_state=42)
|
|
|
|
|
|
dec_tree = DecisionTreeRegressor()
|
|
ran_for = RandomForestRegressor()
|
|
ada = AdaBoostRegressor()
|
|
lin_reg = LinearRegression()
|
|
|
|
|
|
dec_tree.fit(X_train, y_train)
|
|
ran_for.fit(X_train, y_train)
|
|
ada.fit(X_train, y_train)
|
|
lin_reg.fit(X_train, y_train)
|
|
|
|
|
|
dec_pred = dec_tree.predict(X_test)
|
|
ran_pred = ran_for.predict(X_test)
|
|
ada_pred = ada.predict(X_test)
|
|
lin_pred = lin_reg.predict(X_test)
|
|
|
|
|
|
# potential model options
|
|
a = 'regression'
|
|
b = 'classification'
|
|
c = 'both regression and classification'
|
|
|
|
metrics_dict = {
|
|
'precision': b,
|
|
'recall': b,
|
|
'accuracy': b,
|
|
'r2_score': a,
|
|
'mean_squared_error': a,
|
|
'area_under_curve': b,
|
|
'mean_absolute_area': a
|
|
}
|
|
|
|
# checks your answer, no need to change this code
|
|
t.q6_check(metrics_dict)
|
|
print()
|
|
|
|
models = {'dec_pred': dec_pred, 'ran_pred': ran_pred, 'ada_pred': ada_pred,
|
|
'lin_pred': lin_pred}
|
|
metrics = [r2_score, mean_squared_error, mean_absolute_error]
|
|
|
|
|
|
# Check r2
|
|
def r2(actual, preds):
|
|
'''
|
|
INPUT:
|
|
actual - numpy array or pd series of actual y values
|
|
preds - numpy array or pd series of predicted y values
|
|
OUTPUT:
|
|
returns the r-squared score as a float
|
|
'''
|
|
sse = np.sum((actual - preds)**2)
|
|
sst = np.sum((actual - np.mean(actual))**2)
|
|
return 1 - sse / sst
|
|
|
|
|
|
for i in models:
|
|
print(f'r2 manual for {i} is {r2(y_test, models[i]):.4f}')
|
|
print(f'r2 sklearn for {i} is {r2_score(y_test, models[i]):.4f}')
|
|
print()
|
|
# Check solution matches sklearn
|
|
|
|
|
|
def mse(actual, preds):
|
|
'''
|
|
INPUT:
|
|
actual - numpy array or pd series of actual y values
|
|
preds - numpy array or pd series of predicted y values
|
|
OUTPUT:
|
|
returns the mean squared error as a float
|
|
'''
|
|
|
|
return np.sum((actual - preds)**2) / len(actual)
|
|
|
|
|
|
# Check your solution matches sklearn
|
|
for i in models:
|
|
print(f'mse manual for {i} is {mse(y_test, models[i]):.4f}')
|
|
print(f'mse sklearn for {i} is'
|
|
f' {mean_squared_error(y_test, models[i]):.4f}')
|
|
print()
|
|
|
|
|
|
def mae(actual, preds):
|
|
'''
|
|
INPUT:
|
|
actual - numpy array or pd series of actual y values
|
|
preds - numpy array or pd series of predicted y values
|
|
OUTPUT:
|
|
returns the mean absolute error as a float
|
|
'''
|
|
|
|
return np.sum(np.abs(actual - preds)) / len(actual)
|
|
|
|
|
|
# Check your solution matches sklearn
|
|
for i in models:
|
|
print(f'mae manual for {i} is {mae(y_test, models[i]):.4f}')
|
|
print(f'mae sklearn for {i} is'
|
|
f' {mean_absolute_error(y_test, models[i]):.4f}')
|
|
print()
|
|
|
|
print('=================')
|
|
print('Comparison of all models:\n')
|
|
for i in models:
|
|
for j in range(len(metrics)):
|
|
print(f'{metrics[j].__name__} for '
|
|
f'{i} {metrics[j](y_test, models[i]):.4f}')
|
|
print()
|
|
|
|
|
|
# match each metric to the model that performed best on it
|
|
a = 'decision tree'
|
|
b = 'random forest'
|
|
c = 'adaptive boosting'
|
|
d = 'linear regression'
|
|
|
|
|
|
best_fit = {
|
|
'mse': b,
|
|
'r2': b,
|
|
'mae': b
|
|
}
|
|
|
|
# Tests your answer - don't change this code
|
|
t.check_ten(best_fit)
|