Files
udacity/python/Supervised Learning/Training and Tuning/diabetes.py
2019-07-14 00:58:00 +01:00

168 lines
5.0 KiB
Python

# Import our libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
import sys
import os
sys.path.append(os.getcwd())
import check_file as ch
import warnings
warnings.filterwarnings("ignore")
sns.set(style="ticks")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
# Read in our dataset
diabetes = pd.read_csv('diabetes.csv')
# Take a look at the first few rows of the dataset
print(diabetes.head())
# Summary on diabetes df
print(diabetes.describe())
sns.pairplot(diabetes, hue='Outcome', diag_kind='hist')
plt.show()
sns.heatmap(diabetes.corr(), annot=True, square=True,
annot_kws={"size": 12})
# plt.tight_layout()
plt.show()
diabetes.hist()
plt.show()
preg = diabetes.groupby(['Pregnancies', 'Outcome'])\
.agg({'Pregnancies': 'count'})
preg = preg.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
# Set our testing and training data
y = diabetes['Outcome']
X = diabetes.drop(['Outcome'], axis=1)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
learning_rate=0.1, n_estimators=200, random_state=None)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42)
# build a classifier
clf_rf = RandomForestClassifier()
# Set up the hyperparameter search
param_dist = {"max_depth": [3, None],
"n_estimators": list(range(10, 200)),
"max_features": list(range(1, X_test.shape[1] + 1)),
"min_samples_split": list(range(2, 11)),
"min_samples_leaf": list(range(1, 11)),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)
# Fit the model on the training data
random_search.fit(X_train, y_train)
# Make predictions on the test data
rf_preds = random_search.best_estimator_.predict(X_test)
ch.print_metrics(y_test, rf_preds, 'random forest')
# Print the parameters used in the model
print(random_search.best_estimator_)
# build a classifier for ada boost
clf_ada = AdaBoostClassifier()
# Set up the hyperparameter search
# look at setting up your search for n_estimators, learning_rate
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
param_dist = {"n_estimators": [10, 100, 200, 400],
"learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4,
0.5, 1, 2, 10, 20]}
# Run a randomized search over the hyperparameters
ada_search = RandomizedSearchCV(clf_ada, param_distributions=param_dist)
# Fit the model on the training data
ada_search.fit(X_train, y_train)
# Make predictions on the test data
ada_preds = ada_search.best_estimator_.predict(X_test)
# Return your metrics on test data
ch.print_metrics(y_test, ada_preds, 'adaboost')
# Print the hyperparams used
print(ada_search.best_estimator_)
# Doing the same as above except using a full GridSearch
ada_grid_search = GridSearchCV(clf_ada, param_dist)
ada_grid_search.fit(X_train, y_train)
ada_grid_preds = ada_grid_search.best_estimator_.predict(X_test)
ch.print_metrics(y_test, ada_grid_preds, 'adaboost')
print(ada_grid_search.best_estimator_)
# build a classifier for support vector machines
clf_svc = SVC()
# Set up the hyperparameter search
# look at setting up your search for C (recommend 0-10 range),
# kernel, and degree
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
param_dist = {"C": [0.1, 0.5, 1, 3, 5],
"kernel": ['linear', 'rbf']
}
# Run a randomized search over the hyperparameters
svc_search = RandomizedSearchCV(clf_svc, param_distributions=param_dist)
# Fit the model on the training data
svc_search.fit(X_train, y_train)
# Make predictions on the test data
svc_preds = svc_search.best_estimator_.predict(X_test)
ch.print_metrics(y_test, svc_preds, 'svc')
print(svc_search.best_estimator_)
# Get information about the best model
# Get column names
features = diabetes.columns[:diabetes.shape[1]]
print(features)
# Get the best features of the best estimator (random forest)
importances = ada_search.best_estimator_.feature_importances_
print(importances)
# Get these in increasing number of importance
indices = np.argsort(importances)
print(indices)
# Plot them
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()