completed all sections of 1
This commit is contained in:
167
python/Supervised Learning/Training and Tuning/diabetes.py
Normal file
167
python/Supervised Learning/Training and Tuning/diabetes.py
Normal file
@@ -0,0 +1,167 @@
|
||||
# Import our libraries
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.svm import SVC
|
||||
import seaborn as sns
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.getcwd())
|
||||
import check_file as ch
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
|
||||
sns.set(style="ticks")
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_colwidth', -1)
|
||||
|
||||
# Read in our dataset
|
||||
diabetes = pd.read_csv('diabetes.csv')
|
||||
|
||||
# Take a look at the first few rows of the dataset
|
||||
print(diabetes.head())
|
||||
|
||||
# Summary on diabetes df
|
||||
print(diabetes.describe())
|
||||
# sns.pairplot(diabetes, hue='Outcome', diag_kind='hist')
|
||||
# plt.show()
|
||||
sns.heatmap(diabetes.corr(), annot=True, square=True,
|
||||
annot_kws={"size": 12})
|
||||
# plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
diabetes.hist()
|
||||
plt.show()
|
||||
|
||||
preg = diabetes.groupby(['Pregnancies', 'Outcome'])\
|
||||
.agg({'Pregnancies': 'count'})
|
||||
preg = preg.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
|
||||
|
||||
|
||||
# Set our testing and training data
|
||||
y = diabetes['Outcome']
|
||||
X = diabetes.drop(['Outcome'], axis=1)
|
||||
|
||||
|
||||
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
|
||||
learning_rate=0.1, n_estimators=200, random_state=None)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
test_size=0.2,
|
||||
random_state=42)
|
||||
# build a classifier
|
||||
clf_rf = RandomForestClassifier()
|
||||
|
||||
# Set up the hyperparameter search
|
||||
param_dist = {"max_depth": [3, None],
|
||||
"n_estimators": list(range(10, 200)),
|
||||
"max_features": list(range(1, X_test.shape[1] + 1)),
|
||||
"min_samples_split": list(range(2, 11)),
|
||||
"min_samples_leaf": list(range(1, 11)),
|
||||
"bootstrap": [True, False],
|
||||
"criterion": ["gini", "entropy"]}
|
||||
|
||||
# Run a randomized search over the hyperparameters
|
||||
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)
|
||||
|
||||
# Fit the model on the training data
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
# Make predictions on the test data
|
||||
rf_preds = random_search.best_estimator_.predict(X_test)
|
||||
|
||||
ch.print_metrics(y_test, rf_preds, 'random forest')
|
||||
|
||||
# Print the parameters used in the model
|
||||
print(random_search.best_estimator_)
|
||||
|
||||
|
||||
# build a classifier for ada boost
|
||||
clf_ada = AdaBoostClassifier()
|
||||
|
||||
# Set up the hyperparameter search
|
||||
# look at setting up your search for n_estimators, learning_rate
|
||||
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
|
||||
param_dist = {"n_estimators": [10, 100, 200, 400],
|
||||
"learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4,
|
||||
0.5, 1, 2, 10, 20]}
|
||||
|
||||
# Run a randomized search over the hyperparameters
|
||||
ada_search = RandomizedSearchCV(clf_ada, param_distributions=param_dist)
|
||||
|
||||
# Fit the model on the training data
|
||||
ada_search.fit(X_train, y_train)
|
||||
|
||||
# Make predictions on the test data
|
||||
ada_preds = ada_search.best_estimator_.predict(X_test)
|
||||
|
||||
# Return your metrics on test data
|
||||
ch.print_metrics(y_test, ada_preds, 'adaboost')
|
||||
|
||||
# Print the hyperparams used
|
||||
print(ada_search.best_estimator_)
|
||||
|
||||
# Doing the same as above except using a full GridSearch
|
||||
|
||||
ada_grid_search = GridSearchCV(clf_ada, param_dist)
|
||||
|
||||
ada_grid_search.fit(X_train, y_train)
|
||||
|
||||
ada_grid_preds = ada_grid_search.best_estimator_.predict(X_test)
|
||||
|
||||
ch.print_metrics(y_test, ada_grid_preds, 'adaboost')
|
||||
|
||||
print(ada_grid_search.best_estimator_)
|
||||
|
||||
|
||||
# build a classifier for support vector machines
|
||||
clf_svc = SVC()
|
||||
|
||||
# Set up the hyperparameter search
|
||||
# look at setting up your search for C (recommend 0-10 range),
|
||||
# kernel, and degree
|
||||
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
|
||||
param_dist = {"C": [0.1, 0.5, 1, 3, 5],
|
||||
"kernel": ['linear', 'rbf']
|
||||
}
|
||||
|
||||
|
||||
# Run a randomized search over the hyperparameters
|
||||
svc_search = RandomizedSearchCV(clf_svc, param_distributions=param_dist)
|
||||
|
||||
# Fit the model on the training data
|
||||
svc_search.fit(X_train, y_train)
|
||||
|
||||
# Make predictions on the test data
|
||||
svc_preds = svc_search.best_estimator_.predict(X_test)
|
||||
|
||||
ch.print_metrics(y_test, svc_preds, 'svc')
|
||||
|
||||
print(svc_search.best_estimator_)
|
||||
|
||||
|
||||
# Get information about the best model
|
||||
# Get column names
|
||||
features = diabetes.columns[:diabetes.shape[1]]
|
||||
print(features)
|
||||
|
||||
# Get the best features of the best estimator (random forest)
|
||||
importances = ada_search.best_estimator_.feature_importances_
|
||||
print(importances)
|
||||
|
||||
# Get these in increasing number of importance
|
||||
indices = np.argsort(importances)
|
||||
print(indices)
|
||||
|
||||
# Plot them
|
||||
plt.title('Feature Importances')
|
||||
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
|
||||
plt.yticks(range(len(indices)), features[indices])
|
||||
plt.xlabel('Relative Importance')
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user