168 lines
5.0 KiB
Python
168 lines
5.0 KiB
Python
# Import our libraries
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.datasets import load_diabetes
|
|
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
from sklearn.metrics import f1_score
|
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.svm import SVC
|
|
import seaborn as sns
|
|
import sys
|
|
import os
|
|
sys.path.append(os.getcwd())
|
|
import check_file as ch
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
sns.set(style="ticks")
|
|
pd.set_option('display.max_columns', None)
|
|
pd.set_option('display.max_colwidth', -1)
|
|
|
|
# Read in our dataset
|
|
diabetes = pd.read_csv('diabetes.csv')
|
|
|
|
# Take a look at the first few rows of the dataset
|
|
print(diabetes.head())
|
|
|
|
# Summary on diabetes df
|
|
print(diabetes.describe())
|
|
sns.pairplot(diabetes, hue='Outcome', diag_kind='hist')
|
|
plt.show()
|
|
sns.heatmap(diabetes.corr(), annot=True, square=True,
|
|
annot_kws={"size": 12})
|
|
# plt.tight_layout()
|
|
plt.show()
|
|
|
|
diabetes.hist()
|
|
plt.show()
|
|
|
|
preg = diabetes.groupby(['Pregnancies', 'Outcome'])\
|
|
.agg({'Pregnancies': 'count'})
|
|
preg = preg.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
|
|
|
|
|
|
# Set our testing and training data
|
|
y = diabetes['Outcome']
|
|
X = diabetes.drop(['Outcome'], axis=1)
|
|
|
|
|
|
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
|
|
learning_rate=0.1, n_estimators=200, random_state=None)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
|
test_size=0.2,
|
|
random_state=42)
|
|
# build a classifier
|
|
clf_rf = RandomForestClassifier()
|
|
|
|
# Set up the hyperparameter search
|
|
param_dist = {"max_depth": [3, None],
|
|
"n_estimators": list(range(10, 200)),
|
|
"max_features": list(range(1, X_test.shape[1] + 1)),
|
|
"min_samples_split": list(range(2, 11)),
|
|
"min_samples_leaf": list(range(1, 11)),
|
|
"bootstrap": [True, False],
|
|
"criterion": ["gini", "entropy"]}
|
|
|
|
# Run a randomized search over the hyperparameters
|
|
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)
|
|
|
|
# Fit the model on the training data
|
|
random_search.fit(X_train, y_train)
|
|
|
|
# Make predictions on the test data
|
|
rf_preds = random_search.best_estimator_.predict(X_test)
|
|
|
|
ch.print_metrics(y_test, rf_preds, 'random forest')
|
|
|
|
# Print the parameters used in the model
|
|
print(random_search.best_estimator_)
|
|
|
|
|
|
# build a classifier for ada boost
|
|
clf_ada = AdaBoostClassifier()
|
|
|
|
# Set up the hyperparameter search
|
|
# look at setting up your search for n_estimators, learning_rate
|
|
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
|
|
param_dist = {"n_estimators": [10, 100, 200, 400],
|
|
"learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4,
|
|
0.5, 1, 2, 10, 20]}
|
|
|
|
# Run a randomized search over the hyperparameters
|
|
ada_search = RandomizedSearchCV(clf_ada, param_distributions=param_dist)
|
|
|
|
# Fit the model on the training data
|
|
ada_search.fit(X_train, y_train)
|
|
|
|
# Make predictions on the test data
|
|
ada_preds = ada_search.best_estimator_.predict(X_test)
|
|
|
|
# Return your metrics on test data
|
|
ch.print_metrics(y_test, ada_preds, 'adaboost')
|
|
|
|
# Print the hyperparams used
|
|
print(ada_search.best_estimator_)
|
|
|
|
# Doing the same as above except using a full GridSearch
|
|
|
|
ada_grid_search = GridSearchCV(clf_ada, param_dist)
|
|
|
|
ada_grid_search.fit(X_train, y_train)
|
|
|
|
ada_grid_preds = ada_grid_search.best_estimator_.predict(X_test)
|
|
|
|
ch.print_metrics(y_test, ada_grid_preds, 'adaboost')
|
|
|
|
print(ada_grid_search.best_estimator_)
|
|
|
|
|
|
# build a classifier for support vector machines
|
|
clf_svc = SVC()
|
|
|
|
# Set up the hyperparameter search
|
|
# look at setting up your search for C (recommend 0-10 range),
|
|
# kernel, and degree
|
|
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
|
|
param_dist = {"C": [0.1, 0.5, 1, 3, 5],
|
|
"kernel": ['linear', 'rbf']
|
|
}
|
|
|
|
|
|
# Run a randomized search over the hyperparameters
|
|
svc_search = RandomizedSearchCV(clf_svc, param_distributions=param_dist)
|
|
|
|
# Fit the model on the training data
|
|
svc_search.fit(X_train, y_train)
|
|
|
|
# Make predictions on the test data
|
|
svc_preds = svc_search.best_estimator_.predict(X_test)
|
|
|
|
ch.print_metrics(y_test, svc_preds, 'svc')
|
|
|
|
print(svc_search.best_estimator_)
|
|
|
|
|
|
# Get information about the best model
|
|
# Get column names
|
|
features = diabetes.columns[:diabetes.shape[1]]
|
|
print(features)
|
|
|
|
# Get the best features of the best estimator (random forest)
|
|
importances = ada_search.best_estimator_.feature_importances_
|
|
print(importances)
|
|
|
|
# Get these in increasing number of importance
|
|
indices = np.argsort(importances)
|
|
print(indices)
|
|
|
|
# Plot them
|
|
plt.title('Feature Importances')
|
|
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
|
|
plt.yticks(range(len(indices)), features[indices])
|
|
plt.xlabel('Relative Importance')
|
|
plt.show()
|