# Import our libraries import pandas as pd import numpy as np from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn.metrics import f1_score from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier import matplotlib.pyplot as plt from sklearn.svm import SVC import seaborn as sns import sys import os sys.path.append(os.getcwd()) import check_file as ch import warnings warnings.filterwarnings("ignore") sns.set(style="ticks") pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', -1) # Read in our dataset diabetes = pd.read_csv('diabetes.csv') # Take a look at the first few rows of the dataset print(diabetes.head()) # Summary on diabetes df print(diabetes.describe()) # sns.pairplot(diabetes, hue='Outcome', diag_kind='hist') # plt.show() sns.heatmap(diabetes.corr(), annot=True, square=True, annot_kws={"size": 12}) # plt.tight_layout() plt.show() diabetes.hist() plt.show() preg = diabetes.groupby(['Pregnancies', 'Outcome'])\ .agg({'Pregnancies': 'count'}) preg = preg.groupby(level=0).apply(lambda x: 100 * x / float(x.sum())) # Set our testing and training data y = diabetes['Outcome'] X = diabetes.drop(['Outcome'], axis=1) AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1, n_estimators=200, random_state=None) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # build a classifier clf_rf = RandomForestClassifier() # Set up the hyperparameter search param_dist = {"max_depth": [3, None], "n_estimators": list(range(10, 200)), "max_features": list(range(1, X_test.shape[1] + 1)), "min_samples_split": list(range(2, 11)), "min_samples_leaf": list(range(1, 11)), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # Run a randomized search over the hyperparameters random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist) # Fit the model on the training data random_search.fit(X_train, y_train) # Make predictions on the test data rf_preds = random_search.best_estimator_.predict(X_test) ch.print_metrics(y_test, rf_preds, 'random forest') # Print the parameters used in the model print(random_search.best_estimator_) # build a classifier for ada boost clf_ada = AdaBoostClassifier() # Set up the hyperparameter search # look at setting up your search for n_estimators, learning_rate # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html param_dist = {"n_estimators": [10, 100, 200, 400], "learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 10, 20]} # Run a randomized search over the hyperparameters ada_search = RandomizedSearchCV(clf_ada, param_distributions=param_dist) # Fit the model on the training data ada_search.fit(X_train, y_train) # Make predictions on the test data ada_preds = ada_search.best_estimator_.predict(X_test) # Return your metrics on test data ch.print_metrics(y_test, ada_preds, 'adaboost') # Print the hyperparams used print(ada_search.best_estimator_) # Doing the same as above except using a full GridSearch ada_grid_search = GridSearchCV(clf_ada, param_dist) ada_grid_search.fit(X_train, y_train) ada_grid_preds = ada_grid_search.best_estimator_.predict(X_test) ch.print_metrics(y_test, ada_grid_preds, 'adaboost') print(ada_grid_search.best_estimator_) # build a classifier for support vector machines clf_svc = SVC() # Set up the hyperparameter search # look at setting up your search for C (recommend 0-10 range), # kernel, and degree # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html param_dist = {"C": [0.1, 0.5, 1, 3, 5], "kernel": ['linear', 'rbf'] } # Run a randomized search over the hyperparameters svc_search = RandomizedSearchCV(clf_svc, param_distributions=param_dist) # Fit the model on the training data svc_search.fit(X_train, y_train) # Make predictions on the test data svc_preds = svc_search.best_estimator_.predict(X_test) ch.print_metrics(y_test, svc_preds, 'svc') print(svc_search.best_estimator_) # Get information about the best model # Get column names features = diabetes.columns[:diabetes.shape[1]] print(features) # Get the best features of the best estimator (random forest) importances = ada_search.best_estimator_.feature_importances_ print(importances) # Get these in increasing number of importance indices = np.argsort(importances) print(indices) # Plot them plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features[indices]) plt.xlabel('Relative Importance') plt.show()