240 lines
7.1 KiB
Python
240 lines
7.1 KiB
Python
# Import our libraries
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
from sklearn.metrics import f1_score, fbeta_score
|
|
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
|
|
from sklearn.ensemble import AdaBoostClassifier
|
|
from sklearn.svm import SVC
|
|
import matplotlib.pyplot as plt
|
|
from itertools import cycle
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
from scipy import interp
|
|
# import tests as t
|
|
|
|
# Read in our dataset
|
|
df = pd.read_csv('smsspamcollection_SMSSpamCollection',
|
|
header=None,
|
|
names=['label', 'sms_message'])
|
|
|
|
# Fix our response value
|
|
df['label'] = df.label.map({'ham': 0, 'spam': 1})
|
|
|
|
# Split our dataset into training and testing data
|
|
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
|
|
df['label'],
|
|
random_state=1)
|
|
|
|
# Instantiate the CountVectorizer method
|
|
count_vector = CountVectorizer()
|
|
|
|
# Fit the training data and then return the matrix
|
|
training_data = count_vector.fit_transform(X_train)
|
|
|
|
# Transform testing data and return the matrix. Note we are not fitting the
|
|
# testing data into the CountVectorizer()
|
|
testing_data = count_vector.transform(X_test)
|
|
|
|
# Instantiate a number of our models
|
|
naive_bayes = MultinomialNB()
|
|
bag_mod = BaggingClassifier(n_estimators=200)
|
|
rf_mod = RandomForestClassifier(n_estimators=200)
|
|
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
|
|
svm_mod = SVC()
|
|
|
|
# Fit each of the 4 models
|
|
# This might take some time to run
|
|
naive_bayes.fit(training_data, y_train)
|
|
bag_mod.fit(training_data, y_train)
|
|
rf_mod.fit(training_data, y_train)
|
|
ada_mod.fit(training_data, y_train)
|
|
svm_mod.fit(training_data, y_train)
|
|
|
|
|
|
# Make predictions using each of your models
|
|
nb = naive_bayes.predict(testing_data)
|
|
bag_pred = bag_mod.predict(testing_data)
|
|
rf_pred = rf_mod.predict(testing_data)
|
|
ada_pred = ada_mod.predict(testing_data)
|
|
svm_pred = svm_mod.predict(testing_data)
|
|
|
|
|
|
# accuracy is the total correct divided by the total to predict
|
|
def accuracy(actual, preds):
|
|
'''
|
|
INPUT
|
|
preds - predictions as a numpy array or pandas series
|
|
actual - actual values as a numpy array or pandas series
|
|
|
|
OUTPUT:
|
|
returns the accuracy as a float
|
|
'''
|
|
return np.sum(preds == actual) / len(actual)
|
|
|
|
|
|
print(accuracy(y_test, nb))
|
|
print(accuracy_score(y_test, nb))
|
|
print("Since these match, we correctly calculated our metric!")
|
|
|
|
|
|
# precision is the true positives over the predicted positive values
|
|
def precision(actual, preds):
|
|
'''
|
|
INPUT
|
|
(assumes positive = 1 and negative = 0)
|
|
preds - predictions as a numpy array or pandas series
|
|
actual - actual values as a numpy array or pandas series
|
|
|
|
OUTPUT:
|
|
returns the precision as a float
|
|
'''
|
|
TP = np.sum((preds == actual) & (preds > 0))
|
|
FP = np.sum((preds == 1) & (actual == 0))
|
|
return TP / (TP + FP)
|
|
|
|
|
|
print(precision(y_test, nb))
|
|
print(precision_score(y_test, nb))
|
|
print("If the above match, you got it!")
|
|
|
|
|
|
# recall is true positives over all actual positive values
|
|
def recall(actual, preds):
|
|
'''
|
|
INPUT
|
|
preds - predictions as a numpy array or pandas series
|
|
actual - actual values as a numpy array or pandas series
|
|
|
|
OUTPUT:
|
|
returns the recall as a float
|
|
'''
|
|
TP = np.sum((preds == actual) & (preds > 0))
|
|
FN = np.sum((preds == 0) & (actual == 1))
|
|
return TP / (TP + FN)
|
|
|
|
|
|
print(recall(y_test, nb))
|
|
print(recall_score(y_test, nb))
|
|
print("If the above match, you got it!")
|
|
|
|
|
|
# f1_score is 2*(precision*recall)/(precision+recall))
|
|
def f1(actual, preds):
|
|
'''
|
|
INPUT
|
|
preds - predictions as a numpy array or pandas series
|
|
actual - actual values as a numpy array or pandas series
|
|
|
|
OUTPUT:
|
|
returns the f1score as a float
|
|
'''
|
|
prec = precision(actual, preds)
|
|
rec = recall(actual, preds)
|
|
return 2 * ((prec * rec) / (prec + rec))
|
|
|
|
|
|
print(f1(y_test, nb))
|
|
print(f1_score(y_test, nb))
|
|
print("If the above match, you got it!")
|
|
|
|
|
|
# add the letter of the most appropriate metric to each statement
|
|
# in the dictionary
|
|
a = "recall"
|
|
b = "precision"
|
|
c = "accuracy"
|
|
d = 'f1-score'
|
|
|
|
|
|
seven_sol = {
|
|
'We have imbalanced classes, which metric do we definitely not want to'
|
|
' use?': c,
|
|
'We really want to make sure the positive cases are all caught even if'
|
|
' that means we identify some negatives as positives': a,
|
|
'When we identify something as positive, we want to be sure it is truly'
|
|
' positive': b,
|
|
'We care equally about identifying positive and negative cases': d
|
|
}
|
|
|
|
# This gives: That's right! It isn't really necessary to memorize these in
|
|
# practice, but it is important to know they exist and know why might use one
|
|
# metric over another for a particular situation.
|
|
|
|
|
|
models = {'nb': nb,
|
|
'bag_pred': bag_pred,
|
|
'rf_pred': rf_pred,
|
|
'ada_pred': ada_pred,
|
|
'svm_pred': svm_pred}
|
|
metrics = [accuracy_score, precision_score, recall_score, f1_score]
|
|
|
|
for i in models:
|
|
for j in range(len(metrics)):
|
|
print(f'{metrics[j].__name__} for '
|
|
f'{i} {metrics[j](y_test, models[i]):.4f}')
|
|
print()
|
|
|
|
|
|
beta = 1
|
|
|
|
print(f1_score(y_test, nb))
|
|
print(fbeta_score(y_test, nb, beta))
|
|
|
|
for i in models:
|
|
print(f'fbeta_score for {i} {fbeta_score(y_test, models[i], beta)}')
|
|
print(f'f1_score for {i} {f1_score(y_test, models[i], beta)}')
|
|
print()
|
|
|
|
|
|
# Function for calculating auc and roc
|
|
|
|
def build_roc_auc(model, X_train, X_test, y_train, y_test):
|
|
'''
|
|
INPUT:
|
|
model - an sklearn instantiated model
|
|
X_train - the training data
|
|
y_train - the training response values (must be categorical)
|
|
X_test - the test data
|
|
y_test - the test response values (must be categorical)
|
|
OUTPUT:
|
|
auc - returns auc as a float
|
|
prints the roc curve
|
|
'''
|
|
y_preds = model.fit(X_train, y_train).predict_proba(X_test)
|
|
# Compute ROC curve and ROC area for each class
|
|
fpr = dict()
|
|
tpr = dict()
|
|
roc_auc = dict()
|
|
for i in range(len(y_test)):
|
|
fpr[i], tpr[i], _ = roc_curve(y_test, y_preds[:, 1])
|
|
roc_auc[i] = auc(fpr[i], tpr[i])
|
|
|
|
# Compute micro-average ROC curve and ROC area
|
|
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(),
|
|
y_preds[:, 1].ravel())
|
|
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
|
|
|
|
plt.plot(fpr[2], tpr[2], color='darkorange',
|
|
lw=2, label='ROC curve (area = %0.2f)' % roc_auc[2])
|
|
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
|
plt.xlim([0.0, 1.0])
|
|
plt.ylim([0.0, 1.05])
|
|
plt.xlabel('False Positive Rate')
|
|
plt.ylabel('True Positive Rate')
|
|
plt.title('Receiver operating characteristic example')
|
|
plt.show()
|
|
|
|
return roc_auc_score(y_test, np.round(y_preds[:, 1]))
|
|
|
|
|
|
instaniated_models = [naive_bayes, bag_mod, rf_mod]
|
|
|
|
for i in instaniated_models:
|
|
build_roc_auc(i, training_data, testing_data, y_train, y_test)
|
|
|
|
print(build_roc_auc(instaniated_models[0], training_data, testing_data,
|
|
y_train, y_test))
|