import pandas as pd import os from sklearn.preprocessing import Normalizer, StandardScaler from matplotlib.colors import ListedColormap import numpy as np import matplotlib import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.metrics import confusion_matrix, classification_report matplotlib.rcParams['backend'] = 'TkAgg' plt.style.use('seaborn-dark-palette') # path = os.getcwd() path = '/home/dtomlinson/projects/bayes-learning' data = pd.read_csv(path + str('/data/Social_Network_Ads.csv'), engine='python') df = pd.DataFrame(data) print('{0} rows, {1} columns'.format(df.shape[0], df.shape[1])) # print(df[1:7]) train_size = int(0.75 * df.shape[0]) test_size = int(0.25 * df.shape[0]) print('Training set size {}, Testing set size {}'.format(train_size, test_size)) df = df.sample(frac=1).reset_index(drop=True) print(df[0:5]) X = df.iloc[:, [2, 3]].values y = df.iloc[:, 4].values normalizer = StandardScaler(copy=False).fit(X) X = normalizer.fit_transform(X) X_train = X[0:train_size, :] y_train = y[0:train_size] X_test = X[train_size:, :] y_test = y[train_size:] X_set, y_set = X_train, y_train # ind = np.argsort(X_set[:, 0]) # X_set = X_set[ind] for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'blue'))(i), label=j, marker='.') plt.title('Training Set') plt.xlabel('Age') plt.ylabel('Estimated Salary') plt.legend() def generate_data(class_data_dic, X_train, y_train): first_one = True first_zero = True for i in range(y_train.shape[0]): X_temp = X_train[i, :].reshape(X_train[i, :].shape[0], 1) if y_train[i] == 1: if first_one is True: class_data_dic[1] = X_temp first_one = False else: class_data_dic[1] = np.append(class_data_dic[1], X_temp, axis=1) elif y_train[i] == 0: if first_zero is True: class_data_dic[0] = X_temp first_zero = False else: class_data_dic[0] = np.append(class_data_dic[0], X_temp, axis=1) return class_data_dic class_data_dic = generate_data(class_data_dic={}, X_train=X_train, y_train=y_train) """find the mean (2x1) for each column. 0 and 1 are the values for having 0 and 1 seperately""" mean_0 = np.mean(class_data_dic[0], axis=1) mean_1 = np.mean(class_data_dic[1], axis=1) std_0 = np.std(class_data_dic[0], axis=1) std_1 = np.std(class_data_dic[1], axis=1) print('mean_0={}, std_0={}, mean_1={}, std_1={}'.format( mean_0, mean_1, std_0, std_1)) # plt.show() """define the likelyhood function (the pdf of the norm dist) """ def likelyhood(x, mean, sigma): return np.exp(-(x - mean)**2 / (2 * sigma ** 2)) * (1 / (np.sqrt(2 * np.pi) * sigma ** 2)) """ the posterior function times together all the likelihoods for each row of X_test here we are working out the likelihood func for each row of X_test with their corresponding mean and stdev """ """we then times this by the prior-prob-func to find the posterior func""" def posterior(X, X_train_class, mean_, std_): product = np.prod(likelyhood(X, mean_, std_), axis=1) product = product * (X_train_class.shape[0] / X.shape[0]) return product """ we test the posterior fun with the test data to find the probs""" p_1 = posterior(X_test, class_data_dic[1], mean_1, std_1) p_0 = posterior(X_test, class_data_dic[0], mean_0, std_0) y_pred = 1 * (p_1 > p_0) print(X_test.shape) print(class_data_dic[0].shape) print(p_1.shape) tp = len([i for i in range(0, y_test.shape[0]) if y_test[i] == 0 and y_pred[i] == 0]) tn = len([i for i in range(0, y_test.shape[0]) if y_test[i] == 0 and y_pred[i] == 1]) fp = len([i for i in range(0, y_test.shape[0]) if y_test[i] == 1 and y_pred[i] == 0]) fn = len([i for i in range(0, y_test.shape[0]) if y_test[i] == 1 and y_pred[i] == 1]) confusion_matrix_alg = np.array([[tp, tn], [fp, fn]]) print(confusion_matrix_alg) classifer = GaussianNB() classifer.fit(X_train, y_train) y_pred = classifer.predict(X_test) cm = confusion_matrix(y_test, y_pred) report = classification_report(y_test, y_pred) print(cm) print(report)