Files
python-VM/bayes-learning/lesson1.py

153 lines
4.3 KiB
Python

import pandas as pd
import os
from sklearn.preprocessing import Normalizer, StandardScaler
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
matplotlib.rcParams['backend'] = 'TkAgg'
plt.style.use('seaborn-dark-palette')
# path = os.getcwd()
path = '/home/dtomlinson/projects/bayes-learning'
data = pd.read_csv(path + str('/data/Social_Network_Ads.csv'), engine='python')
df = pd.DataFrame(data)
print('{0} rows, {1} columns'.format(df.shape[0], df.shape[1]))
# print(df[1:7])
train_size = int(0.75 * df.shape[0])
test_size = int(0.25 * df.shape[0])
print('Training set size {}, Testing set size {}'.format(train_size,
test_size))
df = df.sample(frac=1).reset_index(drop=True)
print(df[0:5])
X = df.iloc[:, [2, 3]].values
y = df.iloc[:, 4].values
normalizer = StandardScaler(copy=False).fit(X)
X = normalizer.fit_transform(X)
X_train = X[0:train_size, :]
y_train = y[0:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]
X_set, y_set = X_train, y_train
# ind = np.argsort(X_set[:, 0])
# X_set = X_set[ind]
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c=ListedColormap(('red', 'blue'))(i),
label=j, marker='.')
plt.title('Training Set')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
def generate_data(class_data_dic, X_train, y_train):
first_one = True
first_zero = True
for i in range(y_train.shape[0]):
X_temp = X_train[i, :].reshape(X_train[i, :].shape[0], 1)
if y_train[i] == 1:
if first_one is True:
class_data_dic[1] = X_temp
first_one = False
else:
class_data_dic[1] = np.append(class_data_dic[1], X_temp,
axis=1)
elif y_train[i] == 0:
if first_zero is True:
class_data_dic[0] = X_temp
first_zero = False
else:
class_data_dic[0] = np.append(class_data_dic[0], X_temp,
axis=1)
return class_data_dic
class_data_dic = generate_data(class_data_dic={}, X_train=X_train,
y_train=y_train)
"""find the mean (2x1) for each column. 0 and 1 are the values for having 0
and 1 seperately"""
mean_0 = np.mean(class_data_dic[0], axis=1)
mean_1 = np.mean(class_data_dic[1], axis=1)
std_0 = np.std(class_data_dic[0], axis=1)
std_1 = np.std(class_data_dic[1], axis=1)
print('mean_0={}, std_0={}, mean_1={}, std_1={}'.format(
mean_0, mean_1, std_0, std_1))
# plt.show()
"""define the likelyhood function (the pdf of the norm dist) """
def likelyhood(x, mean, sigma):
return np.exp(-(x - mean)**2 / (2 * sigma ** 2)) * (1 / (np.sqrt(2 * np.pi) * sigma ** 2))
""" the posterior function times together all the likelihoods for each row
of X_test here we are working out the likelihood func for each row of X_test
with their corresponding mean and stdev """
"""we then times this by the prior-prob-func to find the posterior func"""
def posterior(X, X_train_class, mean_, std_):
product = np.prod(likelyhood(X, mean_, std_), axis=1)
product = product * (X_train_class.shape[0] / X.shape[0])
return product
""" we test the posterior fun with the test data to find the probs"""
p_1 = posterior(X_test, class_data_dic[1], mean_1, std_1)
p_0 = posterior(X_test, class_data_dic[0], mean_0, std_0)
y_pred = 1 * (p_1 > p_0)
print(X_test.shape)
print(class_data_dic[0].shape)
print(p_1.shape)
tp = len([i for i in range(0, y_test.shape[0])
if y_test[i] == 0 and y_pred[i] == 0])
tn = len([i for i in range(0, y_test.shape[0])
if y_test[i] == 0 and y_pred[i] == 1])
fp = len([i for i in range(0, y_test.shape[0])
if y_test[i] == 1 and y_pred[i] == 0])
fn = len([i for i in range(0, y_test.shape[0])
if y_test[i] == 1 and y_pred[i] == 1])
confusion_matrix_alg = np.array([[tp, tn], [fp, fn]])
print(confusion_matrix_alg)
classifer = GaussianNB()
classifer.fit(X_train, y_train)
y_pred = classifer.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(cm)
print(report)