adding all files done so far
This commit is contained in:
152
bayes-learning/lesson1.py
Normal file
152
bayes-learning/lesson1.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
from sklearn.preprocessing import Normalizer, StandardScaler
|
||||
from matplotlib.colors import ListedColormap
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.metrics import confusion_matrix, classification_report
|
||||
|
||||
matplotlib.rcParams['backend'] = 'TkAgg'
|
||||
|
||||
plt.style.use('seaborn-dark-palette')
|
||||
|
||||
# path = os.getcwd()
|
||||
path = '/home/dtomlinson/projects/bayes-learning'
|
||||
|
||||
data = pd.read_csv(path + str('/data/Social_Network_Ads.csv'), engine='python')
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
print('{0} rows, {1} columns'.format(df.shape[0], df.shape[1]))
|
||||
# print(df[1:7])
|
||||
|
||||
train_size = int(0.75 * df.shape[0])
|
||||
test_size = int(0.25 * df.shape[0])
|
||||
|
||||
print('Training set size {}, Testing set size {}'.format(train_size,
|
||||
test_size))
|
||||
|
||||
df = df.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
print(df[0:5])
|
||||
|
||||
X = df.iloc[:, [2, 3]].values
|
||||
y = df.iloc[:, 4].values
|
||||
|
||||
normalizer = StandardScaler(copy=False).fit(X)
|
||||
X = normalizer.fit_transform(X)
|
||||
|
||||
X_train = X[0:train_size, :]
|
||||
y_train = y[0:train_size]
|
||||
|
||||
X_test = X[train_size:, :]
|
||||
y_test = y[train_size:]
|
||||
|
||||
X_set, y_set = X_train, y_train
|
||||
|
||||
|
||||
# ind = np.argsort(X_set[:, 0])
|
||||
# X_set = X_set[ind]
|
||||
|
||||
|
||||
for i, j in enumerate(np.unique(y_set)):
|
||||
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
|
||||
c=ListedColormap(('red', 'blue'))(i),
|
||||
label=j, marker='.')
|
||||
|
||||
|
||||
plt.title('Training Set')
|
||||
plt.xlabel('Age')
|
||||
plt.ylabel('Estimated Salary')
|
||||
plt.legend()
|
||||
|
||||
|
||||
def generate_data(class_data_dic, X_train, y_train):
|
||||
|
||||
first_one = True
|
||||
first_zero = True
|
||||
|
||||
for i in range(y_train.shape[0]):
|
||||
X_temp = X_train[i, :].reshape(X_train[i, :].shape[0], 1)
|
||||
|
||||
if y_train[i] == 1:
|
||||
if first_one is True:
|
||||
class_data_dic[1] = X_temp
|
||||
first_one = False
|
||||
else:
|
||||
class_data_dic[1] = np.append(class_data_dic[1], X_temp,
|
||||
axis=1)
|
||||
elif y_train[i] == 0:
|
||||
if first_zero is True:
|
||||
class_data_dic[0] = X_temp
|
||||
first_zero = False
|
||||
else:
|
||||
class_data_dic[0] = np.append(class_data_dic[0], X_temp,
|
||||
axis=1)
|
||||
|
||||
return class_data_dic
|
||||
|
||||
|
||||
class_data_dic = generate_data(class_data_dic={}, X_train=X_train,
|
||||
y_train=y_train)
|
||||
|
||||
"""find the mean (2x1) for each column. 0 and 1 are the values for having 0
|
||||
and 1 seperately"""
|
||||
|
||||
mean_0 = np.mean(class_data_dic[0], axis=1)
|
||||
mean_1 = np.mean(class_data_dic[1], axis=1)
|
||||
std_0 = np.std(class_data_dic[0], axis=1)
|
||||
std_1 = np.std(class_data_dic[1], axis=1)
|
||||
|
||||
print('mean_0={}, std_0={}, mean_1={}, std_1={}'.format(
|
||||
mean_0, mean_1, std_0, std_1))
|
||||
# plt.show()
|
||||
|
||||
"""define the likelyhood function (the pdf of the norm dist) """
|
||||
|
||||
|
||||
def likelyhood(x, mean, sigma):
|
||||
return np.exp(-(x - mean)**2 / (2 * sigma ** 2)) * (1 / (np.sqrt(2 * np.pi) * sigma ** 2))
|
||||
|
||||
|
||||
""" the posterior function times together all the likelihoods for each row
|
||||
of X_test here we are working out the likelihood func for each row of X_test
|
||||
with their corresponding mean and stdev """
|
||||
"""we then times this by the prior-prob-func to find the posterior func"""
|
||||
|
||||
|
||||
def posterior(X, X_train_class, mean_, std_):
|
||||
product = np.prod(likelyhood(X, mean_, std_), axis=1)
|
||||
product = product * (X_train_class.shape[0] / X.shape[0])
|
||||
return product
|
||||
|
||||
|
||||
""" we test the posterior fun with the test data to find the probs"""
|
||||
p_1 = posterior(X_test, class_data_dic[1], mean_1, std_1)
|
||||
p_0 = posterior(X_test, class_data_dic[0], mean_0, std_0)
|
||||
y_pred = 1 * (p_1 > p_0)
|
||||
|
||||
print(X_test.shape)
|
||||
print(class_data_dic[0].shape)
|
||||
print(p_1.shape)
|
||||
tp = len([i for i in range(0, y_test.shape[0])
|
||||
if y_test[i] == 0 and y_pred[i] == 0])
|
||||
tn = len([i for i in range(0, y_test.shape[0])
|
||||
if y_test[i] == 0 and y_pred[i] == 1])
|
||||
fp = len([i for i in range(0, y_test.shape[0])
|
||||
if y_test[i] == 1 and y_pred[i] == 0])
|
||||
fn = len([i for i in range(0, y_test.shape[0])
|
||||
if y_test[i] == 1 and y_pred[i] == 1])
|
||||
confusion_matrix_alg = np.array([[tp, tn], [fp, fn]])
|
||||
print(confusion_matrix_alg)
|
||||
|
||||
|
||||
classifer = GaussianNB()
|
||||
classifer.fit(X_train, y_train)
|
||||
|
||||
y_pred = classifer.predict(X_test)
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
report = classification_report(y_test, y_pred)
|
||||
print(cm)
|
||||
print(report)
|
||||
Reference in New Issue
Block a user