182 lines
4.8 KiB
Python
182 lines
4.8 KiB
Python
# Importing pandas and numpy
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Reading the csv file into a pandas DataFrame
|
|
data = pd.read_csv('student_data.csv')
|
|
|
|
# Printing out the first 10 rows of our data
|
|
print(data[:10])
|
|
|
|
|
|
# Importing matplotlib
|
|
|
|
# Function to help us plot
|
|
def plot_points(data):
|
|
X = np.array(data[["gre", "gpa"]])
|
|
y = np.array(data["admit"])
|
|
admitted = X[np.argwhere(y == 1)]
|
|
rejected = X[np.argwhere(y == 0)]
|
|
plt.scatter([s[0][0] for s in rejected], [s[0][1]
|
|
for s in rejected],
|
|
s=25, color='red', edgecolor='k')
|
|
plt.scatter([s[0][0] for s in admitted], [s[0][1]
|
|
for s in admitted],
|
|
s=25, color='cyan', edgecolor='k')
|
|
plt.xlabel('Test (GRE)')
|
|
plt.ylabel('Grades (GPA)')
|
|
|
|
|
|
# Plotting the points
|
|
plot_points(data)
|
|
plt.show()
|
|
|
|
|
|
# Separating the ranks
|
|
data_rank1 = data[data["rank"] == 1]
|
|
data_rank2 = data[data["rank"] == 2]
|
|
data_rank3 = data[data["rank"] == 3]
|
|
data_rank4 = data[data["rank"] == 4]
|
|
|
|
# Plotting the graphs
|
|
plot_points(data_rank1)
|
|
plt.title("Rank 1")
|
|
plt.show()
|
|
plot_points(data_rank2)
|
|
plt.title("Rank 2")
|
|
plt.show()
|
|
plot_points(data_rank3)
|
|
plt.title("Rank 3")
|
|
plt.show()
|
|
plot_points(data_rank4)
|
|
plt.title("Rank 4")
|
|
plt.show()
|
|
|
|
|
|
# TODO: Make dummy variables for rank
|
|
one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')],
|
|
axis=1)
|
|
|
|
# TODO: Drop the previous rank column
|
|
one_hot_data = one_hot_data.drop('rank', axis=1)
|
|
|
|
# Print the first 10 rows of our data
|
|
one_hot_data[:10]
|
|
|
|
|
|
# Making a copy of our data
|
|
processed_data = one_hot_data[:]
|
|
|
|
# TODO: Scale the columns
|
|
processed_data['gre'] = processed_data['gre'] / 800
|
|
processed_data['gpa'] = processed_data['gpa'] / 4.0
|
|
processed_data[:10]
|
|
|
|
# Printing the first 10 rows of our procesed data
|
|
processed_data[:10]
|
|
|
|
|
|
sample = np.random.choice(processed_data.index, size=int(
|
|
len(processed_data) * 0.9), replace=False)
|
|
train_data, test_data = processed_data.iloc[sample], processed_data.drop(
|
|
sample)
|
|
|
|
print("Number of training samples is", len(train_data))
|
|
print("Number of testing samples is", len(test_data))
|
|
print(train_data[:10])
|
|
print(test_data[:10])
|
|
|
|
|
|
features = train_data.drop('admit', axis=1)
|
|
targets = train_data['admit']
|
|
features_test = test_data.drop('admit', axis=1)
|
|
targets_test = test_data['admit']
|
|
|
|
print(features[:10])
|
|
print(targets[:10])
|
|
|
|
|
|
# Activation (sigmoid) function
|
|
def sigmoid(x):
|
|
return 1 / (1 + np.exp(-x))
|
|
|
|
|
|
def sigmoid_prime(x):
|
|
return sigmoid(x) * (1 - sigmoid(x))
|
|
|
|
|
|
def error_formula(y, output):
|
|
return - y * np.log(output) - (1 - y) * np.log(1 - output)
|
|
|
|
|
|
# TODO: Write the error term formula
|
|
def error_term_formula(x, y, output):
|
|
return (y - output) * sigmoid_prime(x)
|
|
|
|
|
|
# Neural Network hyperparameters
|
|
epochs = 1000
|
|
learnrate = 0.5
|
|
|
|
# Training function
|
|
|
|
|
|
def train_nn(features, targets, epochs, learnrate):
|
|
|
|
# Use to same seed to make debugging easier
|
|
np.random.seed(42)
|
|
|
|
n_records, n_features = features.shape
|
|
last_loss = None
|
|
|
|
# Initialize weights
|
|
weights = np.random.normal(scale=1 / n_features**.5, size=n_features)
|
|
|
|
for e in range(epochs):
|
|
del_w = np.zeros(weights.shape)
|
|
for x, y in zip(features.values, targets):
|
|
# Loop through all records, x is the input, y is the target
|
|
|
|
# Activation of the output unit
|
|
# Notice we multiply the inputs and the weights here
|
|
# rather than storing h as a separate variable
|
|
output = sigmoid(np.dot(x, weights))
|
|
|
|
# The error, the target minus the network output
|
|
error = error_formula(y, output)
|
|
|
|
# The error term
|
|
error_term = error_term_formula(x, y, output)
|
|
|
|
# The gradient descent step, the error times the gradient times the inputs
|
|
del_w += error_term * x
|
|
|
|
# Update the weights here. The learning rate times the
|
|
# change in weights, divided by the number of records to average
|
|
weights += learnrate * del_w / n_records
|
|
|
|
# Printing out the mean square error on the training set
|
|
if e % (epochs / 10) == 0:
|
|
out = sigmoid(np.dot(features, weights))
|
|
loss = np.mean((out - targets) ** 2)
|
|
print("Epoch:", e)
|
|
if last_loss and last_loss < loss:
|
|
print("Train loss: ", loss, " WARNING - Loss Increasing")
|
|
else:
|
|
print("Train loss: ", loss)
|
|
last_loss = loss
|
|
print("=========")
|
|
print("Finished training!")
|
|
return weights
|
|
|
|
|
|
weights = train_nn(features, targets, epochs, learnrate)
|
|
|
|
|
|
# Calculate accuracy on test data
|
|
test_out = sigmoid(np.dot(features_test, weights))
|
|
predictions = test_out > 0.5
|
|
accuracy = np.mean(predictions == targets_test)
|
|
print("Prediction accuracy: {:.3f}".format(accuracy))
|