diff --git a/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescent.ipynb b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescent.ipynb new file mode 100644 index 0000000..080cd7a --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescent.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing the Gradient Descent Algorithm\n", + "\n", + "In this lab, we'll implement the basic functions of the Gradient Descent algorithm to find the boundary in a small dataset. First, we'll start with some functions that will help us plot and visualize the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "#Some helper functions for plotting and drawing lines\n", + "\n", + "def plot_points(X, y):\n", + " admitted = X[np.argwhere(y==1)]\n", + " rejected = X[np.argwhere(y==0)]\n", + " plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s = 25, color = 'blue', edgecolor = 'k')\n", + " plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s = 25, color = 'red', edgecolor = 'k')\n", + "\n", + "def display(m, b, color='g--'):\n", + " plt.xlim(-0.05,1.05)\n", + " plt.ylim(-0.05,1.05)\n", + " x = np.arange(-10, 10, 0.1)\n", + " plt.plot(x, m*x+b, color)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading and plotting the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = pd.read_csv('data.csv', header=None)\n", + "X = np.array(data[[0,1]])\n", + "y = np.array(data[2])\n", + "plot_points(X,y)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TODO: Implementing the basic functions\n", + "Here is your turn to shine. Implement the following formulas, as explained in the text.\n", + "- Sigmoid activation function\n", + "\n", + "$$\\sigma(x) = \\frac{1}{1+e^{-x}}$$\n", + "\n", + "- Output (prediction) formula\n", + "\n", + "$$\\hat{y} = \\sigma(w_1 x_1 + w_2 x_2 + b)$$\n", + "\n", + "- Error function\n", + "\n", + "$$Error(y, \\hat{y}) = - y \\log(\\hat{y}) - (1-y) \\log(1-\\hat{y})$$\n", + "\n", + "- The function that updates the weights\n", + "\n", + "$$ w_i \\longrightarrow w_i + \\alpha (y - \\hat{y}) x_i$$\n", + "\n", + "$$ b \\longrightarrow b + \\alpha (y - \\hat{y})$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Implement the following functions\n", + "\n", + "# Activation (sigmoid) function\n", + "def sigmoid(x):\n", + " pass\n", + "\n", + "# Output (prediction) formula\n", + "def output_formula(features, weights, bias):\n", + " pass\n", + "\n", + "# Error (log-loss) formula\n", + "def error_formula(y, output):\n", + " pass\n", + "\n", + "# Gradient descent step\n", + "def update_weights(x, y, weights, bias, learnrate):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training function\n", + "This function will help us iterate the gradient descent algorithm through all the data, for a number of epochs. It will also plot the data, and some of the boundary lines obtained as we run the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "np.random.seed(44)\n", + "\n", + "epochs = 100\n", + "learnrate = 0.01\n", + "\n", + "def train(features, targets, epochs, learnrate, graph_lines=False):\n", + " \n", + " errors = []\n", + " n_records, n_features = features.shape\n", + " last_loss = None\n", + " weights = np.random.normal(scale=1 / n_features**.5, size=n_features)\n", + " bias = 0\n", + " for e in range(epochs):\n", + " del_w = np.zeros(weights.shape)\n", + " for x, y in zip(features, targets):\n", + " output = output_formula(x, weights, bias)\n", + " error = error_formula(y, output)\n", + " weights, bias = update_weights(x, y, weights, bias, learnrate)\n", + " \n", + " # Printing out the log-loss error on the training set\n", + " out = output_formula(features, weights, bias)\n", + " loss = np.mean(error_formula(targets, out))\n", + " errors.append(loss)\n", + " if e % (epochs / 10) == 0:\n", + " print(\"\\n========== Epoch\", e,\"==========\")\n", + " if last_loss and last_loss < loss:\n", + " print(\"Train loss: \", loss, \" WARNING - Loss Increasing\")\n", + " else:\n", + " print(\"Train loss: \", loss)\n", + " last_loss = loss\n", + " predictions = out > 0.5\n", + " accuracy = np.mean(predictions == targets)\n", + " print(\"Accuracy: \", accuracy)\n", + " if graph_lines and e % (epochs / 100) == 0:\n", + " display(-weights[0]/weights[1], -bias/weights[1])\n", + " \n", + "\n", + " # Plotting the solution boundary\n", + " plt.title(\"Solution boundary\")\n", + " display(-weights[0]/weights[1], -bias/weights[1], 'black')\n", + "\n", + " # Plotting the data\n", + " plot_points(features, targets)\n", + " plt.show()\n", + "\n", + " # Plotting the error\n", + " plt.title(\"Error Plot\")\n", + " plt.xlabel('Number of epochs')\n", + " plt.ylabel('Error')\n", + " plt.plot(errors)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time to train the algorithm!\n", + "When we run the function, we'll obtain the following:\n", + "- 10 updates with the current training loss and accuracy\n", + "- A plot of the data and some of the boundary lines obtained. The final one is in black. Notice how the lines get closer and closer to the best fit, as we go through more epochs.\n", + "- A plot of the error function. Notice how it decreases as we go through more epochs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train(X, y, epochs, learnrate, True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescentSolutions.ipynb b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescentSolutions.ipynb new file mode 100644 index 0000000..ca2b2be --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/GradientDescentSolutions.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Solutions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + ":\n", + "# Activation (sigmoid) function\n", + "def sigmoid(x):\n", + " return 1 / (1 + np.exp(-x))\n", + "\n", + "def output_formula(features, weights, bias):\n", + " return sigmoid(np.dot(features, weights) + bias)\n", + "\n", + "def error_formula(y, output):\n", + " return - y*np.log(output) - (1 - y) * np.log(1-output)\n", + "\n", + "def update_weights(x, y, weights, bias, learnrate):\n", + " output = output_formula(x, weights, bias)\n", + " d_error = y - output\n", + " weights += learnrate * d_error * x\n", + " bias += learnrate * d_error\n", + " return weights, bias" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/data.csv b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/data.csv new file mode 100644 index 0000000..c4324e9 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/data.csv @@ -0,0 +1,100 @@ +0.78051,-0.063669,1 +0.28774,0.29139,1 +0.40714,0.17878,1 +0.2923,0.4217,1 +0.50922,0.35256,1 +0.27785,0.10802,1 +0.27527,0.33223,1 +0.43999,0.31245,1 +0.33557,0.42984,1 +0.23448,0.24986,1 +0.0084492,0.13658,1 +0.12419,0.33595,1 +0.25644,0.42624,1 +0.4591,0.40426,1 +0.44547,0.45117,1 +0.42218,0.20118,1 +0.49563,0.21445,1 +0.30848,0.24306,1 +0.39707,0.44438,1 +0.32945,0.39217,1 +0.40739,0.40271,1 +0.3106,0.50702,1 +0.49638,0.45384,1 +0.10073,0.32053,1 +0.69907,0.37307,1 +0.29767,0.69648,1 +0.15099,0.57341,1 +0.16427,0.27759,1 +0.33259,0.055964,1 +0.53741,0.28637,1 +0.19503,0.36879,1 +0.40278,0.035148,1 +0.21296,0.55169,1 +0.48447,0.56991,1 +0.25476,0.34596,1 +0.21726,0.28641,1 +0.67078,0.46538,1 +0.3815,0.4622,1 +0.53838,0.32774,1 +0.4849,0.26071,1 +0.37095,0.38809,1 +0.54527,0.63911,1 +0.32149,0.12007,1 +0.42216,0.61666,1 +0.10194,0.060408,1 +0.15254,0.2168,1 +0.45558,0.43769,1 +0.28488,0.52142,1 +0.27633,0.21264,1 +0.39748,0.31902,1 +0.5533,1,0 +0.44274,0.59205,0 +0.85176,0.6612,0 +0.60436,0.86605,0 +0.68243,0.48301,0 +1,0.76815,0 +0.72989,0.8107,0 +0.67377,0.77975,0 +0.78761,0.58177,0 +0.71442,0.7668,0 +0.49379,0.54226,0 +0.78974,0.74233,0 +0.67905,0.60921,0 +0.6642,0.72519,0 +0.79396,0.56789,0 +0.70758,0.76022,0 +0.59421,0.61857,0 +0.49364,0.56224,0 +0.77707,0.35025,0 +0.79785,0.76921,0 +0.70876,0.96764,0 +0.69176,0.60865,0 +0.66408,0.92075,0 +0.65973,0.66666,0 +0.64574,0.56845,0 +0.89639,0.7085,0 +0.85476,0.63167,0 +0.62091,0.80424,0 +0.79057,0.56108,0 +0.58935,0.71582,0 +0.56846,0.7406,0 +0.65912,0.71548,0 +0.70938,0.74041,0 +0.59154,0.62927,0 +0.45829,0.4641,0 +0.79982,0.74847,0 +0.60974,0.54757,0 +0.68127,0.86985,0 +0.76694,0.64736,0 +0.69048,0.83058,0 +0.68122,0.96541,0 +0.73229,0.64245,0 +0.76145,0.60138,0 +0.58985,0.86955,0 +0.73145,0.74516,0 +0.77029,0.7014,0 +0.73156,0.71782,0 +0.44556,0.57991,0 +0.85275,0.85987,0 +0.51912,0.62359,0 diff --git a/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/gradient_descent.py b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/gradient_descent.py new file mode 100644 index 0000000..882ba3e --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Gradient Descent/gradient_descent.py @@ -0,0 +1,122 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +# Some helper functions for plotting and drawing lines + +def plot_points(X, y): + admitted = X[np.argwhere(y == 1)] + rejected = X[np.argwhere(y == 0)] + plt.scatter([s[0][0] for s in rejected], + [s[0][1] for s in rejected], s=25, + color='blue', edgecolor='k') + plt.scatter([s[0][0] for s in admitted], + [s[0][1] for s in admitted], + s=25, color='red', edgecolor='k') + + +def display(m, b, color='g--'): + plt.xlim(-0.05, 1.05) + plt.ylim(-0.05, 1.05) + x = np.arange(-10, 10, 0.1) + plt.plot(x, m * x + b, color) + + +data = pd.read_csv('data.csv', header=None) +X = np.array(data[[0, 1]]) +y = np.array(data[2]) +plot_points(X, y) +plt.show() + + +# Implement the following functions + +# Activation (sigmoid) function +def sigmoid(x): + return 1 / (1 + np.exp(-x)) + +# Output (prediction) formula + + +def output_formula(features, weights, bias): + return sigmoid(np.dot(features, weights) + bias) + +# Error (log-loss) formula + + +def error_formula(y, output): + return -y * np.log(output) - (1 - y) * np.log(1 - output) + +# Gradient descent step + + +def update_weights(x, y, weights, bias, learnrate): + output = output_formula(x, weights, bias) + d_error = y - output + weights += learnrate * d_error * x + bias += learnrate * d_error + return weights, bias + + +""" +Training function +This function will help us iterate the gradient descent algorithm through all +the data, for a number of epochs. It will also plot the data, and some of the +boundary lines obtained as we run the algorithm. +""" + +np.random.seed(44) + +epochs = 100 +learnrate = 0.01 + + +def train(features, targets, epochs, learnrate, graph_lines=False): + + errors = [] + n_records, n_features = features.shape + last_loss = None + weights = np.random.normal(scale=1 / n_features**.5, size=n_features) + bias = 0 + for e in range(epochs): + del_w = np.zeros(weights.shape) + for x, y in zip(features, targets): + output = output_formula(x, weights, bias) + error = error_formula(y, output) + weights, bias = update_weights(x, y, weights, bias, learnrate) + + # Printing out the log-loss error on the training set + out = output_formula(features, weights, bias) + loss = np.mean(error_formula(targets, out)) + errors.append(loss) + if e % (epochs / 10) == 0: + print("\n========== Epoch", e, "==========") + if last_loss and last_loss < loss: + print("Train loss: ", loss, " WARNING - Loss Increasing") + else: + print("Train loss: ", loss) + last_loss = loss + predictions = out > 0.5 + accuracy = np.mean(predictions == targets) + print("Accuracy: ", accuracy) + if graph_lines and e % (epochs / 100) == 0: + display(-weights[0] / weights[1], -bias / weights[1]) + + # Plotting the solution boundary + plt.title("Solution boundary") + display(-weights[0] / weights[1], -bias / weights[1], 'black') + + # Plotting the data + plot_points(features, targets) + plt.show() + + # Plotting the error + plt.title("Error Plot") + plt.xlabel('Number of epochs') + plt.ylabel('Error') + plt.plot(errors) + plt.show() + + +train(X, y, epochs, learnrate, True) diff --git a/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/data.csv b/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/data.csv new file mode 100644 index 0000000..c4324e9 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/data.csv @@ -0,0 +1,100 @@ +0.78051,-0.063669,1 +0.28774,0.29139,1 +0.40714,0.17878,1 +0.2923,0.4217,1 +0.50922,0.35256,1 +0.27785,0.10802,1 +0.27527,0.33223,1 +0.43999,0.31245,1 +0.33557,0.42984,1 +0.23448,0.24986,1 +0.0084492,0.13658,1 +0.12419,0.33595,1 +0.25644,0.42624,1 +0.4591,0.40426,1 +0.44547,0.45117,1 +0.42218,0.20118,1 +0.49563,0.21445,1 +0.30848,0.24306,1 +0.39707,0.44438,1 +0.32945,0.39217,1 +0.40739,0.40271,1 +0.3106,0.50702,1 +0.49638,0.45384,1 +0.10073,0.32053,1 +0.69907,0.37307,1 +0.29767,0.69648,1 +0.15099,0.57341,1 +0.16427,0.27759,1 +0.33259,0.055964,1 +0.53741,0.28637,1 +0.19503,0.36879,1 +0.40278,0.035148,1 +0.21296,0.55169,1 +0.48447,0.56991,1 +0.25476,0.34596,1 +0.21726,0.28641,1 +0.67078,0.46538,1 +0.3815,0.4622,1 +0.53838,0.32774,1 +0.4849,0.26071,1 +0.37095,0.38809,1 +0.54527,0.63911,1 +0.32149,0.12007,1 +0.42216,0.61666,1 +0.10194,0.060408,1 +0.15254,0.2168,1 +0.45558,0.43769,1 +0.28488,0.52142,1 +0.27633,0.21264,1 +0.39748,0.31902,1 +0.5533,1,0 +0.44274,0.59205,0 +0.85176,0.6612,0 +0.60436,0.86605,0 +0.68243,0.48301,0 +1,0.76815,0 +0.72989,0.8107,0 +0.67377,0.77975,0 +0.78761,0.58177,0 +0.71442,0.7668,0 +0.49379,0.54226,0 +0.78974,0.74233,0 +0.67905,0.60921,0 +0.6642,0.72519,0 +0.79396,0.56789,0 +0.70758,0.76022,0 +0.59421,0.61857,0 +0.49364,0.56224,0 +0.77707,0.35025,0 +0.79785,0.76921,0 +0.70876,0.96764,0 +0.69176,0.60865,0 +0.66408,0.92075,0 +0.65973,0.66666,0 +0.64574,0.56845,0 +0.89639,0.7085,0 +0.85476,0.63167,0 +0.62091,0.80424,0 +0.79057,0.56108,0 +0.58935,0.71582,0 +0.56846,0.7406,0 +0.65912,0.71548,0 +0.70938,0.74041,0 +0.59154,0.62927,0 +0.45829,0.4641,0 +0.79982,0.74847,0 +0.60974,0.54757,0 +0.68127,0.86985,0 +0.76694,0.64736,0 +0.69048,0.83058,0 +0.68122,0.96541,0 +0.73229,0.64245,0 +0.76145,0.60138,0 +0.58985,0.86955,0 +0.73145,0.74516,0 +0.77029,0.7014,0 +0.73156,0.71782,0 +0.44556,0.57991,0 +0.85275,0.85987,0 +0.51912,0.62359,0 diff --git a/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/perceptron.py b/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/perceptron.py new file mode 100644 index 0000000..b5c50f2 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Perceptron Algorithm/perceptron.py @@ -0,0 +1,54 @@ +import numpy as np +# Setting the random seed, feel free to change it and see different solutions. +np.random.seed(42) + + +def stepFunction(t): + if t >= 0: + return 1 + return 0 + + +def prediction(X, W, b): + return stepFunction((np.matmul(X, W) + b)[0]) + +# TODO: Fill in the code below to implement the perceptron trick. +# The function should receive as inputs the data X, the labels y, +# the weights W (as an array), and the bias b, +# update the weights and bias W, b, according to the perceptron algorithm, +# and return W and b. + + +def perceptronStep(X, y, W, b, learn_rate=0.01): + for i in range(len(X)): + y_hat = prediction(X[i], W, b) + if y[i] - y_hat == 1: + W[0] += X[i][0] * learn_rate + W[1] += X[i][1] * learn_rate + b += learn_rate + elif y[i] - y_hat == -1: + W[0] -= X[i][0] * learn_rate + W[1] -= X[i][1] * learn_rate + b -= learn_rate + return W, b + + +# This function runs the perceptron algorithm repeatedly on the dataset, +# and returns a few of the boundary lines obtained in the iterations, +# for plotting purposes. +# Feel free to play with the learning rate and the num_epochs, +# and see your results plotted below. + + +def trainPerceptronAlgorithm(X, y, learn_rate=0.01, num_epochs=25): + x_min, x_max = min(X.T[0]), max(X.T[0]) + y_min, y_max = min(X.T[1]), max(X.T[1]) + W = np.array(np.random.rand(2, 1)) + b = np.random.rand(1)[0] + x_max + # These are the solution lines that get plotted below. + boundary_lines = [] + for i in range(num_epochs): + # In each epoch, we apply the perceptron step. + W, b = perceptronStep(X, y, W, b, learn_rate) + boundary_lines.append((-W[0] / W[1], -b / W[1])) + return boundary_lines diff --git a/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_and.py b/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_and.py new file mode 100644 index 0000000..e76247f --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_and.py @@ -0,0 +1,32 @@ +import pandas as pd + +# TODO: Set weight1, weight2, and bias +weight1 = 1.0 +weight2 = 1.0 +bias = -1.25 + + +# DON'T CHANGE ANYTHING BELOW +# Inputs and outputs +test_inputs = [(0, 0), (0, 1), (1, 0), (1, 1)] +correct_outputs = [False, False, False, True] +outputs = [] + +# Generate and check output +for test_input, correct_output in zip(test_inputs, correct_outputs): + linear_combination = weight1 * \ + test_input[0] + weight2 * test_input[1] + bias + output = int(linear_combination >= 0) + is_correct_string = 'Yes' if output == correct_output else 'No' + outputs.append([test_input[0], test_input[1], + linear_combination, output, is_correct_string]) + +# Print output +num_wrong = len([output[4] for output in outputs if output[4] == 'No']) +output_frame = pd.DataFrame(outputs, columns=[ + 'Input 1', ' Input 2', ' Linear Combination', ' Activation Output', ' Is Correct']) +if not num_wrong: + print('Nice! You got it all correct.\n') +else: + print('You got {} wrong. Keep trying!\n'.format(num_wrong)) +print(output_frame.to_string(index=False)) diff --git a/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_not.py b/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_not.py new file mode 100644 index 0000000..4f4ea6e --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Perceptrons as Logical Operators/perceptron_not.py @@ -0,0 +1,29 @@ +import pandas as pd + +# TODO: Set weight1, weight2, and bias +weight1 = 0 +weight2 = -1 +bias = 0.5 + + +# DON'T CHANGE ANYTHING BELOW +# Inputs and outputs +test_inputs = [(0, 0), (0, 1), (1, 0), (1, 1)] +correct_outputs = [True, False, True, False] +outputs = [] + +# Generate and check output +for test_input, correct_output in zip(test_inputs, correct_outputs): + linear_combination = weight1 * test_input[0] + weight2 * test_input[1] + bias + output = int(linear_combination >= 0) + is_correct_string = 'Yes' if output == correct_output else 'No' + outputs.append([test_input[0], test_input[1], linear_combination, output, is_correct_string]) + +# Print output +num_wrong = len([output[4] for output in outputs if output[4] == 'No']) +output_frame = pd.DataFrame(outputs, columns=['Input 1', ' Input 2', ' Linear Combination', ' Activation Output', ' Is Correct']) +if not num_wrong: + print('Nice! You got it all correct.\n') +else: + print('You got {} wrong. Keep trying!\n'.format(num_wrong)) +print(output_frame.to_string(index=False)) diff --git a/python/Deep Learning/Introduction to Neural Networks/Softmax/cross_entropy.py b/python/Deep Learning/Introduction to Neural Networks/Softmax/cross_entropy.py new file mode 100644 index 0000000..916f517 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Softmax/cross_entropy.py @@ -0,0 +1,10 @@ +import numpy as np + +# Write a function that takes as input two lists Y, P, +# and returns the float corresponding to their cross-entropy. + + +def cross_entropy(Y, P): + Y = np.float_(Y) + P = np.float_(P) + return -np.sum(Y * np.log(P) + (1 - Y) * np.log(1 - P)) diff --git a/python/Deep Learning/Introduction to Neural Networks/Softmax/softmax.py b/python/Deep Learning/Introduction to Neural Networks/Softmax/softmax.py new file mode 100644 index 0000000..979abf7 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Softmax/softmax.py @@ -0,0 +1,18 @@ +import numpy as np + +# Write a function that takes as input a list of numbers, and returns +# the list of values given by the softmax function. + + +def softmax(L): + expL = np.exp(L) + sumExpL = sum(expL) + result = [] + for i in expL: + result.append(i * 1.0 / sumExpL) + return result + + # Note: The function np.divide can also be used here, as follows: + # def softmax(L): + # expL = np.exp(L) + # return np.divide (expL, expL.sum()) diff --git a/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissions.ipynb b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissions.ipynb new file mode 100644 index 0000000..bbb7fb3 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissions.ipynb @@ -0,0 +1,947 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predicting Student Admissions with Neural Networks\n", + "In this notebook, we predict student admissions to graduate school at UCLA based on three pieces of data:\n", + "- GRE Scores (Test)\n", + "- GPA Scores (Grades)\n", + "- Class rank (1-4)\n", + "\n", + "The dataset originally came from here: http://www.ats.ucla.edu/\n", + "\n", + "## Loading the data\n", + "To load the data and format it nicely, we will use two very useful packages called Pandas and Numpy. You can read on the documentation here:\n", + "- https://pandas.pydata.org/pandas-docs/stable/\n", + "- https://docs.scipy.org/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
admitgregparank
003803.613
116603.673
218004.001
316403.194
405202.934
517603.002
615602.981
704003.082
815403.393
907003.922
\n", + "
" + ], + "text/plain": [ + " admit gre gpa rank\n", + "0 0 380 3.61 3\n", + "1 1 660 3.67 3\n", + "2 1 800 4.00 1\n", + "3 1 640 3.19 4\n", + "4 0 520 2.93 4\n", + "5 1 760 3.00 2\n", + "6 1 560 2.98 1\n", + "7 0 400 3.08 2\n", + "8 1 540 3.39 3\n", + "9 0 700 3.92 2" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Importing pandas and numpy\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Reading the csv file into a pandas DataFrame\n", + "data = pd.read_csv('student_data.csv')\n", + "\n", + "# Printing out the first 10 rows of our data\n", + "data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting the data\n", + "\n", + "First let's make a plot of our data to see how it looks. In order to have a 2D plot, let's ingore the rank." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Importing matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Function to help us plot\n", + "def plot_points(data):\n", + " X = np.array(data[[\"gre\",\"gpa\"]])\n", + " y = np.array(data[\"admit\"])\n", + " admitted = X[np.argwhere(y==1)]\n", + " rejected = X[np.argwhere(y==0)]\n", + " plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s = 25, color = 'red', edgecolor = 'k')\n", + " plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s = 25, color = 'cyan', edgecolor = 'k')\n", + " plt.xlabel('Test (GRE)')\n", + " plt.ylabel('Grades (GPA)')\n", + " \n", + "# Plotting the points\n", + "plot_points(data)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Roughly, it looks like the students with high scores in the grades and test passed, while the ones with low scores didn't, but the data is not as nicely separable as we hoped it would. Maybe it would help to take the rank into account? Let's make 4 plots, each one for each rank." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEWCAYAAAB1xKBvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3X+cXHV97/HXx2VDZBYhkiiYEAMFfyLshDSDcK9VQrPsbYLmUUS21dJaL9VLW38XoiErG9tgb6u26q2mWq1Wo4IGs7HrLsoPizQbN+zyGxQVJQEk/BJ2A2x2/dw/zndhGPbHmd05M+fMvJ+Pxzx2zjnfmfmc3Z35zPn+NHdHRERkJs+rdQAiIpINShgiIhKLEoaIiMSihCEiIrEoYYiISCxKGCIiEosShkiVmdlHzOw/ah2HSLmUMEQmYWZ3m9kTZjZsZveb2ZfMrKWG8cwzszvMbE+tYhBRwhCZ2lp3bwFagTywvoaxfBB4oIavL6KEITITd78f6CVKHACY2R+Y2aCZPWZm95jZR4qOLTMzN7PzzOxXZvagmX14suc2s2Yz22pm3zKzeVOUOQZ4K7C5oicmUiYlDJEZmNkSoB24q2j3CPAnwOHAHwDvMrM3lTz0fwAvB1YBG83slSXP+3zgCuAp4Bx3H50ihE8BHwKemOOpiMyJEobI1K4ws8eBe4iqgzonDrj7Ne5+s7v/1t1vArYCv1fy+Evc/Ql3vxG4ETip6NgLgO8BPwP+zN3HJwvAzNYBB7n7toqdlcgsKWGITO1N7n4o8HrgFcDCiQNmVjCzq81sn5n9Bnhn8fHg/qL7+4HiRvNTgBOBS32KGUDNLAf8PfBXcz0RkUpQwhCZgbtfC3wJ+Iei3V8DtgNHu/thwGcBK+Np+4jaJH5gZi+eoszxwDLgv8zsfuDbwFGh19ayMl5LpCKUMETi+STw+2Y20fB9KPCwuz9pZiuBPyr3Cd3974kSzw/MrPTqBOAW4GiixvZW4B3Ar8P9e8o/BZG5UcIQicHd9wFfBi4Ou/4P0BXaODYC35zl824iavj+vpm9sOTYmLvfP3EDHgZ+G7YnbfMQSZJpASUREYlDVxgiIhKLEoaIiMSihCEiIrEoYYiISCwH1TqASlq4cKEvW7as1mGIiGTG7t27H3T3RXHK1lXCWLZsGQMDA7UOQ0QkM8zsl3HLqkpKRERiUcIQEZFYlDBERCQWJQwREYlFCUNERGJRwhARkVgSTxhm1hTWPt4xybGDzewbZnaXmfUXz/FvZuvD/jvNrC3pOEVEZHrVGIfxbuB2oiUpS/058Ii7H2dm5wIfA95iZq8CzgVeDbyEaOrnl2lKZxGRZ4yPj9PT08Pg4CD5fJ729naampoSe71EE4aZLQH+APhb4H2TFHkj8JFw/3Lg02ZmYf/X3f0p4BdmdhewEvjvJOMVEcmK8fFx1rW1sbe/n9UjI3TmcmwpFNjW25tY0ki6SuqTwN8Av53i+GLCymHuPgb8BjiieH+wJ+wTERGgp6eHvf397BweZrM7O4eH2dPfT09PT2KvmVjCMLM1wAPuvnu6YpPs82n2T/Y655vZgJkN7Nu3bxaRiohkz+DgIKtHRmgO281A28gIQ0NDib1mklcYpwFnmdndwNeB083sP0rK7CFasxgzOwg4jGgZyqf3B0uAeyd7EXff4u4r3H3FokWx5s8SEcm8fD5PXy7HgbB9AOjN5WhtbZ3uYXOSWMJw9/XuvsTdlxE1YF/l7m8tKbYdOC/cPzuU8bD/3NCL6hjgeGBXUrFK/RsfH2fHjh1s2rSJHTt2MD6u/hOSbe3t7SwuFCi0tLDejEJLC0sKBdrb2xN7zarPVmtmXcCAu28HvgB8JTRqP0yUWHD3W83sm8BtwBhwgXpIyWzVonEwi6rd40bmpqmpiW29vfT09DA0NERXa2vifzOLvtDXhxUrVrimN5dSO3bsoLOjg53DwzQTXboXWlro2rqVNWvW1Dq8VChNqn25HIuVVBuCme129xVxymqkt9S9WjQOZk0tetxI9ihhyIyyXv9fi8bBrFFSlTiUMGRaE1UVnR0d7O/spLOjg3VtbZlKGrVoHMwaJVWJQwlDplUPVRUTjYNdW7eS6+qia+tW1c2XUFKVOOpqTW+pvOmqKrLUYNzU1MSaNWsyFXM11aLHjWSPEoZMK5/P05nL0VXUw6g3l6MrY1UV6jI6MyVVmYkShkyrvb2dLYUChf5+2kZG6M3lMldVoXEYkhVp/2KjhCHTqoeqiuJ2mGaga3iYQmiH0bdpSYssfLFRo7fMaKKqYsOGDaxZsyY1/7xxqcuoZEEWOpgoYUjdU5dRyYIsfLFRwpC6py6jkgVZ+GKjuaSkIUw0Jg4NDdGawXYYqX8TbRh7SjqYJN2GUc5cUkoYIiIpUYsvNkoYIiISi2arFRGRilPCEBGRWDRwT6QC0j5CV6QSlDBE5qgaI3SVkCQNlDBE5ijpqUeyMGWENIbE2jDMbL6Z7TKzG83sVjO7ZJIynzCzoXD7iZk9WnRsvOjY9qTiFJmrpEfoZmHKCGkMSTZ6PwWc7u4nAa3AmWZ2SnEBd3+vu7e6eyvwKeDbRYefmDjm7mclGKfInCQ9QjcLU0ZIY0gsYXhkOGw2h9t0gz46gK1JxSNSjnLWMU966pEsTBnRKLK+vv1cJTpwz8yagN3AccBn3P3CKcq9FNgJLHH38bBvDBgCxoBL3f2KmV5PA/ekEsbHx2lbt47+vXsZWb2aXF8fhcWL6d22bco2gyRH6NZqygh5ttK2pL5cjsV18HcoZ+Ae7p74DTgcuBo4YYrjFwKfKtn3kvDzWOBu4HemeOz5wAAwsHTpUheZq+7ubm9ZvtwZHY3+yUZHvSWf9+7u7prFNDY25t3d3b5p0ybv7u72sbGxmsXSqLq7u315S4uPgjv4KHi+paWm/xeVAAx4zM/yqgzcc/dHgWuAM6coci4l1VHufm/4+fPw2PwUz73F3Ve4+4pFixZVKmRJsaSrBQYHBxlZvRqaQ6tBczMjbW01bTPI+pok9UBtScn2klpkZoeH+88HzgDumKTcy4EFwH8X7VtgZgeH+wuB04DbkopVsmOiuqijs5PO/fvp6Oykbd26iiaNfD5Prq8PDoRWgwMHyPX2qs1gjrJe/6+2pGR7SR0FXG1mNwE/Bq509x1m1mVmxb2eOoCvh0ujCa8EBszsRqKqrEvdXQlD6OnpoX/vXoZ37sQ3b2Z450769+ypaBfT9vZ2CosX01IoYOvX01IoUFiyROtnzEE1En3StK5KggP33P0mJqlGcveNJdsfmaTM9cBrkopNsmu66qJKrc/d1NRE77ZtzzRid3VpZPUcFSd6mpsZ7uqiv1DI1Lrq9bC+/Vxp8kHJlGpVF6nNoLLS2C40G43+f6GEIZmi6qJsUrtQfdACSpI5Wm41e54e27JnDyNtbeR6eyksWTLt2BapDq24JzJHmh228pTo00kJQ2QOZjPSWySrtESryBxUo+uuyGTSPlZFCUOkRL306JHKS/IDPQtjVZQwREqoR49MZmLywc6ODvZ3dtLZ0cG6traKfaBn4cpWCUOkhLruymSSXsgqC1e2ShgiJSZGem/t6qIrl2NrV5cavCXxyQezcGWrNb0zTt0/kzExojcr01ZI8vL5PJ25HF1h7faJyQe7KvSB3t7eTmHLFvoLhWeNVUnTla261WZYvS7oIpJG1VjIqhZjVTQOo0Hs2LGDzo4OdhZ94ym0tNC1dau+GYskoB4HH5aTMFQllWHT1akqYYhUXqNXVarRO8O0oEt6pH3AlUglKGFkmBZ0SYcsDLgSqQRVSWWYFnSJL8neZPWwOJBIHEoYGdfodapxlPYm68zl2FLB3i3VWAUwjdLWpbsa8aTtnKvO3evmdvLJJ7tIqe7ubl/e0uKj4A4+Cp5vafHu7u6KPX/L8uXO6Gj0jzg66i35fMWeP43GxsZ87apVvrylxS8y8+UtLb521SofGxurWTyr1q71luXL3S66yFuWL/dVa9dWNJ5qvEYtAAMe8zM2sTYMM5tvZrvM7EYzu9XMLpmkzJ+a2T4zGwq3dxQdO8/Mfhpu5yUVp9S/pEfoNuJUIklPkzGbeJKehykLcz0lLclG76eA0939JKAVONPMTpmk3DfcvTXcPg9gZi8EOoECsBLoNLMFCcYqdSzp3mSNOJVI0kl4NvEkPQ9TFuZ6SlpiCSNc7QyHzeZwiztKsA240t0fdvdHgCuBMxMIUxpANXqTTbQlbdiwgTVr1syYLLLeDTdtXbqrMQ9TFuZ6SlqiI73NrAnYDRwHfMbdLyw5/qfAZmAf8BPgve5+j5l9AJjv7h8N5S4GnnD3f5ju9RptpLfEl6YRuvUwpUs1pskoN56k1wyv13XJUzc1iJkdDmwD/srdbynafwQw7O5Pmdk7gXPc/XQz+yBwcEnC2O/u/zjJc58PnA+wdOnSk3/5y18mfj4ic1EvU7qkKQlXK560nXMlpC5hAJhZJzAy1VVCuBp52N0PM7MO4PXu/hfh2OeAa9x963SvoSsMqZVyultu2rSJ/Z2dbC567603I9fVxYYNG+b8/CLlSMVcUma2CDjg7o+a2fOBM4CPlZQ5yt3vC5tnAbeH+73A3xU1dK8G1icVq8hclDvOo9xpspMeR1ItaUt6aYsnE+L2vy33BpwIDAI3AbcAG8P+LuCscH8zcCtwI3A18Iqix78duCvc/izOa2ochtRCueM8JsYw5MMYhvwMYxiSHkdSDWkct5GmeGqJNIzDcPeb3D3v7ie6+wnu3hX2b3T37eH+end/tbuf5O5vcPc7ih7/b+5+XLh9Mak4Reaq3C6mE1O6dG3dSq6ri66tW6e9WkhbF9bZSOO4jTTFkxWafFBkjmbTxbScbrhp68I6G2lLemmLJyuUMETmKOlxHvUwK3Hakl7a4skKrbgnM1Lj4MyS7m6Z9e6caRy3kaZ4aimV3WqrQQmj8uphkFk9qIeknbakl7Z4akUJQyomjYPM6uHDsxxK2pKkio/DMLMXAacBLwGeIOomO+Duv511lJIJaVs3PK1jEpJeoGmiR08z0DU8TCH06MnSyHDJvmkbvc3sDWbWC3wXaAeOAl4FbABuNrNLzOwFyYcptZK2xsE0doecSGKdHR3s7+yks6ODdW1tFZtQUD160iPrk0bO1Uy9pP4X8L/d/Xfd/Xx33+DuH3D3s4CTiAbm/X7iUUrNpK2HTho/PJNOYmlL2o0q6S8GWTBtwnD3D7r7r6Y4fIS7X+Hu30ogLkmJcgeZJS2NH57VWKApTUm7UaXx6rbayhqHYWaHmdnbzez7wA0JxSQpU+5aD0lK44dnNRZoSlPSblRpvLqtthkbvcPEgWcBfwQsBw4F3gT8MNnQRJ5r4sNzojtkVwq6Q7a3t7OlUKBQ0qc/iQWa1MhdO/l8no2HHEJhZISbgdcA3zvkEDY1UNXgtN1qzeyrwOuAPuDrwFXAXe5+THXCK08jdqtttC6maVVun3793bJndHSU4488kkMeeYSzgO3A/gUL+On99zNv3rxahzdrlexWewLwCNG043e4+7iZ1c/AjYxLaxfTRlTOFcD4+DhvWr2aX11/PWc++SQfnj+fz516Klf09envlmJ9fX0sPHCAnUTVUR8FCqOj9PX1NcyV30yN3icB5wAvAL5vZv8FHGpmR1YjOJmeGuGyaceOHfzs2msZePJJPgYMPPkkP7v2Wnbs2FGx10hj9880xlSOSdsw9u9vqDaMGRu93f2OMCX5y4H3Al8GdpnZ9YlHJ9NSI1w2XXbZZawdH3/W323N+DiXX355RZ4/jd0/0xhTudLYQ6/aYvWSMrOFAO4+4O7vB16KVsCrOf0DZ9d34Vl/t/+coXw5387TeOWZxpjKlcYeetU200jvtWa2j2hU9x4zOxUgLNR0bVUilCnpHzib3vzmN/NgUxMFom9dBeDBpibOPvvsScuX++08jVeeaYypXOrezPRLtBItr/qKcL8AXBt3Kb9a3BpxidaxsTHv7u72TZs2eXd3d0MuMZk1Y2Njvub00/3Y+fP9f4IfO3++rzn99Iot0ZrGJV3TGJNEKGOJ1pl6SY15WDbV3fvN7NAkk5eUT/3zs6epqYkr+vpid8MtdwLIaowLKVcaY5LyzTQOYw/w8aJd7yvedvePP+dBzzx2PtHgvoOJuu9e7u6dJWXeB7wDGAP2AW9391+GY+PAzaHorzyav2pajTgOQ+rfbKaYT+NaD2mMSSq4HoaZdU55EHD3S6Z5rAE5dx82s2bgOuDd7r6zqMwbgH53329m7wJe7+5vCceG3b0lzklMUMKQeqTV4SRJFRu4N11CmEmoGxsOm83h5iVlri7a3Am8dbavJ+mhUcwzK+d3lMbpUKQxzXSF8Wrgd9x9e9j+BHBYOPxpd592AkIzawJ2A8cBn3H3C6cp+2ngfnf/aNgeA4aIqqsudfcrZjoZXWHEk+QHelpXh0tTEkvr70gaUzlXGDP1kuoGTi3avg34Q+BtwBVxW9aBw4GrgROmOP5WoiuMg4v2vST8PBa4myhxTfbY84EBYGDp0qVz6i3QCMbGxnzV2rXesny520UXecvy5b5q7dqK9a5KY2+YsbExX7tqlS9vafGLzHx5S4uvXbWqZj3K0vg7ksZFGb2kZhq4d5S7F4/ofszdv+XuXwEWxspIUVJ6FLgGOLP0mJmdAXwYOMvdnyp6zL3h58/DY/NTPPcWd1/h7isWLVoUN6SG1dPTQ//evQzv3Ilv3szwzp3079lTsQFUaexvn7ZBY2n8HUH2p+6Q5M2UMJ7VjdbdTynafNF0DzSzRWZ2eLj/fOAM4I6SMnngc0TJ4oGi/QvM7OBwfyHReuK3zRCrxDA4OMjI6tXQHD6umpsZaWur2IdVGkefp+0DOo2/o3qYukOSN1PCuNfMCqU7zewU4N4ZHnsUcLWZ3QT8GLjS3XeYWZeZTXSR/b9AC3CZmQ2Z2faw/5XAgJndSFSVdam7K2FUQD6fJ9fXBwfCx9WBA+R6eyv2YZXG0edp+4BO4+9oNldhSV+R6IonfWZq9F4JfAP4Es+ssHcycB7wFnfflXSA5VCj98zGx8dpW7eO/j17GGlrI9fbS2HJEnq3batow3ea+tunsVtq2n5HmzZtYn9nJ5uLPg/Wm5Hr6mLDhg3PKf/0/9HevYysXk2ur4/C4sUV+z9Sx4DqqVijd0gmLwK6gG+FWxfw4riNJNW8NeLUILPRiNOJNOI5l2M204+0LF/ujI5Gb77RUW/J5yvWcK+OAdVDBacGwaO2hY2zzV6SPo04nUgjnnM5yp26Y7q2sEr8jsudDkWqY6bZarvDjLXNkxw7NrRHvD258ESkGsqdiTXptrC0tTtJZKY2jCOJ5o/6Q+Bhovme5gPLgJ8RDd77TvJhxqM2jGxK06A6iSfptrA0tjulUSXeOxWbS6rkSZcR9Xx6AviJu+8vK6oqUMLIHjVuZlfSDfdp6xiQNpV67ySSMLJACSMdyvnWM5uZWEWkcu+dchJGrCVaReKqh9XhRLKgFu8dJQypqHIHgKlxU2R2avHeKTthhGk7TkwiGMm+cr/1pHHUM2iUsaRfLd47M47DADCza4CzQvkhYJ+ZXevu70ssMsmkfD5PZy5HV1G9am8uR9cU33rSuNZDaWNiZy7HFjXES8rU4r0Tq9HbzAbdPW9m7wCOdvdOM7vJ3VN1paFG79qrh+6QjdoQr+7NjaliK+4VlzOzo4BziKYiF5lUGq8YytWIo4zTeFWlBJY+cRNGF9AL/Mjdf2xmxwI/TS4sybKsT8NRbrVaPSjurNAMdA0PUwidFWrxd0xjApOYjd7ufpm7n+ju7wrbP3f3P0w2NJHaSGtDfJLS1r05bYteSSRWwjCzl5nZD8zslrB9opk9d85jkTpQ7rxK9SBt3ZvTlsAkErdb7b8C64n+j3D3m4BzkwpKpNYmqtU2bNjAmjVr6jpZQPquqtKWwCQStw3jEHffZWbF+8YSiEdEaiBtnRXKnW5dqiNuwnjQzH4HcAAzOxu4L7GoRDKmHnr0pKmzQtoSmETijsM4FtgCnAo8AvwCeKu7351odGXSOAypBc24K1lW8ckHQ6+oM4BFwCvc/X/MlCzMbL6Z7TKzG83sVjO7ZJIyB5vZN8zsLjPrD1OoTxxbH/bfaWZtceIUmUqSU32oR480immrpMxs0qk/Jtoy3P3j0zz8KeB0dx8OK/ZdZ2Y97r6zqMyfA4+4+3Fmdi7wMeAtZvYqokb1VwMvAb5vZi9zd03oI2VLuk9/Iw70q4Z6qOarNzNdYRwabiuAdwGLw+2dwKume2BYX3w4bDaHW2n91xuBfw/3LwdWWZSN3gh83d2fcvdfAHcBK2OdkUiJpK8A1KOn8iZW9Ovo7KRz/346OjtpW7dOk0DW2LQJw90vcfdLgIXAcnd/v7u/HzgZWDLTk5tZk5kNAQ8AV7p7f0mRxcA94bXGgN8ARxTvD/aEfSJlS7pPf9q6pNaDnp4e+vfuZXjnTnzzZoZ37qR/zx5V89VY3HEYS4HRou1RonW9p+Xu4+7eSpRcVprZCSVFbLKHTbP/OczsfDMbMLOBffv2zRSSNKBJrwAOOaRiVwCNONAvaYODg4ysXg3NIc03NzPS1qaBezUWN2F8BdhlZh8xs06gH/hy3Bdx90eBa4AzSw7tAY4GMLODgMOAh4v3B0uAe6d47i3uvsLdVyxatChuSNJAVq9ezYPNzbwGuBB4DfDQvHmsXr26Yq/RaAP9kpbP58n19cGBkOYPHCDX26tqvhqL20vqb4G3E3WpfRT4M3f/u+keY2aLzOzwcP/5wBnAHSXFtgPnhftnA1d51M93O3Bu6EV1DHA8sCveKYk8W19fH0eMjnIpUYPcpcALR0fp6+urcWQylfb2dgqLF9NSKGDr19NSKFBYskTVfDUWd+Ae7r7bzO4B5gOY2VJ3/9U0DzkK+HczayJKTN909x1m1gUMuPt24AvAV8zsLqIri3PDa91qZt8EbiMaUX6BekjJbA0ODtK2fz9vAt4U9vXv369eTCnW1NRE77ZtTw/ca+3qUi+pFIg7cO8s4B+Jurg+QNSmcYe7vzrZ8MqjgXsymUZdEEkkjooP3AM2AacAP3H3Y4iql340y/hEqqq9vZ2XrFzJK+bP53XAK+bPZ/HKldNWb2hNb5HnilsldcDdHzKz55nZ89z9ajP7WKKRVYkGBzWOFuC1wPdmKKfFe0QmFzdhPGpmLcAPga+a2QPUwWy19fDBoIQ3s56eHu7dtYuBJ5+kGfjok09S2LVrytXk0rb6nEhaxK2SeiOwH3gv0Re0nwFrkwqqWrI+B9BEwuvs6GB/ZyedHR2sa2tT9UmJcgfuafGebFI1YvJmTBihl9N33P237j7m7v/u7v/s7g9VIb5EZf2DIesJr1rKnbpDU31kj748VceMCSN0Z91vZodVIZ6qyvoHQ9YTXrWUO3WHpvrIHn15qo64bRhPAjeb2ZXAyMROd//rRKKqkqyv6pXP5+nM5egq6i7am8vRlZGEVy3lLsajxXuyRzMGV0fchPHdcKsrWf9gyHrCq4U4444gXavPgTo3zERfnqoj1sC9rGjEgXsTHyRDQ0O0ZizhVUvWV8SbmOq7f+9eRlavJtfXR2HxYnq3bctE/NUw8TfeU/LlKSt/41oqZ+Ae7j7ljah31AVF2/3Az8Pt7OkeW4vbySef7CKluru7fXlLi4+CO/goeL6lxbu7u2sdWizd3d3esny5Mzoa/aOPjnpLPp+Z+KtlbGzMu7u7fdOmTd7d3e1jY2O1DikTiKZqivUZO1Oj998QTQQ44WDgd4HXEy2oJJJ6We8coKm+49GMwcmbKWHMc/fihYyuc/eHPJp0MJdgXCIVk/XecJrqW9JipoSxoHjD3f+yaFOLT0gmZL2b7Gym+h4dHWXjxo2sWrWKjRs3Mjo6OmVZkbimbfQ2s68C17j7v5bs/wvg9e7ekXB8ZWnERm+JJ+udA8qJf3R0lOOPPJJDHnmEs4jqlPcvWMBP77+fefPmVTVuSb9yGr1nShgvAq4AngJuCLtPJmrLeJO7/3qOsVaUEoYIbNy4kcs2beImeLqL6WuAcy6+mK6urtoGJ6lTTsKYdhyGuz8AnGpmpwMTa198192vmmOMIlXVSOMYfvSjH3EWPKuR/43A9ddfX7ugpC7EGrgXEoSShGRSPcxKXI7TTjuNy666io/yzBXGd4BzTj21toFJ5sWdrVYksxptnqENGzawf8ECXgNcSFQd9cSCBWzYsKHGkUnWKWFI3cv6OIxyzZs3j5/efz/nXHwxu1et4pyLL1aDt1RE3LmkRDKrEecZmjdvnhq4peISu8Iws6PN7Gozu93MbjWzd09S5oNmNhRut5jZuJm9MBy728xuDsfU9UlmLevjMETSIrHJB83sKOAod7/BzA4FdhN1xb1tivJrgfe6++lh+25ghbs/GPc11a1WppL1cRgiSalYt9q5cPf7gPvC/cfN7HZgMTBpwgA6gK1JxSONLW3TlYtkUVUavc1sGZAnmu12suOHAGcC3yra7UCfme02s/Onee7zzWzAzAb27dtXuaBFRORZEk8YZtZClAje4+6PTVFsLfAjd3+4aN9p7r4caAcuMLPXTfZAd9/i7ivcfcWiRZreSkQkKYkmDDNrJkoWX3X3b09T9FxKqqPc/d7w8wFgG7AyqThFRGRmSfaSMuALwO3u/vFpyh0G/B7RYNSJfbnQUI6Z5YDVwC1JxSoiIjNLchzGacDbgJvNbGKE1IeApQDu/tmwbx3Q5+4jRY99MbAtyjkcBHzN3b+XYKwiIjKDJHtJXQdYjHJfAr5Usu/nwEmJBCYiIrOiqUFERCQWJQwREYlFCUNERGJRwhARkViUMEREJBYlDBERiUUJQ0REYlHCEBGRWJQwREQkFiUMERGJRQlDRERiUcIQEZFYlDBERCQWJQwREYlFCUNERGJRwhARkViUMEREJBYlDBERiUUJQ0REYkksYZjZ0WZ2tZndbma3mtm7JynzejP7jZkNhdvGomNnmtmdZnaXmV2UVJwiIhLPQQk+9xjwfne/wcwOBXab2ZXufltJuf9y9zXFO8ysCfgM8PvAHuDHZrZ9kseKiEjAh4A7AAALbUlEQVSVJJYw3P0+4L5w/3Ezux1YDMT50F8J3OXuPwcws68Db4z5WKmx8fFxenp6GBwcJJ/P097eTlNTU63DEpE5SvIK42lmtgzIA/2THH6tmd0I3At8wN1vJUos9xSV2QMUpnju84HzAZYuXVq5oGVWxsfHWdfWxt7+flaPjNCZy7GlUGBbb6+ShkjGJd7obWYtwLeA97j7YyWHbwBe6u4nAZ8Crph42CRP5ZM9v7tvcfcV7r5i0aJFlQpbZqmnp4e9/f3sHB5mszs7h4fZ099PT09PrUMTkTlKNGGYWTNRsviqu3+79Li7P+buw+H+fwLNZraQ6Iri6KKiS4iuQCTlBgcHWT0yQnPYbgbaRkYYGhqqZVgiUgFJ9pIy4AvA7e7+8SnKHBnKYWYrQzwPAT8GjjezY8xsHnAusD2pWKVy8vk8fbkcB8L2AaA3l6O1tbWWYYlIBSTZhnEa8DbgZjOb+Hr5IWApgLt/FjgbeJeZjQFPAOe6uwNjZvaXQC/QBPxbaNuQlGtvb2dLoUChv5+2kRF6czmWFAq0t7fXOjQRmSOLPp/rw4oVK3xgYKDWYTS8iV5SQ0NDtLa2qpeUSIqZ2W53XxGrrBKGiEjjKidhaGoQERGJRQlDRERiUcIQEZFYlDBERCQWJQwREYlFCUNERGJRwhARkViUMEREJBYlDBERiUUJQ0REYlHCEBGRWJQwREQkFiUMERGJRQlDRERiUcIQEZFYlDBERCQWJQwREYklyTW9pU5MLLk6ODhIPp/XkqsiDSqxhGFmRwNfBo4Efgtscfd/Kinzx8CFYXMYeJe73xiO3Q08DowDY3GXEJTKGh8fZ11bG3v7+1k9MkJnLseWQoFtvb1KGiINJskqqTHg/e7+SuAU4AIze1VJmV8Av+fuJwKbgC0lx9/g7q1KFrXT09PD3v5+dg4Ps9mdncPD7Onvp6enp9ahiUiVJZYw3P0+d78h3H8cuB1YXFLmend/JGzuBJYkFY/MzuDgIKtHRmgO281A28gIQ0NDtQxLRGqgKo3eZrYMyAP90xT7c6D4a6sDfWa228zOn+a5zzezATMb2LdvXyXClSL5fJ6+XI4DYfsA0JvL0draWsuwRKQGEk8YZtYCfAt4j7s/NkWZNxAljAuLdp/m7suBdqLqrNdN9lh33+LuK9x9xaJFiyocvbS3t7O4UKDQ0sJ6MwotLSwpFGhvb691aCJSZYn2kjKzZqJk8VV3//YUZU4EPg+0u/tDE/vd/d7w8wEz2wasBH6YZLzyXE1NTWzr7aWnp4ehoSG6WlvVS0qkQSXZS8qALwC3u/vHpyizFPg28DZ3/0nR/hzwPHd/PNxfDXQlFatMr6mpiTVr1rBmzZpahyIiNZTkFcZpwNuAm81sooX0Q8BSAHf/LLAROAL4f1F+ebr77IuBbWHfQcDX3P17CcYqIiIzSCxhuPt1gM1Q5h3AOybZ/3PgpIRCExGRWdDUICIiEosShoiIxKKEISIisShhiIhILObutY6hYsxsH/DLWT58IfBgBcPJAp1z/Wu08wWdc7le6u6xRj3XVcKYCzMbaLRJDnXO9a/Rzhd0zklSlZSIiMSihCEiIrEoYTyjdC2ORqBzrn+Ndr6gc06M2jBERCQWXWGIiEgsShgiIhJLQyQMM5tvZrvM7EYzu9XMLgn7jzGzfjP7qZl9w8zmhf0Hh+27wvFltYx/LsysycwGzWxH2K7rczazu83sZjMbMrOBsO+FZnZlOOcrzWxB2G9m9s/hnG8ys+W1jX52zOxwM7vczO4ws9vN7LX1fM5m9vLw9524PWZm76nzc35v+Oy6xcy2hs+0qr+XGyJhAE8Bp7v7SUArcKaZnQJ8DPiEux8PPEK06h/h5yPufhzwiVAuq95NtJ76hEY45ze4e2tRv/SLgB+Ec/5B2IZoNcfjw+184F+qHmll/BPwPXd/BdEsz7dTx+fs7neGv28rcDKwH9hGnZ6zmS0G/hpY4e4nAE3AudTivezuDXUDDgFuAApEIyMPCvtfC/SG+73Aa8P9g0I5q3XsszjXJURvnNOBHUTTzdf7Od8NLCzZdydwVLh/FHBnuP85oGOyclm5AS8AflH6t6rncy45z9XAj+r5nIHFwD3AC8N7cwfQVov3cqNcYUxUzQwBDwBXAj8DHnX3sVBkD9EfBp75AxGO/4Zooaes+STwN8Bvw/YR1P85O9BnZrvN7Pyw78Xufh9A+PmisP/pcw6Kfx9ZcSywD/hiqHr8fFilsp7Pudi5wNZwvy7P2d33Av8A/Aq4j+i9uZsavJcbJmG4+7hHl7BLiNYHf+VkxcLPyRZ+ylT/YzNbAzzg7ruLd09StG7OOTjN3ZcTVUNcYGavm6ZsPZzzQcBy4F/cPQ+M8ExVzGTq4ZwBCHX2ZwGXzVR0kn2ZOefQFvNG4BjgJUCO6P+7VOLv5YZJGBPc/VHgGuAU4HAzm1h1cAlwb7i/BzgaIBw/DHi4upHO2WnAWWZ2N/B1omqpT1Lf54y73xt+PkBUr70S+LWZHQUQfj4Qij99zkHx7yMr9gB73L0/bF9OlEDq+ZwntAM3uPuvw3a9nvMZwC/cfZ+7HwC+DZxKDd7LDZEwzGyRmR0e7j+f6A9wO3A1cHYodh7wnXB/e9gmHL/KQ4VgVrj7endf4u7LiC7br3L3P6aOz9nMcmZ26MR9ovrtW3j2uZWe85+EXjSnAL+ZqNLICne/H7jHzF4edq0CbqOOz7lIB89UR0H9nvOvgFPM7BAzM575G1f/vVzrBp0qNRqdCAwCNxF9gGwM+48FdgF3EV3WHhz2zw/bd4Xjx9b6HOZ4/q8HdtT7OYdzuzHcbgU+HPYfQdT4/9Pw84VhvwGfIWrPupmoF0rNz2MW590KDIT/7yuABQ1wzocADwGHFe2r23MGLgHuCJ9fXwEOrsV7WVODiIhILA1RJSUiInOnhCEiIrEoYYiISCxKGCIiEosShoiIxKKEIQ3LzI4omvH0fjPbW7Q9r4znebuZHTnN8U+b2anhfrOZXRpmEh0K03lcFI4dZGbjYf8tZvYdM3tBOHacmT1RMkvrH4djPzCzw+b22xCZmRKGNCx3f8ifmfX0s0Qzf7aG22gZT/V2YNKEYWaLgLy7Xx92bQYWAa8Or/s6oj71Ex4Pr38CMAy8q+jYnUXxtbr7V8P+rwHvLCNekVk5aOYiIo3HzM4DLgDmAdcDf0n0BeuLRAPljGgd5V+H7W+Y2RPAypJk82agJzznoUQjcJe5+1MA7v440aCsyfw38LIY4X6HaKBalqeklwzQFYZICTM7AVgHnBquAg4iml7lZKKp018TrgC+7O7fAIaAt0xxZXIa0cyiEK3HcLe7j8SIoYlo/q/tRbtLFw46FcDdHwQOnZj+RiQpusIQea4zgN8FBqKpe3g+0XTRvUQf2v8E/CfQF+O5jiKafvw5zOwdRFcuC8Pr7SP64B8ClgH9RPMFTbgzJLDJ7Auv9WiMmERmRVcYIs9lwL8VtRW83N03uftDRPOSXUe0AtrnYjzXE0Rz+0A0x9ExYWJE3P3zIQEME62iBqENgyhhHAr8RcyY54fXEkmMEobIc30fOMfMFsLTvamWhgZsc/fLgE6iacQBHif6cJ/M7cBx8HR7xZeBfzazg8NzHwQ0lz7Io2n43w18MFRPTcnMnkd0lXLPdOVE5koJQ6SEu99M1BD9fTO7iajq6cVEawz8MFQZ/SvwofCQLwKfn6I77neJZguecBHRLKu3mdkgcC3weaLG89I4fkw0Q+k5YVdpG8YFYf9K4Dp3H5/LeYvMRLPViiQorF9wHdDu7o8l9BqfAb7p7tcm8fwiE3SFIZIgj76RfQBYmuDLDCpZSDXoCkNERGLRFYaIiMSihCEiIrEoYYiISCxKGCIiEosShoiIxPL/AVcBjfMh4u+/AAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Separating the ranks\n", + "data_rank1 = data[data[\"rank\"]==1]\n", + "data_rank2 = data[data[\"rank\"]==2]\n", + "data_rank3 = data[data[\"rank\"]==3]\n", + "data_rank4 = data[data[\"rank\"]==4]\n", + "\n", + "# Plotting the graphs\n", + "plot_points(data_rank1)\n", + "plt.title(\"Rank 1\")\n", + "plt.show()\n", + "plot_points(data_rank2)\n", + "plt.title(\"Rank 2\")\n", + "plt.show()\n", + "plot_points(data_rank3)\n", + "plt.title(\"Rank 3\")\n", + "plt.show()\n", + "plot_points(data_rank4)\n", + "plt.title(\"Rank 4\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks more promising, as it seems that the lower the rank, the higher the acceptance rate. Let's use the rank as one of our inputs. In order to do this, we should one-hot encode it.\n", + "\n", + "## TODO: One-hot encoding the rank\n", + "Use the `get_dummies` function in Pandas in order to one-hot encode the data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
admitgregparank_1rank_2rank_3rank_4
003803.610010
116603.670010
218004.001000
316403.190001
405202.930001
517603.000100
615602.981000
704003.080100
815403.390010
907003.920100
\n", + "
" + ], + "text/plain": [ + " admit gre gpa rank_1 rank_2 rank_3 rank_4\n", + "0 0 380 3.61 0 0 1 0\n", + "1 1 660 3.67 0 0 1 0\n", + "2 1 800 4.00 1 0 0 0\n", + "3 1 640 3.19 0 0 0 1\n", + "4 0 520 2.93 0 0 0 1\n", + "5 1 760 3.00 0 1 0 0\n", + "6 1 560 2.98 1 0 0 0\n", + "7 0 400 3.08 0 1 0 0\n", + "8 1 540 3.39 0 0 1 0\n", + "9 0 700 3.92 0 1 0 0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: Make dummy variables for rank\n", + "one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')], axis=1)\n", + "\n", + "# TODO: Drop the previous rank column\n", + "one_hot_data = one_hot_data.drop('rank', axis=1)\n", + "\n", + "# Print the first 10 rows of our data\n", + "one_hot_data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TODO: Scaling the data\n", + "The next step is to scale the data. We notice that the range for grades is 1.0-4.0, whereas the range for test scores is roughly 200-800, which is much larger. This means our data is skewed, and that makes it hard for a neural network to handle. Let's fit our two features into a range of 0-1, by dividing the grades by 4.0, and the test score by 800." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
admitgregparank_1rank_2rank_3rank_4
000.4750.90250010
110.8250.91750010
211.0001.00001000
310.8000.79750001
400.6500.73250001
510.9500.75000100
610.7000.74501000
700.5000.77000100
810.6750.84750010
900.8750.98000100
\n", + "
" + ], + "text/plain": [ + " admit gre gpa rank_1 rank_2 rank_3 rank_4\n", + "0 0 0.475 0.9025 0 0 1 0\n", + "1 1 0.825 0.9175 0 0 1 0\n", + "2 1 1.000 1.0000 1 0 0 0\n", + "3 1 0.800 0.7975 0 0 0 1\n", + "4 0 0.650 0.7325 0 0 0 1\n", + "5 1 0.950 0.7500 0 1 0 0\n", + "6 1 0.700 0.7450 1 0 0 0\n", + "7 0 0.500 0.7700 0 1 0 0\n", + "8 1 0.675 0.8475 0 0 1 0\n", + "9 0 0.875 0.9800 0 1 0 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Making a copy of our data\n", + "processed_data = one_hot_data[:]\n", + "\n", + "# TODO: Scale the columns\n", + "processed_data['gre'] = processed_data['gre']/800\n", + "processed_data['gpa'] = processed_data['gpa']/4.0\n", + "processed_data[:10]\n", + "\n", + "# Printing the first 10 rows of our procesed data\n", + "processed_data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splitting the data into Training and Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to test our algorithm, we'll split the data into a Training and a Testing set. The size of the testing set will be 10% of the total data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of training samples is 360\n", + "Number of testing samples is 40\n", + " admit gre gpa rank_1 rank_2 rank_3 rank_4\n", + "16 0 0.975 0.9675 0 0 0 1\n", + "97 0 0.600 0.8925 0 1 0 0\n", + "361 1 0.675 0.8725 1 0 0 0\n", + "315 1 0.375 0.7100 0 1 0 0\n", + "215 1 0.825 0.7275 0 0 1 0\n", + "109 0 0.600 0.8625 0 1 0 0\n", + "199 0 0.725 0.9425 0 0 0 1\n", + "197 1 0.500 0.8075 0 0 0 1\n", + "354 1 0.675 0.9450 0 1 0 0\n", + "84 1 0.625 0.9000 0 0 1 0\n", + " admit gre gpa rank_1 rank_2 rank_3 rank_4\n", + "10 0 1.000 1.0000 0 0 0 1\n", + "11 0 0.550 0.8050 1 0 0 0\n", + "13 0 0.875 0.7700 0 1 0 0\n", + "28 1 0.975 0.8050 0 1 0 0\n", + "34 0 0.450 0.7850 1 0 0 0\n", + "50 0 0.800 0.9650 0 0 1 0\n", + "54 0 0.825 0.8350 0 0 1 0\n", + "63 1 0.850 0.9625 0 0 1 0\n", + "65 0 0.750 0.8975 0 1 0 0\n", + "66 0 0.925 0.9050 0 0 0 1\n" + ] + } + ], + "source": [ + "sample = np.random.choice(processed_data.index, size=int(len(processed_data)*0.9), replace=False)\n", + "train_data, test_data = processed_data.iloc[sample], processed_data.drop(sample)\n", + "\n", + "print(\"Number of training samples is\", len(train_data))\n", + "print(\"Number of testing samples is\", len(test_data))\n", + "print(train_data[:10])\n", + "print(test_data[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splitting the data into features and targets (labels)\n", + "Now, as a final step before the training, we'll split the data into features (X) and targets (y)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " gre gpa rank_1 rank_2 rank_3 rank_4\n", + "16 0.975 0.9675 0 0 0 1\n", + "97 0.600 0.8925 0 1 0 0\n", + "361 0.675 0.8725 1 0 0 0\n", + "315 0.375 0.7100 0 1 0 0\n", + "215 0.825 0.7275 0 0 1 0\n", + "109 0.600 0.8625 0 1 0 0\n", + "199 0.725 0.9425 0 0 0 1\n", + "197 0.500 0.8075 0 0 0 1\n", + "354 0.675 0.9450 0 1 0 0\n", + "84 0.625 0.9000 0 0 1 0\n", + "16 0\n", + "97 0\n", + "361 1\n", + "315 1\n", + "215 1\n", + "109 0\n", + "199 0\n", + "197 1\n", + "354 1\n", + "84 1\n", + "Name: admit, dtype: int64\n" + ] + } + ], + "source": [ + "features = train_data.drop('admit', axis=1)\n", + "targets = train_data['admit']\n", + "features_test = test_data.drop('admit', axis=1)\n", + "targets_test = test_data['admit']\n", + "\n", + "print(features[:10])\n", + "print(targets[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training the 2-layer Neural Network\n", + "The following function trains the 2-layer neural network. First, we'll write some helper functions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Activation (sigmoid) function\n", + "def sigmoid(x):\n", + " return 1 / (1 + np.exp(-x))\n", + "def sigmoid_prime(x):\n", + " return sigmoid(x) * (1-sigmoid(x))\n", + "def error_formula(y, output):\n", + " return - y*np.log(output) - (1 - y) * np.log(1-output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TODO: Backpropagate the error\n", + "Now it's your turn to shine. Write the error term. Remember that this is given by the equation $$ (y-\\hat{y}) \\sigma'(x) $$" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Write the error term formula\n", + "def error_term_formula(x, y, output):\n", + " return (y - output) * sigmoid_prime(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch: 0\n", + "Train loss: 0.274675763534\n", + "=========\n", + "Epoch: 100\n", + "Train loss: 0.211541227151\n", + "=========\n", + "Epoch: 200\n", + "Train loss: 0.209108941338\n", + "=========\n", + "Epoch: 300\n", + "Train loss: 0.207892062032\n", + "=========\n", + "Epoch: 400\n", + "Train loss: 0.207253836282\n", + "=========\n", + "Epoch: 500\n", + "Train loss: 0.206889105737\n", + "=========\n", + "Epoch: 600\n", + "Train loss: 0.206656034976\n", + "=========\n", + "Epoch: 700\n", + "Train loss: 0.206488123278\n", + "=========\n", + "Epoch: 800\n", + "Train loss: 0.206353568323\n", + "=========\n", + "Epoch: 900\n", + "Train loss: 0.206236831468\n", + "=========\n", + "Finished training!\n" + ] + } + ], + "source": [ + "# Neural Network hyperparameters\n", + "epochs = 1000\n", + "learnrate = 0.5\n", + "\n", + "# Training function\n", + "def train_nn(features, targets, epochs, learnrate):\n", + " \n", + " # Use to same seed to make debugging easier\n", + " np.random.seed(42)\n", + "\n", + " n_records, n_features = features.shape\n", + " last_loss = None\n", + "\n", + " # Initialize weights\n", + " weights = np.random.normal(scale=1 / n_features**.5, size=n_features)\n", + "\n", + " for e in range(epochs):\n", + " del_w = np.zeros(weights.shape)\n", + " for x, y in zip(features.values, targets):\n", + " # Loop through all records, x is the input, y is the target\n", + "\n", + " # Activation of the output unit\n", + " # Notice we multiply the inputs and the weights here \n", + " # rather than storing h as a separate variable \n", + " output = sigmoid(np.dot(x, weights))\n", + "\n", + " # The error, the target minus the network output\n", + " error = error_formula(y, output)\n", + "\n", + " # The error term\n", + " error_term = error_term_formula(x, y, output)\n", + "\n", + " # The gradient descent step, the error times the gradient times the inputs\n", + " del_w += error_term * x\n", + "\n", + " # Update the weights here. The learning rate times the \n", + " # change in weights, divided by the number of records to average\n", + " weights += learnrate * del_w / n_records\n", + "\n", + " # Printing out the mean square error on the training set\n", + " if e % (epochs / 10) == 0:\n", + " out = sigmoid(np.dot(features, weights))\n", + " loss = np.mean((out - targets) ** 2)\n", + " print(\"Epoch:\", e)\n", + " if last_loss and last_loss < loss:\n", + " print(\"Train loss: \", loss, \" WARNING - Loss Increasing\")\n", + " else:\n", + " print(\"Train loss: \", loss)\n", + " last_loss = loss\n", + " print(\"=========\")\n", + " print(\"Finished training!\")\n", + " return weights\n", + " \n", + "weights = train_nn(features, targets, epochs, learnrate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calculating the Accuracy on the Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction accuracy: 0.700\n" + ] + } + ], + "source": [ + "# Calculate accuracy on test data\n", + "test_out = sigmoid(np.dot(features_test, weights))\n", + "predictions = test_out > 0.5\n", + "accuracy = np.mean(predictions == targets_test)\n", + "print(\"Prediction accuracy: {:.3f}\".format(accuracy))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissionsSolutions.ipynb b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissionsSolutions.ipynb new file mode 100644 index 0000000..3cfbfbf --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/StudentAdmissionsSolutions.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Solutions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### One-hot encoding the rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Make dummy variables for rank\n", + "one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')], axis=1)\n", + "\n", + "# Drop the previous rank column\n", + "one_hot_data = one_hot_data.drop('rank', axis=1)\n", + "\n", + "# Print the first 10 rows of our data\n", + "one_hot_data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scaling the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Copying our data\n", + "processed_data = one_hot_data[:]\n", + "\n", + "# Scaling the columns\n", + "processed_data['gre'] = processed_data['gre']/800\n", + "processed_data['gpa'] = processed_data['gpa']/4.0\n", + "processed_data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Backpropagating the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def error_term_formula(x, y, output):\n", + " return (y - output)*sigmoid_prime(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## alternative solution ##\n", + "# you could also *only* use y and the output \n", + "# and calculate sigmoid_prime directly from the activated output!\n", + "\n", + "# below is an equally valid solution (it doesn't utilize x)\n", + "def error_term_formula(x, y, output):\n", + " return (y-output) * output * (1 - output)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_admissions.py b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_admissions.py new file mode 100644 index 0000000..dd50305 --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_admissions.py @@ -0,0 +1,181 @@ +# Importing pandas and numpy +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# Reading the csv file into a pandas DataFrame +data = pd.read_csv('student_data.csv') + +# Printing out the first 10 rows of our data +print(data[:10]) + + +# Importing matplotlib + +# Function to help us plot +def plot_points(data): + X = np.array(data[["gre", "gpa"]]) + y = np.array(data["admit"]) + admitted = X[np.argwhere(y == 1)] + rejected = X[np.argwhere(y == 0)] + plt.scatter([s[0][0] for s in rejected], [s[0][1] + for s in rejected], + s=25, color='red', edgecolor='k') + plt.scatter([s[0][0] for s in admitted], [s[0][1] + for s in admitted], + s=25, color='cyan', edgecolor='k') + plt.xlabel('Test (GRE)') + plt.ylabel('Grades (GPA)') + + +# Plotting the points +plot_points(data) +plt.show() + + +# Separating the ranks +data_rank1 = data[data["rank"] == 1] +data_rank2 = data[data["rank"] == 2] +data_rank3 = data[data["rank"] == 3] +data_rank4 = data[data["rank"] == 4] + +# Plotting the graphs +plot_points(data_rank1) +plt.title("Rank 1") +plt.show() +plot_points(data_rank2) +plt.title("Rank 2") +plt.show() +plot_points(data_rank3) +plt.title("Rank 3") +plt.show() +plot_points(data_rank4) +plt.title("Rank 4") +plt.show() + + +# TODO: Make dummy variables for rank +one_hot_data = pd.concat([data, pd.get_dummies(data['rank'], prefix='rank')], + axis=1) + +# TODO: Drop the previous rank column +one_hot_data = one_hot_data.drop('rank', axis=1) + +# Print the first 10 rows of our data +one_hot_data[:10] + + +# Making a copy of our data +processed_data = one_hot_data[:] + +# TODO: Scale the columns +processed_data['gre'] = processed_data['gre'] / 800 +processed_data['gpa'] = processed_data['gpa'] / 4.0 +processed_data[:10] + +# Printing the first 10 rows of our procesed data +processed_data[:10] + + +sample = np.random.choice(processed_data.index, size=int( + len(processed_data) * 0.9), replace=False) +train_data, test_data = processed_data.iloc[sample], processed_data.drop( + sample) + +print("Number of training samples is", len(train_data)) +print("Number of testing samples is", len(test_data)) +print(train_data[:10]) +print(test_data[:10]) + + +features = train_data.drop('admit', axis=1) +targets = train_data['admit'] +features_test = test_data.drop('admit', axis=1) +targets_test = test_data['admit'] + +print(features[:10]) +print(targets[:10]) + + +# Activation (sigmoid) function +def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + +def sigmoid_prime(x): + return sigmoid(x) * (1 - sigmoid(x)) + + +def error_formula(y, output): + return - y * np.log(output) - (1 - y) * np.log(1 - output) + + +# TODO: Write the error term formula +def error_term_formula(x, y, output): + return (y - output) * sigmoid_prime(x) + + +# Neural Network hyperparameters +epochs = 1000 +learnrate = 0.5 + +# Training function + + +def train_nn(features, targets, epochs, learnrate): + + # Use to same seed to make debugging easier + np.random.seed(42) + + n_records, n_features = features.shape + last_loss = None + + # Initialize weights + weights = np.random.normal(scale=1 / n_features**.5, size=n_features) + + for e in range(epochs): + del_w = np.zeros(weights.shape) + for x, y in zip(features.values, targets): + # Loop through all records, x is the input, y is the target + + # Activation of the output unit + # Notice we multiply the inputs and the weights here + # rather than storing h as a separate variable + output = sigmoid(np.dot(x, weights)) + + # The error, the target minus the network output + error = error_formula(y, output) + + # The error term + error_term = error_term_formula(x, y, output) + + # The gradient descent step, the error times the gradient times the inputs + del_w += error_term * x + + # Update the weights here. The learning rate times the + # change in weights, divided by the number of records to average + weights += learnrate * del_w / n_records + + # Printing out the mean square error on the training set + if e % (epochs / 10) == 0: + out = sigmoid(np.dot(features, weights)) + loss = np.mean((out - targets) ** 2) + print("Epoch:", e) + if last_loss and last_loss < loss: + print("Train loss: ", loss, " WARNING - Loss Increasing") + else: + print("Train loss: ", loss) + last_loss = loss + print("=========") + print("Finished training!") + return weights + + +weights = train_nn(features, targets, epochs, learnrate) + + +# Calculate accuracy on test data +test_out = sigmoid(np.dot(features_test, weights)) +predictions = test_out > 0.5 +accuracy = np.mean(predictions == targets_test) +print("Prediction accuracy: {:.3f}".format(accuracy)) diff --git a/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_data.csv b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_data.csv new file mode 100644 index 0000000..5f2cf4e --- /dev/null +++ b/python/Deep Learning/Introduction to Neural Networks/Student Admissions(Neural Network)/student_data.csv @@ -0,0 +1,401 @@ +admit,gre,gpa,rank +0,380,3.61,3 +1,660,3.67,3 +1,800,4,1 +1,640,3.19,4 +0,520,2.93,4 +1,760,3,2 +1,560,2.98,1 +0,400,3.08,2 +1,540,3.39,3 +0,700,3.92,2 +0,800,4,4 +0,440,3.22,1 +1,760,4,1 +0,700,3.08,2 +1,700,4,1 +0,480,3.44,3 +0,780,3.87,4 +0,360,2.56,3 +0,800,3.75,2 +1,540,3.81,1 +0,500,3.17,3 +1,660,3.63,2 +0,600,2.82,4 +0,680,3.19,4 +1,760,3.35,2 +1,800,3.66,1 +1,620,3.61,1 +1,520,3.74,4 +1,780,3.22,2 +0,520,3.29,1 +0,540,3.78,4 +0,760,3.35,3 +0,600,3.4,3 +1,800,4,3 +0,360,3.14,1 +0,400,3.05,2 +0,580,3.25,1 +0,520,2.9,3 +1,500,3.13,2 +1,520,2.68,3 +0,560,2.42,2 +1,580,3.32,2 +1,600,3.15,2 +0,500,3.31,3 +0,700,2.94,2 +1,460,3.45,3 +1,580,3.46,2 +0,500,2.97,4 +0,440,2.48,4 +0,400,3.35,3 +0,640,3.86,3 +0,440,3.13,4 +0,740,3.37,4 +1,680,3.27,2 +0,660,3.34,3 +1,740,4,3 +0,560,3.19,3 +0,380,2.94,3 +0,400,3.65,2 +0,600,2.82,4 +1,620,3.18,2 +0,560,3.32,4 +0,640,3.67,3 +1,680,3.85,3 +0,580,4,3 +0,600,3.59,2 +0,740,3.62,4 +0,620,3.3,1 +0,580,3.69,1 +0,800,3.73,1 +0,640,4,3 +0,300,2.92,4 +0,480,3.39,4 +0,580,4,2 +0,720,3.45,4 +0,720,4,3 +0,560,3.36,3 +1,800,4,3 +0,540,3.12,1 +1,620,4,1 +0,700,2.9,4 +0,620,3.07,2 +0,500,2.71,2 +0,380,2.91,4 +1,500,3.6,3 +0,520,2.98,2 +0,600,3.32,2 +0,600,3.48,2 +0,700,3.28,1 +1,660,4,2 +0,700,3.83,2 +1,720,3.64,1 +0,800,3.9,2 +0,580,2.93,2 +1,660,3.44,2 +0,660,3.33,2 +0,640,3.52,4 +0,480,3.57,2 +0,700,2.88,2 +0,400,3.31,3 +0,340,3.15,3 +0,580,3.57,3 +0,380,3.33,4 +0,540,3.94,3 +1,660,3.95,2 +1,740,2.97,2 +1,700,3.56,1 +0,480,3.13,2 +0,400,2.93,3 +0,480,3.45,2 +0,680,3.08,4 +0,420,3.41,4 +0,360,3,3 +0,600,3.22,1 +0,720,3.84,3 +0,620,3.99,3 +1,440,3.45,2 +0,700,3.72,2 +1,800,3.7,1 +0,340,2.92,3 +1,520,3.74,2 +1,480,2.67,2 +0,520,2.85,3 +0,500,2.98,3 +0,720,3.88,3 +0,540,3.38,4 +1,600,3.54,1 +0,740,3.74,4 +0,540,3.19,2 +0,460,3.15,4 +1,620,3.17,2 +0,640,2.79,2 +0,580,3.4,2 +0,500,3.08,3 +0,560,2.95,2 +0,500,3.57,3 +0,560,3.33,4 +0,700,4,3 +0,620,3.4,2 +1,600,3.58,1 +0,640,3.93,2 +1,700,3.52,4 +0,620,3.94,4 +0,580,3.4,3 +0,580,3.4,4 +0,380,3.43,3 +0,480,3.4,2 +0,560,2.71,3 +1,480,2.91,1 +0,740,3.31,1 +1,800,3.74,1 +0,400,3.38,2 +1,640,3.94,2 +0,580,3.46,3 +0,620,3.69,3 +1,580,2.86,4 +0,560,2.52,2 +1,480,3.58,1 +0,660,3.49,2 +0,700,3.82,3 +0,600,3.13,2 +0,640,3.5,2 +1,700,3.56,2 +0,520,2.73,2 +0,580,3.3,2 +0,700,4,1 +0,440,3.24,4 +0,720,3.77,3 +0,500,4,3 +0,600,3.62,3 +0,400,3.51,3 +0,540,2.81,3 +0,680,3.48,3 +1,800,3.43,2 +0,500,3.53,4 +1,620,3.37,2 +0,520,2.62,2 +1,620,3.23,3 +0,620,3.33,3 +0,300,3.01,3 +0,620,3.78,3 +0,500,3.88,4 +0,700,4,2 +1,540,3.84,2 +0,500,2.79,4 +0,800,3.6,2 +0,560,3.61,3 +0,580,2.88,2 +0,560,3.07,2 +0,500,3.35,2 +1,640,2.94,2 +0,800,3.54,3 +0,640,3.76,3 +0,380,3.59,4 +1,600,3.47,2 +0,560,3.59,2 +0,660,3.07,3 +1,400,3.23,4 +0,600,3.63,3 +0,580,3.77,4 +0,800,3.31,3 +1,580,3.2,2 +1,700,4,1 +0,420,3.92,4 +1,600,3.89,1 +1,780,3.8,3 +0,740,3.54,1 +1,640,3.63,1 +0,540,3.16,3 +0,580,3.5,2 +0,740,3.34,4 +0,580,3.02,2 +0,460,2.87,2 +0,640,3.38,3 +1,600,3.56,2 +1,660,2.91,3 +0,340,2.9,1 +1,460,3.64,1 +0,460,2.98,1 +1,560,3.59,2 +0,540,3.28,3 +0,680,3.99,3 +1,480,3.02,1 +0,800,3.47,3 +0,800,2.9,2 +1,720,3.5,3 +0,620,3.58,2 +0,540,3.02,4 +0,480,3.43,2 +1,720,3.42,2 +0,580,3.29,4 +0,600,3.28,3 +0,380,3.38,2 +0,420,2.67,3 +1,800,3.53,1 +0,620,3.05,2 +1,660,3.49,2 +0,480,4,2 +0,500,2.86,4 +0,700,3.45,3 +0,440,2.76,2 +1,520,3.81,1 +1,680,2.96,3 +0,620,3.22,2 +0,540,3.04,1 +0,800,3.91,3 +0,680,3.34,2 +0,440,3.17,2 +0,680,3.64,3 +0,640,3.73,3 +0,660,3.31,4 +0,620,3.21,4 +1,520,4,2 +1,540,3.55,4 +1,740,3.52,4 +0,640,3.35,3 +1,520,3.3,2 +1,620,3.95,3 +0,520,3.51,2 +0,640,3.81,2 +0,680,3.11,2 +0,440,3.15,2 +1,520,3.19,3 +1,620,3.95,3 +1,520,3.9,3 +0,380,3.34,3 +0,560,3.24,4 +1,600,3.64,3 +1,680,3.46,2 +0,500,2.81,3 +1,640,3.95,2 +0,540,3.33,3 +1,680,3.67,2 +0,660,3.32,1 +0,520,3.12,2 +1,600,2.98,2 +0,460,3.77,3 +1,580,3.58,1 +1,680,3,4 +1,660,3.14,2 +0,660,3.94,2 +0,360,3.27,3 +0,660,3.45,4 +0,520,3.1,4 +1,440,3.39,2 +0,600,3.31,4 +1,800,3.22,1 +1,660,3.7,4 +0,800,3.15,4 +0,420,2.26,4 +1,620,3.45,2 +0,800,2.78,2 +0,680,3.7,2 +0,800,3.97,1 +0,480,2.55,1 +0,520,3.25,3 +0,560,3.16,1 +0,460,3.07,2 +0,540,3.5,2 +0,720,3.4,3 +0,640,3.3,2 +1,660,3.6,3 +1,400,3.15,2 +1,680,3.98,2 +0,220,2.83,3 +0,580,3.46,4 +1,540,3.17,1 +0,580,3.51,2 +0,540,3.13,2 +0,440,2.98,3 +0,560,4,3 +0,660,3.67,2 +0,660,3.77,3 +1,520,3.65,4 +0,540,3.46,4 +1,300,2.84,2 +1,340,3,2 +1,780,3.63,4 +1,480,3.71,4 +0,540,3.28,1 +0,460,3.14,3 +0,460,3.58,2 +0,500,3.01,4 +0,420,2.69,2 +0,520,2.7,3 +0,680,3.9,1 +0,680,3.31,2 +1,560,3.48,2 +0,580,3.34,2 +0,500,2.93,4 +0,740,4,3 +0,660,3.59,3 +0,420,2.96,1 +0,560,3.43,3 +1,460,3.64,3 +1,620,3.71,1 +0,520,3.15,3 +0,620,3.09,4 +0,540,3.2,1 +1,660,3.47,3 +0,500,3.23,4 +1,560,2.65,3 +0,500,3.95,4 +0,580,3.06,2 +0,520,3.35,3 +0,500,3.03,3 +0,600,3.35,2 +0,580,3.8,2 +0,400,3.36,2 +0,620,2.85,2 +1,780,4,2 +0,620,3.43,3 +1,580,3.12,3 +0,700,3.52,2 +1,540,3.78,2 +1,760,2.81,1 +0,700,3.27,2 +0,720,3.31,1 +1,560,3.69,3 +0,720,3.94,3 +1,520,4,1 +1,540,3.49,1 +0,680,3.14,2 +0,460,3.44,2 +1,560,3.36,1 +0,480,2.78,3 +0,460,2.93,3 +0,620,3.63,3 +0,580,4,1 +0,800,3.89,2 +1,540,3.77,2 +1,680,3.76,3 +1,680,2.42,1 +1,620,3.37,1 +0,560,3.78,2 +0,560,3.49,4 +0,620,3.63,2 +1,800,4,2 +0,640,3.12,3 +0,540,2.7,2 +0,700,3.65,2 +1,540,3.49,2 +0,540,3.51,2 +0,660,4,1 +1,480,2.62,2 +0,420,3.02,1 +1,740,3.86,2 +0,580,3.36,2 +0,640,3.17,2 +0,640,3.51,2 +1,800,3.05,2 +1,660,3.88,2 +1,600,3.38,3 +1,620,3.75,2 +1,460,3.99,3 +0,620,4,2 +0,560,3.04,3 +0,460,2.63,2 +0,700,3.65,2 +0,600,3.89,3 diff --git a/python/Supervised Learning/Project/.ipynb_checkpoints/finding_donors-checkpoint.ipynb b/python/Supervised Learning/Project/.ipynb_checkpoints/finding_donors-checkpoint.ipynb index 4909e69..3c24309 100644 --- a/python/Supervised Learning/Project/.ipynb_checkpoints/finding_donors-checkpoint.ipynb +++ b/python/Supervised Learning/Project/.ipynb_checkpoints/finding_donors-checkpoint.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -165,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -196,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -217,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -293,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -303,16 +303,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 44, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, @@ -342,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -351,7 +351,7 @@ "(0, 1500)" ] }, - "execution_count": 45, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, @@ -376,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -385,7 +385,7 @@ "(0, 1000)" ] }, - "execution_count": 46, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, @@ -410,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -446,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -474,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -483,7 +483,7 @@ "(0, 1500)" ] }, - "execution_count": 49, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, @@ -508,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -517,7 +517,7 @@ "(0, 1500)" ] }, - "execution_count": 50, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -740,14 +740,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 60, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -771,70 +764,100 @@ " \n", " \n", " \n", + " age\n", " workclass\n", " education_level\n", + " education-num\n", " marital-status\n", " occupation\n", " relationship\n", " race\n", " sex\n", + " capital-gain\n", + " capital-loss\n", + " hours-per-week\n", " native-country\n", " \n", " \n", " \n", " \n", - " 14475\n", + " 38017\n", + " 0.260274\n", " Private\n", - " Bachelors\n", - " Married-civ-spouse\n", - " Sales\n", - " Husband\n", - " White\n", - " Male\n", - " United-States\n", - " \n", - " \n", - " 14051\n", - " State-gov\n", - " Bachelors\n", - " Never-married\n", - " Prof-specialty\n", - " Not-in-family\n", - " White\n", - " Male\n", - " United-States\n", - " \n", - " \n", - " 40954\n", - " Private\n", - " Some-college\n", + " HS-grad\n", + " 0.533333\n", " Never-married\n", " Adm-clerical\n", - " Own-child\n", - " Black\n", - " Male\n", - " United-States\n", - " \n", - " \n", - " 29769\n", - " State-gov\n", - " Masters\n", - " Never-married\n", - " Exec-managerial\n", - " Not-in-family\n", + " Unmarried\n", " White\n", " Female\n", + " 0.000000\n", + " 0.0\n", + " 0.397959\n", " United-States\n", " \n", " \n", - " 22474\n", + " 13752\n", + " 0.219178\n", " Private\n", - " Bachelors\n", + " HS-grad\n", + " 0.533333\n", " Married-civ-spouse\n", - " Exec-managerial\n", + " Transport-moving\n", " Husband\n", " White\n", " Male\n", + " 0.000000\n", + " 0.0\n", + " 0.397959\n", + " United-States\n", + " \n", + " \n", + " 31365\n", + " 0.054795\n", + " Private\n", + " HS-grad\n", + " 0.533333\n", + " Never-married\n", + " Handlers-cleaners\n", + " Not-in-family\n", + " Asian-Pac-Islander\n", + " Female\n", + " 0.000000\n", + " 0.0\n", + " 0.336735\n", + " South\n", + " \n", + " \n", + " 8526\n", + " 0.219178\n", + " Private\n", + " 7th-8th\n", + " 0.200000\n", + " Married-civ-spouse\n", + " Craft-repair\n", + " Husband\n", + " White\n", + " Male\n", + " 0.000000\n", + " 0.0\n", + " 0.397959\n", + " United-States\n", + " \n", + " \n", + " 32263\n", + " 0.534247\n", + " Private\n", + " Doctorate\n", + " 1.000000\n", + " Married-civ-spouse\n", + " Prof-specialty\n", + " Husband\n", + " White\n", + " Male\n", + " 0.777174\n", + " 0.0\n", + " 0.653061\n", " United-States\n", " \n", " \n", @@ -842,34 +865,41 @@ "" ], "text/plain": [ - " workclass education_level marital-status occupation \\\n", - "14475 Private Bachelors Married-civ-spouse Sales \n", - "14051 State-gov Bachelors Never-married Prof-specialty \n", - "40954 Private Some-college Never-married Adm-clerical \n", - "29769 State-gov Masters Never-married Exec-managerial \n", - "22474 Private Bachelors Married-civ-spouse Exec-managerial \n", + " age workclass education_level education-num marital-status \\\n", + "38017 0.260274 Private HS-grad 0.533333 Never-married \n", + "13752 0.219178 Private HS-grad 0.533333 Married-civ-spouse \n", + "31365 0.054795 Private HS-grad 0.533333 Never-married \n", + "8526 0.219178 Private 7th-8th 0.200000 Married-civ-spouse \n", + "32263 0.534247 Private Doctorate 1.000000 Married-civ-spouse \n", "\n", - " relationship race sex native-country \n", - "14475 Husband White Male United-States \n", - "14051 Not-in-family White Male United-States \n", - "40954 Own-child Black Male United-States \n", - "29769 Not-in-family White Female United-States \n", - "22474 Husband White Male United-States " + " occupation relationship race sex \\\n", + "38017 Adm-clerical Unmarried White Female \n", + "13752 Transport-moving Husband White Male \n", + "31365 Handlers-cleaners Not-in-family Asian-Pac-Islander Female \n", + "8526 Craft-repair Husband White Male \n", + "32263 Prof-specialty Husband White Male \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "38017 0.000000 0.0 0.397959 United-States \n", + "13752 0.000000 0.0 0.397959 United-States \n", + "31365 0.000000 0.0 0.336735 South \n", + "8526 0.000000 0.0 0.397959 United-States \n", + "32263 0.777174 0.0 0.653061 United-States " ] }, - "execution_count": 60, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "non_numeric_features = features_log_minmax_transform.drop(numerical, axis=1)\n", - "non_numeric_features.sample(frac=1).head(5)" + "# non_numeric_features = features_log_minmax_transform.drop(numerical, axis=1)\n", + "features_log_minmax_transform.sample(frac=1).head(5)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 16, "metadata": { "scrolled": true }, @@ -878,24 +908,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "98 total features after one-hot encoding.\n", + "103 total features after one-hot encoding.\n", "\n", "Encoded feature names are:\n", - "['workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_level_ 10th', 'education_level_ 11th', 'education_level_ 12th', 'education_level_ 1st-4th', 'education_level_ 5th-6th', 'education_level_ 7th-8th', 'education_level_ 9th', 'education_level_ Assoc-acdm', 'education_level_ Assoc-voc', 'education_level_ Bachelors', 'education_level_ Doctorate', 'education_level_ HS-grad', 'education_level_ Masters', 'education_level_ Preschool', 'education_level_ Prof-school', 'education_level_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'relationship_ Husband', 'relationship_ Not-in-family', 'relationship_ Other-relative', 'relationship_ Own-child', 'relationship_ Unmarried', 'relationship_ Wife', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White', 'sex_ Female', 'sex_ Male', 'native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia']\n", + "['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_level_ 10th', 'education_level_ 11th', 'education_level_ 12th', 'education_level_ 1st-4th', 'education_level_ 5th-6th', 'education_level_ 7th-8th', 'education_level_ 9th', 'education_level_ Assoc-acdm', 'education_level_ Assoc-voc', 'education_level_ Bachelors', 'education_level_ Doctorate', 'education_level_ HS-grad', 'education_level_ Masters', 'education_level_ Preschool', 'education_level_ Prof-school', 'education_level_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'relationship_ Husband', 'relationship_ Not-in-family', 'relationship_ Other-relative', 'relationship_ Own-child', 'relationship_ Unmarried', 'relationship_ Wife', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White', 'sex_ Female', 'sex_ Male', 'native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia']\n", "\n", "The income col now looks like:\n", - "40240 1\n", - "19808 1\n", - "26118 0\n", - "590 0\n", - "42229 1\n", + "19232 0\n", + "45209 0\n", + "26283 1\n", + "41688 0\n", + "9039 0\n", "Name: income, dtype: object\n" ] } ], "source": [ "# TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()\n", - "features_final = pd.get_dummies(non_numeric_features)\n", + "features_final = pd.get_dummies(features_log_minmax_transform)\n", "\n", "# TODO: Encode the 'income_raw' data to numerical values\n", "income_raw.iloc[::-1][income_raw.iloc[::-1] == '<=50K'] = 0\n", @@ -910,6 +940,37 @@ "print(f'The income col now looks like:\\n{income_raw.sample(frac=1).head(5)}')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert income raw to dtype of int32" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "Name: income, dtype: int32" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "income_raw = pd.Series(income_raw, dtype='int32')\n", + "income_raw.head(3)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -932,7 +993,251 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageeducation-numcapital-gaincapital-losshours-per-weekworkclass_ Federal-govworkclass_ Local-govworkclass_ Privateworkclass_ Self-emp-incworkclass_ Self-emp-not-inc...native-country_ Portugalnative-country_ Puerto-Riconative-country_ Scotlandnative-country_ Southnative-country_ Taiwannative-country_ Thailandnative-country_ Trinadad&Tobagonative-country_ United-Statesnative-country_ Vietnamnative-country_ Yugoslavia
132490.1095890.8666670.0000000.00.39795900100...0000000100
225420.4109590.5333330.7870510.00.43877600100...0000000100
12360.2465750.8000000.0000000.00.39795900100...0000000100
237020.3698630.8000000.0000000.00.50000000100...0000000100
83390.3561640.5333330.0000000.00.39795900100...0000000100
\n", + "

5 rows × 103 columns

\n", + "
" + ], + "text/plain": [ + " age education-num capital-gain capital-loss hours-per-week \\\n", + "13249 0.109589 0.866667 0.000000 0.0 0.397959 \n", + "22542 0.410959 0.533333 0.787051 0.0 0.438776 \n", + "1236 0.246575 0.800000 0.000000 0.0 0.397959 \n", + "23702 0.369863 0.800000 0.000000 0.0 0.500000 \n", + "8339 0.356164 0.533333 0.000000 0.0 0.397959 \n", + "\n", + " workclass_ Federal-gov workclass_ Local-gov workclass_ Private \\\n", + "13249 0 0 1 \n", + "22542 0 0 1 \n", + "1236 0 0 1 \n", + "23702 0 0 1 \n", + "8339 0 0 1 \n", + "\n", + " workclass_ Self-emp-inc workclass_ Self-emp-not-inc ... \\\n", + "13249 0 0 ... \n", + "22542 0 0 ... \n", + "1236 0 0 ... \n", + "23702 0 0 ... \n", + "8339 0 0 ... \n", + "\n", + " native-country_ Portugal native-country_ Puerto-Rico \\\n", + "13249 0 0 \n", + "22542 0 0 \n", + "1236 0 0 \n", + "23702 0 0 \n", + "8339 0 0 \n", + "\n", + " native-country_ Scotland native-country_ South \\\n", + "13249 0 0 \n", + "22542 0 0 \n", + "1236 0 0 \n", + "23702 0 0 \n", + "8339 0 0 \n", + "\n", + " native-country_ Taiwan native-country_ Thailand \\\n", + "13249 0 0 \n", + "22542 0 0 \n", + "1236 0 0 \n", + "23702 0 0 \n", + "8339 0 0 \n", + "\n", + " native-country_ Trinadad&Tobago native-country_ United-States \\\n", + "13249 0 1 \n", + "22542 0 1 \n", + "1236 0 1 \n", + "23702 0 1 \n", + "8339 0 1 \n", + "\n", + " native-country_ Vietnam native-country_ Yugoslavia \n", + "13249 0 0 \n", + "22542 0 0 \n", + "1236 0 0 \n", + "23702 0 0 \n", + "8339 0 0 \n", + "\n", + "[5 rows x 103 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_final.sample(frac=1).head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1016,7 +1321,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +1333,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1163,7 +1468,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1273,11 +1578,11 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score\n", + "from sklearn.metrics import fbeta_score, accuracy_score\n", "\n", "def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): \n", " '''\n", @@ -1289,12 +1594,12 @@ " - X_test: features testing set\n", " - y_test: income testing set\n", " '''\n", - " \n", + " beta = 0.5\n", " results = {}\n", " \n", " # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])\n", " start = time() # Get start time\n", - " learner = learner.fit(X_train, y_train)\n", + " learner = learner.fit(X_train[:sample_size], y_train[:sample_size])\n", " end = time() # Get end time\n", " \n", " # TODO: Calculate the training time\n", @@ -1303,24 +1608,24 @@ " # TODO: Get the predictions on the test set(X_test),\n", " # then get predictions on the first 300 training samples(X_train) using .predict()\n", " start = time() # Get start time\n", - " predictions_test = None\n", - " predictions_train = None\n", + " predictions_test = learner.predict(X_test)\n", + " predictions_train = learner.predict(X_train[:300])\n", " end = time() # Get end time\n", " \n", " # TODO: Calculate the total prediction time\n", - " results['pred_time'] = None\n", + " results['pred_time'] = end - start\n", " \n", " # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]\n", - " results['acc_train'] = None\n", + " results['acc_train'] = accuracy_score(y_train[:300], predictions_train)\n", " \n", " # TODO: Compute accuracy on test set using accuracy_score()\n", - " results['acc_test'] = None\n", + " results['acc_test'] = accuracy_score(y_test, predictions_test)\n", " \n", " # TODO: Compute F-score on the the first 300 training samples using fbeta_score()\n", - " results['f_train'] = None\n", + " results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta)\n", " \n", " # TODO: Compute F-score on the test set which is y_test\n", - " results['f_test'] = None\n", + " results['f_test'] = fbeta_score(y_test, predictions_test, beta)\n", " \n", " # Success\n", " print(\"{} trained on {} samples.\".format(learner.__class__.__name__, sample_size))\n", @@ -1347,42 +1652,63 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13181 0\n", + "10342 0\n", + "20881 0\n", + "24972 1\n", + "43867 0\n", + "Name: income, dtype: int32" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n" - ] - }, - { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for +: 'int' and 'NoneType'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# Run metrics visualization for the three supervised learning models chosen\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mvs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfscore\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/courses/Udacity Machine Learning - Introduction Nanodegree Program/python/Supervised Learning/Project/visuals.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(results, accuracy, f1)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;31m# Creative plot code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbar_width\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlearner\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwidth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbar_width\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcolors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_xticks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0.45\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1.45\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2.45\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_xticklabels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"1%\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"10%\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"100%\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/__init__.py\u001b[0m in \u001b[0;36minner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1599\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1600\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1601\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msanitize_sequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1603\u001b[0m \u001b[0mbound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_sig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/axes/_axes.py\u001b[0m in \u001b[0;36mbar\u001b[0;34m(self, x, height, width, bottom, align, **kwargs)\u001b[0m\n\u001b[1;32m 2428\u001b[0m \u001b[0medgecolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2429\u001b[0m \u001b[0mlinewidth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlw\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2430\u001b[0;31m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'_nolegend_'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2431\u001b[0m )\n\u001b[1;32m 2432\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/patches.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, xy, width, height, angle, **kwargs)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_x1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_x0\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_width\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_y1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_y0\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_height\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mangle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mangle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'int' and 'NoneType'" + "36177 3617 361\n", + "\n", + "0 361\n", + "GaussianNB trained on 361 samples.\n", + "1 3617\n", + "GaussianNB trained on 3617 samples.\n", + "2 36177\n", + "GaussianNB trained on 36177 samples.\n", + "0 361\n", + "DecisionTreeClassifier trained on 361 samples.\n", + "1 3617\n", + "DecisionTreeClassifier trained on 3617 samples.\n", + "2 36177\n", + "DecisionTreeClassifier trained on 36177 samples.\n", + "0 361\n", + "SVC trained on 361 samples.\n", + "1 3617\n", + "SVC trained on 3617 samples.\n", + "2 36177\n", + "SVC trained on 36177 samples.\n" ] }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1395,29 +1721,33 @@ ], "source": [ "# TODO: Import the three supervised learning models from sklearn\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.svm import SVC\n", "\n", "# TODO: Initialize the three models\n", - "clf_A = None\n", - "clf_B = None\n", - "clf_C = None\n", + "clf_A = GaussianNB()\n", + "clf_B = DecisionTreeClassifier()\n", + "clf_C = SVC()\n", "\n", "# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data\n", "# HINT: samples_100 is the entire training set i.e. len(y_train)\n", "# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)\n", "# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)\n", - "samples_100 = None\n", - "samples_10 = None\n", - "samples_1 = None\n", - "\n", + "samples_100 = int(X_train.shape[0])\n", + "samples_10 = int(samples_100 * 0.1)\n", + "samples_1 = int(samples_100 * 0.01)\n", + "print(samples_100, samples_10, samples_1)\n", + "print()\n", "# Collect results on the learners\n", "results = {}\n", "for clf in [clf_A, clf_B, clf_C]:\n", " clf_name = clf.__class__.__name__\n", " results[clf_name] = {}\n", " for i, samples in enumerate([samples_1, samples_10, samples_100]):\n", + " print(i, samples)\n", " results[clf_name][i] = \\\n", " train_predict(clf, samples, X_train, y_train, X_test, y_test)\n", - "\n", "# Run metrics visualization for the three supervised learning models chosen\n", "vs.evaluate(results, accuracy, fscore)" ] @@ -1494,27 +1824,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mgrid_fit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrid_obj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# Get the estimator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 832\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 519\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m 520\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib64/python3.7/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib64/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import make_scorer\n", "\n", "# TODO: Initialize the classifier\n", - "clf = None\n", + "clf = SVC()\n", "\n", "# TODO: Create the parameters list you wish to tune, using a dictionary if needed.\n", "# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}\n", - "parameters = None\n", + "parameters = {'C': np.logspace(-4, 10, 12, base=10),\n", + " 'gamma': np.logspace(-11, 3, 12, base=10),\n", + " 'kernel': ['linear', 'rbf']}\n", "\n", "# TODO: Make an fbeta_score scoring object using make_scorer()\n", - "scorer = None\n", + "scorer = make_scorer(fbeta_score, beta=0.5)\n", "\n", "# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()\n", - "grid_obj = None\n", + "grid_obj = GridSearchCV(clf, param_grid=parameters, scoring=scorer, n_jobs=-1)\n", "\n", "# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()\n", - "grid_fit = None\n", + "grid_fit = grid_obj.fit(X_train, y_train)\n", "\n", "# Get the estimator\n", "best_clf = grid_fit.best_estimator_\n", diff --git a/python/Supervised Learning/Project/finding_donors.ipynb b/python/Supervised Learning/Project/finding_donors.ipynb index 4909e69..8b4944e 100644 --- a/python/Supervised Learning/Project/finding_donors.ipynb +++ b/python/Supervised Learning/Project/finding_donors.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -165,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -196,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -217,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -293,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -303,16 +303,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 44, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" }, @@ -342,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -351,7 +351,7 @@ "(0, 1500)" ] }, - "execution_count": 45, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, @@ -376,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -385,7 +385,7 @@ "(0, 1000)" ] }, - "execution_count": 46, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" }, @@ -410,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -446,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -474,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -483,7 +483,7 @@ "(0, 1500)" ] }, - "execution_count": 49, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, @@ -508,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -517,7 +517,7 @@ "(0, 1500)" ] }, - "execution_count": 50, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" }, @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -740,14 +740,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 60, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -771,70 +764,100 @@ " \n", " \n", " \n", + " age\n", " workclass\n", " education_level\n", + " education-num\n", " marital-status\n", " occupation\n", " relationship\n", " race\n", " sex\n", + " capital-gain\n", + " capital-loss\n", + " hours-per-week\n", " native-country\n", " \n", " \n", " \n", " \n", - " 14475\n", + " 14204\n", + " 0.027397\n", " Private\n", - " Bachelors\n", - " Married-civ-spouse\n", - " Sales\n", - " Husband\n", - " White\n", - " Male\n", + " HS-grad\n", + " 0.533333\n", + " Never-married\n", + " Other-service\n", + " Own-child\n", + " Black\n", + " Female\n", + " 0.0\n", + " 0.0\n", + " 0.397959\n", " United-States\n", " \n", " \n", - " 14051\n", - " State-gov\n", - " Bachelors\n", + " 4740\n", + " 0.068493\n", + " Private\n", + " Assoc-voc\n", + " 0.666667\n", " Never-married\n", " Prof-specialty\n", " Not-in-family\n", " White\n", " Male\n", + " 0.0\n", + " 0.0\n", + " 0.295918\n", " United-States\n", " \n", " \n", - " 40954\n", + " 19821\n", + " 0.027397\n", " Private\n", " Some-college\n", + " 0.600000\n", " Never-married\n", - " Adm-clerical\n", + " Other-service\n", " Own-child\n", - " Black\n", - " Male\n", - " United-States\n", - " \n", - " \n", - " 29769\n", - " State-gov\n", - " Masters\n", - " Never-married\n", - " Exec-managerial\n", - " Not-in-family\n", " White\n", - " Female\n", + " Male\n", + " 0.0\n", + " 0.0\n", + " 0.244898\n", " United-States\n", " \n", " \n", - " 22474\n", - " Private\n", - " Bachelors\n", + " 15539\n", + " 0.219178\n", + " Self-emp-not-inc\n", + " 11th\n", + " 0.400000\n", " Married-civ-spouse\n", - " Exec-managerial\n", + " Craft-repair\n", " Husband\n", " White\n", " Male\n", + " 0.0\n", + " 0.0\n", + " 0.500000\n", + " United-States\n", + " \n", + " \n", + " 416\n", + " 0.041096\n", + " Private\n", + " HS-grad\n", + " 0.533333\n", + " Married-civ-spouse\n", + " Machine-op-inspct\n", + " Husband\n", + " White\n", + " Male\n", + " 0.0\n", + " 0.0\n", + " 0.397959\n", " United-States\n", " \n", " \n", @@ -842,34 +865,41 @@ "" ], "text/plain": [ - " workclass education_level marital-status occupation \\\n", - "14475 Private Bachelors Married-civ-spouse Sales \n", - "14051 State-gov Bachelors Never-married Prof-specialty \n", - "40954 Private Some-college Never-married Adm-clerical \n", - "29769 State-gov Masters Never-married Exec-managerial \n", - "22474 Private Bachelors Married-civ-spouse Exec-managerial \n", + " age workclass education_level education-num \\\n", + "14204 0.027397 Private HS-grad 0.533333 \n", + "4740 0.068493 Private Assoc-voc 0.666667 \n", + "19821 0.027397 Private Some-college 0.600000 \n", + "15539 0.219178 Self-emp-not-inc 11th 0.400000 \n", + "416 0.041096 Private HS-grad 0.533333 \n", "\n", - " relationship race sex native-country \n", - "14475 Husband White Male United-States \n", - "14051 Not-in-family White Male United-States \n", - "40954 Own-child Black Male United-States \n", - "29769 Not-in-family White Female United-States \n", - "22474 Husband White Male United-States " + " marital-status occupation relationship race \\\n", + "14204 Never-married Other-service Own-child Black \n", + "4740 Never-married Prof-specialty Not-in-family White \n", + "19821 Never-married Other-service Own-child White \n", + "15539 Married-civ-spouse Craft-repair Husband White \n", + "416 Married-civ-spouse Machine-op-inspct Husband White \n", + "\n", + " sex capital-gain capital-loss hours-per-week native-country \n", + "14204 Female 0.0 0.0 0.397959 United-States \n", + "4740 Male 0.0 0.0 0.295918 United-States \n", + "19821 Male 0.0 0.0 0.244898 United-States \n", + "15539 Male 0.0 0.0 0.500000 United-States \n", + "416 Male 0.0 0.0 0.397959 United-States " ] }, - "execution_count": 60, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "non_numeric_features = features_log_minmax_transform.drop(numerical, axis=1)\n", - "non_numeric_features.sample(frac=1).head(5)" + "# non_numeric_features = features_log_minmax_transform.drop(numerical, axis=1)\n", + "features_log_minmax_transform.sample(frac=1).head(5)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 39, "metadata": { "scrolled": true }, @@ -878,24 +908,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "98 total features after one-hot encoding.\n", + "103 total features after one-hot encoding.\n", "\n", "Encoded feature names are:\n", - "['workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_level_ 10th', 'education_level_ 11th', 'education_level_ 12th', 'education_level_ 1st-4th', 'education_level_ 5th-6th', 'education_level_ 7th-8th', 'education_level_ 9th', 'education_level_ Assoc-acdm', 'education_level_ Assoc-voc', 'education_level_ Bachelors', 'education_level_ Doctorate', 'education_level_ HS-grad', 'education_level_ Masters', 'education_level_ Preschool', 'education_level_ Prof-school', 'education_level_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'relationship_ Husband', 'relationship_ Not-in-family', 'relationship_ Other-relative', 'relationship_ Own-child', 'relationship_ Unmarried', 'relationship_ Wife', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White', 'sex_ Female', 'sex_ Male', 'native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia']\n", + "['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_level_ 10th', 'education_level_ 11th', 'education_level_ 12th', 'education_level_ 1st-4th', 'education_level_ 5th-6th', 'education_level_ 7th-8th', 'education_level_ 9th', 'education_level_ Assoc-acdm', 'education_level_ Assoc-voc', 'education_level_ Bachelors', 'education_level_ Doctorate', 'education_level_ HS-grad', 'education_level_ Masters', 'education_level_ Preschool', 'education_level_ Prof-school', 'education_level_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'relationship_ Husband', 'relationship_ Not-in-family', 'relationship_ Other-relative', 'relationship_ Own-child', 'relationship_ Unmarried', 'relationship_ Wife', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White', 'sex_ Female', 'sex_ Male', 'native-country_ Cambodia', 'native-country_ Canada', 'native-country_ China', 'native-country_ Columbia', 'native-country_ Cuba', 'native-country_ Dominican-Republic', 'native-country_ Ecuador', 'native-country_ El-Salvador', 'native-country_ England', 'native-country_ France', 'native-country_ Germany', 'native-country_ Greece', 'native-country_ Guatemala', 'native-country_ Haiti', 'native-country_ Holand-Netherlands', 'native-country_ Honduras', 'native-country_ Hong', 'native-country_ Hungary', 'native-country_ India', 'native-country_ Iran', 'native-country_ Ireland', 'native-country_ Italy', 'native-country_ Jamaica', 'native-country_ Japan', 'native-country_ Laos', 'native-country_ Mexico', 'native-country_ Nicaragua', 'native-country_ Outlying-US(Guam-USVI-etc)', 'native-country_ Peru', 'native-country_ Philippines', 'native-country_ Poland', 'native-country_ Portugal', 'native-country_ Puerto-Rico', 'native-country_ Scotland', 'native-country_ South', 'native-country_ Taiwan', 'native-country_ Thailand', 'native-country_ Trinadad&Tobago', 'native-country_ United-States', 'native-country_ Vietnam', 'native-country_ Yugoslavia']\n", "\n", "The income col now looks like:\n", - "40240 1\n", - "19808 1\n", - "26118 0\n", - "590 0\n", - "42229 1\n", + "16481 0\n", + "37818 0\n", + "20804 0\n", + "3242 1\n", + "23475 0\n", "Name: income, dtype: object\n" ] } ], "source": [ "# TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()\n", - "features_final = pd.get_dummies(non_numeric_features)\n", + "features_final = pd.get_dummies(features_log_minmax_transform)\n", "\n", "# TODO: Encode the 'income_raw' data to numerical values\n", "income_raw.iloc[::-1][income_raw.iloc[::-1] == '<=50K'] = 0\n", @@ -910,6 +940,37 @@ "print(f'The income col now looks like:\\n{income_raw.sample(frac=1).head(5)}')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert income raw to dtype of int32" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 0\n", + "Name: income, dtype: int32" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "income_raw = pd.Series(income_raw, dtype='int32')\n", + "income_raw.head(3)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -932,7 +993,251 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageeducation-numcapital-gaincapital-losshours-per-weekworkclass_ Federal-govworkclass_ Local-govworkclass_ Privateworkclass_ Self-emp-incworkclass_ Self-emp-not-inc...native-country_ Portugalnative-country_ Puerto-Riconative-country_ Scotlandnative-country_ Southnative-country_ Taiwannative-country_ Thailandnative-country_ Trinadad&Tobagonative-country_ United-Statesnative-country_ Vietnamnative-country_ Yugoslavia
167020.3835620.8666670.00.00.34693901000...0000000100
88790.5205480.3333330.00.00.28571400100...0000000100
149280.1506850.6000000.00.00.39795900100...0000000100
280870.2739730.5333330.00.00.39795900100...0000000100
339700.2328770.5333330.00.00.39795900100...0000000100
\n", + "

5 rows × 103 columns

\n", + "
" + ], + "text/plain": [ + " age education-num capital-gain capital-loss hours-per-week \\\n", + "16702 0.383562 0.866667 0.0 0.0 0.346939 \n", + "8879 0.520548 0.333333 0.0 0.0 0.285714 \n", + "14928 0.150685 0.600000 0.0 0.0 0.397959 \n", + "28087 0.273973 0.533333 0.0 0.0 0.397959 \n", + "33970 0.232877 0.533333 0.0 0.0 0.397959 \n", + "\n", + " workclass_ Federal-gov workclass_ Local-gov workclass_ Private \\\n", + "16702 0 1 0 \n", + "8879 0 0 1 \n", + "14928 0 0 1 \n", + "28087 0 0 1 \n", + "33970 0 0 1 \n", + "\n", + " workclass_ Self-emp-inc workclass_ Self-emp-not-inc ... \\\n", + "16702 0 0 ... \n", + "8879 0 0 ... \n", + "14928 0 0 ... \n", + "28087 0 0 ... \n", + "33970 0 0 ... \n", + "\n", + " native-country_ Portugal native-country_ Puerto-Rico \\\n", + "16702 0 0 \n", + "8879 0 0 \n", + "14928 0 0 \n", + "28087 0 0 \n", + "33970 0 0 \n", + "\n", + " native-country_ Scotland native-country_ South \\\n", + "16702 0 0 \n", + "8879 0 0 \n", + "14928 0 0 \n", + "28087 0 0 \n", + "33970 0 0 \n", + "\n", + " native-country_ Taiwan native-country_ Thailand \\\n", + "16702 0 0 \n", + "8879 0 0 \n", + "14928 0 0 \n", + "28087 0 0 \n", + "33970 0 0 \n", + "\n", + " native-country_ Trinadad&Tobago native-country_ United-States \\\n", + "16702 0 1 \n", + "8879 0 1 \n", + "14928 0 1 \n", + "28087 0 1 \n", + "33970 0 1 \n", + "\n", + " native-country_ Vietnam native-country_ Yugoslavia \n", + "16702 0 0 \n", + "8879 0 0 \n", + "14928 0 0 \n", + "28087 0 0 \n", + "33970 0 0 \n", + "\n", + "[5 rows x 103 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_final.sample(frac=1).head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -1016,7 +1321,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +1333,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -1163,7 +1468,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -1273,11 +1578,11 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ - "# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score\n", + "from sklearn.metrics import fbeta_score, accuracy_score\n", "\n", "def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): \n", " '''\n", @@ -1289,12 +1594,12 @@ " - X_test: features testing set\n", " - y_test: income testing set\n", " '''\n", - " \n", + " beta = 0.5\n", " results = {}\n", " \n", " # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])\n", " start = time() # Get start time\n", - " learner = learner.fit(X_train, y_train)\n", + " learner = learner.fit(X_train[:sample_size], y_train[:sample_size])\n", " end = time() # Get end time\n", " \n", " # TODO: Calculate the training time\n", @@ -1303,24 +1608,24 @@ " # TODO: Get the predictions on the test set(X_test),\n", " # then get predictions on the first 300 training samples(X_train) using .predict()\n", " start = time() # Get start time\n", - " predictions_test = None\n", - " predictions_train = None\n", + " predictions_test = learner.predict(X_test)\n", + " predictions_train = learner.predict(X_train[:300])\n", " end = time() # Get end time\n", " \n", " # TODO: Calculate the total prediction time\n", - " results['pred_time'] = None\n", + " results['pred_time'] = end - start\n", " \n", " # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]\n", - " results['acc_train'] = None\n", + " results['acc_train'] = accuracy_score(y_train[:300], predictions_train)\n", " \n", " # TODO: Compute accuracy on test set using accuracy_score()\n", - " results['acc_test'] = None\n", + " results['acc_test'] = accuracy_score(y_test, predictions_test)\n", " \n", " # TODO: Compute F-score on the the first 300 training samples using fbeta_score()\n", - " results['f_train'] = None\n", + " results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta)\n", " \n", " # TODO: Compute F-score on the test set which is y_test\n", - " results['f_test'] = None\n", + " results['f_test'] = fbeta_score(y_test, predictions_test, beta)\n", " \n", " # Success\n", " print(\"{} trained on {} samples.\".format(learner.__class__.__name__, sample_size))\n", @@ -1347,42 +1652,63 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13181 0\n", + "10342 0\n", + "20881 0\n", + "24972 1\n", + "43867 0\n", + "Name: income, dtype: int32" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n", - "NoneType trained on None samples.\n" - ] - }, - { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for +: 'int' and 'NoneType'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;31m# Run metrics visualization for the three supervised learning models chosen\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mvs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccuracy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfscore\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/courses/Udacity Machine Learning - Introduction Nanodegree Program/python/Supervised Learning/Project/visuals.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(results, accuracy, f1)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;31m# Creative plot code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 75\u001b[0;31m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mbar_width\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlearner\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwidth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbar_width\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcolors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 76\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_xticks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0.45\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1.45\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2.45\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m//\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_xticklabels\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"1%\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"10%\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"100%\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/__init__.py\u001b[0m in \u001b[0;36minner\u001b[0;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1599\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1600\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1601\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msanitize_sequence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1603\u001b[0m \u001b[0mbound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_sig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/axes/_axes.py\u001b[0m in \u001b[0;36mbar\u001b[0;34m(self, x, height, width, bottom, align, **kwargs)\u001b[0m\n\u001b[1;32m 2428\u001b[0m \u001b[0medgecolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2429\u001b[0m \u001b[0mlinewidth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlw\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2430\u001b[0;31m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'_nolegend_'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2431\u001b[0m )\n\u001b[1;32m 2432\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/matplotlib/patches.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, xy, width, height, angle, **kwargs)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_x1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_x0\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_width\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_y1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_y0\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_height\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mangle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mangle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'int' and 'NoneType'" + "36177 3617 361\n", + "\n", + "0 361\n", + "GaussianNB trained on 361 samples.\n", + "1 3617\n", + "GaussianNB trained on 3617 samples.\n", + "2 36177\n", + "GaussianNB trained on 36177 samples.\n", + "0 361\n", + "DecisionTreeClassifier trained on 361 samples.\n", + "1 3617\n", + "DecisionTreeClassifier trained on 3617 samples.\n", + "2 36177\n", + "DecisionTreeClassifier trained on 36177 samples.\n", + "0 361\n", + "SVC trained on 361 samples.\n", + "1 3617\n", + "SVC trained on 3617 samples.\n", + "2 36177\n", + "SVC trained on 36177 samples.\n" ] }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1395,29 +1721,33 @@ ], "source": [ "# TODO: Import the three supervised learning models from sklearn\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.svm import SVC\n", "\n", "# TODO: Initialize the three models\n", - "clf_A = None\n", - "clf_B = None\n", - "clf_C = None\n", + "clf_A = GaussianNB()\n", + "clf_B = DecisionTreeClassifier()\n", + "clf_C = SVC()\n", "\n", "# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data\n", "# HINT: samples_100 is the entire training set i.e. len(y_train)\n", "# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)\n", "# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)\n", - "samples_100 = None\n", - "samples_10 = None\n", - "samples_1 = None\n", - "\n", + "samples_100 = int(X_train.shape[0])\n", + "samples_10 = int(samples_100 * 0.1)\n", + "samples_1 = int(samples_100 * 0.01)\n", + "print(samples_100, samples_10, samples_1)\n", + "print()\n", "# Collect results on the learners\n", "results = {}\n", "for clf in [clf_A, clf_B, clf_C]:\n", " clf_name = clf.__class__.__name__\n", " results[clf_name] = {}\n", " for i, samples in enumerate([samples_1, samples_10, samples_100]):\n", + " print(i, samples)\n", " results[clf_name][i] = \\\n", " train_predict(clf, samples, X_train, y_train, X_test, y_test)\n", - "\n", "# Run metrics visualization for the three supervised learning models chosen\n", "vs.evaluate(results, accuracy, fscore)" ] @@ -1494,27 +1824,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,\n", + " decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',\n", + " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import make_scorer\n", + "\n", + "clf = SVC(C=0.01, gamma=0.1, kernel='rbf')\n", + "\n", + "clf.fit(X_train, y_train)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, "metadata": {}, "outputs": [], + "source": [ + "predictions_test = clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8245439469320066" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score(y_test, predictions_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mgrid_fit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrid_obj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# Get the estimator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 832\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/udacity-ML-3.7.3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 519\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m 520\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib64/python3.7/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib64/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import make_scorer\n", "\n", "# TODO: Initialize the classifier\n", - "clf = None\n", + "clf = SVC()\n", "\n", "# TODO: Create the parameters list you wish to tune, using a dictionary if needed.\n", "# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}\n", - "parameters = None\n", + "parameters = {'C': np.logspace(-4, 10, 12, base=10),\n", + " 'gamma': np.logspace(-11, 3, 12, base=10),\n", + " 'kernel': ['linear', 'rbf']}\n", "\n", "# TODO: Make an fbeta_score scoring object using make_scorer()\n", - "scorer = None\n", + "scorer = make_scorer(fbeta_score, beta=0.5)\n", "\n", "# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()\n", - "grid_obj = None\n", + "grid_obj = GridSearchCV(clf, param_grid=parameters, scoring=scorer, n_jobs=-1)\n", "\n", "# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()\n", - "grid_fit = None\n", + "grid_fit = grid_obj.fit(X_train, y_train)\n", "\n", "# Get the estimator\n", "best_clf = grid_fit.best_estimator_\n",