adding all work done so far (lessons 1 - 5)
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
0.24539,0.81725,0
|
||||
0.21774,0.76462,0
|
||||
0.20161,0.69737,0
|
||||
0.20161,0.58041,0
|
||||
0.2477,0.49561,0
|
||||
0.32834,0.44883,0
|
||||
0.39516,0.48099,0
|
||||
0.39286,0.57164,0
|
||||
0.33525,0.62135,0
|
||||
0.33986,0.71199,0
|
||||
0.34447,0.81433,0
|
||||
0.28226,0.82602,0
|
||||
0.26613,0.75,0
|
||||
0.26613,0.63596,0
|
||||
0.32604,0.54825,0
|
||||
0.28917,0.65643,0
|
||||
0.80069,0.71491,0
|
||||
0.80069,0.64181,0
|
||||
0.80069,0.50146,0
|
||||
0.79839,0.36988,0
|
||||
0.73157,0.25,0
|
||||
0.63249,0.18275,0
|
||||
0.60023,0.27047,0
|
||||
0.66014,0.34649,0
|
||||
0.70161,0.42251,0
|
||||
0.70853,0.53947,0
|
||||
0.71544,0.63304,0
|
||||
0.74309,0.72076,0
|
||||
0.75,0.63596,0
|
||||
0.75,0.46345,0
|
||||
0.72235,0.35526,0
|
||||
0.66935,0.28509,0
|
||||
0.20622,0.94298,1
|
||||
0.26613,0.8962,1
|
||||
0.38134,0.8962,1
|
||||
0.42051,0.94591,1
|
||||
0.49885,0.86404,1
|
||||
0.31452,0.93421,1
|
||||
0.53111,0.72076,1
|
||||
0.45276,0.74415,1
|
||||
0.53571,0.6038,1
|
||||
0.60484,0.71491,1
|
||||
0.60945,0.58333,1
|
||||
0.51267,0.47807,1
|
||||
0.50806,0.59211,1
|
||||
0.46198,0.30556,1
|
||||
0.5288,0.41082,1
|
||||
0.38594,0.35819,1
|
||||
0.31682,0.31433,1
|
||||
0.29608,0.20906,1
|
||||
0.36982,0.27632,1
|
||||
0.42972,0.18275,1
|
||||
0.51498,0.10965,1
|
||||
0.53111,0.20906,1
|
||||
0.59793,0.095029,1
|
||||
0.73848,0.086257,1
|
||||
0.83065,0.18275,1
|
||||
0.8629,0.10965,1
|
||||
0.88364,0.27924,1
|
||||
0.93433,0.30848,1
|
||||
0.93433,0.19444,1
|
||||
0.92512,0.43421,1
|
||||
0.87903,0.43421,1
|
||||
0.87903,0.58626,1
|
||||
0.9182,0.71491,1
|
||||
0.85138,0.8348,1
|
||||
0.85599,0.94006,1
|
||||
0.70853,0.94298,1
|
||||
0.70853,0.87281,1
|
||||
0.59793,0.93129,1
|
||||
0.61175,0.83187,1
|
||||
0.78226,0.82895,1
|
||||
0.78917,0.8962,1
|
||||
0.90668,0.89912,1
|
||||
0.14862,0.92251,1
|
||||
0.15092,0.85819,1
|
||||
0.097926,0.85819,1
|
||||
0.079493,0.91374,1
|
||||
0.079493,0.77632,1
|
||||
0.10945,0.79678,1
|
||||
0.12327,0.67982,1
|
||||
0.077189,0.6886,1
|
||||
0.081797,0.58626,1
|
||||
0.14862,0.58041,1
|
||||
0.14862,0.5307,1
|
||||
0.14171,0.41959,1
|
||||
0.08871,0.49269,1
|
||||
0.095622,0.36696,1
|
||||
0.24539,0.3962,1
|
||||
0.1947,0.29678,1
|
||||
0.16935,0.22368,1
|
||||
0.15553,0.13596,1
|
||||
0.23848,0.12427,1
|
||||
0.33065,0.12427,1
|
||||
0.095622,0.2617,1
|
||||
0.091014,0.20322,1
|
||||
|
@@ -0,0 +1,29 @@
|
||||
# Import statements
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.metrics import accuracy_score
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Read the data.
|
||||
data = np.asarray(pd.read_csv('data.csv', header=None))
|
||||
# Assign the features to the variable X, and the labels to the variable y.
|
||||
X = data[:, 0:2]
|
||||
y = data[:, 2]
|
||||
|
||||
|
||||
# TODO: Create the decision tree model and assign it to the variable model.
|
||||
# You won't need to, but if you'd like, play with hyperparameters such
|
||||
# as max_depth and min_samples_leaf and see what they do to the decision
|
||||
# boundary.
|
||||
model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=10)
|
||||
|
||||
# TODO: Fit the model.
|
||||
model.fit(X, y)
|
||||
|
||||
# TODO: Make predictions. Store them in the variable y_pred.
|
||||
y_pred = model.predict(X)
|
||||
print(y_pred)
|
||||
|
||||
# TODO: Calculate the accuracy and assign it to the variable acc.
|
||||
acc = accuracy_score(y, y_pred)
|
||||
print(acc)
|
||||
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# # Lab: Titanic Survival Exploration with Decision Trees
|
||||
|
||||
# ## Getting Started
|
||||
# In this lab, you will see how decision trees work by implementing a decision tree in sklearn.
|
||||
#
|
||||
# We'll start by loading the dataset and displaying some of its rows.
|
||||
|
||||
# In[6]:
|
||||
|
||||
|
||||
# Import libraries necessary for this project
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
# from IPython.display import display # Allows the use of display() for DataFrames
|
||||
|
||||
# Pretty display for notebooks
|
||||
# get_ipython().run_line_magic('matplotlib', 'inline')
|
||||
|
||||
# Set a random seed
|
||||
import random
|
||||
random.seed(42)
|
||||
|
||||
# Load the dataset
|
||||
in_file = 'titanic_data.csv'
|
||||
full_data = pd.read_csv(in_file)
|
||||
|
||||
# Print the first few entries of the RMS Titanic data
|
||||
display(full_data.head())
|
||||
|
||||
|
||||
# Recall that these are the various features present for each passenger on the ship:
|
||||
# - **Survived**: Outcome of survival (0 = No; 1 = Yes)
|
||||
# - **Pclass**: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
|
||||
# - **Name**: Name of passenger
|
||||
# - **Sex**: Sex of the passenger
|
||||
# - **Age**: Age of the passenger (Some entries contain `NaN`)
|
||||
# - **SibSp**: Number of siblings and spouses of the passenger aboard
|
||||
# - **Parch**: Number of parents and children of the passenger
|
||||
# - **Ticket**: Ticket number of the passenger
|
||||
# - **Fare**: Fare paid by the passenger
|
||||
# - **Cabin** Cabin number of the passenger (Some entries contain `NaN`)
|
||||
# - **Embarked**: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)
|
||||
#
|
||||
# Since we're interested in the outcome of survival for each passenger or crew member, we can remove the **Survived** feature from this dataset and store it as its own separate variable `outcomes`. We will use these outcomes as our prediction targets.
|
||||
# Run the code cell below to remove **Survived** as a feature of the dataset and store it in `outcomes`.
|
||||
|
||||
# In[7]:
|
||||
|
||||
|
||||
# Store the 'Survived' feature in a new variable and remove it from the dataset
|
||||
outcomes = full_data['Survived']
|
||||
features_raw = full_data.drop('Survived', axis = 1)
|
||||
|
||||
# Show the new dataset with 'Survived' removed
|
||||
display(features_raw.head())
|
||||
|
||||
|
||||
# The very same sample of the RMS Titanic data now shows the **Survived** feature removed from the DataFrame. Note that `data` (the passenger data) and `outcomes` (the outcomes of survival) are now *paired*. That means for any passenger `data.loc[i]`, they have the survival outcome `outcomes[i]`.
|
||||
#
|
||||
# ## Preprocessing the data
|
||||
#
|
||||
# Now, let's do some data preprocessing. First, we'll remove the names of the passengers, and then one-hot encode the features.
|
||||
#
|
||||
# **Question:** Why would it be a terrible idea to one-hot encode the data without removing the names?
|
||||
# (Andw
|
||||
|
||||
# In[8]:
|
||||
|
||||
|
||||
# Removing the names
|
||||
features_no_names = features_raw.drop(['Name'], axis=1)
|
||||
|
||||
# One-hot encoding
|
||||
features = pd.get_dummies(features_no_names)
|
||||
|
||||
|
||||
# And now we'll fill in any blanks with zeroes.
|
||||
|
||||
# In[9]:
|
||||
|
||||
|
||||
features = features.fillna(0.0)
|
||||
display(features.head())
|
||||
|
||||
|
||||
# ## (TODO) Training the model
|
||||
#
|
||||
# Now we're ready to train a model in sklearn. First, let's split the data into training and testing sets. Then we'll train the model on the training set.
|
||||
|
||||
# In[15]:
|
||||
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# In[17]:
|
||||
|
||||
|
||||
# Import the classifier from sklearn
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
# TODO: Define the classifier, and fit it to the data
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
|
||||
# ## Testing the model
|
||||
# Now, let's see how our model does, let's calculate the accuracy over both the training and the testing set.
|
||||
|
||||
# In[18]:
|
||||
|
||||
|
||||
# Making predictions
|
||||
y_train_pred = model.predict(X_train)
|
||||
y_test_pred = model.predict(X_test)
|
||||
|
||||
# Calculate the accuracy
|
||||
from sklearn.metrics import accuracy_score
|
||||
train_accuracy = accuracy_score(y_train, y_train_pred)
|
||||
test_accuracy = accuracy_score(y_test, y_test_pred)
|
||||
print('The training accuracy is', train_accuracy)
|
||||
print('The test accuracy is', test_accuracy)
|
||||
|
||||
|
||||
# # Exercise: Improving the model
|
||||
#
|
||||
# Ok, high training accuracy and a lower testing accuracy. We may be overfitting a bit.
|
||||
#
|
||||
# So now it's your turn to shine! Train a new model, and try to specify some parameters in order to improve the testing accuracy, such as:
|
||||
# - `max_depth`
|
||||
# - `min_samples_leaf`
|
||||
# - `min_samples_split`
|
||||
#
|
||||
# You can use your intuition, trial and error, or even better, feel free to use Grid Search!
|
||||
#
|
||||
# **Challenge:** Try to get to 85% accuracy on the testing set. If you'd like a hint, take a look at the solutions notebook next.
|
||||
|
||||
# In[23]:
|
||||
|
||||
|
||||
# TODO: Train the model
|
||||
new_model = DecisionTreeClassifier(max_depth=10, min_samples_leaf=6, min_samples_split=8)
|
||||
new_model.fit(X_train, y_train)
|
||||
|
||||
# TODO: Make predictions
|
||||
new_y_train_pred = new_model.predict(X_train)
|
||||
new_y_test_pred = new_model.predict(X_test)
|
||||
|
||||
# TODO: Calculate the accuracy
|
||||
new_train_accuracy = accuracy_score(y_train, new_y_train_pred)
|
||||
new_test_accuracy = accuracy_score(y_test, new_y_test_pred)
|
||||
|
||||
print(f'The training accuracy on the new model is {new_train_accuracy:.4f}')
|
||||
print(f'The test accuracy on the new model is {new_test_accuracy:.4f}')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user