adding all work done so far (lessons 1 - 5)

2019-07-10 19:58:53 +01:00
parent 8085149a49
commit b982957daf
37 changed files with 19407 additions and 0 deletions
--- a/Exploration/titanic.py
+++ b/Exploration/titanic.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Lab: Titanic Survival Exploration with Decision Trees
+
+# ## Getting Started
+# In this lab, you will see how decision trees work by implementing a decision tree in sklearn.
+#
+# We'll start by loading the dataset and displaying some of its rows.
+
+# In[6]:
+
+
+# Import libraries necessary for this project
+import numpy as np
+import pandas as pd
+# from IPython.display import display # Allows the use of display() for DataFrames
+
+# Pretty display for notebooks
+# get_ipython().run_line_magic('matplotlib', 'inline')
+
+# Set a random seed
+import random
+random.seed(42)
+
+# Load the dataset
+in_file = 'titanic_data.csv'
+full_data = pd.read_csv(in_file)
+
+# Print the first few entries of the RMS Titanic data
+# display(full_data.head())
+
+
+# Recall that these are the various features present for each passenger on the ship:
+# - **Survived**: Outcome of survival (0 = No; 1 = Yes)
+# - **Pclass**: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
+# - **Name**: Name of passenger
+# - **Sex**: Sex of the passenger
+# - **Age**: Age of the passenger (Some entries contain `NaN`)
+# - **SibSp**: Number of siblings and spouses of the passenger aboard
+# - **Parch**: Number of parents and children of the passenger
+# - **Ticket**: Ticket number of the passenger
+# - **Fare**: Fare paid by the passenger
+# - **Cabin** Cabin number of the passenger (Some entries contain `NaN`)
+# - **Embarked**: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)
+#
+# Since we're interested in the outcome of survival for each passenger or crew member, we can remove the **Survived** feature from this dataset and store it as its own separate variable `outcomes`. We will use these outcomes as our prediction targets.
+# Run the code cell below to remove **Survived** as a feature of the dataset and store it in `outcomes`.
+
+# In[7]:
+
+
+# Store the 'Survived' feature in a new variable and remove it from the dataset
+outcomes = full_data['Survived']
+features_raw = full_data.drop('Survived', axis = 1)
+
+# Show the new dataset with 'Survived' removed
+# display(features_raw.head())
+
+
+# The very same sample of the RMS Titanic data now shows the **Survived** feature removed from the DataFrame. Note that `data` (the passenger data) and `outcomes` (the outcomes of survival) are now *paired*. That means for any passenger `data.loc[i]`, they have the survival outcome `outcomes[i]`.
+#
+# ## Preprocessing the data
+#
+# Now, let's do some data preprocessing. First, we'll remove the names of the passengers, and then one-hot encode the features.
+#
+# **Question:** Why would it be a terrible idea to one-hot encode the data without removing the names?
+# (Andw
+
+# In[8]:
+
+
+# Removing the names
+features_no_names = features_raw.drop(['Name'], axis=1)
+
+# One-hot encoding
+features = pd.get_dummies(features_no_names)
+
+
+# And now we'll fill in any blanks with zeroes.
+
+# In[9]:
+
+
+features = features.fillna(0.0)
+# display(features.head())
+
+
+# ## (TODO) Training the model
+#
+# Now we're ready to train a model in sklearn. First, let's split the data into training and testing sets. Then we'll train the model on the training set.
+
+# In[15]:
+
+
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
+
+
+# In[17]:
+
+
+# Import the classifier from sklearn
+from sklearn.tree import DecisionTreeClassifier
+
+# TODO: Define the classifier, and fit it to the data
+model = DecisionTreeClassifier()
+model.fit(X_train, y_train)
+
+
+# ## Testing the model
+# Now, let's see how our model does, let's calculate the accuracy over both the training and the testing set.
+
+# In[18]:
+
+
+# Making predictions
+y_train_pred = model.predict(X_train)
+y_test_pred = model.predict(X_test)
+
+# Calculate the accuracy
+from sklearn.metrics import accuracy_score
+train_accuracy = accuracy_score(y_train, y_train_pred)
+test_accuracy = accuracy_score(y_test, y_test_pred)
+print('The training accuracy is', train_accuracy)
+print('The test accuracy is', test_accuracy)
+
+
+# # Exercise: Improving the model
+#
+# Ok, high training accuracy and a lower testing accuracy. We may be overfitting a bit.
+#
+# So now it's your turn to shine! Train a new model, and try to specify some parameters in order to improve the testing accuracy, such as:
+# - `max_depth`
+# - `min_samples_leaf`
+# - `min_samples_split`
+#
+# You can use your intuition, trial and error, or even better, feel free to use Grid Search!
+#
+# **Challenge:** Try to get to 85% accuracy on the testing set. If you'd like a hint, take a look at the solutions notebook next.
+
+# In[23]:
+
+
+# TODO: Train the model
+new_model = DecisionTreeClassifier(max_depth=10, min_samples_leaf=6, min_samples_split=8)
+new_model.fit(X_train, y_train)
+
+# TODO: Make predictions
+new_y_train_pred = new_model.predict(X_train)
+new_y_test_pred = new_model.predict(X_test)
+
+# TODO: Calculate the accuracy
+new_train_accuracy = accuracy_score(y_train, new_y_train_pred)
+new_test_accuracy = accuracy_score(y_test, new_y_test_pred)
+
+print(f'The training accuracy on the new model is {new_train_accuracy:.4f}')
+print(f'The test accuracy on the new model is {new_test_accuracy:.4f}')
+
+