project 1 updated
This commit is contained in:
143
python/Supervised Learning/Project/visuals.py
Normal file
143
python/Supervised Learning/Project/visuals.py
Normal file
@@ -0,0 +1,143 @@
|
||||
###########################################
|
||||
# Suppress matplotlib user warnings
|
||||
# Necessary for newer version of matplotlib
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
|
||||
#
|
||||
# Display inline matplotlib plots with IPython
|
||||
from IPython import get_ipython
|
||||
get_ipython().run_line_magic('matplotlib', 'inline')
|
||||
###########################################
|
||||
|
||||
import matplotlib.pyplot as pl
|
||||
import matplotlib.patches as mpatches
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from time import time
|
||||
from sklearn.metrics import f1_score, accuracy_score
|
||||
|
||||
|
||||
def distribution(data, transformed = False):
|
||||
"""
|
||||
Visualization code for displaying skewed distributions of features
|
||||
"""
|
||||
|
||||
# Create figure
|
||||
fig = pl.figure(figsize = (11,5));
|
||||
|
||||
# Skewed feature plotting
|
||||
for i, feature in enumerate(['capital-gain','capital-loss']):
|
||||
ax = fig.add_subplot(1, 2, i+1)
|
||||
ax.hist(data[feature], bins = 25, color = '#00A0A0')
|
||||
ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
|
||||
ax.set_xlabel("Value")
|
||||
ax.set_ylabel("Number of Records")
|
||||
ax.set_ylim((0, 2000))
|
||||
ax.set_yticks([0, 500, 1000, 1500, 2000])
|
||||
ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])
|
||||
|
||||
# Plot aesthetics
|
||||
if transformed:
|
||||
fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
|
||||
fontsize = 16, y = 1.03)
|
||||
else:
|
||||
fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
|
||||
fontsize = 16, y = 1.03)
|
||||
|
||||
fig.tight_layout()
|
||||
fig.show()
|
||||
|
||||
|
||||
def evaluate(results, accuracy, f1):
|
||||
"""
|
||||
Visualization code to display results of various learners.
|
||||
|
||||
inputs:
|
||||
- learners: a list of supervised learners
|
||||
- stats: a list of dictionaries of the statistic results from 'train_predict()'
|
||||
- accuracy: The score for the naive predictor
|
||||
- f1: The score for the naive predictor
|
||||
"""
|
||||
|
||||
# Create figure
|
||||
fig, ax = pl.subplots(2, 3, figsize = (11,7))
|
||||
|
||||
# Constants
|
||||
bar_width = 0.3
|
||||
colors = ['#A00000','#00A0A0','#00A000']
|
||||
|
||||
# Super loop to plot four panels of data
|
||||
for k, learner in enumerate(results.keys()):
|
||||
for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
|
||||
for i in np.arange(3):
|
||||
|
||||
# Creative plot code
|
||||
ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
|
||||
ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
|
||||
ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"])
|
||||
ax[j//3, j%3].set_xlabel("Training Set Size")
|
||||
ax[j//3, j%3].set_xlim((-0.1, 3.0))
|
||||
|
||||
# Add unique y-labels
|
||||
ax[0, 0].set_ylabel("Time (in seconds)")
|
||||
ax[0, 1].set_ylabel("Accuracy Score")
|
||||
ax[0, 2].set_ylabel("F-score")
|
||||
ax[1, 0].set_ylabel("Time (in seconds)")
|
||||
ax[1, 1].set_ylabel("Accuracy Score")
|
||||
ax[1, 2].set_ylabel("F-score")
|
||||
|
||||
# Add titles
|
||||
ax[0, 0].set_title("Model Training")
|
||||
ax[0, 1].set_title("Accuracy Score on Training Subset")
|
||||
ax[0, 2].set_title("F-score on Training Subset")
|
||||
ax[1, 0].set_title("Model Predicting")
|
||||
ax[1, 1].set_title("Accuracy Score on Testing Set")
|
||||
ax[1, 2].set_title("F-score on Testing Set")
|
||||
|
||||
# Add horizontal lines for naive predictors
|
||||
ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
|
||||
ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
|
||||
ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
|
||||
ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
|
||||
|
||||
# Set y-limits for score panels
|
||||
ax[0, 1].set_ylim((0, 1))
|
||||
ax[0, 2].set_ylim((0, 1))
|
||||
ax[1, 1].set_ylim((0, 1))
|
||||
ax[1, 2].set_ylim((0, 1))
|
||||
|
||||
# Create patches for the legend
|
||||
patches = []
|
||||
for i, learner in enumerate(results.keys()):
|
||||
patches.append(mpatches.Patch(color = colors[i], label = learner))
|
||||
pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
|
||||
loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
|
||||
|
||||
# Aesthetics
|
||||
pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
|
||||
pl.tight_layout()
|
||||
pl.show()
|
||||
|
||||
|
||||
def feature_plot(importances, X_train, y_train):
|
||||
|
||||
# Display the five most important features
|
||||
indices = np.argsort(importances)[::-1]
|
||||
columns = X_train.columns.values[indices[:5]]
|
||||
values = importances[indices][:5]
|
||||
|
||||
# Creat the plot
|
||||
fig = pl.figure(figsize = (9,5))
|
||||
pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
|
||||
pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
|
||||
label = "Feature Weight")
|
||||
pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
|
||||
label = "Cumulative Feature Weight")
|
||||
pl.xticks(np.arange(5), columns)
|
||||
pl.xlim((-0.5, 4.5))
|
||||
pl.ylabel("Weight", fontsize = 12)
|
||||
pl.xlabel("Feature", fontsize = 12)
|
||||
|
||||
pl.legend(loc = 'upper center')
|
||||
pl.tight_layout()
|
||||
pl.show()
|
||||
Reference in New Issue
Block a user