# Import, read, and split data import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit data = pd.read_csv('data.csv') X = np.array(data[['x1', 'x2']]) y = np.array(data['y']) # Fix random seed np.random.seed(55) # TODO: Uncomment one of the three classifiers, and hit "Test Run" # to see the learning curve. Use these to answer the quiz below. # Logistic Regression estimator0 = LogisticRegression(solver='lbfgs') # Decision Tree estimator1 = GradientBoostingClassifier() # Support Vector Machine estimator2 = SVC(kernel='rbf', gamma=1000) def randomize(X, Y): permutation = np.random.permutation(Y.shape[0]) X2 = X[permutation, :] Y2 = Y[permutation] return X2, Y2 X2, y2 = randomize(X, y) def draw_learning_curves(X, y, estimator, num_trainings): cv = ShuffleSplit(n_splits=5, test_size=0.3) train_sizes, train_scores, test_scores = learning_curve( estimator, X2, y2, cv=cv, n_jobs=1, train_sizes=np.linspace(.1, 1.0, num_trainings)) print(test_scores) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.title("Learning Curves") plt.xlabel("Training examples") plt.ylabel("Score") # plt.plot(train_scores_mean, 'o-', color="g", # label="Training score") # plt.plot(test_scores_mean, 'o-', color="y", # label="Cross-validation score") plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g') plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-Validation Score') plt.legend(loc="best") plt.show() draw_learning_curves(X2, y2, estimator2, 10) cv = ShuffleSplit(n_splits=5, test_size=0.3) print(cv) for i, j in cv.split(X2): print(i, j)