import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.cluster import KMeans from sklearn.datasets import make_blobs def simulate_data(n = 500, features = 10, centroids = 3): ''' Simulates n data points, each with number of features equal to features, with a number of centers equal to centroids INPUT (defaults) n = number of rows (500) features = number of columns (10) centroids = number of centers (3) Output dataset = a dataset with the the specified characteristics ''' dataset, y = make_blobs(n_samples=n, n_features=features, centers=centroids, random_state=42) return dataset def plot_data(data, labels): ''' Plot data with colors associated with labels ''' fig = plt.figure(); ax = Axes3D(fig) ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10'); data = simulate_data(200, 5, 4) def get_kmeans_score(data, center): ''' returns the kmeans score regarding SSE for points to centers INPUT: data - the dataset you want to fit kmeans to center - the number of centers you want (the k value) OUTPUT: score - the SSE score for the kmeans model fit to the data ''' #instantiate kmeans kmeans = KMeans(n_clusters=center) # Then fit the model to your data using the fit method model = kmeans.fit(data) # Obtain a score related to the model fit score = np.abs(model.score(data)) return score def fit_mods(): scores = [] centers = list(range(1,11)) for center in centers: scores.append(get_kmeans_score(data, center)) return centers, scores