59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from mpl_toolkits.mplot3d import Axes3D
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.datasets import make_blobs
|
|
|
|
def simulate_data(n = 500, features = 10, centroids = 3):
|
|
'''
|
|
Simulates n data points, each with number of features equal to features, with a number of centers equal to centroids
|
|
INPUT (defaults)
|
|
n = number of rows (500)
|
|
features = number of columns (10)
|
|
centroids = number of centers (3)
|
|
Output
|
|
dataset = a dataset with the the specified characteristics
|
|
'''
|
|
dataset, y = make_blobs(n_samples=n, n_features=features, centers=centroids, random_state=42)
|
|
|
|
return dataset
|
|
|
|
def plot_data(data, labels):
|
|
'''
|
|
Plot data with colors associated with labels
|
|
'''
|
|
fig = plt.figure();
|
|
ax = Axes3D(fig)
|
|
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10');
|
|
|
|
data = simulate_data(200, 5, 4)
|
|
|
|
def get_kmeans_score(data, center):
|
|
'''
|
|
returns the kmeans score regarding SSE for points to centers
|
|
INPUT:
|
|
data - the dataset you want to fit kmeans to
|
|
center - the number of centers you want (the k value)
|
|
OUTPUT:
|
|
score - the SSE score for the kmeans model fit to the data
|
|
'''
|
|
#instantiate kmeans
|
|
kmeans = KMeans(n_clusters=center)
|
|
|
|
# Then fit the model to your data using the fit method
|
|
model = kmeans.fit(data)
|
|
|
|
# Obtain a score related to the model fit
|
|
score = np.abs(model.score(data))
|
|
|
|
return score
|
|
|
|
def fit_mods():
|
|
scores = []
|
|
centers = list(range(1,11))
|
|
|
|
for center in centers:
|
|
scores.append(get_kmeans_score(data, center))
|
|
|
|
return centers, scores
|