K-means clustering and normalization
kmtest.csv:
2.000000 4.000000 | |
3.000000 3.000000 | |
3.000000 4.000000 | |
3.000000 5.000000 | |
4.000000 3.000000 | |
4.000000 5.000000 | |
9.000000 4.000000 | |
9.000000 5.000000 | |
9.000000 9.000000 | |
9.000000 10.000000 | |
10.000000 4.000000 | |
10.000000 5.000000 | |
10.000000 9.000000 | |
10.000000 10.000000 | |
11.000000 10.000000 | |
15.000000 4.000000 | |
15.000000 5.000000 | |
15.000000 6.000000 | |
16.000000 4.000000 | |
16.000000 5.000000 | |
16.000000 6.000000 |
(a.)  Without Normalization # coding: utf-8 # In[1]: import numpy as np import matplotlib.pyplot as plt # In[2]: import random #initialize centroids def kMeansInitCentroids(X,k):     centroids = np.zeros((np.shape(X)[1],k))     idx = np.random.choice(np.shape(X)[0],k,replace = 'false')     centroids = X[idx,:]     return centroids # In[3]: #find closest centroid[] def findClosestCentroids(X,centroids):     k = np.shape(centroids)[0]     m = np.shape(X)[0]          idx = np.zeros((m,1))          for i in range(m):         min_dist = sum((X[i,:]-centroids[0,:])**2)         idx[i] = 0         for j in range(k):             dist = sum((X[i,:]-centroids[j,:])**2)             if dist<min_dist:                 min_dist = dist                 idx[i] = j     return idx # In[4]: #computes centroid def computeCentroids(X,idx,k):     [m,n] =  np.shape(X)          centroids = np.zeros((k,n))          for i in range(k):         cnt = 0         for j in range(m):             if idx[j] == i:                 cnt+=1                 centroids[i,:]+=X[j,:]         try:             centroids[i,:]/=cnt         except:             print("divided by 0")     return centroids ... See the full answer