Skip to content

Commit 5edee54

Browse files
add kmeans
1 parent 45157ab commit 5edee54

6 files changed

Lines changed: 448 additions & 0 deletions

File tree

kmeans/.DS_Store

6 KB
Binary file not shown.

kmeans/kmeans.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
from collections import defaultdict
2+
from random import uniform
3+
from sklearn import datasets
4+
from utils.data_generater import *
5+
import time
6+
7+
8+
9+
# 输入:points是一类的点,维度相同
10+
# 输出:这些点的中心点
11+
def point_avg(points):
12+
return np.mean(points, axis=0)
13+
14+
15+
#输入:data_set是数据集的点,assignments是每个点在当前归为的类别
16+
#输出:新的中心点list
17+
def update_centers(data_set, assignments):
18+
new_means = defaultdict(list)
19+
centers = []
20+
for assignment, point in zip(assignments, data_set):
21+
new_means[assignment].append(point)
22+
23+
for points in new_means.values():
24+
newCenter = point_avg(points)
25+
centers.append(newCenter)
26+
27+
return centers
28+
29+
#输入:data_set是数据集的点,assignments是每个点在当前归为的类别
30+
#输出:新的误差值
31+
def update_error(data_set, assignments):
32+
new_means = defaultdict(list)
33+
error = 0
34+
for assignment, point in zip(assignments, data_set):
35+
new_means[assignment].append(point)
36+
37+
for points in new_means.values():
38+
newCenter = point_avg(points)
39+
error += np.sqrt(np.sum(np.square(points - newCenter)))
40+
41+
return error
42+
43+
#输入:data_set原始数据集,centers所有的中心点
44+
#输出:每个点对应的聚类类别
45+
def assign_points(data_set, centers):
46+
assignments = []
47+
for point in data_set:
48+
shortest = float("inf") # 正无穷
49+
shortest_index = 0
50+
for i in range(len(centers)):
51+
val = distance(point, centers[i])
52+
if val < shortest:
53+
shortest = val
54+
shortest_index = i
55+
assignments.append(shortest_index)
56+
return assignments
57+
58+
59+
def distance(a, b):
60+
return np.sqrt(np.sum(np.square(a - b)))
61+
62+
63+
#k个数据点,随机生成
64+
def generate_k(data_set, k):
65+
centers = []
66+
dimensions = len(data_set[0])
67+
min_max = defaultdict(int)
68+
69+
for point in data_set:
70+
for i in range(dimensions):
71+
val = point[i]
72+
min_key = 'min_%d' % i
73+
max_key = 'max_%d' % i
74+
if min_key not in min_max or val < min_max[min_key]:
75+
min_max[min_key] = val
76+
if max_key not in min_max or val > min_max[max_key]:
77+
min_max[max_key] = val
78+
79+
for _k in range(k):
80+
rand_point = []
81+
for i in range(dimensions):
82+
min_val = min_max['min_%d' % i]
83+
max_val = min_max['max_%d' % i]
84+
85+
rand_point.append(uniform(min_val, max_val))
86+
87+
centers.append(rand_point)
88+
89+
return centers
90+
91+
92+
def k_means(dataset, k):
93+
k_points = generate_k(dataset, k)
94+
assignments = assign_points(dataset, k_points)
95+
old_assignments = None
96+
while assignments != old_assignments:
97+
error = update_error(dataset, assignments)
98+
print("error", error)
99+
new_centers = update_centers(dataset, assignments)
100+
old_assignments = assignments
101+
assignments = assign_points(dataset, new_centers)
102+
return zip(assignments, dataset)
103+
104+
105+
iris = datasets.load_iris()
106+
for k in k_means(iris.data, 3):
107+
print(k)
108+
109+
110+
pointList = []
111+
numPoints = 10000
112+
dim = 1000
113+
numClusters = 10
114+
k = 0
115+
for i in range(0,numClusters):
116+
num = int(numPoints/numClusters)
117+
p = makeRandomPoint(num,dim,k)
118+
k += 5
119+
pointList += p.tolist()
120+
121+
start = time.time()
122+
config= k_means(np.array(pointList), numClusters)
123+
print("Time taken:",time.time() - start)

kmeans/notebook/plot_cluster_iris.ipynb

Lines changed: 192 additions & 0 deletions
Large diffs are not rendered by default.

kmeans/notebook/plot_kmeans_assumptions.ipynb

Lines changed: 129 additions & 0 deletions
Large diffs are not rendered by default.
323 Bytes
Binary file not shown.

utils/data_generater.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import numpy as np
2+
3+
def makeRandomPoint(num, dim, upper):
4+
return np.random.normal(loc=upper, size=[num, dim])

0 commit comments

Comments
 (0)