1+ from collections import defaultdict
2+ from random import uniform
3+ from sklearn import datasets
4+ from utils .data_generater import *
5+ import time
6+
7+
8+
9+ # 输入:points是一类的点,维度相同
10+ # 输出:这些点的中心点
11+ def point_avg (points ):
12+ return np .mean (points , axis = 0 )
13+
14+
15+ #输入:data_set是数据集的点,assignments是每个点在当前归为的类别
16+ #输出:新的中心点list
17+ def update_centers (data_set , assignments ):
18+ new_means = defaultdict (list )
19+ centers = []
20+ for assignment , point in zip (assignments , data_set ):
21+ new_means [assignment ].append (point )
22+
23+ for points in new_means .values ():
24+ newCenter = point_avg (points )
25+ centers .append (newCenter )
26+
27+ return centers
28+
29+ #输入:data_set是数据集的点,assignments是每个点在当前归为的类别
30+ #输出:新的误差值
31+ def update_error (data_set , assignments ):
32+ new_means = defaultdict (list )
33+ error = 0
34+ for assignment , point in zip (assignments , data_set ):
35+ new_means [assignment ].append (point )
36+
37+ for points in new_means .values ():
38+ newCenter = point_avg (points )
39+ error += np .sqrt (np .sum (np .square (points - newCenter )))
40+
41+ return error
42+
43+ #输入:data_set原始数据集,centers所有的中心点
44+ #输出:每个点对应的聚类类别
45+ def assign_points (data_set , centers ):
46+ assignments = []
47+ for point in data_set :
48+ shortest = float ("inf" ) # 正无穷
49+ shortest_index = 0
50+ for i in range (len (centers )):
51+ val = distance (point , centers [i ])
52+ if val < shortest :
53+ shortest = val
54+ shortest_index = i
55+ assignments .append (shortest_index )
56+ return assignments
57+
58+
59+ def distance (a , b ):
60+ return np .sqrt (np .sum (np .square (a - b )))
61+
62+
63+ #k个数据点,随机生成
64+ def generate_k (data_set , k ):
65+ centers = []
66+ dimensions = len (data_set [0 ])
67+ min_max = defaultdict (int )
68+
69+ for point in data_set :
70+ for i in range (dimensions ):
71+ val = point [i ]
72+ min_key = 'min_%d' % i
73+ max_key = 'max_%d' % i
74+ if min_key not in min_max or val < min_max [min_key ]:
75+ min_max [min_key ] = val
76+ if max_key not in min_max or val > min_max [max_key ]:
77+ min_max [max_key ] = val
78+
79+ for _k in range (k ):
80+ rand_point = []
81+ for i in range (dimensions ):
82+ min_val = min_max ['min_%d' % i ]
83+ max_val = min_max ['max_%d' % i ]
84+
85+ rand_point .append (uniform (min_val , max_val ))
86+
87+ centers .append (rand_point )
88+
89+ return centers
90+
91+
92+ def k_means (dataset , k ):
93+ k_points = generate_k (dataset , k )
94+ assignments = assign_points (dataset , k_points )
95+ old_assignments = None
96+ while assignments != old_assignments :
97+ error = update_error (dataset , assignments )
98+ print ("error" , error )
99+ new_centers = update_centers (dataset , assignments )
100+ old_assignments = assignments
101+ assignments = assign_points (dataset , new_centers )
102+ return zip (assignments , dataset )
103+
104+
105+ iris = datasets .load_iris ()
106+ for k in k_means (iris .data , 3 ):
107+ print (k )
108+
109+
110+ pointList = []
111+ numPoints = 10000
112+ dim = 1000
113+ numClusters = 10
114+ k = 0
115+ for i in range (0 ,numClusters ):
116+ num = int (numPoints / numClusters )
117+ p = makeRandomPoint (num ,dim ,k )
118+ k += 5
119+ pointList += p .tolist ()
120+
121+ start = time .time ()
122+ config = k_means (np .array (pointList ), numClusters )
123+ print ("Time taken:" ,time .time () - start )
0 commit comments