Skip to content

Commit ae69fb3

Browse files
committed
🚩新增文本聚类模块(k-means和repeated bisection)
1 parent d1a1b95 commit ae69fb3

8 files changed

Lines changed: 1169 additions & 0 deletions

File tree

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
/*
2+
* <author>Han He</author>
3+
* <email>me@hankcs.com</email>
4+
* <create-date>2018-08-12 7:11 PM</create-date>
5+
*
6+
* <copyright file="Cluster.java">
7+
* Copyright (c) 2018, Han He. All Rights Reserved, http://www.hankcs.com/
8+
* This source is subject to Han He. Please contact Han He for more information.
9+
* </copyright>
10+
*/
11+
package com.hankcs.hanlp.mining.cluster;
12+
13+
import java.util.*;
14+
15+
/**
16+
* @author hankcs
17+
*/
18+
public class Cluster<K> implements Comparable<Cluster<K>>
19+
{
20+
List<Document<K>> documents_; ///< documents
21+
SparseVector composite_; ///< a composite SparseVector
22+
SparseVector centroid_; ///< a centroid SparseVector
23+
List<Cluster<K>> sectioned_clusters_; ///< sectioned clusters
24+
double sectioned_gain_; ///< a sectioned gain
25+
Random random;
26+
27+
public Cluster()
28+
{
29+
this(new ArrayList<Document<K>>());
30+
}
31+
32+
public Cluster(List<Document<K>> documents)
33+
{
34+
this.documents_ = documents;
35+
composite_ = new SparseVector();
36+
random = new Random();
37+
}
38+
39+
/**
40+
* Add the vectors of all documents to a composite vector.
41+
*/
42+
void set_composite_vector()
43+
{
44+
composite_.clear();
45+
for (Document<K> document : documents_)
46+
{
47+
composite_.add_vector(document.feature());
48+
}
49+
}
50+
51+
/**
52+
* Clear status.
53+
*/
54+
void clear()
55+
{
56+
documents_.clear();
57+
composite_.clear();
58+
if (centroid_ != null)
59+
centroid_.clear();
60+
if (sectioned_clusters_ != null)
61+
sectioned_clusters_.clear();
62+
sectioned_gain_ = 0.0;
63+
}
64+
65+
66+
/**
67+
* Get the size.
68+
*
69+
* @return the size of this cluster
70+
*/
71+
int size()
72+
{
73+
return documents_.size();
74+
}
75+
76+
/**
77+
* Get the pointer of a centroid vector.
78+
*
79+
* @return the pointer of a centroid vector
80+
*/
81+
SparseVector centroid_vector()
82+
{
83+
if (documents_.size() > 0 && composite_.size() == 0)
84+
set_composite_vector();
85+
centroid_ = (SparseVector) composite_vector().clone();
86+
centroid_.normalize();
87+
return centroid_;
88+
}
89+
90+
/**
91+
* Get the pointer of a composite vector.
92+
*
93+
* @return the pointer of a composite vector
94+
*/
95+
SparseVector composite_vector()
96+
{
97+
return composite_;
98+
}
99+
100+
/**
101+
* Get documents in this cluster.
102+
*
103+
* @return documents in this cluster
104+
*/
105+
List<Document<K>> documents()
106+
{
107+
return documents_;
108+
}
109+
110+
/**
111+
* Add a document.
112+
*
113+
* @param doc the pointer of a document object
114+
*/
115+
void add_document(Document doc)
116+
{
117+
doc.feature().normalize();
118+
documents_.add(doc);
119+
composite_.add_vector(doc.feature());
120+
}
121+
122+
/**
123+
* Remove a document from this cluster.
124+
*
125+
* @param index the index of vector container of documents
126+
*/
127+
void remove_document(int index)
128+
{
129+
ListIterator<Document<K>> listIterator = documents_.listIterator(index);
130+
Document<K> document = listIterator.next();
131+
listIterator.set(null);
132+
composite_.sub_vector(document.feature());
133+
}
134+
135+
/**
136+
* Remove a document from this cluster.
137+
*
138+
* @param doc the pointer of a document object
139+
*/
140+
void remove_document(Document doc)
141+
{
142+
for (Document<K> document : documents_)
143+
{
144+
if (document.equals(doc))
145+
{
146+
remove_document(doc);
147+
return;
148+
}
149+
}
150+
}
151+
152+
153+
/**
154+
* Delete removed documents from the internal container.
155+
*/
156+
void refresh()
157+
{
158+
ListIterator<Document<K>> listIterator = documents_.listIterator();
159+
while (listIterator.hasNext())
160+
{
161+
if (listIterator.next() == null)
162+
listIterator.remove();
163+
}
164+
}
165+
166+
/**
167+
* Get a gain when this cluster sectioned.
168+
*
169+
* @return a gain
170+
*/
171+
double sectioned_gain()
172+
{
173+
return sectioned_gain_;
174+
}
175+
176+
/**
177+
* Set a gain when the cluster sectioned.
178+
*/
179+
void set_sectioned_gain()
180+
{
181+
double gain = 0.0f;
182+
if (sectioned_gain_ == 0 && sectioned_clusters_.size() > 1)
183+
{
184+
for (Cluster<K> cluster : sectioned_clusters_)
185+
{
186+
gain += cluster.composite_vector().norm();
187+
}
188+
gain -= composite_.norm();
189+
}
190+
sectioned_gain_ = gain;
191+
}
192+
193+
/**
194+
* Get sectioned clusters.
195+
*
196+
* @return sectioned clusters
197+
*/
198+
List<Cluster<K>> sectioned_clusters()
199+
{
200+
return sectioned_clusters_;
201+
}
202+
203+
// /**
204+
// * Choose documents randomly.
205+
// */
206+
// void choose_randomly(int ndocs, List<Document > docs)
207+
//{
208+
// HashMap<int, bool>.type choosed;
209+
// int siz = size();
210+
// init_hash_map(siz, choosed, ndocs);
211+
// if (siz < ndocs)
212+
// ndocs = siz;
213+
// int count = 0;
214+
// while (count < ndocs)
215+
// {
216+
// int index = myrand(seed_) % siz;
217+
// if (choosed.find(index) == choosed.end())
218+
// {
219+
// choosed.insert(std.pair<int, bool>(index, true));
220+
// docs.push_back(documents_[index]);
221+
// ++count;
222+
// }
223+
// }
224+
//}
225+
226+
/**
227+
* 选取初始质心
228+
*
229+
* @param ndocs 质心数量
230+
* @param docs 输出到该列表中
231+
*/
232+
void choose_smartly(int ndocs, List<Document> docs)
233+
{
234+
int siz = size();
235+
double[] closest = new double[siz];
236+
if (siz < ndocs)
237+
ndocs = siz;
238+
int index, count = 0;
239+
240+
index = random.nextInt(siz); // initial center
241+
docs.add(documents_.get(index));
242+
++count;
243+
double potential = 0.0;
244+
for (int i = 0; i < documents_.size(); i++)
245+
{
246+
double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
247+
potential += dist;
248+
closest[i] = dist;
249+
}
250+
251+
// choose each center
252+
while (count < ndocs)
253+
{
254+
double randval = random.nextDouble() * potential;
255+
256+
for (index = 0; index < documents_.size(); index++)
257+
{
258+
double dist = closest[index];
259+
if (randval <= dist)
260+
break;
261+
randval -= dist;
262+
}
263+
if (index == documents_.size())
264+
index--;
265+
docs.add(documents_.get(index));
266+
++count;
267+
268+
double new_potential = 0.0;
269+
for (int i = 0; i < documents_.size(); i++)
270+
{
271+
double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
272+
double min = closest[i];
273+
if (dist < min)
274+
{
275+
closest[i] = dist;
276+
min = dist;
277+
}
278+
new_potential += min;
279+
}
280+
potential = new_potential;
281+
}
282+
}
283+
284+
/**
285+
* 将本簇划分为nclusters个簇
286+
*
287+
* @param nclusters
288+
*/
289+
void section(int nclusters)
290+
{
291+
if (size() < nclusters)
292+
return;
293+
294+
sectioned_clusters_ = new ArrayList<Cluster<K>>(nclusters);
295+
List<Document> centroids = new ArrayList<Document>(nclusters);
296+
// choose_randomly(nclusters, centroids);
297+
choose_smartly(nclusters, centroids);
298+
for (int i = 0; i < centroids.size(); i++)
299+
{
300+
Cluster<K> cluster = new Cluster<K>();
301+
sectioned_clusters_.add(cluster);
302+
}
303+
304+
for (Document<K> d : documents_)
305+
{
306+
double max_similarity = -1.0;
307+
int max_index = 0;
308+
for (int j = 0; j < centroids.size(); j++)
309+
{
310+
double similarity = SparseVector.inner_product(d.feature(), centroids.get(j).feature());
311+
if (max_similarity < similarity)
312+
{
313+
max_similarity = similarity;
314+
max_index = j;
315+
}
316+
}
317+
sectioned_clusters_.get(max_index).add_document(d);
318+
}
319+
}
320+
321+
@Override
322+
public int compareTo(Cluster<K> o)
323+
{
324+
return Double.compare(o.sectioned_gain(), sectioned_gain());
325+
}
326+
}

0 commit comments

Comments
 (0)