forked from hankcs/HanLP
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDemoStopWord.java
More file actions
58 lines (54 loc) · 1.79 KB
/
DemoStopWord.java
File metadata and controls
58 lines (54 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/*
* <summary></summary>
* <author>hankcs</author>
* <email>me@hankcs.com</email>
* <create-date>2015/5/6 11:11</create-date>
*
* <copyright file="DemoStopWordEx.java">
* Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
* </copyright>
*/
package com.hankcs.demo;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.dictionary.stopword.Filter;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.BasicTokenizer;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
import java.util.List;
/**
* 演示如何去除停用词
*
* @author hankcs
*/
public class DemoStopWord
{
public static void main(String[] args)
{
String text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝";
// 可以动态修改停用词词典
CoreStopWordDictionary.add("居民");
System.out.println(NotionalTokenizer.segment(text));
CoreStopWordDictionary.remove("居民");
System.out.println(NotionalTokenizer.segment(text));
// 可以对任意分词器的结果执行过滤
List<Term> termList = BasicTokenizer.segment(text);
System.out.println(termList);
CoreStopWordDictionary.apply(termList);
System.out.println(termList);
// 还可以自定义过滤逻辑
CoreStopWordDictionary.FILTER = new Filter()
{
@Override
public boolean shouldInclude(Term term)
{
switch (term.nature)
{
case nz:
return !CoreStopWordDictionary.contains(term.word);
}
return false;
}
};
System.out.println(NotionalTokenizer.segment(text));
}
}