-
Notifications
You must be signed in to change notification settings - Fork 10.9k
Expand file tree
/
Copy pathDemoCustomDictionary.java
More file actions
76 lines (68 loc) · 3.24 KB
/
DemoCustomDictionary.java
File metadata and controls
76 lines (68 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/*
* <summary></summary>
* <author>He Han</author>
* <email>[email protected]</email>
* <create-date>2014/12/9 13:04</create-date>
*
* <copyright file="DemoCustomDictionary.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.demo;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.dictionary.BaseSearcher;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import java.util.Map;
/**
* 演示用户词典的动态增删
*
* @author hankcs
*/
public class DemoCustomDictionary
{
public static void main(String[] args)
{
// 动态增加
CustomDictionary.add("攻城狮");
// 强行插入
CustomDictionary.insert("白富美", "nz 1024");
// 删除词语(注释掉试试)
// CustomDictionary.remove("攻城狮");
System.out.println(CustomDictionary.add("单身狗", "nz 1024 n 1"));
System.out.println(CustomDictionary.get("单身狗"));
String text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"; // 怎么可能噗哈哈!
// DoubleArrayTrie分词
final char[] charArray = text.toCharArray();
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>()
{
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value)
{
System.out.printf("[%d:%d]=%s %s\n", begin, end, new String(charArray, begin, end - begin), value);
}
});
// 首字哈希之后二分的trie树分词
BaseSearcher searcher = CustomDictionary.getSearcher(text);
Map.Entry entry;
while ((entry = searcher.next()) != null)
{
System.out.println(entry);
}
// 标准分词
System.out.println(HanLP.segment(text));
// Note:动态增删不会影响词典文件
// 目前CustomDictionary使用DAT储存词典文件中的词语,用BinTrie储存动态加入的词语,前者性能高,后者性能低
// 之所以保留动态增删功能,一方面是历史遗留特性,另一方面是调试用;未来可能会去掉动态增删特性。
// 系统默认的词典
DynamicCustomDictionary dictionary = CustomDictionary.DEFAULT;
// 每个分词器都有一份词典,默认公用 CustomDictionary.DEFAULT,你可以为任何分词器指定一份不同的词典
DynamicCustomDictionary myDictionary = new DynamicCustomDictionary("data/dictionary/custom/CustomDictionary.txt", "data/dictionary/custom/机构名词典.txt");
StandardTokenizer.SEGMENT.enableCustomDictionary(myDictionary);
StandardTokenizer.SEGMENT.customDictionary.insert("插入到该分词器专用的词典中");
}
}