-
Notifications
You must be signed in to change notification settings - Fork 10.9k
Expand file tree
/
Copy pathDemoWord2Vec.java
More file actions
109 lines (95 loc) · 3.88 KB
/
DemoWord2Vec.java
File metadata and controls
109 lines (95 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
* <author>Hankcs</author>
* <email>[email protected]</email>
* <create-date>2017-11-02 12:09</create-date>
*
* <copyright file="Demo.java" company="码农场">
* Copyright (c) 2017, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.demo;
import com.hankcs.hanlp.corpus.MSR;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
import com.hankcs.hanlp.mining.word2vec.Word2VecTrainer;
import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
import com.hankcs.hanlp.utility.TestUtility;
import java.io.IOException;
import java.util.Map;
/**
* 演示词向量的训练与应用
*
* @author hankcs
*/
public class DemoWord2Vec
{
private static final String TRAIN_FILE_NAME = MSR.TRAIN_PATH;
private static final String MODEL_FILE_NAME = "data/test/word2vec.txt";
public static void main(String[] args) throws IOException
{
WordVectorModel wordVectorModel = trainOrLoadModel();
printNearest("上海", wordVectorModel);
printNearest("美丽", wordVectorModel);
printNearest("购买", wordVectorModel);
System.out.println(wordVectorModel.similarity("上海", "广州"));
System.out.println(wordVectorModel.analogy("日本", "自民党", "共和党"));
// 文档向量
DocVectorModel docVectorModel = new DocVectorModel(wordVectorModel);
String[] documents = new String[]{
"山东苹果丰收",
"农民在江苏种水稻",
"奥运会女排夺冠",
"世界锦标赛胜出",
"中国足球失败",
};
System.out.println(docVectorModel.similarity(documents[0], documents[1]));
System.out.println(docVectorModel.similarity(documents[0], documents[4]));
for (int i = 0; i < documents.length; i++)
{
docVectorModel.addDocument(i, documents[i]);
}
printNearestDocument("体育", documents, docVectorModel);
printNearestDocument("农业", documents, docVectorModel);
printNearestDocument("我要看比赛", documents, docVectorModel);
printNearestDocument("要不做饭吧", documents, docVectorModel);
}
static void printNearest(String word, WordVectorModel model)
{
System.out.printf("\n Word Cosine\n------------------------------------------------------------------------\n");
for (Map.Entry<String, Float> entry : model.nearest(word))
{
System.out.printf("%50s\t\t%f\n", entry.getKey(), entry.getValue());
}
}
static void printNearestDocument(String document, String[] documents, DocVectorModel model)
{
printHeader(document);
for (Map.Entry<Integer, Float> entry : model.nearest(document))
{
System.out.printf("%50s\t\t%f\n", documents[entry.getKey()], entry.getValue());
}
}
private static void printHeader(String query)
{
System.out.printf("\n%50s Cosine\n------------------------------------------------------------------------\n", query);
}
static WordVectorModel trainOrLoadModel() throws IOException
{
if (!IOUtil.isFileExisted(MODEL_FILE_NAME))
{
if (!IOUtil.isFileExisted(TRAIN_FILE_NAME))
{
System.err.println("语料不存在,请阅读文档了解语料获取与格式:https://github.com/hankcs/HanLP/wiki/word2vec");
System.exit(1);
}
Word2VecTrainer trainerBuilder = new Word2VecTrainer();
return trainerBuilder.train(TRAIN_FILE_NAME, MODEL_FILE_NAME);
}
return loadModel();
}
static WordVectorModel loadModel() throws IOException
{
return new WordVectorModel(MODEL_FILE_NAME);
}
}