forked from coderLCJ/NlpPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateTokenizer.py
More file actions
30 lines (27 loc) · 1.09 KB
/
createTokenizer.py
File metadata and controls
30 lines (27 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-#
# ---------------------------------------------
# Name: demo0
# Description:
# Author: Laity
# Date: 2021/9/22
# ---------------------------------------------
# 导入用于对象保存与加载的joblib
import joblib
# 导入keras中的词汇映射器Tokenizer
from keras.preprocessing.text import Tokenizer
# 假定vocab为语料集所有不同词汇集合
vocab = {"周杰伦", "陈奕迅", "王力宏", "李宗盛", "吴亦凡", "鹿晗"}
# 实例化一个词汇映射器对象
t = Tokenizer(num_words=None, char_level=False)
# 使用映射器拟合现有文本数据
t.fit_on_texts(vocab)
for token in vocab:
zero_list = [0]*len(vocab)
# 使用映射器转化现有文本数据, 每个词汇对应从1开始的自然数
# 返回样式如: [[2]], 取出其中的数字需要使用[0][0]
token_index = t.texts_to_sequences([token])[0][0] - 1
zero_list[token_index] = 1
print(token, "的one-hot编码为:", zero_list)
# 使用joblib工具保存映射器, 以便之后使用
tokenizer_path = "./Tokenizer"
joblib.dump(t, tokenizer_path)