Skip to content

Commit 39468b7

Browse files
author
lichuang
committed
add word segment
1 parent 2cc659d commit 39468b7

2 files changed

Lines changed: 93 additions & 0 deletions

File tree

word_segment.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# coding:utf-8
2+
3+
import sys
4+
reload(sys)
5+
sys.setdefaultencoding( "utf-8" )
6+
7+
import jieba
8+
from jieba import analyse
9+
10+
def segment(input, output):
11+
input_file = open(input, "r")
12+
output_file = open(output, "w")
13+
while True:
14+
line = input_file.readline()
15+
if line:
16+
line = line.strip()
17+
seg_list = jieba.cut(line)
18+
segments = ""
19+
for str in seg_list:
20+
segments = segments + " " + str
21+
output_file.write(segments)
22+
else:
23+
break
24+
input_file.close()
25+
output_file.close()
26+
27+
if __name__ == '__main__':
28+
if 3 != len(sys.argv):
29+
print "Usage: ", sys.argv[0], "input output"
30+
sys.exit(-1)
31+
segment(sys.argv[1], sys.argv[2]);

word_vectors_loader.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# coding:utf-8
2+
3+
import sys
4+
import struct
5+
import math
6+
import numpy as np
7+
8+
reload(sys)
9+
sys.setdefaultencoding( "utf-8" )
10+
11+
max_w = 50
12+
float_size = 4
13+
14+
def load_vectors(input):
15+
print "begin load vectors"
16+
17+
input_file = open(input, "rb")
18+
19+
# 获取词表数目及向量维度
20+
words_and_size = input_file.readline()
21+
words_and_size = words_and_size.strip()
22+
words = long(words_and_size.split(' ')[0])
23+
size = long(words_and_size.split(' ')[1])
24+
print "words =", words
25+
print "size =", size
26+
27+
word_vector = {}
28+
29+
for b in range(0, words):
30+
a = 0
31+
word = ''
32+
# 读取一个词
33+
while True:
34+
c = input_file.read(1)
35+
word = word + c
36+
if False == c or c == ' ':
37+
break
38+
if a < max_w and c != '\n':
39+
a = a + 1
40+
word = word.strip()
41+
42+
# 读取词向量
43+
vector = np.empty([200])
44+
for index in range(0, size):
45+
m = input_file.read(float_size)
46+
(weight,) = struct.unpack('f', m)
47+
vector[index] = weight
48+
49+
# 将词及其对应的向量存到dict中
50+
word_vector[word.decode('utf-8')] = vector
51+
52+
input_file.close()
53+
54+
print "load vectors finish"
55+
return word_vector
56+
57+
if __name__ == '__main__':
58+
if 2 != len(sys.argv):
59+
print "Usage: ", sys.argv[0], "vectors.bin"
60+
sys.exit(-1)
61+
d = load_vectors(sys.argv[1])
62+
print d[u'真的']

0 commit comments

Comments
 (0)