File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ # coding:utf-8
2+
3+ import sys
4+ reload (sys )
5+ sys .setdefaultencoding ( "utf-8" )
6+
7+ import jieba
8+ from jieba import analyse
9+
10+ def segment (input , output ):
11+ input_file = open (input , "r" )
12+ output_file = open (output , "w" )
13+ while True :
14+ line = input_file .readline ()
15+ if line :
16+ line = line .strip ()
17+ seg_list = jieba .cut (line )
18+ segments = ""
19+ for str in seg_list :
20+ segments = segments + " " + str
21+ output_file .write (segments )
22+ else :
23+ break
24+ input_file .close ()
25+ output_file .close ()
26+
27+ if __name__ == '__main__' :
28+ if 3 != len (sys .argv ):
29+ print "Usage: " , sys .argv [0 ], "input output"
30+ sys .exit (- 1 )
31+ segment (sys .argv [1 ], sys .argv [2 ]);
Original file line number Diff line number Diff line change 1+ # coding:utf-8
2+
3+ import sys
4+ import struct
5+ import math
6+ import numpy as np
7+
8+ reload (sys )
9+ sys .setdefaultencoding ( "utf-8" )
10+
11+ max_w = 50
12+ float_size = 4
13+
14+ def load_vectors (input ):
15+ print "begin load vectors"
16+
17+ input_file = open (input , "rb" )
18+
19+ # 获取词表数目及向量维度
20+ words_and_size = input_file .readline ()
21+ words_and_size = words_and_size .strip ()
22+ words = long (words_and_size .split (' ' )[0 ])
23+ size = long (words_and_size .split (' ' )[1 ])
24+ print "words =" , words
25+ print "size =" , size
26+
27+ word_vector = {}
28+
29+ for b in range (0 , words ):
30+ a = 0
31+ word = ''
32+ # 读取一个词
33+ while True :
34+ c = input_file .read (1 )
35+ word = word + c
36+ if False == c or c == ' ' :
37+ break
38+ if a < max_w and c != '\n ' :
39+ a = a + 1
40+ word = word .strip ()
41+
42+ # 读取词向量
43+ vector = np .empty ([200 ])
44+ for index in range (0 , size ):
45+ m = input_file .read (float_size )
46+ (weight ,) = struct .unpack ('f' , m )
47+ vector [index ] = weight
48+
49+ # 将词及其对应的向量存到dict中
50+ word_vector [word .decode ('utf-8' )] = vector
51+
52+ input_file .close ()
53+
54+ print "load vectors finish"
55+ return word_vector
56+
57+ if __name__ == '__main__' :
58+ if 2 != len (sys .argv ):
59+ print "Usage: " , sys .argv [0 ], "vectors.bin"
60+ sys .exit (- 1 )
61+ d = load_vectors (sys .argv [1 ])
62+ print d [u'真的' ]
You can’t perform that action at this time.
0 commit comments