|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +import sys |
| 4 | +import math |
| 5 | +import tflearn |
| 6 | +import chardet |
| 7 | +import numpy as np |
| 8 | +import struct |
| 9 | + |
| 10 | +seq = [] |
| 11 | + |
| 12 | +max_w = 50 |
| 13 | +float_size = 4 |
| 14 | +word_vector_dict = {} |
| 15 | + |
| 16 | +def load_vectors(input): |
| 17 | + """从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量 |
| 18 | + """ |
| 19 | + print "begin load vectors" |
| 20 | + |
| 21 | + input_file = open(input, "rb") |
| 22 | + |
| 23 | + # 获取词表数目及向量维度 |
| 24 | + words_and_size = input_file.readline() |
| 25 | + words_and_size = words_and_size.strip() |
| 26 | + words = long(words_and_size.split(' ')[0]) |
| 27 | + size = long(words_and_size.split(' ')[1]) |
| 28 | + print "words =", words |
| 29 | + print "size =", size |
| 30 | + |
| 31 | + for b in range(0, words): |
| 32 | + a = 0 |
| 33 | + word = '' |
| 34 | + # 读取一个词 |
| 35 | + while True: |
| 36 | + c = input_file.read(1) |
| 37 | + word = word + c |
| 38 | + if False == c or c == ' ': |
| 39 | + break |
| 40 | + if a < max_w and c != '\n': |
| 41 | + a = a + 1 |
| 42 | + word = word.strip() |
| 43 | + |
| 44 | + vector = [] |
| 45 | + for index in range(0, size): |
| 46 | + m = input_file.read(float_size) |
| 47 | + (weight,) = struct.unpack('f', m) |
| 48 | + vector.append(weight) |
| 49 | + |
| 50 | + # 将词及其对应的向量存到dict中 |
| 51 | + word_vector_dict[word.decode('utf-8')] = vector |
| 52 | + |
| 53 | + input_file.close() |
| 54 | + |
| 55 | + print "load vectors finish" |
| 56 | + |
| 57 | +def init_seq(): |
| 58 | + """读取切好词的文本文件,加载全部词序列 |
| 59 | + """ |
| 60 | + file_object = open('zhenhuanzhuan.segment', 'r') |
| 61 | + vocab_dict = {} |
| 62 | + while True: |
| 63 | + line = file_object.readline() |
| 64 | + if line: |
| 65 | + for word in line.decode('utf-8').split(' '): |
| 66 | + if word_vector_dict.has_key(word): |
| 67 | + seq.append(word_vector_dict[word]) |
| 68 | + else: |
| 69 | + break |
| 70 | + file_object.close() |
| 71 | + |
| 72 | +def vector_sqrtlen(vector): |
| 73 | + len = 0 |
| 74 | + for item in vector: |
| 75 | + len += item * item |
| 76 | + len = math.sqrt(len) |
| 77 | + return len |
| 78 | + |
| 79 | +def vector_cosine(v1, v2): |
| 80 | + if len(v1) != len(v2): |
| 81 | + sys.exit(1) |
| 82 | + sqrtlen1 = vector_sqrtlen(v1) |
| 83 | + sqrtlen2 = vector_sqrtlen(v2) |
| 84 | + value = 0 |
| 85 | + for item1, item2 in zip(v1, v2): |
| 86 | + value += item1 * item2 |
| 87 | + return value / (sqrtlen1*sqrtlen2) |
| 88 | + |
| 89 | + |
| 90 | +def vector2word(vector): |
| 91 | + max_cos = -10000 |
| 92 | + match_word = '' |
| 93 | + for word in word_vector_dict: |
| 94 | + v = word_vector_dict[word] |
| 95 | + cosine = vector_cosine(vector, v) |
| 96 | + if cosine > max_cos: |
| 97 | + max_cos = cosine |
| 98 | + match_word = word |
| 99 | + return (match_word, max_cos) |
| 100 | + |
| 101 | + |
| 102 | +def main(): |
| 103 | + load_vectors("./vectors.bin") |
| 104 | + init_seq() |
| 105 | + xlist = [] |
| 106 | + ylist = [] |
| 107 | + test_X = None |
| 108 | + #for i in range(len(seq)-100): |
| 109 | + for i in range(10): |
| 110 | + sequence = seq[i:i+20] |
| 111 | + xlist.append(sequence) |
| 112 | + ylist.append(seq[i+20]) |
| 113 | + if test_X is None: |
| 114 | + test_X = np.array(sequence) |
| 115 | + (match_word, max_cos) = vector2word(seq[i+20]) |
| 116 | + print "right answer=", match_word, max_cos |
| 117 | + |
| 118 | + X = np.array(xlist) |
| 119 | + Y = np.array(ylist) |
| 120 | + net = tflearn.input_data([None, 20, 200]) |
| 121 | + net = tflearn.lstm(net, 200) |
| 122 | + net = tflearn.fully_connected(net, 200, activation='linear') |
| 123 | + net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, |
| 124 | + loss='mean_square') |
| 125 | + model = tflearn.DNN(net) |
| 126 | + model.fit(X, Y, n_epoch=500, batch_size=10,snapshot_epoch=False,show_metric=True) |
| 127 | + model.save("model") |
| 128 | + predict = model.predict([test_X]) |
| 129 | + #print predict |
| 130 | + #for v in test_X: |
| 131 | + # print vector2word(v) |
| 132 | + (match_word, max_cos) = vector2word(predict[0]) |
| 133 | + print "predict=", match_word, max_cos |
| 134 | + |
| 135 | +main() |
| 136 | +#init_seq() |
| 137 | +# |
| 138 | +#x1 = [1,2,3,4,5] |
| 139 | +#x2 = [0,1,2,3] |
| 140 | +#y = [2,4,6,8,10] |
| 141 | +#X = np.array([x1,x2]) |
| 142 | +#Y = np.array([y]) |
| 143 | +#net = tflearn.input_data([None]) |
| 144 | +##net = tflearn.embedding(net, input_dim=4, output_dim=2) |
| 145 | +#net = tflearn.single_unit(net) |
| 146 | +##net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy') |
| 147 | +#net = tflearn.regression(net, optimizer='sgd', loss='mean_square', |
| 148 | +# metric='R2', learning_rate=0.01) |
| 149 | +##layer1 = tflearn.fully_connected(net, 2) |
| 150 | +#model = tflearn.DNN(net) |
| 151 | +#model.fit(x1, y, n_epoch=1000, snapshot_epoch=False,show_metric=True,batch_size=1) |
| 152 | +#print model.predict(x1) |
| 153 | +##net = tflearn.embedding(net, input_dim=10000, output_dim=16) |
| 154 | +#net = tflearn.lstm(net, 16, dropout=0.8) |
| 155 | +#net = tflearn.fully_connected(net, 2, activation='softmax') |
| 156 | +#net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,loss='categorical_crossentropy') |
| 157 | +#model = tflearn.DNN(net, tensorboard_verbose=0) |
| 158 | +#model.fit(X, Y, show_metric=True,batch_size=32) |
| 159 | + |
| 160 | + |
| 161 | +#sys.exit(0) |
| 162 | +# 下面都是测试用的,不用看 |
| 163 | + |
| 164 | +def test_case1(): |
| 165 | + x = [1,2,3] |
| 166 | + y = [0.01,0.99] |
| 167 | + # 多组x作为输入样本 |
| 168 | + X = np.array(np.repeat([x], 1, axis=0)) |
| 169 | + # 多组y作为样本的y值 |
| 170 | + Y = np.array(np.repeat([y], 1, axis=0)) |
| 171 | + |
| 172 | + #X = np.array([x1,x2], dtype=np.float32) |
| 173 | + #Y = np.array([y1,y2]) |
| 174 | + |
| 175 | + # 这里的第二个数对应了x是多少维的向量 |
| 176 | + net = tflearn.input_data(shape=[None, 3]) |
| 177 | + #net = tflearn.fully_connected(net, 32) |
| 178 | + net = tflearn.fully_connected(net, 2) |
| 179 | + # 这里的第二个参数对应了输出的y是多少维的向量 |
| 180 | + #net = tflearn.fully_connected(net, 2, activation='softmax') |
| 181 | + net = tflearn.regression(net) |
| 182 | + |
| 183 | + |
| 184 | + model = tflearn.DNN(net) |
| 185 | + model.fit(X, Y, n_epoch=1000, batch_size=1, show_metric=True, snapshot_epoch=False) |
| 186 | + pred = model.predict([x]) |
| 187 | + print(pred) |
| 188 | + |
| 189 | +def case_linear_regression(): |
| 190 | + x = [1,2,3,4,5] |
| 191 | + y = [2,4,6,8,10] |
| 192 | + net = tflearn.input_data([None]) |
| 193 | + linear = tflearn.single_unit(net) |
| 194 | + net = tflearn.regression(linear, optimizer='sgd', loss='mean_square', |
| 195 | + metric='R2', learning_rate=0.01) |
| 196 | + model = tflearn.DNN(net) |
| 197 | + model.fit(x, y, n_epoch=200, snapshot_epoch=False,show_metric=True,batch_size=1) |
| 198 | + print model.predict([8,9]) # [15.990408897399902, 17.988374710083008] |
| 199 | + print model.get_weights(linear.W) # [ 1.99796414] |
| 200 | + print model.get_weights(linear.b) # [ 0.00669619] |
| 201 | + |
| 202 | +#case_linear_regression() |
| 203 | + |
| 204 | + |
| 205 | +#X = [[0., 0.], [0., 1.], [1., 0.], [1., 1.]] |
| 206 | +#Y_xor = [[0.], [1.], [1.], [0.]] |
| 207 | + |
| 208 | +# 如何输出每一步的输出值 |
| 209 | +# You can re-use a new model that share a same session (to use same weights): . Note that you could also save your 'm' model and load it with 'm2', that gives similar results. |
| 210 | +## Graph definition |
| 211 | +#with tf.Graph().as_default(): |
| 212 | +# tnorm = tflearn.initializations.uniform(minval=-1.0, maxval=1.0) |
| 213 | +# net = tflearn.input_data(shape=[None, 2], name='inputLayer') |
| 214 | +# layer1 = tflearn.fully_connected(net, 2, activation='sigmoid', weights_init=tnorm, name='layer1') |
| 215 | +# layer2 = tflearn.fully_connected(layer1, 1, activation='softmax', weights_init=tnorm, name='layer2') |
| 216 | +# regressor = tflearn.regression(layer2, optimizer='sgd', learning_rate=2., loss='mean_square', name='layer3') |
| 217 | +# |
| 218 | +# # Training |
| 219 | +# m = tflearn.DNN(regressor) |
| 220 | +# m.fit(X, Y_xor, n_epoch=100, snapshot_epoch=False) |
| 221 | +# |
| 222 | +# # Testing |
| 223 | +# print("Testing XOR operator") |
| 224 | +# print("0 xor 0:", m.predict([[0., 0.]])) |
| 225 | +# print("0 xor 1:", m.predict([[0., 1.]])) |
| 226 | +# print("1 xor 0:", m.predict([[1., 0.]])) |
| 227 | +# print("1 xor 1:", m.predict([[1., 1.]])) |
| 228 | +# |
| 229 | +# # You can create a new model, that share the same session (to get same weights) |
| 230 | +# # Or you can also simply save and load a model |
| 231 | +# m2 = tflearn.DNN(layer1, session=m.session) |
| 232 | +# print(m2.predict([[0., 0.]])) |
0 commit comments