Skip to content

Commit f8c7b46

Browse files
author
lichuang
committed
add my_seq2seq.py
1 parent e56cc38 commit f8c7b46

File tree

2 files changed

+239
-2
lines changed

2 files changed

+239
-2
lines changed

chatbotv2/my_seq2seq.py

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import sys
4+
import math
5+
import tflearn
6+
import tensorflow as tf
7+
from tensorflow.python.ops import rnn_cell
8+
from tensorflow.python.ops import rnn
9+
import chardet
10+
import numpy as np
11+
import struct
12+
13+
seq = []
14+
15+
max_w = 50
16+
float_size = 4
17+
word_vector_dict = {}
18+
19+
def load_vectors(input):
20+
"""从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量
21+
"""
22+
print "begin load vectors"
23+
24+
input_file = open(input, "rb")
25+
26+
# 获取词表数目及向量维度
27+
words_and_size = input_file.readline()
28+
words_and_size = words_and_size.strip()
29+
words = long(words_and_size.split(' ')[0])
30+
size = long(words_and_size.split(' ')[1])
31+
print "words =", words
32+
print "size =", size
33+
34+
for b in range(0, words):
35+
a = 0
36+
word = ''
37+
# 读取一个词
38+
while True:
39+
c = input_file.read(1)
40+
word = word + c
41+
if False == c or c == ' ':
42+
break
43+
if a < max_w and c != '\n':
44+
a = a + 1
45+
word = word.strip()
46+
47+
vector = []
48+
for index in range(0, size):
49+
m = input_file.read(float_size)
50+
(weight,) = struct.unpack('f', m)
51+
vector.append(float(weight))
52+
53+
# 将词及其对应的向量存到dict中
54+
word_vector_dict[word.decode('utf-8')] = vector
55+
#word_vector_dict[word.decode('utf-8')] = vector[0:4]
56+
57+
input_file.close()
58+
59+
print "load vectors finish"
60+
61+
def init_seq():
62+
"""读取切好词的文本文件,加载全部词序列
63+
"""
64+
file_object = open('zhenhuanzhuan.segment', 'r')
65+
vocab_dict = {}
66+
while True:
67+
line = file_object.readline()
68+
if line:
69+
for word in line.decode('utf-8').split(' '):
70+
if word_vector_dict.has_key(word):
71+
seq.append(word_vector_dict[word])
72+
else:
73+
break
74+
file_object.close()
75+
76+
def vector_sqrtlen(vector):
77+
len = 0
78+
for item in vector:
79+
len += item * item
80+
len = math.sqrt(len)
81+
return len
82+
83+
def vector_cosine(v1, v2):
84+
if len(v1) != len(v2):
85+
sys.exit(1)
86+
sqrtlen1 = vector_sqrtlen(v1)
87+
sqrtlen2 = vector_sqrtlen(v2)
88+
value = 0
89+
for item1, item2 in zip(v1, v2):
90+
value += item1 * item2
91+
return value / (sqrtlen1*sqrtlen2)
92+
93+
94+
def vector2word(vector):
95+
max_cos = -10000
96+
match_word = ''
97+
for word in word_vector_dict:
98+
v = word_vector_dict[word]
99+
cosine = vector_cosine(vector, v)
100+
if cosine > max_cos:
101+
max_cos = cosine
102+
match_word = word
103+
return (match_word, max_cos)
104+
105+
106+
class MySeq2Seq(object):
107+
"""
108+
思路:输入输出序列一起作为input,然后通过slick和unpack切分
109+
完全按照论文说的编码器解码器来做
110+
输出的时候把解码器的输出按照词向量的200维展平,这样输出就是(?,seqlen*200)
111+
这样就可以通过regression来做回归计算了,输入的y也展平,保持一致
112+
"""
113+
def __init__(self, max_seq_len = 16):
114+
self.max_seq_len = max_seq_len
115+
self.word_vec_dim = 200
116+
117+
def generate_trainig_data(self):
118+
load_vectors("./vectors.bin")
119+
init_seq()
120+
xy_data = []
121+
y_data = []
122+
for i in range(10,30,10):
123+
# 问句、答句都是16字,所以取32个
124+
start = i*self.max_seq_len*2
125+
middle = i*self.max_seq_len*2 + self.max_seq_len
126+
end = (i+1)*self.max_seq_len*2
127+
sequence_xy = seq[start:end]
128+
sequence_y = seq[middle:end]
129+
sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
130+
xy_data.append(sequence_xy)
131+
y_data.append(sequence_y)
132+
133+
return np.array(xy_data), np.array(y_data)
134+
135+
def embedding_rnn_seq2seq(self, encoder_inputs,
136+
decoder_inputs,
137+
cell,
138+
output_projection=None,
139+
feed_previous=False,
140+
dtype=None,
141+
scope=None):
142+
_, encoder_state = rnn.rnn(cell, encoder_inputs, dtype=dtype, scope=scope)
143+
144+
def model_bak(self, feed_previous=False):
145+
# 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
146+
input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
147+
encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
148+
decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
149+
go_inputs = tf.zeros_like(decoder_inputs_tmp)
150+
go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
151+
decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
152+
153+
# 编码器
154+
# 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
155+
(encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, 200, return_state=True, scope='encoder_lstm')
156+
encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
157+
158+
# 解码器
159+
if feed_previous:
160+
# 预测过程用前一个时间序的输出作为下一个时间序的输入
161+
# 先用编码器的最后一个输出作为第一个输入
162+
decoder_output_tensor = tflearn.lstm(encoder_output_sequence, 200, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
163+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
164+
decoder_output_sequence_list = [decoder_output_tensor]
165+
# 再用解码器的输出作为下一个时序的输入
166+
for i in range(self.max_seq_len-1):
167+
decoder_output_tensor = tflearn.lstm(decoder_output_sequence_single, 200, return_seq=False, reuse=True, scope='decoder_lstm')
168+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
169+
decoder_output_sequence_list.append(decoder_output_tensor)
170+
else:
171+
# 把decoder_inputs交给解码器,返回一个输出序列
172+
decoder_output_sequence_list = tflearn.lstm(decoder_inputs, 200, initial_state=states, return_seq=True, reuse=False, scope='decoder_lstm')
173+
174+
decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
175+
real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
176+
177+
net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
178+
model = tflearn.DNN(net)
179+
return model
180+
181+
def model(self, feed_previous=False):
182+
# 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
183+
input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
184+
encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
185+
decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
186+
go_inputs = tf.ones_like(decoder_inputs_tmp)
187+
go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
188+
decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
189+
190+
# 编码器
191+
# 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
192+
(encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, 200, return_state=True, scope='encoder_lstm')
193+
encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
194+
195+
# 解码器
196+
# 预测过程用前一个时间序的输出作为下一个时间序的输入
197+
# 先用编码器的最后一个输出作为第一个输入
198+
if feed_previous:
199+
first_dec_input = go_inputs
200+
else:
201+
first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
202+
decoder_output_tensor = tflearn.lstm(first_dec_input, 200, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
203+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
204+
decoder_output_sequence_list = [decoder_output_tensor]
205+
# 再用解码器的输出作为下一个时序的输入
206+
for i in range(self.max_seq_len-1):
207+
if feed_previous:
208+
next_dec_input = decoder_output_sequence_single
209+
else:
210+
next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
211+
decoder_output_tensor = tflearn.lstm(next_dec_input, 200, return_seq=False, reuse=True, scope='decoder_lstm')
212+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
213+
decoder_output_sequence_list.append(decoder_output_tensor)
214+
215+
decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
216+
real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
217+
218+
net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
219+
model = tflearn.DNN(net)
220+
return model
221+
222+
def train(self):
223+
trainXY, trainY = self.generate_trainig_data()
224+
model = self.model(feed_previous=False)
225+
model.fit(trainXY, trainY, n_epoch=100, snapshot_epoch=False)
226+
model.save('./model/model')
227+
return model
228+
229+
def load(self):
230+
model = self.model(feed_previous=True)
231+
model.load('./model/model')
232+
return model
233+
234+
if __name__ == '__main__':
235+
my_seq2seq = MySeq2Seq()
236+
my_seq2seq.train()
237+
#model = my_seq2seq.load()

chatbotv2/one_lstm_sequence_generate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def main():
106106
ylist = []
107107
test_X = None
108108
#for i in range(len(seq)-100):
109-
for i in range(10):
109+
for i in range(1000):
110110
sequence = seq[i:i+20]
111111
xlist.append(sequence)
112112
ylist.append(seq[i+20])
@@ -123,7 +123,7 @@ def main():
123123
net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1,
124124
loss='mean_square')
125125
model = tflearn.DNN(net)
126-
model.fit(X, Y, n_epoch=500, batch_size=10,snapshot_epoch=False,show_metric=True)
126+
model.fit(X, Y, n_epoch=500, batch_size=100,snapshot_epoch=False,show_metric=True)
127127
model.save("model")
128128
predict = model.predict([test_X])
129129
#print predict

0 commit comments

Comments
 (0)