Skip to content

Commit 3f74881

Browse files
author
李闯
committed
encoder_decoder_seq2seq better
1 parent 0ffb01d commit 3f74881

1 file changed

Lines changed: 186 additions & 67 deletions

File tree

chatbotv3/encoder_decoder_seq2seq.py

Lines changed: 186 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,57 @@ def __init__(self):
1414
self.max_abs_weight = 32 # 最大权重绝对值,用来对词向量做正规化
1515
self.max_seq_len = 8 # 最大句子长度(词)
1616
self.word_vec_dim = 0 # 词向量维度,读vectors.bin二进制时动态确定
17-
self.epoch = 10000
17+
self.epoch = 1000
1818
self.word_vector_dict = {} # 词向量词典,加载vectors.bin读入
19+
self.one_hot_word_vector_dict = {} # 根据样本词汇生成的softmax用的词向量
20+
self.word_id_word_dict = {}
21+
self.one_hot_word_vectors_dim = 1 # softmax用的词向量维度,从1开始,保留0作为EOS的word_id
22+
self.eos_word_id = 0
23+
self.eos_word = 'EOS'
1924
self.vectors_bin_file = './vectors.bin' # 词向量二进制
2025
self.model_dir = './model/model' # 模型文件路径
2126
self.n_hidden = 1000 # lstm隐藏状态单元数目
2227
self.learning_rate = 0.01 # 学习率
2328

29+
def load_one_hot_word_vectors(self):
30+
31+
word_id_dict = {}
32+
sample_file_object = open('./samples/1', 'r')
33+
lines = sample_file_object.readlines()
34+
for line in lines:
35+
line = line.strip()
36+
split = line.split('|')
37+
if len(split) == 2:
38+
answer = split[1]
39+
segments = jieba.cut(answer)
40+
for word in segments:
41+
if word not in word_id_dict:
42+
word_id_dict[word] = self.one_hot_word_vectors_dim
43+
self.word_id_word_dict[self.one_hot_word_vectors_dim] = word
44+
self.one_hot_word_vectors_dim = self.one_hot_word_vectors_dim + 1
45+
46+
# 添加一个结尾符
47+
vector = np.zeros(self.one_hot_word_vectors_dim)
48+
vector[self.eos_word_id] = 1
49+
self.one_hot_word_vector_dict[self.eos_word] = vector
50+
self.word_id_word_dict[self.eos_word_id] = self.eos_word
51+
52+
for line in lines:
53+
line = line.strip()
54+
split = line.split('|')
55+
if len(split) == 2:
56+
answer = split[1]
57+
segments = jieba.cut(answer)
58+
for word in segments:
59+
if word not in self.one_hot_word_vector_dict:
60+
word_id = word_id_dict[word]
61+
print word, word_id
62+
vector = np.zeros(self.one_hot_word_vectors_dim)
63+
vector[word_id] = 1
64+
self.one_hot_word_vector_dict[word] = vector
65+
66+
sample_file_object.close()
67+
2468
def load_word_vectors(self):
2569
"""加载词向量二进制到内存"""
2670
float_size = 4 # 一个浮点数4字节
@@ -80,98 +124,133 @@ def next_batch(self):
80124
answer = split[1]
81125
print('question:[%s] answer:[%s]' % (question, answer))
82126

127+
good_sample = True
83128
question_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len
84129
answer_seq = [np.zeros(self.word_vec_dim)] * self.max_seq_len
130+
answer_seq_one_hot = [np.zeros(self.one_hot_word_vectors_dim)] * self.max_seq_len
85131
segments = jieba.cut(question)
86132
for index, word in enumerate(segments):
87133
if word in self.word_vector_dict:
88134
vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight
89135
# 防止词过多越界
90136
if self.max_seq_len - index - 1 < 0:
137+
good_sample = False
91138
break
92139
# 问题不足max_seq_len在前面补零,存储时倒序存储
93140
question_seq[self.max_seq_len - index - 1] = vec
141+
else:
142+
good_sample = False
94143

95144
segments = jieba.cut(answer)
145+
last_index = 0
96146
for index, word in enumerate(segments):
97147
if word in self.word_vector_dict:
98148
vec = np.array(self.word_vector_dict[word]) / self.max_abs_weight
99149
# 防止词过多越界
100-
if index >= self.max_seq_len:
150+
if index >= self.max_seq_len - 1:
151+
good_sample = False
101152
break
102153
answer_seq[index] = vec
103-
104-
xy = question_seq + EOS + answer_seq[0:-1]
105-
y = answer_seq
106-
XY.append(xy)
107-
Y.append(y)
154+
else:
155+
good_sample = False
156+
157+
if word in self.one_hot_word_vector_dict:
158+
vec = self.one_hot_word_vector_dict[word]
159+
answer_seq_one_hot[index] = vec
160+
else:
161+
good_sample = False
162+
last_index = index
163+
# 句子末尾加上EOS
164+
answer_seq_one_hot[last_index + 1] = self.one_hot_word_vector_dict[self.eos_word] # EOS
165+
166+
if good_sample:
167+
xy = question_seq + EOS + answer_seq[0:-1]
168+
y = answer_seq_one_hot
169+
XY.append(xy)
170+
Y.append(y)
108171

109172
sample_file_object.close()
110173

111174
return XY, Y
112175

113-
def model(self, x, y, weights, biases, predict=False):
114-
encoder_inputs = tf.slice(x, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim])
115-
encoder_inputs = tf.unstack(encoder_inputs, self.max_seq_len, 1)
116-
117-
if predict:
118-
decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, 1, self.word_vec_dim])
119-
decoder_inputs = tf.unstack(decoder_inputs, 1, 1)
120-
else:
121-
decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, self.max_seq_len, self.word_vec_dim])
122-
decoder_inputs = tf.unstack(decoder_inputs, self.max_seq_len, 1)
123-
124-
target_outputs = tf.slice(y, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim])
125-
target_outputs = tf.unstack(target_outputs, self.max_seq_len, 1)
126-
127-
encoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
128-
decoder = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
129-
130-
encoder_outputs, states = rnn.static_rnn(encoder, encoder_inputs, dtype=tf.float32, scope='encoder')
131-
if predict:
132-
decoder_output, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder')
133-
else:
134-
decoder_outputs, states = rnn.static_rnn(decoder, decoder_inputs, initial_state=states, dtype=tf.float32, scope='decoder')
135-
136-
optimizer = None
137-
cost = None
138-
139-
if predict:
140-
decoder_outputs = []
141-
decoder_outputs.append(decoder_output)
142-
143-
for i in range(self.max_seq_len - 1):
144-
decoder_output = tf.unstack(decoder_output, axis=1)[0]
145-
decoder_output = tf.matmul(decoder_output, weights['out']) + tf.slice(biases['out'], [i, 0],
146-
[1, self.word_vec_dim])
147-
decoder_output, states = rnn.static_rnn(decoder, [decoder_output], initial_state=states,
148-
dtype=tf.float32,
149-
scope='decoder')
150-
decoder_outputs.append(decoder_output)
151-
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
152-
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
153-
decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out']
154-
else:
155-
decoder_outputs = tf.unstack(decoder_outputs, axis=1)[0]
156-
decoder_outputs = tf.matmul(decoder_outputs, weights['out']) + biases['out']
157-
target_outputs = tf.unstack(target_outputs, axis=1)[0]
158-
159-
cost = tf.losses.mean_squared_error(decoder_outputs, target_outputs)
160-
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost)
161-
return optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs
176+
def model(self, x, y, weights, biases, training=True):
177+
# 注:以下的6是one_hot_word_vectors_dim
178+
# 取第一个样本的ABC
179+
encoder_inputs = tf.slice(x, [0, 0, 0], [1, self.max_seq_len, self.word_vec_dim]) # shape=(1, 8, 128)
180+
# 展开成2-D Tensor
181+
encoder_inputs = tf.unstack(encoder_inputs, self.max_seq_len, 1) # [<tf.Tensor shape=(1, 128)>,...] 内含8个Tensor
182+
183+
# 取第一个样本的<EOS>WXYZ
184+
decoder_inputs = tf.slice(x, [0, self.max_seq_len, 0], [1, self.max_seq_len, self.word_vec_dim]) # shape=(1, 8, 128)
185+
decoder_inputs = decoder_inputs[0] # shape=(8, 128)
186+
# 转成解码器的输入输出形状
187+
decoder_inputs = tf.matmul(decoder_inputs, weights['enc2dec']) + biases['enc2dec']
188+
# 展开成2-D Tensor
189+
decoder_inputs = tf.unstack([decoder_inputs], axis=1) # [<tf.Tensor shape=(1, 6)>,...] 内含8个Tensor
190+
191+
# 取第一个样本的WXYZ
192+
target_outputs = tf.slice(y, [0, 0, 0], [1, self.max_seq_len, self.one_hot_word_vectors_dim]) # shape=(1, 8, 6)
193+
target_outputs = target_outputs[0] # shape=(8, 6)
194+
195+
# 构造网络结构:两层结构
196+
encoder_layer1 = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
197+
encoder_layer2 = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
198+
decoder_layer1 = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
199+
decoder_layer2 = rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0)
200+
201+
# 输入是8个shape=(1, 128)的Tensor,输出是8个shape=(1, 1000)的Tensor
202+
encoder_layer1_outputs, encoder_layer1_states = rnn.static_rnn(encoder_layer1, encoder_inputs, dtype=tf.float32, scope='encoder_layer1')
203+
# 输入是8个shape=(1, 1000)的Tensor,输出是8个shape=(1, 1000)的Tensor
204+
encoder_layer2_outputs, encoder_layer2_states = rnn.static_rnn(encoder_layer2, encoder_layer1_outputs, dtype=tf.float32, scope='encoder_layer2')
205+
# 取解码器输入的<EOS>
206+
# 输入是1个shape=(1, 6)的Tensor(<EOS>),输出是1个shape=(1, 1000)的Tensor
207+
decoder_layer1_outputs, decoder_layer1_states = rnn.static_rnn(decoder_layer1, decoder_inputs[:1], initial_state=encoder_layer1_states, dtype=tf.float32, scope='decoder_layer1')
208+
# 输入是1个shape=(1, 1000)的Tensor,输出是1个shape=(1, 1000)的Tensor
209+
decoder_layer2_outputs, decoder_layer2_states = rnn.static_rnn(decoder_layer2, decoder_layer1_outputs, initial_state=encoder_layer2_states, dtype=tf.float32, scope='decoder_layer2')
210+
211+
decoder_layer2_outputs_combine = []
212+
decoder_layer2_outputs_combine.append(decoder_layer2_outputs)
213+
for i in range(self.max_seq_len - 1):
214+
decoder_layer2_outputs = tf.unstack(decoder_layer2_outputs, axis=1)[0]
215+
decoder_layer2_outputs = tf.matmul(decoder_layer2_outputs, weights['hid2tar']) + biases['hid2tar'][i]
216+
# 输入是1个shape=(1, 6)的Tensor,输出是1个shape=(1, 1000)的Tensor
217+
if training:
218+
decoder_layer1_outputs, decoder_layer1_states = rnn.static_rnn(decoder_layer1, decoder_inputs[i+1:i+2], initial_state=decoder_layer1_states, dtype=tf.float32, scope='decoder_layer1')
219+
else:
220+
decoder_layer1_outputs, decoder_layer1_states = rnn.static_rnn(decoder_layer1, [decoder_layer2_outputs], initial_state=decoder_layer1_states, dtype=tf.float32, scope='decoder_layer1')
221+
# 输入是1个shape=(1, 1000)的Tensor,输出是1个shape=(1, 1000)的Tensor
222+
decoder_layer2_outputs, decoder_layer2_states = rnn.static_rnn(decoder_layer2, decoder_layer1_outputs, initial_state=decoder_layer2_states, dtype=tf.float32, scope='decoder_layer2')
223+
decoder_layer2_outputs_combine.append(decoder_layer2_outputs)
224+
225+
# 下面的过程把8个shape=(1, 1000)的数组转成8个shape=(1, 1000)的Tensor
226+
decoder_layer2_outputs_combine = tf.unstack(decoder_layer2_outputs_combine, axis=1)[0]
227+
decoder_layer2_outputs_combine = tf.unstack(decoder_layer2_outputs_combine, axis=1)[0]
228+
decoder_layer2_outputs_combine = tf.unstack([decoder_layer2_outputs_combine], axis=1)
229+
# 重新对decoder_layer2_outputs赋值
230+
decoder_layer2_outputs = decoder_layer2_outputs_combine
231+
232+
decoder_layer2_outputs = tf.unstack(decoder_layer2_outputs, axis=1)[0] # shape=(8, 1000)
233+
decoder_layer2_outputs = tf.matmul(decoder_layer2_outputs, weights['hid2tar']) + biases['hid2tar'] # shape=(8, 6)
234+
235+
cost = tf.losses.mean_squared_error(decoder_layer2_outputs, target_outputs)
236+
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost)
237+
238+
return optimizer, cost, decoder_layer2_outputs
162239

163240
def train(self):
164241
x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim])
165-
y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim])
242+
y = tf.placeholder("float", [None, self.max_seq_len, self.one_hot_word_vectors_dim])
166243

167244
weights = {
168-
'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim]))
245+
'enc2dec': tf.Variable(tf.random_normal([self.word_vec_dim, self.one_hot_word_vectors_dim])),
246+
'hid2tar': tf.Variable(tf.random_normal([self.n_hidden, self.one_hot_word_vectors_dim])),
169247
}
170248
biases = {
171-
'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim]))
249+
'enc2dec': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
250+
'hid2tar': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
172251
}
173252

174-
optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases)
253+
optimizer, cost, decoder_layer2_outputs = self.model(x, y, weights, biases)
175254

176255
init = tf.global_variables_initializer()
177256
sess = tf.Session()
@@ -186,24 +265,56 @@ def train(self):
186265
train_Y = Y[step:]
187266
sess.run(optimizer, feed_dict={x: train_XY, y: train_Y})
188267
loss = sess.run(cost, feed_dict={x: train_XY, y: train_Y})
189-
if i % 100 == 0 and step == 0:
268+
if i % 1 == 0 and step == 0:
190269
print 'i=%d, loss=%f' % (i, loss)
191270

192271
saver = tf.train.Saver()
193272
saver.save(sess, self.model_dir)
194273

274+
def test(self):
275+
x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim])
276+
y = tf.placeholder("float", [None, self.max_seq_len, self.one_hot_word_vectors_dim])
277+
278+
weights = {
279+
'enc2dec': tf.Variable(tf.random_normal([self.word_vec_dim, self.one_hot_word_vectors_dim])),
280+
'hid2tar': tf.Variable(tf.random_normal([self.n_hidden, self.one_hot_word_vectors_dim])),
281+
}
282+
biases = {
283+
'enc2dec': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
284+
'hid2tar': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
285+
}
286+
287+
optimizer, cost, decoder_layer2_outputs = self.model(x, y, weights, biases, training=False)
288+
289+
init = tf.global_variables_initializer()
290+
sess = tf.Session()
291+
sess.run(init)
292+
saver = tf.train.Saver()
293+
saver.restore(sess, self.model_dir)
294+
295+
XY, Y = self.next_batch()
296+
n_steps = len(XY)
297+
for step in range(n_steps):
298+
train_XY = XY[step:]
299+
train_Y = Y[step:]
300+
loss = sess.run(cost, feed_dict={x: train_XY, y: train_Y})
301+
print sess.run(decoder_layer2_outputs, feed_dict={x: train_XY, y: train_Y})
302+
print 'loss=%f' % loss
303+
195304
def predict(self):
196305
x = tf.placeholder("float", [None, self.max_seq_len * 2, self.word_vec_dim])
197-
y = tf.placeholder("float", [None, self.max_seq_len, self.word_vec_dim])
306+
y = tf.placeholder("float", [None, self.max_seq_len, self.one_hot_word_vectors_dim])
198307

199308
weights = {
200-
'out': tf.Variable(tf.random_normal([self.n_hidden, self.word_vec_dim]))
309+
'enc2dec': tf.Variable(tf.random_normal([self.word_vec_dim, self.one_hot_word_vectors_dim])),
310+
'hid2tar': tf.Variable(tf.random_normal([self.n_hidden, self.one_hot_word_vectors_dim])),
201311
}
202312
biases = {
203-
'out': tf.Variable(tf.random_normal([self.max_seq_len, self.word_vec_dim]))
313+
'enc2dec': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
314+
'hid2tar': tf.Variable(tf.random_normal([self.max_seq_len, self.one_hot_word_vectors_dim])),
204315
}
205316

206-
optimizer, cost, decoder_outputs, target_outputs, encoder_inputs, decoder_inputs = self.model(x, y, weights, biases, predict=True)
317+
optimizer, cost, decoder_layer2_outputs = self.model(x, y, weights, biases, training=False)
207318

208319
init = tf.global_variables_initializer()
209320
sess = tf.Session()
@@ -227,17 +338,25 @@ def predict(self):
227338

228339
xy = question_seq + EOS + [np.zeros(self.word_vec_dim)] * (self.max_seq_len-1)
229340
XY.append(xy)
230-
Y.append([np.zeros(self.word_vec_dim)] * self.max_seq_len)
231-
print sess.run(decoder_outputs, feed_dict={x: XY, y: Y})
341+
Y.append([np.zeros(self.one_hot_word_vectors_dim)] * self.max_seq_len)
342+
output_seq = sess.run(decoder_layer2_outputs, feed_dict={x: XY, y: Y})
343+
print output_seq
344+
for vector in output_seq:
345+
word_id = np.argmax(vector, axis=0)
346+
print self.word_id_word_dict[word_id]
232347

233348

234349
def main(op):
350+
np.set_printoptions(threshold='nan')
235351
lstm = MyLSTM()
236352
lstm.load_word_vectors()
353+
lstm.load_one_hot_word_vectors()
237354
if op == 'train':
238355
lstm.train()
239356
elif op == 'predict':
240357
lstm.predict()
358+
elif op == 'test':
359+
lstm.test()
241360
else:
242361
print 'Usage:'
243362

0 commit comments

Comments
 (0)