Skip to content

Commit 9990a21

Browse files
committed
Merge branch 'master' of github.com:warmheartli/ChatBotCourse
2 parents ead720e + a18d332 commit 9990a21

File tree

10 files changed

+2158
-4
lines changed

10 files changed

+2158
-4
lines changed

chatbotv2/lstm_train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def create_model(max_word_id, is_test=False):
7676
decoder_inputs = tf.slice(network, [0, max_seq_len], [-1, max_seq_len], name="dec_in")
7777
decoder_inputs = tf.unpack(decoder_inputs, axis=1)
7878
go_input = tf.mul( tf.ones_like(decoder_inputs[0], dtype=tf.int32), GO_VALUE )
79-
decoder_inputs = [go_input] + decoder_inputs[: max_seq_len-1]
79+
decoder_inputs = [go_input] + decoder_inputs[: max_max_seq_len-1]
8080
num_encoder_symbols = max_word_id + 1 # 从0起始
8181
num_decoder_symbols = max_word_id + 2 # 包括GO
8282

@@ -96,7 +96,7 @@ def create_model(max_word_id, is_test=False):
9696

9797

9898

99-
targetY = tf.placeholder(shape=[None, max_seq_len], dtype=tf.int32, name="Y")
99+
targetY = tf.placeholder(shape=[None, max_seq_len], dtype=tf.float32, name="Y")
100100

101101
network = tflearn.regression(
102102
network,

chatbotv2/my_seq2seq.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import sys
4+
import math
5+
import tflearn
6+
import tensorflow as tf
7+
from tensorflow.python.ops import rnn_cell
8+
from tensorflow.python.ops import rnn
9+
import chardet
10+
import numpy as np
11+
import struct
12+
13+
seq = []
14+
15+
max_w = 50
16+
float_size = 4
17+
word_vector_dict = {}
18+
word_vec_dim = 200
19+
max_seq_len = 16
20+
21+
def load_vectors(input):
22+
"""从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量
23+
"""
24+
print "begin load vectors"
25+
26+
input_file = open(input, "rb")
27+
28+
# 获取词表数目及向量维度
29+
words_and_size = input_file.readline()
30+
words_and_size = words_and_size.strip()
31+
words = long(words_and_size.split(' ')[0])
32+
size = long(words_and_size.split(' ')[1])
33+
print "words =", words
34+
print "size =", size
35+
36+
for b in range(0, words):
37+
a = 0
38+
word = ''
39+
# 读取一个词
40+
while True:
41+
c = input_file.read(1)
42+
word = word + c
43+
if False == c or c == ' ':
44+
break
45+
if a < max_w and c != '\n':
46+
a = a + 1
47+
word = word.strip()
48+
49+
vector = []
50+
for index in range(0, size):
51+
m = input_file.read(float_size)
52+
(weight,) = struct.unpack('f', m)
53+
vector.append(float(weight))
54+
55+
# 将词及其对应的向量存到dict中
56+
#word_vector_dict[word.decode('utf-8')] = vector
57+
word_vector_dict[word.decode('utf-8')] = vector[0:word_vec_dim]
58+
59+
input_file.close()
60+
61+
print "load vectors finish"
62+
63+
def init_seq():
64+
"""读取切好词的文本文件,加载全部词序列
65+
"""
66+
file_object = open('zhenhuanzhuan.segment', 'r')
67+
vocab_dict = {}
68+
while True:
69+
line = file_object.readline()
70+
if line:
71+
for word in line.decode('utf-8').split(' '):
72+
if word_vector_dict.has_key(word):
73+
seq.append(word_vector_dict[word])
74+
else:
75+
break
76+
file_object.close()
77+
78+
def vector_sqrtlen(vector):
79+
len = 0
80+
for item in vector:
81+
len += item * item
82+
len = math.sqrt(len)
83+
return len
84+
85+
def vector_cosine(v1, v2):
86+
if len(v1) != len(v2):
87+
sys.exit(1)
88+
sqrtlen1 = vector_sqrtlen(v1)
89+
sqrtlen2 = vector_sqrtlen(v2)
90+
value = 0
91+
for item1, item2 in zip(v1, v2):
92+
value += item1 * item2
93+
return value / (sqrtlen1*sqrtlen2)
94+
95+
96+
def vector2word(vector):
97+
max_cos = -10000
98+
match_word = ''
99+
for word in word_vector_dict:
100+
v = word_vector_dict[word]
101+
cosine = vector_cosine(vector, v)
102+
if cosine > max_cos:
103+
max_cos = cosine
104+
match_word = word
105+
return (match_word, max_cos)
106+
107+
108+
class MySeq2Seq(object):
109+
"""
110+
思路:输入输出序列一起作为input,然后通过slick和unpack切分
111+
完全按照论文说的编码器解码器来做
112+
输出的时候把解码器的输出按照词向量的200维展平,这样输出就是(?,seqlen*200)
113+
这样就可以通过regression来做回归计算了,输入的y也展平,保持一致
114+
"""
115+
def __init__(self, max_seq_len = 16, word_vec_dim = 200):
116+
self.max_seq_len = max_seq_len
117+
self.word_vec_dim = word_vec_dim
118+
119+
def generate_trainig_data(self):
120+
load_vectors("./vectors.bin")
121+
init_seq()
122+
xy_data = []
123+
y_data = []
124+
for i in range(30,40,10):
125+
# 问句、答句都是16字,所以取32个
126+
start = i*self.max_seq_len*2
127+
middle = i*self.max_seq_len*2 + self.max_seq_len
128+
end = (i+1)*self.max_seq_len*2
129+
sequence_xy = seq[start:end]
130+
sequence_y = seq[middle:end]
131+
print "right answer"
132+
for w in sequence_y:
133+
(match_word, max_cos) = vector2word(w)
134+
print match_word
135+
sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
136+
xy_data.append(sequence_xy)
137+
y_data.append(sequence_y)
138+
139+
return np.array(xy_data), np.array(y_data)
140+
141+
142+
def model(self, feed_previous=False):
143+
# 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
144+
input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
145+
encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
146+
decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
147+
go_inputs = tf.ones_like(decoder_inputs_tmp)
148+
go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
149+
decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
150+
151+
# 编码器
152+
# 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
153+
(encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope='encoder_lstm')
154+
encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
155+
156+
# 解码器
157+
# 预测过程用前一个时间序的输出作为下一个时间序的输入
158+
# 先用编码器的最后一个输出作为第一个输入
159+
if feed_previous:
160+
first_dec_input = go_inputs
161+
else:
162+
first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
163+
decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
164+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
165+
decoder_output_sequence_list = [decoder_output_tensor]
166+
# 再用解码器的输出作为下一个时序的输入
167+
for i in range(self.max_seq_len-1):
168+
if feed_previous:
169+
next_dec_input = decoder_output_sequence_single
170+
else:
171+
next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
172+
decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
173+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
174+
decoder_output_sequence_list.append(decoder_output_tensor)
175+
176+
decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
177+
real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
178+
179+
net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
180+
model = tflearn.DNN(net)
181+
return model
182+
183+
def train(self):
184+
trainXY, trainY = self.generate_trainig_data()
185+
model = self.model(feed_previous=False)
186+
model.fit(trainXY, trainY, n_epoch=1000, snapshot_epoch=False)
187+
model.save('./model/model')
188+
return model
189+
190+
def load(self):
191+
model = self.model(feed_previous=True)
192+
model.load('./model/model')
193+
return model
194+
195+
if __name__ == '__main__':
196+
phrase = sys.argv[1]
197+
my_seq2seq = MySeq2Seq(word_vec_dim=word_vec_dim, max_seq_len=max_seq_len)
198+
if phrase == 'train':
199+
my_seq2seq.train()
200+
else:
201+
model = my_seq2seq.load()
202+
trainXY, trainY = my_seq2seq.generate_trainig_data()
203+
predict = model.predict(trainXY)
204+
for sample in predict:
205+
print "predict answer"
206+
for w in sample[1:]:
207+
(match_word, max_cos) = vector2word(w)
208+
print match_word, max_cos

0 commit comments

Comments
 (0)