Skip to content

Commit a18d332

Browse files
author
lichuang
committed
add my_seq2seq_v2.py
1 parent b24b8a5 commit a18d332

File tree

2 files changed

+259
-0
lines changed

2 files changed

+259
-0
lines changed

chatbotv2/my_seq2seq_v2.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import sys
4+
import math
5+
import tflearn
6+
import tensorflow as tf
7+
from tensorflow.python.ops import rnn_cell
8+
from tensorflow.python.ops import rnn
9+
import chardet
10+
import numpy as np
11+
import struct
12+
13+
question_seqs = []
14+
answer_seqs = []
15+
16+
max_w = 50
17+
float_size = 4
18+
word_vector_dict = {}
19+
trained_word_vector_dict = {}
20+
word_vec_dim = 200
21+
max_seq_len = 8
22+
word_set = {}
23+
24+
def load_word_set():
25+
file_object = open('./segment_result_lined.3000000.pair.less', 'r')
26+
while True:
27+
line = file_object.readline()
28+
if line:
29+
line_pair = line.split('|')
30+
line_question = line_pair[0]
31+
line_answer = line_pair[1]
32+
for word in line_question.decode('utf-8').split(' '):
33+
word_set[word] = 1
34+
for word in line_answer.decode('utf-8').split(' '):
35+
word_set[word] = 1
36+
else:
37+
break
38+
file_object.close()
39+
40+
def load_vectors(input):
41+
"""从vectors.bin加载词向量,返回一个word_vector_dict的词典,key是词,value是200维的向量
42+
"""
43+
print "begin load vectors"
44+
45+
input_file = open(input, "rb")
46+
47+
# 获取词表数目及向量维度
48+
words_and_size = input_file.readline()
49+
words_and_size = words_and_size.strip()
50+
words = long(words_and_size.split(' ')[0])
51+
size = long(words_and_size.split(' ')[1])
52+
print "words =", words
53+
print "size =", size
54+
55+
for b in range(0, words):
56+
a = 0
57+
word = ''
58+
# 读取一个词
59+
while True:
60+
c = input_file.read(1)
61+
word = word + c
62+
if False == c or c == ' ':
63+
break
64+
if a < max_w and c != '\n':
65+
a = a + 1
66+
word = word.strip()
67+
68+
vector = []
69+
for index in range(0, size):
70+
m = input_file.read(float_size)
71+
(weight,) = struct.unpack('f', m)
72+
vector.append(float(weight))
73+
74+
# 将词及其对应的向量存到dict中
75+
76+
if word_set.has_key(word.decode('utf-8')):
77+
word_vector_dict[word.decode('utf-8')] = vector[0:word_vec_dim]
78+
79+
input_file.close()
80+
81+
print "load vectors finish"
82+
83+
def init_seq(input_file):
84+
"""读取切好词的文本文件,加载全部词序列
85+
"""
86+
file_object = open(input_file, 'r')
87+
vocab_dict = {}
88+
while True:
89+
question_seq = []
90+
answer_seq = []
91+
line = file_object.readline()
92+
if line:
93+
line_pair = line.split('|')
94+
line_question = line_pair[0]
95+
line_answer = line_pair[1]
96+
for word in line_question.decode('utf-8').split(' '):
97+
if word_vector_dict.has_key(word):
98+
question_seq.append(word_vector_dict[word])
99+
trained_word_vector_dict[word] = word_vector_dict[word]
100+
for word in line_answer.decode('utf-8').split(' '):
101+
if word_vector_dict.has_key(word):
102+
answer_seq.append(word_vector_dict[word])
103+
trained_word_vector_dict[word] = word_vector_dict[word]
104+
else:
105+
break
106+
question_seqs.append(question_seq)
107+
answer_seqs.append(answer_seq)
108+
file_object.close()
109+
110+
def vector_sqrtlen(vector):
111+
len = 0
112+
for item in vector:
113+
len += item * item
114+
len = math.sqrt(len)
115+
return len
116+
117+
def vector_cosine(v1, v2):
118+
if len(v1) != len(v2):
119+
sys.exit(1)
120+
sqrtlen1 = vector_sqrtlen(v1)
121+
sqrtlen2 = vector_sqrtlen(v2)
122+
value = 0
123+
for item1, item2 in zip(v1, v2):
124+
value += item1 * item2
125+
return value / (sqrtlen1*sqrtlen2)
126+
127+
128+
def vector2word(vector):
129+
max_cos = -10000
130+
match_word = ''
131+
for word in word_vector_dict:
132+
v = word_vector_dict[word]
133+
cosine = vector_cosine(vector, v)
134+
if cosine > max_cos:
135+
max_cos = cosine
136+
match_word = word
137+
return (match_word, max_cos)
138+
139+
140+
class MySeq2Seq(object):
141+
"""
142+
思路:输入输出序列一起作为input,然后通过slick和unpack切分
143+
完全按照论文说的编码器解码器来做
144+
输出的时候把解码器的输出按照词向量的200维展平,这样输出就是(?,seqlen*200)
145+
这样就可以通过regression来做回归计算了,输入的y也展平,保持一致
146+
"""
147+
def __init__(self, max_seq_len = 16, word_vec_dim = 200, input_file='./segment_result_lined.3000000.pair.less'):
148+
self.max_seq_len = max_seq_len
149+
self.word_vec_dim = word_vec_dim
150+
self.input_file = input_file
151+
152+
def generate_trainig_data(self):
153+
load_word_set()
154+
load_vectors("./vectors.bin")
155+
init_seq(self.input_file)
156+
xy_data = []
157+
y_data = []
158+
for i in range(len(question_seqs)):
159+
#for i in range(100):
160+
question_seq = question_seqs[i]
161+
answer_seq = answer_seqs[i]
162+
if len(question_seq) < self.max_seq_len and len(answer_seq) < self.max_seq_len:
163+
sequence_xy = [np.zeros(self.word_vec_dim)] * (self.max_seq_len-len(question_seq)) + list(reversed(question_seq))
164+
sequence_y = answer_seq + [np.zeros(self.word_vec_dim)] * (self.max_seq_len-len(answer_seq))
165+
sequence_xy = sequence_xy + sequence_y
166+
sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
167+
xy_data.append(sequence_xy)
168+
y_data.append(sequence_y)
169+
170+
#print "right answer"
171+
#for w in answer_seq:
172+
# (match_word, max_cos) = vector2word(w)
173+
# if len(match_word)>0:
174+
# print match_word, vector_sqrtlen(w)
175+
176+
return np.array(xy_data), np.array(y_data)
177+
178+
179+
def model(self, feed_previous=False):
180+
# 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
181+
input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
182+
encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
183+
decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
184+
go_inputs = tf.ones_like(decoder_inputs_tmp)
185+
go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
186+
decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
187+
188+
# 编码器
189+
# 把encoder_inputs交给编码器,返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
190+
(encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, self.word_vec_dim, return_state=True, scope='encoder_lstm')
191+
encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
192+
193+
# 解码器
194+
# 预测过程用前一个时间序的输出作为下一个时间序的输入
195+
# 先用编码器的最后一个输出作为第一个输入
196+
if feed_previous:
197+
first_dec_input = go_inputs
198+
else:
199+
first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
200+
decoder_output_tensor = tflearn.lstm(first_dec_input, self.word_vec_dim, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
201+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
202+
decoder_output_sequence_list = [decoder_output_tensor]
203+
# 再用解码器的输出作为下一个时序的输入
204+
for i in range(self.max_seq_len-1):
205+
if feed_previous:
206+
next_dec_input = decoder_output_sequence_single
207+
else:
208+
next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
209+
decoder_output_tensor = tflearn.lstm(next_dec_input, self.word_vec_dim, return_seq=False, reuse=True, scope='decoder_lstm')
210+
decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
211+
decoder_output_sequence_list.append(decoder_output_tensor)
212+
213+
decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
214+
real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
215+
216+
net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
217+
model = tflearn.DNN(net)
218+
return model
219+
220+
def train(self):
221+
trainXY, trainY = self.generate_trainig_data()
222+
model = self.model(feed_previous=False)
223+
model.fit(trainXY, trainY, n_epoch=1000, snapshot_epoch=False, batch_size=1)
224+
model.save('./model/model')
225+
return model
226+
227+
def load(self):
228+
model = self.model(feed_previous=True)
229+
model.load('./model/model')
230+
return model
231+
232+
if __name__ == '__main__':
233+
phrase = sys.argv[1]
234+
if 3 == len(sys.argv):
235+
my_seq2seq = MySeq2Seq(word_vec_dim=word_vec_dim, max_seq_len=max_seq_len, input_file=sys.argv[2])
236+
else:
237+
my_seq2seq = MySeq2Seq(word_vec_dim=word_vec_dim, max_seq_len=max_seq_len)
238+
if phrase == 'train':
239+
my_seq2seq.train()
240+
else:
241+
model = my_seq2seq.load()
242+
trainXY, trainY = my_seq2seq.generate_trainig_data()
243+
predict = model.predict(trainXY)
244+
for sample in predict:
245+
print "predict answer"
246+
for w in sample[1:]:
247+
(match_word, max_cos) = vector2word(w)
248+
#if vector_sqrtlen(w) < 1:
249+
# break
250+
print match_word, max_cos, vector_sqrtlen(w)

chatbotv2/readme.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
python ../word_segment.py zhenhuanzhuan.txt zhenhuanzhuan.segment
2+
../word2vec/word2vec -train ./zhenhuanzhuan.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
3+
4+
5+
6+
head -10000 ../subtitle/raw_subtitles/subtitle.corpus > subtitle.corpus.10000
7+
python ../word_segment.py subtitle.corpus.10000 subtitle.corpus.10000.segment
8+
../word2vec/word2vec -train ./subtitle.corpus.10000.segment -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
9+
cat subtitle.corpus.10000.segment | awk '{if(last!="")print last"|"$0;last=$0}' | sed 's/| /|/g' > subtitle.corpus.10000.segment.pair

0 commit comments

Comments
 (0)