@@ -14,13 +14,57 @@ def __init__(self):
1414 self .max_abs_weight = 32 # 最大权重绝对值,用来对词向量做正规化
1515 self .max_seq_len = 8 # 最大句子长度(词)
1616 self .word_vec_dim = 0 # 词向量维度,读vectors.bin二进制时动态确定
17- self .epoch = 10000
17+ self .epoch = 1000
1818 self .word_vector_dict = {} # 词向量词典,加载vectors.bin读入
19+ self .one_hot_word_vector_dict = {} # 根据样本词汇生成的softmax用的词向量
20+ self .word_id_word_dict = {}
21+ self .one_hot_word_vectors_dim = 1 # softmax用的词向量维度,从1开始,保留0作为EOS的word_id
22+ self .eos_word_id = 0
23+ self .eos_word = 'EOS'
1924 self .vectors_bin_file = './vectors.bin' # 词向量二进制
2025 self .model_dir = './model/model' # 模型文件路径
2126 self .n_hidden = 1000 # lstm隐藏状态单元数目
2227 self .learning_rate = 0.01 # 学习率
2328
29+ def load_one_hot_word_vectors (self ):
30+
31+ word_id_dict = {}
32+ sample_file_object = open ('./samples/1' , 'r' )
33+ lines = sample_file_object .readlines ()
34+ for line in lines :
35+ line = line .strip ()
36+ split = line .split ('|' )
37+ if len (split ) == 2 :
38+ answer = split [1 ]
39+ segments = jieba .cut (answer )
40+ for word in segments :
41+ if word not in word_id_dict :
42+ word_id_dict [word ] = self .one_hot_word_vectors_dim
43+ self .word_id_word_dict [self .one_hot_word_vectors_dim ] = word
44+ self .one_hot_word_vectors_dim = self .one_hot_word_vectors_dim + 1
45+
46+ # 添加一个结尾符
47+ vector = np .zeros (self .one_hot_word_vectors_dim )
48+ vector [self .eos_word_id ] = 1
49+ self .one_hot_word_vector_dict [self .eos_word ] = vector
50+ self .word_id_word_dict [self .eos_word_id ] = self .eos_word
51+
52+ for line in lines :
53+ line = line .strip ()
54+ split = line .split ('|' )
55+ if len (split ) == 2 :
56+ answer = split [1 ]
57+ segments = jieba .cut (answer )
58+ for word in segments :
59+ if word not in self .one_hot_word_vector_dict :
60+ word_id = word_id_dict [word ]
61+ print word , word_id
62+ vector = np .zeros (self .one_hot_word_vectors_dim )
63+ vector [word_id ] = 1
64+ self .one_hot_word_vector_dict [word ] = vector
65+
66+ sample_file_object .close ()
67+
2468 def load_word_vectors (self ):
2569 """加载词向量二进制到内存"""
2670 float_size = 4 # 一个浮点数4字节
@@ -80,98 +124,133 @@ def next_batch(self):
80124 answer = split [1 ]
81125 print ('question:[%s] answer:[%s]' % (question , answer ))
82126
127+ good_sample = True
83128 question_seq = [np .zeros (self .word_vec_dim )] * self .max_seq_len
84129 answer_seq = [np .zeros (self .word_vec_dim )] * self .max_seq_len
130+ answer_seq_one_hot = [np .zeros (self .one_hot_word_vectors_dim )] * self .max_seq_len
85131 segments = jieba .cut (question )
86132 for index , word in enumerate (segments ):
87133 if word in self .word_vector_dict :
88134 vec = np .array (self .word_vector_dict [word ]) / self .max_abs_weight
89135 # 防止词过多越界
90136 if self .max_seq_len - index - 1 < 0 :
137+ good_sample = False
91138 break
92139 # 问题不足max_seq_len在前面补零,存储时倒序存储
93140 question_seq [self .max_seq_len - index - 1 ] = vec
141+ else :
142+ good_sample = False
94143
95144 segments = jieba .cut (answer )
145+ last_index = 0
96146 for index , word in enumerate (segments ):
97147 if word in self .word_vector_dict :
98148 vec = np .array (self .word_vector_dict [word ]) / self .max_abs_weight
99149 # 防止词过多越界
100- if index >= self .max_seq_len :
150+ if index >= self .max_seq_len - 1 :
151+ good_sample = False
101152 break
102153 answer_seq [index ] = vec
103-
104- xy = question_seq + EOS + answer_seq [0 :- 1 ]
105- y = answer_seq
106- XY .append (xy )
107- Y .append (y )
154+ else :
155+ good_sample = False
156+
157+ if word in self .one_hot_word_vector_dict :
158+ vec = self .one_hot_word_vector_dict [word ]
159+ answer_seq_one_hot [index ] = vec
160+ else :
161+ good_sample = False
162+ last_index = index
163+ # 句子末尾加上EOS
164+ answer_seq_one_hot [last_index + 1 ] = self .one_hot_word_vector_dict [self .eos_word ] # EOS
165+
166+ if good_sample :
167+ xy = question_seq + EOS + answer_seq [0 :- 1 ]
168+ y = answer_seq_one_hot
169+ XY .append (xy )
170+ Y .append (y )
108171
109172 sample_file_object .close ()
110173
111174 return XY , Y
112175
113- def model (self , x , y , weights , biases , predict = False ):
114- encoder_inputs = tf .slice (x , [0 , 0 , 0 ], [1 , self .max_seq_len , self .word_vec_dim ])
115- encoder_inputs = tf .unstack (encoder_inputs , self .max_seq_len , 1 )
116-
117- if predict :
118- decoder_inputs = tf .slice (x , [0 , self .max_seq_len , 0 ], [1 , 1 , self .word_vec_dim ])
119- decoder_inputs = tf .unstack (decoder_inputs , 1 , 1 )
120- else :
121- decoder_inputs = tf .slice (x , [0 , self .max_seq_len , 0 ], [1 , self .max_seq_len , self .word_vec_dim ])
122- decoder_inputs = tf .unstack (decoder_inputs , self .max_seq_len , 1 )
123-
124- target_outputs = tf .slice (y , [0 , 0 , 0 ], [1 , self .max_seq_len , self .word_vec_dim ])
125- target_outputs = tf .unstack (target_outputs , self .max_seq_len , 1 )
126-
127- encoder = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
128- decoder = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
129-
130- encoder_outputs , states = rnn .static_rnn (encoder , encoder_inputs , dtype = tf .float32 , scope = 'encoder' )
131- if predict :
132- decoder_output , states = rnn .static_rnn (decoder , decoder_inputs , initial_state = states , dtype = tf .float32 , scope = 'decoder' )
133- else :
134- decoder_outputs , states = rnn .static_rnn (decoder , decoder_inputs , initial_state = states , dtype = tf .float32 , scope = 'decoder' )
135-
136- optimizer = None
137- cost = None
138-
139- if predict :
140- decoder_outputs = []
141- decoder_outputs .append (decoder_output )
142-
143- for i in range (self .max_seq_len - 1 ):
144- decoder_output = tf .unstack (decoder_output , axis = 1 )[0 ]
145- decoder_output = tf .matmul (decoder_output , weights ['out' ]) + tf .slice (biases ['out' ], [i , 0 ],
146- [1 , self .word_vec_dim ])
147- decoder_output , states = rnn .static_rnn (decoder , [decoder_output ], initial_state = states ,
148- dtype = tf .float32 ,
149- scope = 'decoder' )
150- decoder_outputs .append (decoder_output )
151- decoder_outputs = tf .unstack (decoder_outputs , axis = 1 )[0 ]
152- decoder_outputs = tf .unstack (decoder_outputs , axis = 1 )[0 ]
153- decoder_outputs = tf .matmul (decoder_outputs , weights ['out' ]) + biases ['out' ]
154- else :
155- decoder_outputs = tf .unstack (decoder_outputs , axis = 1 )[0 ]
156- decoder_outputs = tf .matmul (decoder_outputs , weights ['out' ]) + biases ['out' ]
157- target_outputs = tf .unstack (target_outputs , axis = 1 )[0 ]
158-
159- cost = tf .losses .mean_squared_error (decoder_outputs , target_outputs )
160- optimizer = tf .train .AdamOptimizer (learning_rate = self .learning_rate ).minimize (cost )
161- return optimizer , cost , decoder_outputs , target_outputs , encoder_inputs , decoder_inputs
176+ def model (self , x , y , weights , biases , training = True ):
177+ # 注:以下的6是one_hot_word_vectors_dim
178+ # 取第一个样本的ABC
179+ encoder_inputs = tf .slice (x , [0 , 0 , 0 ], [1 , self .max_seq_len , self .word_vec_dim ]) # shape=(1, 8, 128)
180+ # 展开成2-D Tensor
181+ encoder_inputs = tf .unstack (encoder_inputs , self .max_seq_len , 1 ) # [<tf.Tensor shape=(1, 128)>,...] 内含8个Tensor
182+
183+ # 取第一个样本的<EOS>WXYZ
184+ decoder_inputs = tf .slice (x , [0 , self .max_seq_len , 0 ], [1 , self .max_seq_len , self .word_vec_dim ]) # shape=(1, 8, 128)
185+ decoder_inputs = decoder_inputs [0 ] # shape=(8, 128)
186+ # 转成解码器的输入输出形状
187+ decoder_inputs = tf .matmul (decoder_inputs , weights ['enc2dec' ]) + biases ['enc2dec' ]
188+ # 展开成2-D Tensor
189+ decoder_inputs = tf .unstack ([decoder_inputs ], axis = 1 ) # [<tf.Tensor shape=(1, 6)>,...] 内含8个Tensor
190+
191+ # 取第一个样本的WXYZ
192+ target_outputs = tf .slice (y , [0 , 0 , 0 ], [1 , self .max_seq_len , self .one_hot_word_vectors_dim ]) # shape=(1, 8, 6)
193+ target_outputs = target_outputs [0 ] # shape=(8, 6)
194+
195+ # 构造网络结构:两层结构
196+ encoder_layer1 = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
197+ encoder_layer2 = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
198+ decoder_layer1 = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
199+ decoder_layer2 = rnn .BasicLSTMCell (self .n_hidden , forget_bias = 1.0 )
200+
201+ # 输入是8个shape=(1, 128)的Tensor,输出是8个shape=(1, 1000)的Tensor
202+ encoder_layer1_outputs , encoder_layer1_states = rnn .static_rnn (encoder_layer1 , encoder_inputs , dtype = tf .float32 , scope = 'encoder_layer1' )
203+ # 输入是8个shape=(1, 1000)的Tensor,输出是8个shape=(1, 1000)的Tensor
204+ encoder_layer2_outputs , encoder_layer2_states = rnn .static_rnn (encoder_layer2 , encoder_layer1_outputs , dtype = tf .float32 , scope = 'encoder_layer2' )
205+ # 取解码器输入的<EOS>
206+ # 输入是1个shape=(1, 6)的Tensor(<EOS>),输出是1个shape=(1, 1000)的Tensor
207+ decoder_layer1_outputs , decoder_layer1_states = rnn .static_rnn (decoder_layer1 , decoder_inputs [:1 ], initial_state = encoder_layer1_states , dtype = tf .float32 , scope = 'decoder_layer1' )
208+ # 输入是1个shape=(1, 1000)的Tensor,输出是1个shape=(1, 1000)的Tensor
209+ decoder_layer2_outputs , decoder_layer2_states = rnn .static_rnn (decoder_layer2 , decoder_layer1_outputs , initial_state = encoder_layer2_states , dtype = tf .float32 , scope = 'decoder_layer2' )
210+
211+ decoder_layer2_outputs_combine = []
212+ decoder_layer2_outputs_combine .append (decoder_layer2_outputs )
213+ for i in range (self .max_seq_len - 1 ):
214+ decoder_layer2_outputs = tf .unstack (decoder_layer2_outputs , axis = 1 )[0 ]
215+ decoder_layer2_outputs = tf .matmul (decoder_layer2_outputs , weights ['hid2tar' ]) + biases ['hid2tar' ][i ]
216+ # 输入是1个shape=(1, 6)的Tensor,输出是1个shape=(1, 1000)的Tensor
217+ if training :
218+ decoder_layer1_outputs , decoder_layer1_states = rnn .static_rnn (decoder_layer1 , decoder_inputs [i + 1 :i + 2 ], initial_state = decoder_layer1_states , dtype = tf .float32 , scope = 'decoder_layer1' )
219+ else :
220+ decoder_layer1_outputs , decoder_layer1_states = rnn .static_rnn (decoder_layer1 , [decoder_layer2_outputs ], initial_state = decoder_layer1_states , dtype = tf .float32 , scope = 'decoder_layer1' )
221+ # 输入是1个shape=(1, 1000)的Tensor,输出是1个shape=(1, 1000)的Tensor
222+ decoder_layer2_outputs , decoder_layer2_states = rnn .static_rnn (decoder_layer2 , decoder_layer1_outputs , initial_state = decoder_layer2_states , dtype = tf .float32 , scope = 'decoder_layer2' )
223+ decoder_layer2_outputs_combine .append (decoder_layer2_outputs )
224+
225+ # 下面的过程把8个shape=(1, 1000)的数组转成8个shape=(1, 1000)的Tensor
226+ decoder_layer2_outputs_combine = tf .unstack (decoder_layer2_outputs_combine , axis = 1 )[0 ]
227+ decoder_layer2_outputs_combine = tf .unstack (decoder_layer2_outputs_combine , axis = 1 )[0 ]
228+ decoder_layer2_outputs_combine = tf .unstack ([decoder_layer2_outputs_combine ], axis = 1 )
229+ # 重新对decoder_layer2_outputs赋值
230+ decoder_layer2_outputs = decoder_layer2_outputs_combine
231+
232+ decoder_layer2_outputs = tf .unstack (decoder_layer2_outputs , axis = 1 )[0 ] # shape=(8, 1000)
233+ decoder_layer2_outputs = tf .matmul (decoder_layer2_outputs , weights ['hid2tar' ]) + biases ['hid2tar' ] # shape=(8, 6)
234+
235+ cost = tf .losses .mean_squared_error (decoder_layer2_outputs , target_outputs )
236+ optimizer = tf .train .AdamOptimizer (learning_rate = self .learning_rate ).minimize (cost )
237+
238+ return optimizer , cost , decoder_layer2_outputs
162239
163240 def train (self ):
164241 x = tf .placeholder ("float" , [None , self .max_seq_len * 2 , self .word_vec_dim ])
165- y = tf .placeholder ("float" , [None , self .max_seq_len , self .word_vec_dim ])
242+ y = tf .placeholder ("float" , [None , self .max_seq_len , self .one_hot_word_vectors_dim ])
166243
167244 weights = {
168- 'out' : tf .Variable (tf .random_normal ([self .n_hidden , self .word_vec_dim ]))
245+ 'enc2dec' : tf .Variable (tf .random_normal ([self .word_vec_dim , self .one_hot_word_vectors_dim ])),
246+ 'hid2tar' : tf .Variable (tf .random_normal ([self .n_hidden , self .one_hot_word_vectors_dim ])),
169247 }
170248 biases = {
171- 'out' : tf .Variable (tf .random_normal ([self .max_seq_len , self .word_vec_dim ]))
249+ 'enc2dec' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
250+ 'hid2tar' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
172251 }
173252
174- optimizer , cost , decoder_outputs , target_outputs , encoder_inputs , decoder_inputs = self .model (x , y , weights , biases )
253+ optimizer , cost , decoder_layer2_outputs = self .model (x , y , weights , biases )
175254
176255 init = tf .global_variables_initializer ()
177256 sess = tf .Session ()
@@ -186,24 +265,56 @@ def train(self):
186265 train_Y = Y [step :]
187266 sess .run (optimizer , feed_dict = {x : train_XY , y : train_Y })
188267 loss = sess .run (cost , feed_dict = {x : train_XY , y : train_Y })
189- if i % 100 == 0 and step == 0 :
268+ if i % 1 == 0 and step == 0 :
190269 print 'i=%d, loss=%f' % (i , loss )
191270
192271 saver = tf .train .Saver ()
193272 saver .save (sess , self .model_dir )
194273
274+ def test (self ):
275+ x = tf .placeholder ("float" , [None , self .max_seq_len * 2 , self .word_vec_dim ])
276+ y = tf .placeholder ("float" , [None , self .max_seq_len , self .one_hot_word_vectors_dim ])
277+
278+ weights = {
279+ 'enc2dec' : tf .Variable (tf .random_normal ([self .word_vec_dim , self .one_hot_word_vectors_dim ])),
280+ 'hid2tar' : tf .Variable (tf .random_normal ([self .n_hidden , self .one_hot_word_vectors_dim ])),
281+ }
282+ biases = {
283+ 'enc2dec' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
284+ 'hid2tar' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
285+ }
286+
287+ optimizer , cost , decoder_layer2_outputs = self .model (x , y , weights , biases , training = False )
288+
289+ init = tf .global_variables_initializer ()
290+ sess = tf .Session ()
291+ sess .run (init )
292+ saver = tf .train .Saver ()
293+ saver .restore (sess , self .model_dir )
294+
295+ XY , Y = self .next_batch ()
296+ n_steps = len (XY )
297+ for step in range (n_steps ):
298+ train_XY = XY [step :]
299+ train_Y = Y [step :]
300+ loss = sess .run (cost , feed_dict = {x : train_XY , y : train_Y })
301+ print sess .run (decoder_layer2_outputs , feed_dict = {x : train_XY , y : train_Y })
302+ print 'loss=%f' % loss
303+
195304 def predict (self ):
196305 x = tf .placeholder ("float" , [None , self .max_seq_len * 2 , self .word_vec_dim ])
197- y = tf .placeholder ("float" , [None , self .max_seq_len , self .word_vec_dim ])
306+ y = tf .placeholder ("float" , [None , self .max_seq_len , self .one_hot_word_vectors_dim ])
198307
199308 weights = {
200- 'out' : tf .Variable (tf .random_normal ([self .n_hidden , self .word_vec_dim ]))
309+ 'enc2dec' : tf .Variable (tf .random_normal ([self .word_vec_dim , self .one_hot_word_vectors_dim ])),
310+ 'hid2tar' : tf .Variable (tf .random_normal ([self .n_hidden , self .one_hot_word_vectors_dim ])),
201311 }
202312 biases = {
203- 'out' : tf .Variable (tf .random_normal ([self .max_seq_len , self .word_vec_dim ]))
313+ 'enc2dec' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
314+ 'hid2tar' : tf .Variable (tf .random_normal ([self .max_seq_len , self .one_hot_word_vectors_dim ])),
204315 }
205316
206- optimizer , cost , decoder_outputs , target_outputs , encoder_inputs , decoder_inputs = self .model (x , y , weights , biases , predict = True )
317+ optimizer , cost , decoder_layer2_outputs = self .model (x , y , weights , biases , training = False )
207318
208319 init = tf .global_variables_initializer ()
209320 sess = tf .Session ()
@@ -227,17 +338,25 @@ def predict(self):
227338
228339 xy = question_seq + EOS + [np .zeros (self .word_vec_dim )] * (self .max_seq_len - 1 )
229340 XY .append (xy )
230- Y .append ([np .zeros (self .word_vec_dim )] * self .max_seq_len )
231- print sess .run (decoder_outputs , feed_dict = {x : XY , y : Y })
341+ Y .append ([np .zeros (self .one_hot_word_vectors_dim )] * self .max_seq_len )
342+ output_seq = sess .run (decoder_layer2_outputs , feed_dict = {x : XY , y : Y })
343+ print output_seq
344+ for vector in output_seq :
345+ word_id = np .argmax (vector , axis = 0 )
346+ print self .word_id_word_dict [word_id ]
232347
233348
234349def main (op ):
350+ np .set_printoptions (threshold = 'nan' )
235351 lstm = MyLSTM ()
236352 lstm .load_word_vectors ()
353+ lstm .load_one_hot_word_vectors ()
237354 if op == 'train' :
238355 lstm .train ()
239356 elif op == 'predict' :
240357 lstm .predict ()
358+ elif op == 'test' :
359+ lstm .test ()
241360 else :
242361 print 'Usage:'
243362
0 commit comments