add my_seq2seq.py

lichuang · lichuang · commit f8c7b467c44d · 2016-12-23T09:11:23.000+08:00
diff --git a/chatbotv2/my_seq2seq.py b/chatbotv2/my_seq2seq.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import math
+import tflearn
+import tensorflow as tf
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn
+import chardet
+import numpy as np
+import struct
+
+seq = []
+
+max_w = 50
+float_size = 4
+word_vector_dict = {}
+
+def load_vectors(input):
+    """从vectors.bin加载词向量，返回一个word_vector_dict的词典，key是词，value是200维的向量
+    """
+    print "begin load vectors"
+
+    input_file = open(input, "rb")
+
+    # 获取词表数目及向量维度
+    words_and_size = input_file.readline()
+    words_and_size = words_and_size.strip()
+    words = long(words_and_size.split(' ')[0])
+    size = long(words_and_size.split(' ')[1])
+    print "words =", words
+    print "size =", size
+
+    for b in range(0, words):
+        a = 0
+        word = ''
+        # 读取一个词
+        while True:
+            c = input_file.read(1)
+            word = word + c
+            if False == c or c == ' ':
+                break
+            if a < max_w and c != '\n':
+                a = a + 1
+        word = word.strip()
+
+        vector = []
+        for index in range(0, size):
+            m = input_file.read(float_size)
+            (weight,) = struct.unpack('f', m)
+            vector.append(float(weight))
+
+        # 将词及其对应的向量存到dict中
+        word_vector_dict[word.decode('utf-8')] = vector
+        #word_vector_dict[word.decode('utf-8')] = vector[0:4]
+
+    input_file.close()
+
+    print "load vectors finish"
+
+def init_seq():
+    """读取切好词的文本文件，加载全部词序列
+    """
+    file_object = open('zhenhuanzhuan.segment', 'r')
+    vocab_dict = {}
+    while True:
+        line = file_object.readline()
+        if line:
+            for word in line.decode('utf-8').split(' '):
+                if word_vector_dict.has_key(word):
+                    seq.append(word_vector_dict[word])
+        else:
+            break
+    file_object.close()
+
+def vector_sqrtlen(vector):
+    len = 0
+    for item in vector:
+        len += item * item
+    len = math.sqrt(len)
+    return len
+
+def vector_cosine(v1, v2):
+    if len(v1) != len(v2):
+        sys.exit(1)
+    sqrtlen1 = vector_sqrtlen(v1)
+    sqrtlen2 = vector_sqrtlen(v2)
+    value = 0
+    for item1, item2 in zip(v1, v2):
+        value += item1 * item2
+    return value / (sqrtlen1*sqrtlen2)
+
+
+def vector2word(vector):
+    max_cos = -10000
+    match_word = ''
+    for word in word_vector_dict:
+        v = word_vector_dict[word]
+        cosine = vector_cosine(vector, v)
+        if cosine > max_cos:
+            max_cos = cosine
+            match_word = word
+    return (match_word, max_cos)
+
+
+class MySeq2Seq(object):
+    """
+    思路：输入输出序列一起作为input，然后通过slick和unpack切分
+    完全按照论文说的编码器解码器来做
+    输出的时候把解码器的输出按照词向量的200维展平，这样输出就是(?,seqlen*200)
+    这样就可以通过regression来做回归计算了，输入的y也展平，保持一致
+    """
+    def __init__(self, max_seq_len = 16):
+        self.max_seq_len = max_seq_len
+        self.word_vec_dim = 200
+
+    def generate_trainig_data(self):
+        load_vectors("./vectors.bin")
+        init_seq()
+        xy_data = []
+        y_data = []
+        for i in range(10,30,10):
+            # 问句、答句都是16字，所以取32个
+            start = i*self.max_seq_len*2
+            middle = i*self.max_seq_len*2 + self.max_seq_len
+            end = (i+1)*self.max_seq_len*2
+            sequence_xy = seq[start:end]
+            sequence_y = seq[middle:end]
+            sequence_y = [np.ones(self.word_vec_dim)] + sequence_y
+            xy_data.append(sequence_xy)
+            y_data.append(sequence_y)
+
+        return np.array(xy_data), np.array(y_data)
+
+    def embedding_rnn_seq2seq(self, encoder_inputs,
+            decoder_inputs,
+            cell,
+            output_projection=None,
+            feed_previous=False,
+            dtype=None,
+            scope=None):
+        _, encoder_state = rnn.rnn(cell, encoder_inputs, dtype=dtype, scope=scope)
+
+    def model_bak(self, feed_previous=False):
+        # 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
+        input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
+        encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
+        decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
+        go_inputs = tf.zeros_like(decoder_inputs_tmp)
+        go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
+        decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
+
+        # 编码器
+        # 把encoder_inputs交给编码器，返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
+        (encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, 200, return_state=True, scope='encoder_lstm')
+        encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
+
+        # 解码器
+        if feed_previous:
+            # 预测过程用前一个时间序的输出作为下一个时间序的输入
+            # 先用编码器的最后一个输出作为第一个输入
+            decoder_output_tensor = tflearn.lstm(encoder_output_sequence, 200, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
+            decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
+            decoder_output_sequence_list = [decoder_output_tensor]
+            # 再用解码器的输出作为下一个时序的输入
+            for i in range(self.max_seq_len-1):
+                decoder_output_tensor = tflearn.lstm(decoder_output_sequence_single, 200, return_seq=False, reuse=True, scope='decoder_lstm')
+                decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
+                decoder_output_sequence_list.append(decoder_output_tensor)
+        else:
+            # 把decoder_inputs交给解码器，返回一个输出序列
+            decoder_output_sequence_list = tflearn.lstm(decoder_inputs, 200, initial_state=states, return_seq=True, reuse=False, scope='decoder_lstm')
+
+        decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
+        real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
+
+        net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
+        model = tflearn.DNN(net)
+        return model
+
+    def model(self, feed_previous=False):
+        # 通过输入的XY生成encoder_inputs和带GO头的decoder_inputs
+        input_data = tflearn.input_data(shape=[None, self.max_seq_len*2, self.word_vec_dim], dtype=tf.float32, name = "XY")
+        encoder_inputs = tf.slice(input_data, [0, 0, 0], [-1, self.max_seq_len, self.word_vec_dim], name="enc_in")
+        decoder_inputs_tmp = tf.slice(input_data, [0, self.max_seq_len, 0], [-1, self.max_seq_len-1, self.word_vec_dim], name="dec_in_tmp")
+        go_inputs = tf.ones_like(decoder_inputs_tmp)
+        go_inputs = tf.slice(go_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
+        decoder_inputs = tf.concat(1, [go_inputs, decoder_inputs_tmp], name="dec_in")
+
+        # 编码器
+        # 把encoder_inputs交给编码器，返回一个输出(预测序列的第一个值)和一个状态(传给解码器)
+        (encoder_output_tensor, states) = tflearn.lstm(encoder_inputs, 200, return_state=True, scope='encoder_lstm')
+        encoder_output_sequence = tf.pack([encoder_output_tensor], axis=1)
+
+        # 解码器
+        # 预测过程用前一个时间序的输出作为下一个时间序的输入
+        # 先用编码器的最后一个输出作为第一个输入
+        if feed_previous:
+            first_dec_input = go_inputs
+        else:
+            first_dec_input = tf.slice(decoder_inputs, [0, 0, 0], [-1, 1, self.word_vec_dim])
+        decoder_output_tensor = tflearn.lstm(first_dec_input, 200, initial_state=states, return_seq=False, reuse=False, scope='decoder_lstm')
+        decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
+        decoder_output_sequence_list = [decoder_output_tensor]
+        # 再用解码器的输出作为下一个时序的输入
+        for i in range(self.max_seq_len-1):
+            if feed_previous:
+                next_dec_input = decoder_output_sequence_single
+            else:
+                next_dec_input = tf.slice(decoder_inputs, [0, i+1, 0], [-1, 1, self.word_vec_dim])
+            decoder_output_tensor = tflearn.lstm(next_dec_input, 200, return_seq=False, reuse=True, scope='decoder_lstm')
+            decoder_output_sequence_single = tf.pack([decoder_output_tensor], axis=1)
+            decoder_output_sequence_list.append(decoder_output_tensor)
+
+        decoder_output_sequence = tf.pack(decoder_output_sequence_list, axis=1)
+        real_output_sequence = tf.concat(1, [encoder_output_sequence, decoder_output_sequence])
+
+        net = tflearn.regression(real_output_sequence, optimizer='sgd', learning_rate=0.1, loss='mean_square')
+        model = tflearn.DNN(net)
+        return model
+
+    def train(self):
+        trainXY, trainY = self.generate_trainig_data()
+        model = self.model(feed_previous=False)
+        model.fit(trainXY, trainY, n_epoch=100, snapshot_epoch=False)
+        model.save('./model/model')
+        return model
+
+    def load(self):
+        model = self.model(feed_previous=True)
+        model.load('./model/model')
+        return model
+
+if __name__ == '__main__':
+    my_seq2seq = MySeq2Seq()
+    my_seq2seq.train()
+    #model = my_seq2seq.load()
diff --git a/chatbotv2/one_lstm_sequence_generate.py b/chatbotv2/one_lstm_sequence_generate.py
@@ -106,7 +106,7 @@ def main():
     ylist = []
     test_X = None
     #for i in range(len(seq)-100):
-    for i in range(10):
+    for i in range(1000):
         sequence = seq[i:i+20]
         xlist.append(sequence)
         ylist.append(seq[i+20])
@@ -123,7 +123,7 @@ def main():
     net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1,
                                      loss='mean_square')
     model = tflearn.DNN(net)
-    model.fit(X, Y, n_epoch=500, batch_size=10,snapshot_epoch=False,show_metric=True)
+    model.fit(X, Y, n_epoch=500, batch_size=100,snapshot_epoch=False,show_metric=True)
     model.save("model")
     predict = model.predict([test_X])
     #print predict