add files

lichuang · lichuang · commit ee25720525e1 · 2016-10-28T09:06:14.000+08:00
diff --git a/chatbotv2/lstm_train.py b/chatbotv2/lstm_train.py
@@ -8,7 +8,8 @@
 import tflearn
 
 max_seq_len = 8
-learning_rate = 0.001
+learning_rate = 0.01
+id_word_dict = {}
 
 # 得到了单词转id的词典是word_id_dict, 最大单词id是max_word_id
 def init_word_id_dict():
@@ -43,9 +44,13 @@ def init_word_id_dict():
 
     uuid = 1
 
-    max_word_id=1500
+    max_word_id=2000
     for (word, freq) in vocab_dict:
         word_id_dict[word] = uuid
+        id_word_dict[uuid] = word
+        #if freq > 20:
+        #    print word, uuid, freq
+        print word, uuid, freq
         uuid = uuid + 1
         if uuid > max_word_id:
             break
@@ -63,7 +68,7 @@ def accuracy(y_pred, y_true, x_in):
     pred_idx = tf.to_int32(tf.argmax(y_pred, 2))
     return tf.reduce_mean(tf.cast(tf.equal(pred_idx, y_true), tf.float32), name='acc')
 
-def create_model(max_word_id):
+def create_model(max_word_id, is_test=False):
     GO_VALUE = max_word_id + 1
     network = tflearn.input_data(shape=[None, max_seq_len + max_seq_len], dtype=tf.int32, name="XY")
     encoder_inputs = tf.slice(network, [0, 0], [-1, max_seq_len], name="enc_in")
@@ -75,7 +80,7 @@ def create_model(max_word_id):
     num_encoder_symbols = max_word_id + 1 # 从0起始
     num_decoder_symbols = max_word_id + 2 # 包括GO
 
-    cell = rnn_cell.BasicLSTMCell(max_seq_len+max_seq_len, state_is_tuple=True)
+    cell = rnn_cell.BasicLSTMCell(16*max_seq_len, state_is_tuple=True)
 
     model_outputs, states = seq2seq.embedding_rnn_seq2seq(
             encoder_inputs,
@@ -84,7 +89,7 @@ def create_model(max_word_id):
             num_encoder_symbols=num_encoder_symbols,
             num_decoder_symbols=num_decoder_symbols,
             embedding_size=max_word_id,
-            feed_previous=False)
+            feed_previous=is_test)
 
     network = tf.pack(model_outputs, axis=1)
 
@@ -107,12 +112,22 @@ def create_model(max_word_id):
     print "create DNN model finish"
     return model
 
+def print_sentence(list, msg):
+    sentence = msg
+    for item in list:
+        if item != 0:
+            sentence = sentence + id_word_dict[item]
+    print sentence
 
 if __name__ == '__main__':
+    if len(sys.argv) > 1 and sys.argv[1] == 'test':
+        is_test = True
+    else:
+        is_test = False
     (word_id_dict, max_word_id) = init_word_id_dict()
     print "max_word_id =", max_word_id
 
-    model = create_model(max_word_id)
+    model = create_model(max_word_id, is_test)
 
     threshold = max_seq_len
     file_object = open("chat_dev.data", "r")
@@ -138,14 +153,20 @@ def create_model(max_word_id):
             # 保证连续的话才参与训练
             if last_line_no != 0 and last_line_no == cur_line_no - 1:
                 question_id_list = []
+                question = ""
+                answer = ""
                 question_array = np.zeros(max_seq_len + max_seq_len)
                 answer_array = np.zeros(max_seq_len)
                 idx = 0
+                question_has_word = False
+                answer_has_word = False
                 for word in last_words:
                     if len(word)>0 and word_id_dict.has_key(word):
                         word_id = word_id_dict[word]
                         question_id_list.append(word_id)
+                        question = question + word
                         question_array[idx] = word_id
+                        question_has_word = True
                         idx = idx + 1
                 for i in range(max_seq_len - len(question_id_list)):
                     question_id_list.append(0)
@@ -157,20 +178,21 @@ def create_model(max_word_id):
                     if len(word)>0 and word_id_dict.has_key(word):
                         word_id = word_id_dict[word]
                         answer_id_list.append(word_id)
+                        answer = answer + word
                         question_array[max_seq_len + idx] = word_id
                         answer_array[idx] = word_id
+                        answer_has_word = True
                         idx = idx + 1
                 for i in range(2*max_seq_len - len(question_id_list)):
                     answer_id_list.append(0)
                 question_id_list.extend(answer_id_list)
 
-                XY.append(question_array)
-                Y.append(answer_array)
-                sample_count = sample_count + 1
-
-                #if sample_count > 0:
-                #    break
-
+                if question_has_word and answer_has_word:
+                    #print "question =", question
+                    #print "answer =", answer
+                    XY.append(question_array)
+                    Y.append(answer_array)
+                    sample_count = sample_count + 1
 
             last_words = words
             last_line = line
@@ -180,29 +202,39 @@ def create_model(max_word_id):
             break
     file_object.close()
 
-    model.fit(
-            XY,
-            Y,
-            n_epoch=100,
-            validation_set=0.01,
-            batch_size=1,
-            shuffle=True,
-            show_metric=True,
-            snapshot_step=5000,
-            snapshot_epoch=False,
-            run_id="my_lstm_test")
+    if not is_test:
+        model.fit(
+                XY,
+                Y,
+                n_epoch=3000,
+                validation_set=0.01,
+                batch_size=64,
+                shuffle=True,
+                show_metric=True,
+                snapshot_step=5000,
+                snapshot_epoch=False,
+                run_id="my_lstm_test")
 
-    model.save("./weights")
-    #model.load("./weights")
+        model.save("./weights")
+    else:
+        model.load("./weights")
 
 
     # predict
-    TEST_XY = [XY[0]]
-    res = model.predict(TEST_XY)
-    res = np.array(res)
-    num_decoder_symbols = max_word_id + 2
-    y = res.reshape(max_seq_len, num_decoder_symbols)
-    prediction = np.argmax(y, axis=1)
-    print TEST_XY
-    print "desire =", Y[0]
-    print "prediction =", prediction
+    for i in range(100):
+        TEST_XY = [XY[i]]
+        TEST_XY[0][max_seq_len:2*max_seq_len] = 0
+        #TEST_XY[0][0:2*max_seq_len] = 0
+        #TEST_XY[0][0] = 5
+        #TEST_XY[0][1] = 4
+        #TEST_XY[0][2] = 109
+
+        res = model.predict(TEST_XY)
+        res = np.array(res)
+        num_decoder_symbols = max_word_id + 2
+        y = res.reshape(max_seq_len, num_decoder_symbols)
+        prediction = np.argmax(y, axis=1)
+        if 0 != np.sum(prediction):
+            print_sentence(TEST_XY[0], "input ")
+            print_sentence(Y[i], "desire ")
+            print_sentence(prediction, "prediction ")
diff --git a/pattern_recognition.lua b/pattern_recognition.lua
@@ -0,0 +1,85 @@
+require 'nn'
+require 'paths'
+if (not paths.filep("cifar10torchsmall.zip")) then
+    os.execute('wget -c https://s3.amazonaws.com/torch7/data/cifar10torchsmall.zip')
+    os.execute('unzip cifar10torchsmall.zip')
+end
+trainset = torch.load('cifar10-train.t7')
+testset = torch.load('cifar10-test.t7')
+classes = {'airplane', 'automobile', 'bird', 'cat',
+'deer', 'dog', 'frog', 'horse', 'ship', 'truck'}
+setmetatable(trainset, 
+{__index = function(t, i) 
+    return {t.data[i], t.label[i]} 
+end}
+);
+trainset.data = trainset.data:double() -- convert the data from a ByteTensor to a DoubleTensor.
+
+function trainset:size() 
+    return self.data:size(1) 
+end
+mean = {} -- store the mean, to normalize the test set in the future
+stdv  = {} -- store the standard-deviation for the future
+for i=1,3 do -- over each image channel
+    mean[i] = trainset.data[{ {}, {i}, {}, {}  }]:mean() -- mean estimation
+    print('Channel ' .. i .. ', Mean: ' .. mean[i])
+    trainset.data[{ {}, {i}, {}, {}  }]:add(-mean[i]) -- mean subtraction
+
+    stdv[i] = trainset.data[{ {}, {i}, {}, {}  }]:std() -- std estimation
+    print('Channel ' .. i .. ', Standard Deviation: ' .. stdv[i])
+    trainset.data[{ {}, {i}, {}, {}  }]:div(stdv[i]) -- std scaling
+end
+net = nn.Sequential()
+net:add(nn.SpatialConvolution(3, 6, 5, 5)) -- 3 input image channels, 6 output channels, 5x5 convolution kernel
+net:add(nn.ReLU())                       -- non-linearity 
+net:add(nn.SpatialMaxPooling(2,2,2,2))     -- A max-pooling operation that looks at 2x2 windows and finds the max.
+net:add(nn.SpatialConvolution(6, 16, 5, 5))
+net:add(nn.ReLU())                       -- non-linearity 
+net:add(nn.SpatialMaxPooling(2,2,2,2))
+net:add(nn.View(16*5*5))                    -- reshapes from a 3D tensor of 16x5x5 into 1D tensor of 16*5*5
+net:add(nn.Linear(16*5*5, 120))             -- fully connected layer (matrix multiplication between input and weights)
+net:add(nn.ReLU())                       -- non-linearity 
+net:add(nn.Linear(120, 84))
+net:add(nn.ReLU())                       -- non-linearity 
+net:add(nn.Linear(84, 10))                   -- 10 is the number of outputs of the network (in this case, 10 digits)
+net:add(nn.LogSoftMax())                     -- converts the output to a log-probability. Useful for classification problems
+criterion = nn.ClassNLLCriterion()
+trainer = nn.StochasticGradient(net, criterion)
+trainer.learningRate = 0.001
+trainer.maxIteration = 5
+trainer:train(trainset)
+testset.data = testset.data:double()   -- convert from Byte tensor to Double tensor
+for i=1,3 do -- over each image channel
+    testset.data[{ {}, {i}, {}, {}  }]:add(-mean[i]) -- mean subtraction    
+    testset.data[{ {}, {i}, {}, {}  }]:div(stdv[i]) -- std scaling
+end
+predicted = net:forward(testset.data[100])
+print(classes[testset.label[100]])
+print(predicted:exp())
+for i=1,predicted:size(1) do
+    print(classes[i], predicted[i])
+end
+correct = 0
+for i=1,10000 do
+    local groundtruth = testset.label[i]
+    local prediction = net:forward(testset.data[i])
+    local confidences, indices = torch.sort(prediction, true)  -- true means sort in descending order
+    if groundtruth == indices[1] then
+        correct = correct + 1
+    end
+end
+
+print(correct, 100*correct/10000 .. ' % ')
+class_performance = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+for i=1,10000 do
+    local groundtruth = testset.label[i]
+    local prediction = net:forward(testset.data[i])
+    local confidences, indices = torch.sort(prediction, true)  -- true means sort in descending order
+    if groundtruth == indices[1] then
+        class_performance[groundtruth] = class_performance[groundtruth] + 1
+    end
+end
+
+for i=1,#classes do
+    print(classes[i], 100*class_performance[i]/1000 .. ' %')
+end
diff --git a/seq2seq/hello_sequence.py b/seq2seq/hello_sequence.py
@@ -0,0 +1,98 @@
+# coding:utf-8
+
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+import sys
+
+vocab_size=256
+learning_rate=0.1
+# 暂时只试验一个bucket
+buckets=[(10, 10)]
+bucket_id=0
+# 填充0
+PAD=[0]
+sample_size=20
+# LSTM中的记忆单元数目
+num_units=100
+# 多少层的lstm
+num_layers=2
+
+# sample_size个样本，每个样本有一个question、answer、weights，question、answer分别是10维的向量
+# 这sample_size个样本有时间序上的依赖关系
+question_sample_list = [map(ord, "hello?") + PAD * 4] * sample_size
+answer_sample_list = [map(ord, "world!") + PAD * 4] * sample_size
+init_weights_list = [[1.0]*7 + [0.0]*3] *sample_size # mask padding. todo: redundant --
+
+with tf.Session() as session:
+
+    # 初始化神经网络单元
+    cell = single_cell = tf.nn.rnn_cell.LSTMCell(num_units)
+    if num_layers > 1:
+        cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
+
+    # 定义函数
+    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
+        return tf.nn.seq2seq.embedding_rnn_seq2seq(
+             encoder_inputs, decoder_inputs, cell,
+             num_encoder_symbols=vocab_size,
+             num_decoder_symbols=vocab_size,
+             embedding_size=num_units,
+             feed_previous=do_decode)
+
+    # 初始化训练用的变量，如果是多个层，权重共享
+    encoder_inputs = []
+    decoder_inputs = []
+    weights = []
+    for i in xrange(sample_size):
+        encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
+    for i in xrange(sample_size):
+        decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
+        weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))
+    targets = [decoder_inputs[i] for i in xrange(len(decoder_inputs))]
+
+    # 创建模型及损失计算方法
+    buckets_outputs, losses = tf.nn.seq2seq.model_with_buckets(
+         encoder_inputs, decoder_inputs, targets,
+         weights, buckets,
+         lambda x, y: seq2seq_f(x, y, False))
+
+
+    # 梯度更新算法
+    updates=[]
+    for b in xrange(len(buckets)):
+        updates.append(tf.train.AdamOptimizer(learning_rate).minimize(losses[b]))
+
+    # 用于保存模型
+    saver = tf.train.Saver(tf.all_variables())
+
+    # 初始化
+    session.run(tf.initialize_all_variables())
+
+    while True:
+        encoder_size = len(encoder_inputs)
+        decoder_size = len(decoder_inputs)
+
+        # 初始化feed_dict数据
+        feed_dict = {}
+        for i in xrange(encoder_size):
+            feed_dict[encoder_inputs[i].name] = question_sample_list[i]
+        for i in xrange(decoder_size):
+            feed_dict[decoder_inputs[i].name] = answer_sample_list[i]
+            feed_dict[weights[i].name] = init_weights_list[i]
+
+        # 初始化fetches模型相关信息,fetches就是想拿什么就拿什么，比如updates就是拿更新值，losses就是拿损失值，buckets_outputs就是拿输出值
+        fetches = [updates[bucket_id], losses[bucket_id]]
+        fetches.append(buckets_outputs[bucket_id][0])
+        # 这一句是为了拿输出，训练过程可以不要
+        for i in xrange(len(buckets_outputs[bucket_id])):
+            fetches.append(buckets_outputs[bucket_id][i])
+
+        # 参数传递进去的是数据和计算逻辑，具体执行时可以传到各种介质中执行
+        fetches_outputs = session.run(fetches, feed_dict)
+        perplexity = fetches_outputs[1]
+        outputs = fetches_outputs[2:]
+        print ("perplexity =", perplexity)
+        words = np.argmax(outputs, axis=2)
+        word = "".join(map(chr, words[0])).replace('\x00', '').replace('\n', '')
+        print("output: %s" % word)