add lstm code

lichuang · lichuang · commit 0cc7b1a8dbfd · 2016-10-13T08:44:47.000+08:00
diff --git a/lstm_code/iamtrask/lstm.py b/lstm_code/iamtrask/lstm.py
@@ -0,0 +1,126 @@
+# coding:utf-8
+import copy, numpy as np
+np.random.seed(0)
+# compute sigmoid nonlinearity
+def sigmoid(x):
+    output = 1/(1+np.exp(-x))
+    return output
+
+# convert output of sigmoid function to its derivative
+def sigmoid_output_to_derivative(output):
+    return output*(1-output)
+
+
+# training dataset generation
+int2binary = {}
+binary_dim = 8
+
+largest_number = pow(2,binary_dim)
+binary = np.unpackbits(
+                       np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
+for i in range(largest_number):
+    int2binary[i] = binary[i]
+
+
+# input variables
+alpha = 0.1
+input_dim = 2
+hidden_dim = 16
+output_dim = 1
+
+
+# initialize neural network weights
+synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1
+synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1
+synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1
+
+synapse_0_update = np.zeros_like(synapse_0)
+synapse_1_update = np.zeros_like(synapse_1)
+synapse_h_update = np.zeros_like(synapse_h)
+
+# training logic
+for j in range(10000):
+
+    # generate a simple addition problem (a + b = c)
+    a_int = np.random.randint(largest_number/2) # int version
+    a = int2binary[a_int] # binary encoding
+
+    b_int = np.random.randint(largest_number/2) # int version
+    b = int2binary[b_int] # binary encoding
+
+    # true answer
+    c_int = a_int + b_int
+    c = int2binary[c_int]
+
+    # where we'll store our best guess (binary encoded)
+    d = np.zeros_like(c)
+
+    overallError = 0
+
+    layer_2_deltas = list()
+    layer_1_values = list()
+    layer_1_values.append(np.zeros(hidden_dim))
+
+    # moving along the positions in the binary encoding
+    for position in range(binary_dim):
+
+        # generate input and output
+        X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]])
+        y = np.array([[c[binary_dim - position - 1]]]).T
+
+        # hidden layer (input ~+ prev_hidden)
+        layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h))
+
+        # output layer (new binary representation)
+        layer_2 = sigmoid(np.dot(layer_1,synapse_1))
+
+        # did we miss?... if so by how much?
+        layer_2_error = y - layer_2
+        layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2))
+        overallError += np.abs(layer_2_error[0])
+
+        # decode estimate so we can print it out
+        d[binary_dim - position - 1] = np.round(layer_2[0][0])
+
+        # store hidden layer so we can use it in the next timestep
+        layer_1_values.append(copy.deepcopy(layer_1))
+
+    future_layer_1_delta = np.zeros(hidden_dim)
+
+    for position in range(binary_dim):
+
+        X = np.array([[a[position],b[position]]])
+        layer_1 = layer_1_values[-position-1]
+        prev_layer_1 = layer_1_values[-position-2]
+
+        # error at output layer
+        layer_2_delta = layer_2_deltas[-position-1]
+        # error at hidden layer
+        layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + \
+                         layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)
+                         # let's update all our weights so we can try again
+        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
+        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
+        synapse_0_update += X.T.dot(layer_1_delta)
+
+        future_layer_1_delta = layer_1_delta
+
+
+    synapse_0 += synapse_0_update * alpha
+    synapse_1 += synapse_1_update * alpha
+    synapse_h += synapse_h_update * alpha
+
+    synapse_0_update *= 0
+    synapse_1_update *= 0
+    synapse_h_update *= 0
+
+    # print out progress
+    if(j % 1000 == 0):
+        print "Error:" + str(overallError)
+        print "Pred:" + str(d)
+        print "True:" + str(c)
+        out = 0
+        for index,x in enumerate(reversed(d)):
+            out += x*pow(2,index)
+        print str(a_int) + " + " + str(b_int) + " = " + str(out)
+        print "------------"
diff --git a/lstm_code/nicodjimenez/README.md b/lstm_code/nicodjimenez/README.md
@@ -0,0 +1,12 @@
+# lstm
+A basic lstm network can be written from scratch in a few hundred lines of python, yet most of us have a hard time figuring out how lstm's actually work.  The original Neural Computation [paper](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=3&cad=rja&uact=8&ved=0CDAQFjACahUKEwj1iZLX5efGAhVMpIgKHbv3DiI&url=http%3A%2F%2Fdeeplearning.cs.cmu.edu%2Fpdfs%2FHochreiter97_lstm.pdf&ei=ZuirVfW-GMzIogS777uQAg&usg=AFQjCNGoFvqrva4rDCNIcqNe_SiPL_VPxg&sig2=ZYnsGpdfHjRbK8xdr1thBg&bvm=bv.98197061,d.cGU) is too technical for non experts.  Most blogs online on the topic seem to be written by people
+who have never implemented lstm's for people who will not implement them either.  Other blogs are written by experts (like this [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)) and lack simplified illustrative source code that actually does something.  The [Apollo](https://github.com/Russell91/apollo) library built on top of caffe is terrific and features a fast lstm implementation.  However, the downside of efficient implementations is that the source code is hard to follow.
+
+This repo features a minimal lstm implementation for people that are curious about lstms to the point of wanting to know how lstm's might be implemented.  The code here follows notational conventions set forth in [this](http://arxiv.org/abs/1506.00019)
+well written tutorial introduction.  This article should be read before trying to understand this code (at least the part about lstm's).  By running `python test.py` you will have a minimal example of an lstm network learning to predict an output sequence of numbers in [-1,1] by using a Euclidean loss on the first element of each node's hidden layer.  
+
+Play with code, add functionality, and try it on different datasets.  Pull requests welcome. 
+
+Please read [my blog article](http://nicodjimenez.github.io/2014/08/08/lstm.html) if you want details on the backprop part of the code.
+
+Also, check out a version of this code written in the D programming language by Mathias Baumann: https://github.com/Marenz/lstm
diff --git a/lstm_code/nicodjimenez/lstm.py b/lstm_code/nicodjimenez/lstm.py
@@ -0,0 +1,190 @@
+import random
+
+import numpy as np
+import math
+
+def sigmoid(x): 
+    return 1. / (1 + np.exp(-x))
+
+# createst uniform random array w/ values in [a,b) and shape args
+def rand_arr(a, b, *args): 
+    np.random.seed(0)
+    return np.random.rand(*args) * (b - a) + a
+
+class LstmParam:
+    def __init__(self, mem_cell_ct, x_dim):
+        self.mem_cell_ct = mem_cell_ct
+        self.x_dim = x_dim
+        concat_len = x_dim + mem_cell_ct
+        # weight matrices
+        self.wg = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len)
+        self.wi = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len) 
+        self.wf = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len)
+        self.wo = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len)
+        # bias terms
+        self.bg = rand_arr(-0.1, 0.1, mem_cell_ct) 
+        self.bi = rand_arr(-0.1, 0.1, mem_cell_ct) 
+        self.bf = rand_arr(-0.1, 0.1, mem_cell_ct) 
+        self.bo = rand_arr(-0.1, 0.1, mem_cell_ct) 
+        # diffs (derivative of loss function w.r.t. all parameters)
+        self.wg_diff = np.zeros((mem_cell_ct, concat_len)) 
+        self.wi_diff = np.zeros((mem_cell_ct, concat_len)) 
+        self.wf_diff = np.zeros((mem_cell_ct, concat_len)) 
+        self.wo_diff = np.zeros((mem_cell_ct, concat_len)) 
+        self.bg_diff = np.zeros(mem_cell_ct) 
+        self.bi_diff = np.zeros(mem_cell_ct) 
+        self.bf_diff = np.zeros(mem_cell_ct) 
+        self.bo_diff = np.zeros(mem_cell_ct) 
+
+    def apply_diff(self, lr = 1):
+        self.wg -= lr * self.wg_diff
+        self.wi -= lr * self.wi_diff
+        self.wf -= lr * self.wf_diff
+        self.wo -= lr * self.wo_diff
+        self.bg -= lr * self.bg_diff
+        self.bi -= lr * self.bi_diff
+        self.bf -= lr * self.bf_diff
+        self.bo -= lr * self.bo_diff
+        # reset diffs to zero
+        self.wg_diff = np.zeros_like(self.wg)
+        self.wi_diff = np.zeros_like(self.wi) 
+        self.wf_diff = np.zeros_like(self.wf) 
+        self.wo_diff = np.zeros_like(self.wo) 
+        self.bg_diff = np.zeros_like(self.bg)
+        self.bi_diff = np.zeros_like(self.bi) 
+        self.bf_diff = np.zeros_like(self.bf) 
+        self.bo_diff = np.zeros_like(self.bo) 
+
+class LstmState:
+    def __init__(self, mem_cell_ct, x_dim):
+        self.g = np.zeros(mem_cell_ct)
+        self.i = np.zeros(mem_cell_ct)
+        self.f = np.zeros(mem_cell_ct)
+        self.o = np.zeros(mem_cell_ct)
+        self.s = np.zeros(mem_cell_ct)
+        self.h = np.zeros(mem_cell_ct)
+        self.bottom_diff_h = np.zeros_like(self.h)
+        self.bottom_diff_s = np.zeros_like(self.s)
+        self.bottom_diff_x = np.zeros(x_dim)
+    
+class LstmNode:
+    def __init__(self, lstm_param, lstm_state):
+        # store reference to parameters and to activations
+        self.state = lstm_state
+        self.param = lstm_param
+        # non-recurrent input to node
+        self.x = None
+        # non-recurrent input concatenated with recurrent input
+        self.xc = None
+
+    def bottom_data_is(self, x, s_prev = None, h_prev = None):
+        # if this is the first lstm node in the network
+        if s_prev == None: s_prev = np.zeros_like(self.state.s)
+        if h_prev == None: h_prev = np.zeros_like(self.state.h)
+        # save data for use in backprop
+        self.s_prev = s_prev
+        self.h_prev = h_prev
+
+        # concatenate x(t) and h(t-1)
+        xc = np.hstack((x,  h_prev))
+        self.state.g = np.tanh(np.dot(self.param.wg, xc) + self.param.bg)
+        self.state.i = sigmoid(np.dot(self.param.wi, xc) + self.param.bi)
+        self.state.f = sigmoid(np.dot(self.param.wf, xc) + self.param.bf)
+        self.state.o = sigmoid(np.dot(self.param.wo, xc) + self.param.bo)
+        self.state.s = self.state.g * self.state.i + s_prev * self.state.f
+        self.state.h = self.state.s * self.state.o
+        self.x = x
+        self.xc = xc
+    
+    def top_diff_is(self, top_diff_h, top_diff_s):
+        # notice that top_diff_s is carried along the constant error carousel
+        ds = self.state.o * top_diff_h + top_diff_s
+        do = self.state.s * top_diff_h
+        di = self.state.g * ds
+        dg = self.state.i * ds
+        df = self.s_prev * ds
+
+        # diffs w.r.t. vector inside sigma / tanh function
+        di_input = (1. - self.state.i) * self.state.i * di 
+        df_input = (1. - self.state.f) * self.state.f * df 
+        do_input = (1. - self.state.o) * self.state.o * do 
+        dg_input = (1. - self.state.g ** 2) * dg
+
+        # diffs w.r.t. inputs
+        self.param.wi_diff += np.outer(di_input, self.xc)
+        self.param.wf_diff += np.outer(df_input, self.xc)
+        self.param.wo_diff += np.outer(do_input, self.xc)
+        self.param.wg_diff += np.outer(dg_input, self.xc)
+        self.param.bi_diff += di_input
+        self.param.bf_diff += df_input       
+        self.param.bo_diff += do_input
+        self.param.bg_diff += dg_input       
+
+        # compute bottom diff
+        dxc = np.zeros_like(self.xc)
+        dxc += np.dot(self.param.wi.T, di_input)
+        dxc += np.dot(self.param.wf.T, df_input)
+        dxc += np.dot(self.param.wo.T, do_input)
+        dxc += np.dot(self.param.wg.T, dg_input)
+
+        # save bottom diffs
+        self.state.bottom_diff_s = ds * self.state.f
+        self.state.bottom_diff_x = dxc[:self.param.x_dim]
+        self.state.bottom_diff_h = dxc[self.param.x_dim:]
+
+class LstmNetwork():
+    def __init__(self, lstm_param):
+        self.lstm_param = lstm_param
+        self.lstm_node_list = []
+        # input sequence
+        self.x_list = []
+
+    def y_list_is(self, y_list, loss_layer):
+        """
+        Updates diffs by setting target sequence 
+        with corresponding loss layer. 
+        Will *NOT* update parameters.  To update parameters,
+        call self.lstm_param.apply_diff()
+        """
+        assert len(y_list) == len(self.x_list)
+        idx = len(self.x_list) - 1
+        # first node only gets diffs from label ...
+        loss = loss_layer.loss(self.lstm_node_list[idx].state.h, y_list[idx])
+        diff_h = loss_layer.bottom_diff(self.lstm_node_list[idx].state.h, y_list[idx])
+        # here s is not affecting loss due to h(t+1), hence we set equal to zero
+        diff_s = np.zeros(self.lstm_param.mem_cell_ct)
+        self.lstm_node_list[idx].top_diff_is(diff_h, diff_s)
+        idx -= 1
+
+        ### ... following nodes also get diffs from next nodes, hence we add diffs to diff_h
+        ### we also propagate error along constant error carousel using diff_s
+        while idx >= 0:
+            loss += loss_layer.loss(self.lstm_node_list[idx].state.h, y_list[idx])
+            diff_h = loss_layer.bottom_diff(self.lstm_node_list[idx].state.h, y_list[idx])
+            diff_h += self.lstm_node_list[idx + 1].state.bottom_diff_h
+            diff_s = self.lstm_node_list[idx + 1].state.bottom_diff_s
+            self.lstm_node_list[idx].top_diff_is(diff_h, diff_s)
+            idx -= 1 
+
+        return loss
+
+    def x_list_clear(self):
+        self.x_list = []
+
+    def x_list_add(self, x):
+        self.x_list.append(x)
+        if len(self.x_list) > len(self.lstm_node_list):
+            # need to add new lstm node, create new state mem
+            lstm_state = LstmState(self.lstm_param.mem_cell_ct, self.lstm_param.x_dim)
+            self.lstm_node_list.append(LstmNode(self.lstm_param, lstm_state))
+
+        # get index of most recent x input
+        idx = len(self.x_list) - 1
+        if idx == 0:
+            # no recurrent inputs yet
+            self.lstm_node_list[idx].bottom_data_is(x)
+        else:
+            s_prev = self.lstm_node_list[idx - 1].state.s
+            h_prev = self.lstm_node_list[idx - 1].state.h
+            self.lstm_node_list[idx].bottom_data_is(x, s_prev, h_prev)
+
diff --git a/lstm_code/nicodjimenez/test.py b/lstm_code/nicodjimenez/test.py
@@ -0,0 +1,48 @@
+import numpy as np
+import sys
+
+from lstm import LstmParam, LstmNetwork
+
+class ToyLossLayer:
+    """
+    Computes square loss with first element of hidden layer array.
+    """
+    @classmethod
+    def loss(self, pred, label):
+        return (pred[0] - label) ** 2
+
+    @classmethod
+    def bottom_diff(self, pred, label):
+        diff = np.zeros_like(pred)
+        diff[0] = 2 * (pred[0] - label)
+        return diff
+
+def example_0():
+    # learns to repeat simple sequence from random inputs
+    np.random.seed(0)
+
+    # parameters for input data dimension and lstm cell count 
+    mem_cell_ct = 100
+    x_dim = 50
+    concat_len = x_dim + mem_cell_ct
+    lstm_param = LstmParam(mem_cell_ct, x_dim) 
+    lstm_net = LstmNetwork(lstm_param)
+    y_list = [-0.5,0.2,0.1, -0.5]
+    input_val_arr = [np.random.random(x_dim) for _ in y_list]
+
+    for cur_iter in range(100):
+        print "cur iter: ", cur_iter
+        print "input_val_arr=", input_val_arr
+        print "y_list=", y_list
+        for ind in range(len(y_list)):
+            lstm_net.x_list_add(input_val_arr[ind])
+            print "y_pred[%d] : %f" % (ind, lstm_net.lstm_node_list[ind].state.h[0])
+
+        loss = lstm_net.y_list_is(y_list, ToyLossLayer)
+        print "loss: ", loss
+        lstm_param.apply_diff(lr=0.1)
+        lstm_net.x_list_clear()
+
+if __name__ == "__main__":
+    example_0()
+
diff --git a/lstm_code/nicodjimenez/test2.py b/lstm_code/nicodjimenez/test2.py