feifeibear
diff --git a/‎FasterTransformer/README.md‎
Lines changed: 10 additions & 0 deletions b/‎FasterTransformer/README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎FasterTransformer/v2/README.md‎
Lines changed: 58 additions & 20 deletions b/‎FasterTransformer/v2/README.md‎
Lines changed: 58 additions & 20 deletions
diff --git a/‎FasterTransformer/v2/fastertransformer/allocator.h‎
Lines changed: 1 addition & 1 deletion b/‎FasterTransformer/v2/fastertransformer/allocator.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎FasterTransformer/v2/fastertransformer/beamsearch_opennmt.h‎
Lines changed: 18 additions & 8 deletions b/‎FasterTransformer/v2/fastertransformer/beamsearch_opennmt.h‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎FasterTransformer/v2/fastertransformer/cuda/cuda_kernels.cu‎
Lines changed: 33 additions & 18 deletions b/‎FasterTransformer/v2/fastertransformer/cuda/cuda_kernels.cu‎
Lines changed: 33 additions & 18 deletions
@@ -37,6 +37,16 @@ FasterTransformer V1 will be deprecated on July 2020.
 
 ### Changelog
 
+March 2020
+- Add feature in FasterTransformer 2.0
+  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
+  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
+  - Fix the bug that decoding does not check finish or not after each step. 
+  - Fix the bug of decoder about max_seq_len.
+  - Modify the decoding model structure to fit the OpenNMT-tf decoding model. 
+    - Add a layer normalization layer after decoder.
+    - Add a normalization for inputs of decoder
+    
 February 2020
  * Release the FasterTransformer 2.0
  * Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow OP.
 
@@ -23,6 +23,7 @@ This repository provides a script and recipe to run the highly optimized transfo
   * [Inference process](#inference-process)
     * [Encoder process](#encoder-process)
     * [Decoder and Decoding process](#decoder-and-decoding-process)
+    * [Translation process](#translation-process)
 - [Performance](#performance)
   * [Encoder performance](#encoder-performance)
   * [Decoder performance on T4](#decoder-performance-on-t4)
@@ -51,7 +52,7 @@ FasterTransformer is built on top of CUDA and cuBLAS, providing the C++ API and
 
 The following configurations are supported in the FasterTransformer encoder. 
 - Batch size (B<sub>1</sub>): smaller or equal to 512
-- Sequence length (S): smaller or equal to 128 
+- Sequence length (S): larger than 3 and smaller or equal to 1024 
 - Head number (H) and size per head (N): 
   - 12 heads * 64 per heads
   - 4 heads * 32 per heads
@@ -60,7 +61,7 @@ The following configurations are supported in the FasterTransformer encoder.
 
 The following configurations are supported in the FasterTransformer decoder and decoding.
 - Batch size (B<sub>1</sub>) * beam width (B<sub>2</sub>): smaller than 1024
-- Sequence length (S): smaller or equal to 128
+- Sequence length (S): smaller than 1024
 - Head number (H): 8 and 12
 - Size per head (N): 64
 - Vocabulary size (V): from 64 to 30000
@@ -154,10 +155,9 @@ nvidia-docker run -ti nvcr.io/nvidia/tensorflow:19.07-py2 bash
 
 ```bash
 git clone https://github.com/NVIDIA/DeepLearningExamples
-cd DeepLearningExamples
+cd DeepLearningExamples/FasterTransformer/v2
 git submodule init
 git submodule update
-cd FasterTransformer/v2
 ```
 
 3. Build the project.
@@ -356,6 +356,7 @@ The `sample/` folder contains useful sample codes for FasterTransformer:
 * `sample/tensorflow/decoding_sample.py` - TensorFlow decoding sample codes 
 * `sample/tensorflow/encoder_decoder_sample.py` - TensorFlow `encoder_decoder` sample codes 
 * `sample/tensorflow/encoder_decoding_sample.py` - TensorFlow `encoder_decoding` sample codes 
+* `sample/tensorflow/translate_sample.py` - TensorFlow translation sample codes
 
 ### Command-line options
 
@@ -367,6 +368,7 @@ python decoder_sample.py --help
 python decoding_sample.py --help
 python encoder_decoder_sample.py --help
 python encoder_decoding_sample.py --help
+python translate_sample.py --help
 ```
 
 ### Inference process
@@ -540,14 +542,16 @@ python decoder_sample.py \
 The outputs should be similar to the following:
 
 ```bash 
-[[INFO][PYTHON] step:][1][max diff: ][9.77516174e-06][True]
-[[INFO][PYTHON] step:][2][max diff: ][1.04904175e-05][True]
+[[INFO][PYTHON] step:][0][max diff: ][5.00679e-06][ op val: ][2.3735888][ tf val: ][2.37359381][True]
+[[INFO][PYTHON] step:][1][max diff: ][4.64916229e-06][ op val: ][-0.588810563][ tf val: ][-0.588815212][True]
+[[INFO][PYTHON] step:][2][max diff: ][5.36441803e-06][ op val: ][-1.46514082][ tf val: ][-1.46514618][True]
 ...
-[[INFO][PYTHON] step:][31][max diff: ][1.21593475e-05][True]
-[[INFO][PYTHON] step:][32][max diff: ][1.04382634e-05][True]
+[[INFO][PYTHON] step:][29][max diff: ][4.529953e-06][ op val: ][2.88768935][ tf val: ][2.88769388][True]
+[[INFO][PYTHON] step:][30][max diff: ][4.17232513e-06][ op val: ][-1.28717053][ tf val: ][-1.2871747][True]
+[[INFO][PYTHON] step:][31][max diff: ][4.05311584e-06][ op val: ][-1.01830876][ tf val: ][-1.01831281][True]
 ```
 
-The results show that the differences between the decoder of TensorFlow and decoder are smaller than threshold.
+The results show that the differences between the decoder of TensorFlow and decoder are smaller than threshold. Note that the differences are absolute differences, so the differences may be large when the op val is large. In this case, the differences are larger than the threshold and the checking will return "False", but it may be not affect the final results.
 
 The option `decoder_type` decides to use the decoder of TensorFlow or decoder of FasterTransformer. `decoder_type 2` uses both decoders and compares their results. 
 
@@ -606,15 +610,13 @@ python decoding_sample.py \
 The outputs should be similar to the following:
 
 ```bash
-[INFO] Before finalize: 
-       result before finalize cross-check: True
+       Output ids cross-check: True
 
        Parent ids cross-check: True
 
-       sequence lengths cross-check: True
+       Sequence lengths cross-check: True
 
-[INFO] After finalize: 
-       result after cross-check: True
+       Finalized output ids cross-check: True
 ```
 
 Note that the results of OP and the results of TensorFlow are often different in the random inputs and weights. 
@@ -635,6 +637,34 @@ python encoder_decoding_sample.py \
         --data_type fp32
 ```
 
+#### Translation progress
+
+For translation, we need to use some tools and library of OpenNMT-tf to prepocess the source sentence and build the encoder.
+Because the encoder of FasterTransformer is based on BERT, it cannot be restore the pretrained model. So, it requires to use the encoder of OpenNMT-tf.
+
+1. Prepare the pretrained model and the data for translation.
+
+```bash
+bash utils/translation/download_model_data.sh
+```
+
+`download_model_data.sh` will prepare the `opennmt` folder, which contains the input embedding and the encoder, download the pretrained model, and download the test data into the `translation` folder. This is because the encoder of FasterTransformer is based on BERT, but not OpenNMT-tf, so we cannot restore the pretrained model of OpenNMT-tf for encoder. Therefore, translation requires the encoder of OpenNMT-tf.
+
+Another problem is that the implementation of our tf_decoding and OpenNMT-tf decoding is a little different. For example, OpenNMT-tf uses one gemm to compute query, key and values in one time; but tf_decoding splits them into three gemms. So, the tool `utils/dump_model.py` will convert the pretrained model to fit the model structure of decoder of FasterTransformer.  
+
+```bash
+./bin/decoding_gemm 1 4 8 64 32001 100 512 0
+python translate_sample.py
+```
+
+The outputs should be similar to the following:
+
+```bash
+[INFO] opennmt: ▁28 - jährige r ▁Chef koch ▁to t ▁in ▁San ▁Francisco </s>
+[INFO] tf     : ▁28 - jährige r ▁Chef koch ▁to t ▁in ▁San ▁Francisco </s>
+[INFO] op     : ▁28 - jährige r ▁Chef koch ▁to t ▁in ▁San ▁Francisco </s>
+```
+
 ## Performance
 
 Hardware settings: 
@@ -752,6 +782,16 @@ bash scripts/profile_decoding_op_performance.sh
 
 ### Changelog
 
+March 2020
+- Add feature in FasterTransformer 2.0
+  - Fix the bug of maximum sequence length of decoder cannot be larger than 128.
+  - Add `translate_sample.py` to demonstrate how to translate a sentence by restoring the pretrained model of OpenNMT-tf.
+  - Fix the bug that decoding does not check finish or not after each step. 
+  - Fix the bug of decoder about max_seq_len.
+  - Modify the decoding model structure to fit the OpenNMT-tf decoding model. 
+    - Add a layer normalization layer after decoder.
+    - Add a normalization for inputs of decoder
+
 Febuary 2020
 - Release the FasterTransformer 2.0
   - Provide a highly optimized OpenNMT-tf based decoder and decoding, including C++ API and TensorFlow op. 
@@ -764,10 +804,8 @@ July 2019
 
 ### Known issues
 
-- sequence length of Decoder and Decoding should be smaller than 128.
-- batch_size should be smaller than 1024 in Decoder.
-- batch_size x beam_width should be smaller than 1024 in Decoding.
-- Results of TensorFlow  and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. 
+- batch_size should be smaller or equal to 1024 in Decoder.
+- batch_size x beam_width should be smaller or equal to 1024 in Decoding.
+- Results of TensorFlow and OP would be different in decoding. This problem is caused by the accumulated log probability, and we do not avoid this problem. 
 - Cmake 15 or Cmake 16 fail to build this project. Cmake 14 is no problem. 
-- Max sequence length of encoder and decoder should be the same.  
-
+- Max sequence length of encoder and decoder should be the same. 
@@ -104,7 +104,7 @@ class Allocator<AllocatorType::TF> : public IAllocator
 
     auto flat = buf.flat<uint8>();
     void *ptr = (void *)flat.data();
-    cudaMemset(ptr, 0, size);
+    cudaMemset(ptr, 0, buf_size);
     return ptr;
   }
 
 
@@ -40,7 +40,9 @@ void BeamSearch_OpenNMT(
     int *output_ids,
     const int batch_size, const int beam_width,
     const int vocab_size, const int hidden_dim, const int step,
-    const int cache_size, const int decoder_layers, cudaStream_t stream)
+    const int cache_size, const int decoder_layers, cudaStream_t stream,
+    const int end_id, 
+    int *finished_count)
 {
 #ifdef NDEBUG
   /* adding cum_log_probs to log_probs */
@@ -75,11 +77,15 @@ void BeamSearch_OpenNMT(
 #endif
 
 #ifdef NDEBUG
-  update(log_probs, cum_log_probs, ids, finished, parent_ids, sequence_length, word_ids, output_ids,
-         batch_size, beam_width, vocab_size, stream);
+  update(log_probs, cum_log_probs, ids, finished, 
+        parent_ids, sequence_length, word_ids, output_ids,
+        batch_size, beam_width, vocab_size, stream, 
+        end_id, finished_count);
 #else
-  update(log_probs, cum_log_probs, ids, finished, parent_ids, sequence_length, word_ids, output_ids,
-         batch_size, beam_width, vocab_size, stream);
+  update(log_probs, cum_log_probs, ids, finished, 
+        parent_ids, sequence_length, word_ids, output_ids,
+        batch_size, beam_width, vocab_size, stream, 
+        end_id, finished_count);
   cudaDeviceSynchronize();
   check_cuda_error(cudaGetLastError());
 
@@ -89,13 +95,17 @@ void BeamSearch_OpenNMT(
     Note that update_kernel_check contains update and uses do not need to call it again. 
   */
   // update_kernel_check(log_probs, cum_log_probs, ids, finished, parent_ids, sequence_length, word_ids, output_ids,
-  //                     batch_size, beam_width, vocab_size, stream);
+  //                     batch_size, beam_width, vocab_size, stream, end_id, finished_count);
 #endif
 
 #ifdef NDEBUG
-  update_KV_cache<T>(key_cache, value_cache, parent_ids, batch_size, beam_width, hidden_dim, step, cache_size, decoder_layers, stream);
+  update_KV_cache<T>(key_cache, value_cache, parent_ids, batch_size, 
+                    beam_width, hidden_dim, step, cache_size, 
+                    decoder_layers, stream);
 #else
-  update_KV_cache<T>(key_cache, value_cache, parent_ids, batch_size, beam_width, hidden_dim, step, cache_size, decoder_layers, stream);
+  update_KV_cache<T>(key_cache, value_cache, parent_ids, batch_size, 
+                    beam_width, hidden_dim, step, cache_size, 
+                    decoder_layers, stream);
   cudaDeviceSynchronize();
   check_cuda_error(cudaGetLastError());
 
 
@@ -69,13 +69,12 @@ T blockReduceSum(T val)
   if(lane == 0)
     shared[wid] = val;
   __syncthreads();
-
+  
   val = (threadIdx.x < (blockDim.x >> 5 )) ? shared[lane] : (T)0.0f;
   val = warpReduceSum(val);
   return val;
 }
 
-
 template <typename T>
   __inline__ __device__
 T warpReduceMax(T val)
@@ -386,12 +385,15 @@ void topK(const float* log_probs, int* ids, const int batch_size, const int beam
 
 template <typename T>
 __global__
-void update_kernel(T* log_probs, T* cum_log_probs, int* ids, bool* finished, int* parent_ids, int* sequence_length,
-  int* word_ids, int* output_ids,
-  const int batch_size, const int beam_width, const int vocab_size)
+void update_kernel(T* log_probs, T* cum_log_probs, 
+                  int* ids, bool* finished, 
+                  int* parent_ids, int* sequence_length, 
+                  int* word_ids, int* output_ids, 
+                  const int batch_size, const int beam_width, 
+                  const int vocab_size, const int end_id, 
+                  int* finished_count)
 {
   int tid = threadIdx.x;
-
   sequence_length[tid] = finished[tid] ? sequence_length[tid] : sequence_length[tid] + 1;
 
   int beam_id = ids[tid];
@@ -401,10 +403,14 @@ void update_kernel(T* log_probs, T* cum_log_probs, int* ids, bool* finished, int
 
   cum_log_probs[tid] = log_probs[ids[tid]];
   sequence_length[tid] = sequence_length[beam_id];
-  finished[tid] = finished[beam_id];
+  finished[tid] = word_id == end_id ? 1 : 0;
   parent_ids[tid] = beam_id;
   word_ids[tid] = word_id;
   output_ids[tid] = word_id;
+
+  // TODO use reduce sum to compute how many sentence are finished
+  // int fi = finished[tid]
+  // int total_finish = reduceSum(fi);
 }
 
 template <typename T>
@@ -415,19 +421,25 @@ __global__ void embedding_lookup_kernel(const T* embedding_table, const int* wor
   from_tensor[write_pos] = embedding_table[word_ids[blockIdx.x] * hidden_units + threadIdx.x];
 }
 
-void update(float* log_probs, float* cum_log_probs, int* ids, bool* finished, int* parent_ids, int* sequence_length,
-  int* word_ids, int* output_ids,
-  const int batch_size, const int beam_width, const int vocab_size, cudaStream_t stream)
+void update(float* log_probs, float* cum_log_probs, 
+            int* ids, bool* finished, 
+            int* parent_ids, int* sequence_length,
+            int* word_ids, int* output_ids, 
+            const int batch_size, const int beam_width, 
+            const int vocab_size, cudaStream_t stream, 
+            const int end_id, int* finished_count)
 { 
 
   dim3 grid(1);
   dim3 block(batch_size * beam_width);
 
   assert(block.x <= 1024);
 
-  update_kernel<float><<<grid, block, 0, stream>>>(log_probs, cum_log_probs, ids, finished, parent_ids, sequence_length,
-    word_ids, output_ids,
-    batch_size, beam_width, vocab_size);
+  update_kernel<float><<<grid, block, 0, stream>>>(log_probs, cum_log_probs, ids, 
+                                                  finished, parent_ids, sequence_length,
+                                                  word_ids, output_ids, batch_size, 
+                                                  beam_width, vocab_size, end_id, 
+                                                  finished_count);
 }
 
 template <typename T>
@@ -565,14 +577,17 @@ __global__
 void sine_position_encoder_kernel(T* output, int step, int n){
   int tid = threadIdx.x;
   int bid = blockIdx.x;
-  int half_n = n / 2;
+  float half_n = (float)n / 2.;
+
+  // input = input * hidden_dim**0.5
+  output[bid * n + tid] = output[bid * n + tid] * (T)sqrtf(float(n));
 
-  float log_timescale_increment = __logf(10000) / (( half_n - 1) * 1.f);
-  float inv_timescales = __expf( (tid % half_n) * -1 * log_timescale_increment );
+  float log_timescale_increment = __logf(10000) / (half_n - 1.f);
+  float inv_timescales = __expf( (tid % (int)half_n) * -1 * log_timescale_increment );
   float scaled_time = inv_timescales * step;
 
   T encoding_val = (tid < half_n) ? (T) __sinf(scaled_time) : (T) __cosf(scaled_time);
-  output[bid * n + tid] = output[bid * n + tid] + encoding_val;
+  output[bid * n + tid] = output[bid * n + tid]  + encoding_val;
 }
 
 template<typename T>
@@ -584,7 +599,7 @@ void sine_position_encoder(
   dim3 grid(m);
   dim3 block(n);
   assert(n <= 1024);
-  sine_position_encoder_kernel<T><<<grid, block, 0, stream>>>(output, step + 1, n);
+  sine_position_encoder_kernel<T><<<grid, block, 0, stream>>>(output, step, n);
 }
 
 template void add_bias_act_kernelLauncher<float>(
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ class Allocator<AllocatorType::TF> : public IAllocator`
`104`	`104`
`105`	`105`	`auto flat = buf.flat<uint8>();`
`106`	`106`	`void ptr = (void )flat.data();`
`107`		`- cudaMemset(ptr, 0, size);`
	`107`	`+ cudaMemset(ptr, 0, buf_size);`
`108`	`108`	`return ptr;`
`109`	`109`	`}`
`110`	`110`