From f7ed7fc3cda8025d6623938fa615833cbdba68e2 Mon Sep 17 00:00:00 2001
From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
Date: Fri, 19 May 2023 23:07:04 +0800
Subject: [PATCH 01/91] [Docs] Update deeprec2304 release images and notes in
 README.md & RELEASE.md. (#865)

Signed-off-by: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
---
 README.md                                     |   6 +-
 RELEASE.md                                    | 104 ++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |   4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |   2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |   2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |   4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |   2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |   2 +-
 8 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index c43fd4359e4..927afe31480 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 --------------------------------------------------------------------------------
 
 ## **Introduction**
-DeepRec is a high-performance recommendation deep learning framework based on [TensorFlow 1.15](https://www.tensorflow.org/), [Intel-TensorFlow](https://github.com/Intel-tensorflow/tensorflow) and [NVIDIA-TensorFlow](https://github.com/NVIDIA/tensorflow).
+DeepRec is a high-performance recommendation deep learning framework based on [TensorFlow 1.15](https://www.tensorflow.org/), [Intel-TensorFlow](https://github.com/Intel-tensorflow/tensorflow) and [NVIDIA-TensorFlow](https://github.com/NVIDIA/tensorflow). It is hosted in incubation in LF AI & Data Foundation.
 
 
 ### **Background**
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2302-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index cdeb8fa4258..d41d9e569ad 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,107 @@
+# Release r1.15.5-deeprec2304
+
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Suport tf.int32 dtype using feature_column API `tf.feature_column.categorical_column_with_embedding`.
+- Make the rules of export frequencies and versions the same as the rule of export keys.
+- Optimize cuda kernel implementation in GroupEmbedding.
+- Support to read embedding files with mmap and madvise, and direct IO.
+- Add double check in find_wait_free of lockless dense hashmap.
+- Change Embedding init value of version in EV from 0 to -1.
+- Interface 'GetSnapshot()' backward compatibility.
+- Implement CPU GroupEmbedding lookup sparse Op.
+- Make GroupEmbedding compatible with sequence feature_column interface.
+- Fix sp_weights indices calculation error in GroupEmbedding.
+- Add group_strategy to control parallelism of group_embedding.
+
+### **Graph & Grappler Optimization**
+
+- Support SparseTensor as placeholder in Sample-awared Graph Compression.
+- Add Dice fusion grappler and ops.
+- Enable MKL Matmul + Bias + LeakyRelu fusion.
+
+### **Runtime Optimization**
+
+- Avoid unnecessary polling in EventMgr.
+- Reduce lock cost and memory usage in EventMgr when use multi-stream.
+
+### **Ops & Hardware Acceleration**
+
+- Register GPU implementation of int64 type for Prod.
+- Register GPU implementation of string type for Shape, ShapeN and ExpandDims.
+- Optimize list of GPU SegmentReductionOps.
+- Optimize zeros_like_impl by reducing calls to convert_to_tensor.
+- Implement GPU version of SparseSlice Op.
+- Delay Reshape when rank > 2 in keras.layers.Dense so that post op can be fused with MatMul.
+- Implement setting max_num_threads hint to oneDNN at compile time.
+- Implement TensorPackTransH2DOp to improve SmartStage performance on GPU.
+
+### **IO**
+
+- Add tensor shape meta-data support for ParquetDataset.
+- Add arrow BINARY type support for ParquetDataset.
+
+### **Serving**
+
+- Add Dice fusion to inference mode.
+- Enable INFERENCE_MODE in processor.
+- Support TensorRT 8.x in Inference.
+- Add configure filed to control enable TensorRT or not.
+- Add flag for device_placement_optimization.
+- Avoid to clustering feature column related nodes when enable TensorRT.
+- Optimize inference latency when load increment checkpoint.
+- Optimize performance via only place TensorRT ops to gpu device.
+
+### **Environment & Build**
+
+- Support CUDA 12.
+- Update DEFAULT_CUDA_VERSION and DEFAULT_CUDNN_VERSION in configure.py.
+- Move thirdparties from WORKSPACE to workspace.bzl.
+- Update urls corresponding to colm, ragel, aliyun-oss-sdk and uuid.
+
+### **BugFix**
+
+- Fix constant op placing bug for device placement optimization.
+- Fix Nan issue occurred in group_embedding API.
+- Fix SOK not compatible with variable issue.
+- Fix memory leak when update full model in serving.
+- Fix 'cols_to_output_tensors' not setted issue in GroupEmbedding.
+- Fix core dump issue about saving GPU EmbeddingVariable.
+- Fix cuda resource issue in KvResourceImportV3 kernel.
+- Fix loading signature_def with coo_sparse bug and add UT.
+- Fix the bug that the training ends early when the workqueue is enabled.
+- Fix the control edge connection issue in device placement optimization.
+
+### **ModelZoo**
+
+- Modify GroupEmbedding related function usage.
+- Update masknet example with layernorm.
+
+### **Tool & Documents**
+
+- Add tools for remove filtered features in checkpoint.
+- Add Arm Compute Library (ACL) user documents.
+- Update Embedding Variable document to fix initializer config example.
+- Update GroupEmbedding document.
+- Update processor documents.
+- Add user documents for intel AMX.
+- Add TensorRT usage documents.
+- Update documents for ParquetDataset.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2302
 
 ## **Major Features and Improvements**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index df06bb4ee10..cf48987ac3b 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -112,7 +112,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -123,5 +123,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2302-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 5f53950af81..51522979444 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -44,7 +44,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2302
+Develop Branch：master, Latest Release Branch: deeprec2304
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index bee928fbcb3..91fde221864 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -43,7 +43,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2302
+Develop Branch: master, Latest Release Branch: deeprec2304
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index 7980935ad28..e18fd6d5a75 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,7 +122,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2302-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index a0c500d12e1..dfeeb3717de 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -44,7 +44,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2302
+开发分支：master，最新Release分支：deeprec2304
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index cf83107308a..c9cc85ad82c 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -43,7 +43,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2302
+开发分支：master，最新Release分支：deeprec2304
 
 ## TFServing编译&打包
 

From 15f3b7aa735977f90ad8ae532c348aae566584ff Mon Sep 17 00:00:00 2001
From: shijieliu <aleliu@nvidia.com>
Date: Tue, 23 May 2023 13:50:54 +0800
Subject: [PATCH 02/91] [Distributed] Update SparseOperationKit to v23.5.01 and
 docker file. (#866)

Signed-off-by: aleliu <aleliu@nvidia.com>
---
 .../benchmark/benchmark_sok.py                |  32 +-
 .../core/adapter/lookup_adapter.cpp           |  59 ++--
 .../core/adapter/lookup_adapter.hpp           |  42 +--
 .../dockerfiles/Dockerfile                    |  37 +++
 addons/sparse_operation_kit/example/demo.py   |  15 +
 .../python/embedding_var_lookup_utest.py      |  18 +-
 tensorflow/python/ops/embedding_ops.py        |   2 +-
 tensorflow/tools/pip_package/build_sok.sh     |   4 +-
 tensorflow/workspace.bzl                      |   6 +-
 third_party/HugeCTR.patch                     | 285 ------------------
 10 files changed, 144 insertions(+), 356 deletions(-)
 create mode 100644 addons/sparse_operation_kit/dockerfiles/Dockerfile
 delete mode 100644 third_party/HugeCTR.patch

diff --git a/addons/sparse_operation_kit/benchmark/benchmark_sok.py b/addons/sparse_operation_kit/benchmark/benchmark_sok.py
index b75ff113b80..cb403594dfc 100644
--- a/addons/sparse_operation_kit/benchmark/benchmark_sok.py
+++ b/addons/sparse_operation_kit/benchmark/benchmark_sok.py
@@ -1,18 +1,18 @@
-# Copyright (c) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
+"""
+ Copyright (c) 2023, NVIDIA CORPORATION.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
 from tensorflow.python.framework import dtypes
 import numpy as np
 from ast import arg
@@ -176,7 +176,7 @@ def transform_categorical(feature):
     if args.sok:
         hotness = [1 for _ in range(len(CATEGORICAL_COLUMNS))]
         combiners = ['sum' for _ in range(len(CATEGORICAL_COLUMNS))]
-        deep_features = sok.lookup_sparse(variables, indices, combiners)
+        deep_features = sok.lookup_sparse(variables, indices, combiners=combiners)
     else:
         deep_features = []
         for ind, var in zip(indices, variables):
diff --git a/addons/sparse_operation_kit/core/adapter/lookup_adapter.cpp b/addons/sparse_operation_kit/core/adapter/lookup_adapter.cpp
index d0ed65d8862..9fdc38f1404 100644
--- a/addons/sparse_operation_kit/core/adapter/lookup_adapter.cpp
+++ b/addons/sparse_operation_kit/core/adapter/lookup_adapter.cpp
@@ -1,17 +1,18 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-======================================================================*/
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include "lookup_adapter.hpp"
 
@@ -21,8 +22,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <typename KeyType, typename DType>
-void EmbeddingVarGPUAdapter<KeyType, DType>::set(
+template <typename KeyType, typename OffsetType, typename DType>
+void EmbeddingVarGPUAdapter<KeyType, OffsetType, DType>::set(
     OpKernelContext* ctx,
     std::vector<core::RefCountPtr<EmbeddingVar<KeyType, DType>>>& vars,
     const std::vector<int>& ev_size_per_lookup, cudaStream_t stream) {
@@ -35,24 +36,24 @@ void EmbeddingVarGPUAdapter<KeyType, DType>::set(
   stream_ = stream;
 }
 
-template <typename KeyType, typename DType>
-void EmbeddingVarGPUAdapter<KeyType, DType>::lookup(
-    const ::core::Tensor& keys, size_t num_keys,
-    const ::core::Tensor& id_space_offset, size_t num_id_space_offset,
-    const ::core::Tensor& id_space, ::core::TensorList& embedding_vec) {
+template <typename KeyType, typename OffsetType, typename DType>
+void EmbeddingVarGPUAdapter<KeyType, OffsetType, DType>::lookup(
+    const core23::Tensor& keys, size_t num_keys,
+    const core23::Tensor& id_space_offset, size_t num_id_space_offset,
+    const core23::Tensor& id_space, core23::Tensor& embedding_vec) {
   id_space_offset_.resize(num_id_space_offset);
   CUDACHECK(cudaMemcpyAsync(id_space_offset_.data(),
-                            id_space_offset.get<uint32_t>(),
-                            sizeof(uint32_t) * (num_id_space_offset),
+                            id_space_offset.data(),
+                            sizeof(OffsetType) * (num_id_space_offset),
                             cudaMemcpyDeviceToHost, stream_));
   id_space_.resize(num_id_space_offset - 1);
-  CUDACHECK(cudaMemcpyAsync(id_space_.data(), id_space.get<int>(),
+  CUDACHECK(cudaMemcpyAsync(id_space_.data(), id_space.data<int>(),
                             sizeof(int) * (num_id_space_offset - 1),
                             cudaMemcpyDeviceToHost, stream_));
   CUDACHECK(cudaStreamSynchronize(stream_));
   assert(tmp_ev_list_.size() == 0);
 
-  const KeyType* input = keys.get<KeyType>();
+  const KeyType* input = keys.data<KeyType>();
   std::vector<DType*> lookup_res;
   for (int i = 0; i < num_id_space_offset - 1; ++i) {
     size_t num = id_space_offset_[i + 1] - id_space_offset_[i];
@@ -74,16 +75,18 @@ void EmbeddingVarGPUAdapter<KeyType, DType>::lookup(
       lookup_res.push_back(evs.flat<DType>().data() + i_ev * ev_size);
     }
   }
-  DType** output = embedding_vec.get<DType>();
+  DType** output = static_cast<DType**>(embedding_vec.data());
   CUDACHECK(cudaMemcpyAsync(output, lookup_res.data(),
                             sizeof(DType*) * lookup_res.size(),
                             cudaMemcpyHostToDevice, stream_));
   CUDACHECK(cudaStreamSynchronize(stream_));
 }
 
-template class EmbeddingVarGPUAdapter<int32, float>;
+template class EmbeddingVarGPUAdapter<int32, int32, float>;
+template class EmbeddingVarGPUAdapter<int32, int64, float>;
 // template class EmbeddingVarGPUAdapter<int32_t, __half>;
-template class EmbeddingVarGPUAdapter<int64, float>;
+template class EmbeddingVarGPUAdapter<int64, int32, float>;
+template class EmbeddingVarGPUAdapter<int64, int64, float>;
 // template class EmbeddingVarGPUAdapter<int64_t, __half>;
 }  // namespace tensorflow
 #endif
diff --git a/addons/sparse_operation_kit/core/adapter/lookup_adapter.hpp b/addons/sparse_operation_kit/core/adapter/lookup_adapter.hpp
index 9d06013dca5..517b93e01bd 100644
--- a/addons/sparse_operation_kit/core/adapter/lookup_adapter.hpp
+++ b/addons/sparse_operation_kit/core/adapter/lookup_adapter.hpp
@@ -1,17 +1,18 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-======================================================================*/
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -34,8 +35,9 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 
 namespace tensorflow {
+namespace core23 = HugeCTR::core23;
 
-template <typename KeyType, typename DType>
+template <typename KeyType, typename OffsetType, typename DType>
 class EmbeddingVarGPUAdapter : public ::embedding::ILookup {
  public:
   virtual ~EmbeddingVarGPUAdapter() = default;
@@ -45,16 +47,16 @@ class EmbeddingVarGPUAdapter : public ::embedding::ILookup {
       std::vector<core::RefCountPtr<EmbeddingVar<KeyType, DType>>>& vars,
       const std::vector<int>& ev_size_per_lookup, cudaStream_t stream);
 
-  void lookup(const ::core::Tensor& keys, size_t num_keys,
-              const ::core::Tensor& id_space_offset, size_t num_id_space_offset,
-              const ::core::Tensor& id_space,
-              ::core::TensorList& embedding_vec) override;
+  void lookup(const core23::Tensor& keys, size_t num_keys,
+              const core23::Tensor& id_space_offset, size_t num_id_space_offset,
+              const core23::Tensor& id_space,
+              core23::Tensor& embedding_vec) override;
 
   void clear_tmp_ev_list() { tmp_ev_list_.clear(); }
 
  private:
   std::vector<EmbeddingVar<KeyType, DType>*> vars_;
-  std::vector<uint32_t> id_space_offset_;
+  std::vector<OffsetType> id_space_offset_;
   std::vector<int> id_space_;
   cudaStream_t stream_;
   OpKernelContext* ctx_;
diff --git a/addons/sparse_operation_kit/dockerfiles/Dockerfile b/addons/sparse_operation_kit/dockerfiles/Dockerfile
new file mode 100644
index 00000000000..623bafe3d8b
--- /dev/null
+++ b/addons/sparse_operation_kit/dockerfiles/Dockerfile
@@ -0,0 +1,37 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+ENV HOROVOD_GPU_OPERATIONS=NCCL
+
+RUN apt-get update && \
+    apt-get install -y --allow-unauthenticated \
+    wget \
+    cmake \
+    git \
+    unzip \
+    curl \
+    libssl-dev \
+    libcurl4-openssl-dev \
+    zlib1g-dev \
+    python3 \
+    python3-dev \
+    python3-pip \
+    libopenmpi-dev \
+    libboost-serialization-dev \
+    && apt-get clean && \
+    ln -sf python3 /usr/bin/python && \
+    ln -sf pip3 /usr/bin/pip
+
+RUN pip install \
+    astor==0.8.1 \
+    numpy==1.16.6 \
+    scikit-build \
+    twine \
+    cmake==3.21.1 \
+    protobuf==3.17.3 && \
+    pip install --no-deps \
+    keras-preprocessing==1.0.5
+
+RUN wget https://github.com/bazelbuild/bazel/releases/download/0.26.1/bazel-0.26.1-installer-linux-x86_64.sh && \
+    bash bazel-0.26.1-installer-linux-x86_64.sh
diff --git a/addons/sparse_operation_kit/example/demo.py b/addons/sparse_operation_kit/example/demo.py
index aa16aec23d1..c06a66ff65f 100644
--- a/addons/sparse_operation_kit/example/demo.py
+++ b/addons/sparse_operation_kit/example/demo.py
@@ -1,3 +1,18 @@
+"""
+ Copyright (c) 2023, NVIDIA CORPORATION.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
 import time
 import numpy as np
 import tensorflow as tf
diff --git a/addons/sparse_operation_kit/python/embedding_var_lookup_utest.py b/addons/sparse_operation_kit/python/embedding_var_lookup_utest.py
index e504ddeb8c2..5f39933e470 100644
--- a/addons/sparse_operation_kit/python/embedding_var_lookup_utest.py
+++ b/addons/sparse_operation_kit/python/embedding_var_lookup_utest.py
@@ -1,3 +1,18 @@
+"""
+ Copyright (c) 2023, NVIDIA CORPORATION.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
 import time
 import numpy as np
 import tensorflow as tf
@@ -11,6 +26,7 @@
     hvd.init()
     config = tf.compat.v1.ConfigProto()
     config.gpu_options.allow_growth = True
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
     sess =  tf.compat.v1.Session(config=config)
     sok.init()
 
@@ -81,7 +97,7 @@
     optimizer = tf.train.AdagradOptimizer(0.1)
 
     def step(params):
-        embeddings = sok.lookup_sparse(params, indices, combiners)
+        embeddings = sok.lookup_sparse(params, indices, combiners=combiners)
         loss = 0
         for i in range(len(embeddings)):
             loss = loss + tf.reduce_sum(embeddings[i])
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 0f49198e4cf..4782dcdd6d1 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -1665,7 +1665,7 @@ def group_embedding_lookup_sparse(params,
                               )
         with ops.name_scope(name, 'group_embedding_lookup', params
                             + sp_ids) as name_scope:
-            emb_vec = sok.lookup_sparse(params, sp_ids, combiners)
+            emb_vec = sok.lookup_sparse(params, sp_ids, combiners=combiners)
     elif strategy == DistStrategy.LOCALIZED:
 
       emb_vec = [None for _ in range(len(params))]
diff --git a/tensorflow/tools/pip_package/build_sok.sh b/tensorflow/tools/pip_package/build_sok.sh
index cdf46fec15e..caad68de90e 100755
--- a/tensorflow/tools/pip_package/build_sok.sh
+++ b/tensorflow/tools/pip_package/build_sok.sh
@@ -13,6 +13,8 @@ export ENABLE_DEEPREC=ON
 export DeepRecWorkdir=`pwd`
 export DeepRecBuild=`pwd`/bazel-DeepRec
 export MAKEFLAGS=-j$(nproc)
+export SOK_COMPILE_GPU_SM="70;75;80"
 cd ./bazel-DeepRec/external/hugectr/sparse_operation_kit
 
-"${PYTHON_BIN_PATH:-python}" setup.py install
+"${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel
+pip install ./dist/merlin_sok-1.1.4-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a04b7b6f908..5bd123ce00f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1361,11 +1361,9 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     http_archive(
         name = "hugectr",                                     # Apache License 2.0
         build_file = "//third_party:hugectr.BUILD",
-        strip_prefix = "HugeCTR-4.3",
-        patch_args = ["-p1"],
-        patches = ["//third_party:HugeCTR.patch"],
+        strip_prefix = "HugeCTR-23.05.01",
         urls = [
-            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v4.3.tar.gz",
+            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.05.01.tar.gz",
         ],
     )
 
diff --git a/third_party/HugeCTR.patch b/third_party/HugeCTR.patch
deleted file mode 100644
index f26df81929e..00000000000
--- a/third_party/HugeCTR.patch
+++ /dev/null
@@ -1,285 +0,0 @@
-From 40ec11a5defe65999749f0fbf481def58331628f Mon Sep 17 00:00:00 2001
-From: Mesilenceki <silenceki@hotmail.com>
-Date: Fri, 7 Apr 2023 10:20:46 +0800
-Subject: [PATCH] Modify ForwardOp
-
----
- HugeCTR/core/core.hpp                         |   1 +
- .../impl/embedding_collection_adapter.cu      |   9 +-
- .../impl/embedding_collection_adapter.h       |   3 +-
- .../lookup/kernels/embedding_collection.cc    | 111 ++++++++++++++++--
- .../lookup/ops/embedding_collection.cc        |   2 +-
- .../src/optimizer/prepare_functions.cu        |   1 +
- .../sparse_operation_kit/experiment/lookup.py |   7 +-
- 7 files changed, 113 insertions(+), 21 deletions(-)
-
-diff --git a/HugeCTR/core/core.hpp b/HugeCTR/core/core.hpp
-index 2cc02640..e389dbdf 100644
---- a/HugeCTR/core/core.hpp
-+++ b/HugeCTR/core/core.hpp
-@@ -19,6 +19,7 @@
- #include <nccl.h>
- 
- #include <memory>
-+#include <string>
- 
- #include "macro.hpp"
- 
-diff --git a/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.cu b/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.cu
-index a739fdf2..2a78c535 100644
---- a/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.cu
-+++ b/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.cu
-@@ -64,13 +64,12 @@ TFAdapter<KeyType, DType>::TFAdapter()
- 
- template <typename KeyType, typename DType>
- void TFAdapter<KeyType, DType>::set(
--    std::vector<tensorflow::core::RefCountPtr<tensorflow::Var>>& vars,
--    std::vector<tensorflow::tf_shared_lock>& locks, std::vector<int>& dimensions,
-+    std::vector<float*>& vars, std::vector<int>& dimensions,
-     std::vector<int>& scale, cudaStream_t stream) {
-   std::vector<float*> data;
-   std::vector<int> id_space;
-   for (int i = 0; i < vars.size(); ++i) {
--    float* input = vars[i]->tensor()->flat<float>().data();
-+    float* input = vars[i];
-     bool is_unique = true;
-     for (int j = 0; j < i; ++j) {
-       if (input == data[j]) {
-@@ -78,10 +77,6 @@ void TFAdapter<KeyType, DType>::set(
-         break;
-       }
-     }
--    if (is_unique) {
--      tensorflow::tf_shared_lock lock(*vars[i]->mu());
--      locks.push_back(std::move(lock));
--    }
-     data.push_back(input);
-     id_space.push_back(i);
-   }
-diff --git a/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.h b/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.h
-index d1faff9c..d1d90d9f 100644
---- a/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.h
-+++ b/sparse_operation_kit/experiment/lookup/impl/embedding_collection_adapter.h
-@@ -41,8 +41,7 @@ class TFAdapter : public ::embedding::ILookup {
-   TFAdapter();
-   virtual ~TFAdapter();
- 
--  void set(std::vector<tensorflow::core::RefCountPtr<tensorflow::Var>>& vars,
--           std::vector<tensorflow::tf_shared_lock>& locks, std::vector<int>& dimensions,
-+  void set(std::vector<float*>& vars, std::vector<int>& dimensions,
-            std::vector<int>& scale, cudaStream_t stream = 0);
- 
-   void lookup(const ::core::Tensor& keys, size_t num_keys, const ::core::Tensor& id_space_offset,
-diff --git a/sparse_operation_kit/experiment/lookup/kernels/embedding_collection.cc b/sparse_operation_kit/experiment/lookup/kernels/embedding_collection.cc
-index 1e956ac6..a01518e6 100644
---- a/sparse_operation_kit/experiment/lookup/kernels/embedding_collection.cc
-+++ b/sparse_operation_kit/experiment/lookup/kernels/embedding_collection.cc
-@@ -274,15 +274,113 @@ REGISTER_GPU_KERNELS(int32_t, int32_t, int32_t, int32_t);
- // -----------------------------------------------------------------------------------------------
- // LookupForward
- // -----------------------------------------------------------------------------------------------
--template <typename KeyType, typename OffsetType, typename DType, typename VarType, typename Adapter>
-+template <typename KeyType, typename OffsetType, typename DType>
- class LookupForwardOp : public EmbeddingCollectionBase<KeyType, OffsetType, DType> {
-  private:
--  Adapter adapter_;
-+  sok::TFAdapter<KeyType, DType> adapter_;
- 
-  public:
-   explicit LookupForwardOp(OpKernelConstruction* ctx)
-       : EmbeddingCollectionBase<KeyType, OffsetType, DType>(ctx) {}
- 
-+  void Compute(OpKernelContext* ctx) override {
-+    std::vector<float*> vars;
-+    std::vector<int> scale;
-+    for (int i = 0; i < this->num_lookups_; ++i) {
-+      auto embedding_weights = ctx->input(i);
-+      auto embedding_data = const_cast<float*>(embedding_weights.flat<float>().data());
-+      int64 dimension = embedding_weights.shape().dim_size(1);
-+      OP_REQUIRES(ctx, this->dimensions_[i] == dimension,
-+                  errors::InvalidArgument("Invalid dimension"));
-+
-+      vars.emplace_back(embedding_data);
-+
-+      if (this->shard_[i] < 0) {
-+        scale.push_back(this->num_gpus_);
-+      } else {
-+        scale.push_back(1);
-+      }
-+    }
-+
-+    // stream
-+    auto device_ctx = ctx->op_device_context();
-+    OP_REQUIRES(ctx, device_ctx != nullptr, errors::Aborted("No valid device context."));
-+    cudaStream_t stream = stream_executor::gpu::AsGpuStreamValue(device_ctx->stream());
-+
-+    // Prepare inputs (except handles)
-+    const Tensor* key_recv_buffer = nullptr;
-+    OP_REQUIRES_OK(ctx, ctx->input("key_recv_buffer", &key_recv_buffer));
-+    sok::Tensor key_recv_buffer_tensor(sok::convert_tensor<KeyType>(key_recv_buffer));
-+
-+    const Tensor* row_length_recv_buffer = nullptr;
-+    OP_REQUIRES_OK(ctx, ctx->input("row_length_recv_buffer", &row_length_recv_buffer));
-+    sok::Tensor row_length_recv_buffer_tensor(
-+        sok::convert_tensor<OffsetType>(row_length_recv_buffer));
-+
-+    int global_batch_size = row_length_recv_buffer->NumElements() / this->num_lookups_;
-+
-+    const Tensor* hotness = nullptr;
-+    OP_REQUIRES_OK(ctx, ctx->input("hotness", &hotness));
-+    std::vector<int> hotness_vector;
-+    int* t_hotness = (int*)hotness->data();
-+    int64_t hotness_num = hotness->NumElements();
-+    for (int64_t i =0;i<hotness_num;++i){
-+       hotness_vector.push_back(t_hotness[i]);
-+    }
-+
-+    // Instance 3g embedding
-+    auto tf_backend = this->make_core_resource(ctx);
-+    this->update_meta(tf_backend, global_batch_size,hotness_vector);
-+
-+    // Prepare ILookup (i.e. embedding table)
-+    std::vector<int> ev_size_per_lookup;
-+    for (auto& p : this->ebc_param_->lookup_params) {
-+      ev_size_per_lookup.push_back(p.ev_size);
-+    }
-+    adapter_.set(vars, this->dimensions_, scale, stream);
-+
-+    // Prepare outputs
-+    auto buffer_size_list = ::embedding::tf::model_forward::get_model_comm_buffer_size(*this->meta_, tf_backend->get_global_gpu_count(), global_batch_size);
-+    std::vector<sok::Tensor> emb_vec_model_buffer;
-+    for (size_t i = 0; i < buffer_size_list.size(); ++i) {
-+      Tensor* output = nullptr;
-+      OP_REQUIRES_OK(ctx,
-+                     ctx->allocate_output(i, {static_cast<int64_t>(buffer_size_list[i])}, &output));
-+      emb_vec_model_buffer.push_back(sok::convert_tensor<DType>(output));
-+    }
-+
-+    // Do forward
-+    int64_t num_model_key, num_model_offsets;
-+    sok::Tensor ret_model_key, ret_model_offset;
-+    ::embedding::tf::model_forward::sparse_forward_per_gpu(tf_backend, *this->meta_, key_recv_buffer_tensor, row_length_recv_buffer_tensor, &adapter_,
-+                                  emb_vec_model_buffer, &num_model_key, &num_model_offsets, &ret_model_key, &ret_model_offset);
-+
-+    // Prepare model_key & model_offsets
-+    // Note the type of model_offsets is always uint32_t
-+    Tensor* model_key = nullptr;
-+    OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_gpus_, {num_model_key}, &model_key));
-+    sok::Tensor model_key_tensor(sok::convert_tensor<KeyType>(model_key));
-+    Tensor* model_offsets = nullptr;
-+    OP_REQUIRES_OK(ctx,
-+                   ctx->allocate_output(this->num_gpus_ + 1, {num_model_offsets}, &model_offsets));
-+    sok::Tensor model_offsets_tensor(sok::convert_tensor<uint32_t>(model_offsets));
-+
-+    // Copy tensors that will be used in backward
-+    ::embedding::tf::model_forward::copy_model_keys_and_offsets(tf_backend, ret_model_key, ret_model_offset, model_key_tensor, model_offsets_tensor);
-+  }
-+};
-+
-+template <typename KeyType, typename OffsetType, typename DType>
-+class LookupForwardDynamicOp : public EmbeddingCollectionBase<KeyType, OffsetType, DType> {
-+ using VarType = DummyVar<KeyType, DType>;
-+
-+ private:
-+  sok::DummyVarAdapter<KeyType, DType> adapter_;
-+
-+ public:
-+  explicit LookupForwardDynamicOp(OpKernelConstruction* ctx)
-+      : EmbeddingCollectionBase<KeyType, OffsetType, DType>(ctx) {}
-+
-   void Compute(OpKernelContext* ctx) override {
-     std::vector<tf_shared_lock> locks;
-     std::vector<core::RefCountPtr<VarType>> vars;
-@@ -379,13 +477,11 @@ class LookupForwardOp : public EmbeddingCollectionBase<KeyType, OffsetType, DTyp
- #define REGISTER_GPU_KERNELS(key_type_tf, key_type, offset_type_tf, offset_type, dtype_tf, dtype)  \
-   REGISTER_KERNEL_BUILDER(Name("LookupForward")                                                    \
-                               .Device(DEVICE_GPU)                                                  \
--                              .HostMemory("handles")                                               \
-                               .HostMemory("hotness")                                               \
-                               .TypeConstraint<key_type_tf>("Tindices")                             \
-                               .TypeConstraint<offset_type_tf>("Toffsets")                          \
-                               .TypeConstraint<dtype_tf>("dtype"),                                  \
--                          LookupForwardOp<key_type, offset_type, dtype, Var,                       \
--                                          sok::TFAdapter<key_type, dtype>>)                        \
-+                          LookupForwardOp<key_type, offset_type, dtype>)                           \
-   REGISTER_KERNEL_BUILDER(Name("LookupForwardDynamic")                                             \
-                               .Device(DEVICE_GPU)                                                  \
-                               .HostMemory("handles")                                               \
-@@ -393,8 +489,7 @@ class LookupForwardOp : public EmbeddingCollectionBase<KeyType, OffsetType, DTyp
-                               .TypeConstraint<key_type_tf>("Tindices")                             \
-                               .TypeConstraint<offset_type_tf>("Toffsets")                          \
-                               .TypeConstraint<dtype_tf>("dtype"),                                  \
--                          LookupForwardOp<key_type, offset_type, dtype, DummyVar<key_type, dtype>, \
--                                          sok::DummyVarAdapter<key_type, dtype>>)
-+                          LookupForwardDynamicOp<key_type, offset_type, dtype>)
- // clang-format on
- 
- #if TF_VERSION_MAJOR == 1
-@@ -752,7 +847,7 @@ namespace tensorflow {
- template <typename KeyType, typename OffsetType, typename DType>
- class LookupForwardEmbeddingVarGPUOp : public EmbeddingCollectionBase<KeyType, OffsetType, DType> {
-  private:
--  using VarType = EmbeddingVarGPU<KeyType, float>;
-+  using VarType = EmbeddingVar<KeyType, float>;
-   EmbeddingVarGPUAdapter<KeyType, float> adapter_;
- 
-  public:
-diff --git a/sparse_operation_kit/experiment/lookup/ops/embedding_collection.cc b/sparse_operation_kit/experiment/lookup/ops/embedding_collection.cc
-index dbea8811..d3a71730 100644
---- a/sparse_operation_kit/experiment/lookup/ops/embedding_collection.cc
-+++ b/sparse_operation_kit/experiment/lookup/ops/embedding_collection.cc
-@@ -63,7 +63,7 @@ REGISTER_OP("PreprocessingForward")
- 
- // There may be duplicates in the `handles`
- REGISTER_OP("LookupForward")
--    .Input("handles: num_lookups * resource")
-+    .Input("embeddings: num_lookups * dtype")
-     .Input("key_recv_buffer: Tindices")
-     .Input("row_length_recv_buffer: Toffsets")
-     .Input("hotness: int32")
-diff --git a/sparse_operation_kit/kit_cc/kit_cc_infra/src/optimizer/prepare_functions.cu b/sparse_operation_kit/kit_cc/kit_cc_infra/src/optimizer/prepare_functions.cu
-index afe4881b..67f3e1b4 100644
---- a/sparse_operation_kit/kit_cc/kit_cc_infra/src/optimizer/prepare_functions.cu
-+++ b/sparse_operation_kit/kit_cc/kit_cc_infra/src/optimizer/prepare_functions.cu
-@@ -15,6 +15,7 @@
-  */
- 
- #include <algorithm>
-+#include <cstdint>
- 
- #include "optimizer/prepare_functions.h"
- 
-diff --git a/sparse_operation_kit/sparse_operation_kit/experiment/lookup.py b/sparse_operation_kit/sparse_operation_kit/experiment/lookup.py
-index d567c857..b4b0027c 100644
---- a/sparse_operation_kit/sparse_operation_kit/experiment/lookup.py
-+++ b/sparse_operation_kit/sparse_operation_kit/experiment/lookup.py
-@@ -129,15 +129,16 @@ def _lookup_forward(params, *args, **kwargs):
-         for param in params:
-             # For tf.GradientTape
-             variable_accessed(param)
--        handles = [param.handle for param in params]
-         if isinstance(params[0], DynamicVariable):
-+            handles = [param.handle for param in params]
-             return raw_ops.lookup_forward_dynamic(handles, *args, **kwargs)
-         elif importlib.find_loader("tensorflow.python.ops.kv_variable_ops") and isinstance(
-             params[0], kv_variable_ops.EmbeddingVariable
-         ):
-+            handles = [param.handle for param in params]
-             return raw_ops.lookup_forward_embedding_var_gpu(handles, *args, **kwargs)
-         else:
--            return raw_ops.lookup_forward(handles, *args, **kwargs)
-+            return raw_ops.lookup_forward(params, *args, **kwargs)
- 
- 
- @tf.RegisterGradient("LookupForward")
-@@ -165,7 +166,7 @@ def _LookupBackward(op, *top_grads):
-     grads = []
-     for i in range(len(indices)):
-         handle = op.inputs[i]
--        params_shape = variable_shape(handle)
-+        params_shape = handle.shape
-         size = array_ops.expand_dims(array_ops.size(indices[i]), 0)
-         values_shape = array_ops.concat([size, params_shape[1:]], 0)
-         values[i] = tf.reshape(values[i], values_shape)
--- 
-2.37.1 (Apple Git-137.1)
\ No newline at end of file

From b124938dc7c72b84a5b1d012d3a86725b4d7c8c2 Mon Sep 17 00:00:00 2001
From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
Date: Tue, 23 May 2023 17:37:20 +0800
Subject: [PATCH 03/91] [Dockerfile] Update docker images in user documents.
 (#867)

Signed-off-by: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
---
 docs/docs_en/DeepRec-Compile-And-Install.md   | 4 ----
 docs/docs_en/Estimator-Compile-And-Install.md | 4 ----
 docs/docs_en/TFServing-Compile-And-Install.md | 4 ----
 docs/docs_zh/DeepRec-Compile-And-Install.md   | 4 ----
 docs/docs_zh/Estimator-Compile-And-Install.md | 4 ----
 docs/docs_zh/TFServing-Compile-And-Install.md | 4 ----
 6 files changed, 24 deletions(-)

diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index cf48987ac3b..1fbe923b30d 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -15,11 +15,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 51522979444..cdc04044875 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -15,11 +15,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 91fde221864..8ced3628673 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -14,11 +14,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index e18fd6d5a75..b526ce9f2d8 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -15,11 +15,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index dfeeb3717de..332b96e6086 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -15,11 +15,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index c9cc85ad82c..27bfc864e4e 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -14,11 +14,7 @@
 
 | GCC Version | Python Version | CUDA VERSION |                           IMAGE                                 |
 | ----------- | -------------- | ------------ | --------------------------------------------------------------- |
-|    7.5.0    |    3.6.9       | CUDA 11.0.3  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu110-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.2.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu112-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.4.2  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu114-ubuntu18.04 |
 |    7.5.0    |    3.6.9       | CUDA 11.6.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu116-ubuntu18.04 |
-|    7.5.0    |    3.6.9       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py36-cu117-ubuntu18.04 |
 |    9.4.0    |    3.8.10      | CUDA 11.6.2  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu116-ubuntu20.04 |
 |    11.2.0   |    3.8.6       | CUDA 11.7.1  | alideeprec/deeprec-base:deeprec-base-gpu-py38-cu117-ubuntu22.04 |
 

From 2649edd79da3996cca8f9655cc320a36be97de97 Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Wed, 24 May 2023 18:00:23 +0800
Subject: [PATCH 04/91] [Runtime] Support for loading saved_model with device
 information when use session_group and multi_stream. (#868)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 .../core/common_runtime/direct_session.cc     |  5 +++
 tensorflow/core/common_runtime/placer.cc      | 42 +++++++++++++++----
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 004262291fa..20171503972 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -386,6 +386,11 @@ class DirectSessionFactory : public SessionFactory {
   Status NewSessionGroup(const SessionOptions& options,
                          SessionGroup** out_session_group,
                          const SessionGroupMetadata& metadata) {
+    // Set SessionGroup env
+    if (setenv("USE_SESSION_GROUP", "1", 1) != 0) {
+      LOG(WARNING) << "Set env USE_SESSION_GROUP failed.";
+    }
+
     int session_num = metadata.session_count;
     if (session_num < 1) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 9b1d196c30b..1887804d41d 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -112,6 +112,39 @@ Status Placer::Run() {
     }
   }
 
+  std::string cpu_name, gpu_name, all_names;
+  for (auto d : devices_->devices()) {
+    all_names += d->name();
+    all_names += ";\n";
+    if (d->name().find("device:CPU:") != std::string::npos) {
+      cpu_name = d->name();
+    } else if (d->name().find("device:GPU:") != std::string::npos) {
+      gpu_name = d->name();
+    }
+  }
+
+  bool use_session_group = false;
+  TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
+      "USE_SESSION_GROUP", false, &use_session_group));
+  if (use_session_group) {
+    for (Node* node : graph_->op_nodes()) {
+      const std::string dname = node->requested_device();
+      if (!dname.empty()) {
+        if (dname.find("device:CPU:") != std::string::npos) {
+          node->set_requested_device(cpu_name);
+        } else if (dname.find("device:GPU:") != std::string::npos) {
+          node->set_requested_device(gpu_name);
+        } else {
+          LOG(ERROR) << "Can not find requested device in current devices set"
+                     << ", node requested device: " << dname
+                     << ", current devices set: " << all_names;
+        }
+      }
+      // we don't continue here, cause there are
+      // nodes should be assigned device below.
+    }
+  }
+
   FunctionStack stack(function_name_);
   ColocationGraph colocation_graph(graph_, stack, flib_def_, devices_,
                                    default_local_device_, allow_soft_placement_,
@@ -236,15 +269,6 @@ Status Placer::Run() {
       "PLACE_TRT_OP_ON_GPU_ONLY", false, &place_trtop_on_gpu_only));
   // Keep TRTEngineOp On GPU Only
   if (place_trtop_on_gpu_only) {
-    std::string cpu_name, gpu_name;
-    for (auto d : devices_->devices()) {
-      if (d->name().find("device:CPU:") != std::string::npos) {
-        cpu_name = d->name();
-      } else if (d->name().find("device:GPU:") != std::string::npos) {
-        gpu_name = d->name();
-      }
-    }
-
     for (Node* n : graph_->op_nodes()) {
       if (n->type_string() == "TRTEngineOp") {
         n->set_assigned_device_name(gpu_name);

From 20f292daff7da044c01c763929150147068008d7 Mon Sep 17 00:00:00 2001
From: xiaowan0322 <35219371+xiaowan0322@users.noreply.github.com>
Date: Fri, 26 May 2023 17:22:53 +0800
Subject: [PATCH 05/91] [Quantization] Fix bug: init global_step before saving
 variables (#870)

---
 tools/low_precision_optimize/low_precision_optimize.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/low_precision_optimize/low_precision_optimize.py b/tools/low_precision_optimize/low_precision_optimize.py
index 6d9114028bc..cf34d44c5b7 100644
--- a/tools/low_precision_optimize/low_precision_optimize.py
+++ b/tools/low_precision_optimize/low_precision_optimize.py
@@ -565,7 +565,10 @@ def optimize(model_path, save_path, opt_config=None, data_type=BF16, calib_file=
         )
         ev_dict = update_embedding_vars(sess)
         if len(ev_dict) > 0:
-            model_outputs.append(_nd(tf.train.get_global_step().name))
+            global_step = tf.train.get_global_step()
+            model_outputs.append(_nd(global_step.name))
+            if isinstance(global_step, tf.Variable):
+                sess.run(tf.variables_initializer([global_step]))
 
         def _extract_sub_graph(outputs):
             graph_def = sess.graph.as_graph_def(add_shapes=True)

From f576e01942830ca9d36965a169e0efbf6023092e Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Sat, 27 May 2023 00:00:19 +0800
Subject: [PATCH 06/91] [Serving] Clear virtual_device configurations before
 load new checkpoint. (#871)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 serving/processor/serving/model_session.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/serving/processor/serving/model_session.cc b/serving/processor/serving/model_session.cc
index f342b07b712..99df3946a22 100644
--- a/serving/processor/serving/model_session.cc
+++ b/serving/processor/serving/model_session.cc
@@ -173,6 +173,10 @@ Status ModelSessionMgr::CreateSessionGroup(
   metadata.model_id = 0;
   metadata.gpu_ids = config->gpu_ids;
   metadata.cpusets = config->cpusets;
+
+  ConfigProto* opt_config = const_cast<ConfigProto*>(&(session_options_->config));
+  GPUOptions* gpu_opt = opt_config->mutable_gpu_options();
+  gpu_opt->mutable_experimental()->clear_virtual_devices();
   TF_RETURN_IF_ERROR(NewSessionGroup(*session_options_,
                                      session_group, metadata));
   TF_RETURN_IF_ERROR((*session_group)->Create(meta_graph_def_.graph_def()));

From e07f25cf48ff216688fcf9376f23a4f42573c2f9 Mon Sep 17 00:00:00 2001
From: xiaowan0322 <35219371+xiaowan0322@users.noreply.github.com>
Date: Mon, 29 May 2023 19:27:01 +0800
Subject: [PATCH 07/91] [Quantization] Fix bug: reserve input nodes, clear
 saver devices on demand. (#873)

Signed-off-by: wanchen.swc <wanchen.swc@alibaba-inc.com>
---
 .../low_precision_optimize.py                 | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/low_precision_optimize/low_precision_optimize.py b/tools/low_precision_optimize/low_precision_optimize.py
index cf34d44c5b7..065e8efac8d 100644
--- a/tools/low_precision_optimize/low_precision_optimize.py
+++ b/tools/low_precision_optimize/low_precision_optimize.py
@@ -1,4 +1,5 @@
 import os
+import re
 import sys
 import tempfile
 
@@ -542,6 +543,18 @@ def _get_gather_pattern():
     return update_dict, variable_path
 
 
+def clear_saver_devices(new_meta_graph, meta_graph):
+    def is_saver_node(name):
+        return re.search('^save_[1-9][0-9]*/', name) or re.search('^save/', name)
+
+    saver_name = _nd(meta_graph.saver_def.save_tensor_name)
+    saver_node = [nd for nd in meta_graph.graph_def.node if nd.name == saver_name][0]
+    if saver_node.device == '':
+        for node in new_meta_graph.graph_def.node:
+            if is_saver_node(node.name):
+                node.device = ''
+
+
 def optimize(model_path, save_path, opt_config=None, data_type=BF16, calib_file=None):
     saved_model = loader_impl._parse_saved_model(model_path)
     tags = saved_model.meta_graphs[0].meta_info_def.tags
@@ -549,6 +562,7 @@ def optimize(model_path, save_path, opt_config=None, data_type=BF16, calib_file=
         meta_graph_def = tf.saved_model.loader.load(sess, tags, model_path)
         signature_keys = list(meta_graph_def.signature_def.keys())
         signature_def = meta_graph_def.signature_def[signature_keys[0]]
+        model_inputs = [_nd(v.name) for v in signature_def.inputs.values()]
         model_outputs = [_nd(v.name) for v in signature_def.outputs.values()]
         init_op = loader_impl.get_init_op(meta_graph_def)
         if init_op is not None:
@@ -576,7 +590,7 @@ def _extract_sub_graph(outputs):
             return tf.graph_util.extract_sub_graph(graph_def, outputs)
 
         def _save(save_path):
-            sub_graph_def = _extract_sub_graph(model_outputs)
+            sub_graph_def = _extract_sub_graph(model_inputs + model_outputs)
             node_names = [node.name for node in sub_graph_def.node]
             variables = [v for v in get_all_variables() if _nd(v.name) in node_names]
             init_name = tf.variables_initializer(variables).name
@@ -601,7 +615,8 @@ def _save(save_path):
             _nd(saver.saver_def.filename_tensor_name),
             _nd(saver.saver_def.save_tensor_name),
         ]
-        graph_def = _extract_sub_graph(model_outputs + saver_nodes + [init_name])
+        outputs = model_inputs + model_outputs + saver_nodes + [init_name]
+        graph_def = _extract_sub_graph(outputs)
         graph = sess.graph
 
     # Create new meta graph def
@@ -645,6 +660,7 @@ def _save(save_path):
             assets_collection=assets_collection,
             main_op=main_op,
         )
+        clear_saver_devices(builder._saved_model.meta_graphs[0], meta_graph_def)
         builder.save()
         if len(ev_opt_dict) > 0:
             target_path = f'{save_path}/variables'

From cedaeefc45f1350b95fe814e2097bce74955ecf7 Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Thu, 1 Jun 2023 10:53:35 +0800
Subject: [PATCH 08/91] [Docs] Update device placement documents. (#874)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 docs/docs_en/Device-Placement.md | 18 ++++++++++++++++++
 docs/docs_zh/Device-Placement.md | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/docs/docs_en/Device-Placement.md b/docs/docs_en/Device-Placement.md
index 4689aa9d728..ee33f62db42 100644
--- a/docs/docs_en/Device-Placement.md
+++ b/docs/docs_en/Device-Placement.md
@@ -18,3 +18,21 @@ In user C++ code:
 tensorflow::SessionOptions session_options;
 session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_device_placement_optimization(true);
 ```
+
+If we use [tensorflow-serving](https://github.com/DeepRec-AI/serving) offered by DeepRec, we can enable the optimization via `--enable_device_placement_optimization=true`
+
+And when we use DeepRec [processor](https://deeprec.readthedocs.io/en/latest/Processor.html), we should add `enable_device_placement_optimization` field in the json configuration file, as follows:
+```
+{
+  "model_entry": "",
+  "processor_path": "...",
+  "processor_entry": "libserving_processor.so",
+  "processor_type": "cpp",
+  "model_config": {
+    ...
+    "enable_device_placement_optimization": true,
+    ...
+  },
+  ...
+}
+```
diff --git a/docs/docs_zh/Device-Placement.md b/docs/docs_zh/Device-Placement.md
index 318d131a942..914060a03a2 100644
--- a/docs/docs_zh/Device-Placement.md
+++ b/docs/docs_zh/Device-Placement.md
@@ -19,3 +19,21 @@ tensorflow::SessionOptions session_options;
 session_options.config.mutable_graph_options()->mutable_optimizer_options()->set_device_placement_optimization(true);
 ```
 
+如果用户使用DeepRec提供的[tensorflow-serving](https://github.com/DeepRec-AI/serving)，可以通过`--enable_device_placement_optimization=true`来开启device placement功能。
+
+如果用户使用DeepRec [processor](https://deeprec.readthedocs.io/zh/latest/Processor.html)，可以通过在json配追文件中增加enable_device_placement_optimization来开启device placement功能，如下所示:
+```
+{
+  "model_entry": "",
+  "processor_path": "...",
+  "processor_entry": "libserving_processor.so",
+  "processor_type": "cpp",
+  "model_config": {
+    ...
+    "enable_device_placement_optimization": true,
+    ...
+  },
+  ...
+}
+```
+

From 6a1317681f880d665c35520e5358e9ff8858bd4d Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Thu, 1 Jun 2023 18:03:21 +0800
Subject: [PATCH 09/91] [Docs] Update session_group documents. (#876)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 docs/docs_en/SessionGroup.md | 8 +++-----
 docs/docs_zh/SessionGroup.md | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/docs_en/SessionGroup.md b/docs/docs_en/SessionGroup.md
index f4112b8c105..e68d499d1dd 100644
--- a/docs/docs_en/SessionGroup.md
+++ b/docs/docs_en/SessionGroup.md
@@ -26,7 +26,6 @@ For GPU tasks, the following configurations are required:
   "session_num": 2,
   "use_per_session_threads": true,
   "gpu_ids_list": "0,2",
-  "use_multi_stream": true,
   ...
 }
 ```
@@ -72,7 +71,7 @@ DeepRec will detect which CPUs can be allocated, and then allocate CPU cores to
 These options can be used in GPU task.
 
 #### GPU Task
-In Inference scenarios, users often use GPUs for online services to improve computing efficiency and reduce latency. One problem that may be encountered here is that the online GPU utilization rate is low, resulting in a waste of resources. Then, to make good use of GPU resources, we recommend users to use Multi-streams to process requests, which greatly improves QPS while ensuring latency.
+In Inference scenarios, users often use GPUs for online services to improve computing efficiency and reduce latency. One problem that may be encountered here is that the online GPU utilization rate is low, resulting in a waste of resources. Then, to make good use of GPU resources, we use Multi-streams to process requests, which greatly improves QPS while ensuring latency. In the GPU scenario, using session group will use multi-stream by default, that is, each session uses an independent stream.
 
 At present inference scenarios, the multi-streams function is bound to the SessionGroup function. For the usage of SessionGroup, see the previous link. In the future, we will directly support the multi-streams function on DirectSession.
 
@@ -88,7 +87,7 @@ nvidia-cuda-mps-control -d
 Currently taking Tensorflow_serving as an example (it will be necessary to add other framework usage methods later), the following parameters need to be added when starting the server.
 
 ```c++
-CUDA_VISIBLE_DEVICES=0  ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 MERGE_COMPUTE_COPY_STREAM=1 PER_SESSION_HOSTALLOC=1 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --use_multi_stream=true --allow_gpu_mem_growth=true --model_base_path=/xx/xx/pb/
+CUDA_VISIBLE_DEVICES=0  ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 MERGE_COMPUTE_COPY_STREAM=1 PER_SESSION_HOSTALLOC=1 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --allow_gpu_mem_growth=true --model_base_path=/xx/xx/pb/
 
 ENABLE_MPS=1: Turn on MPS (it is generally recommended to turn on).
 CONTEXTS_COUNT_PER_GPU=4: Configure cuda contexts count for each physical GPU, the default is 4.
@@ -98,7 +97,6 @@ PER_SESSION_HOSTALLOC=1: Each session uses an independent gpu host allocator.
 
 use_per_session_threads=true: Each session configures the thread pool separately.
 session_num_per_group=4: Indicates the number of sessions configured by the session group.
-use_multi_stream=true: Enable the multi-stream function.
 ```
 
 ##### 3.Multi-GPU
@@ -118,7 +116,7 @@ The option --gpu_ids_list=0,2 means that the user can use GPUs 0 and 2. If the n
 
 Startup command:
 ```
-ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --use_multi_stream=true --allow_gpu_mem_growth=true --gpu_ids_list=0,2  --model_base_path=/xx/xx/pb/
+ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --allow_gpu_mem_growth=true --gpu_ids_list=0,2  --model_base_path=/xx/xx/pb/
 ```
 For a detailed explanation of the above environment variables, see: [Startup parameters](https://deeprec.readthedocs.io/en/latest/SessionGroup.html#startup-command)
 
diff --git a/docs/docs_zh/SessionGroup.md b/docs/docs_zh/SessionGroup.md
index d27ac2035da..318106dba72 100644
--- a/docs/docs_zh/SessionGroup.md
+++ b/docs/docs_zh/SessionGroup.md
@@ -30,7 +30,6 @@ SessionGroup功能提供了可以配置一组Session，并且将Request通过Rou
   "session_num": 2,
   "use_per_session_threads": true,
   "gpu_ids_list": "0,2",
-  "use_multi_stream": true,
   ...
 }
 ```
@@ -74,7 +73,7 @@ session3: 11 12 13
 上述参数在GPU任务中也可用。
 
 #### GPU任务
-在Inference场景中，用户常使用GPU进行线上服务，来提升计算效率，减小延迟。这里可能会遇到的一个问题是，线上GPU利用率低，造成资源浪费。那么为了利用好GPU资源，我们建议用户使用Multi-streams处理请求，在保证延迟的前提下极大提升QPS。
+在Inference场景中，用户常使用GPU进行线上服务，来提升计算效率，减小延迟。这里可能会遇到的一个问题是，线上GPU利用率低，造成资源浪费。那么为了利用好GPU资源，我们使用Multi-streams处理请求，在保证延迟的前提下极大提升QPS。在GPU场景下，使用session group会默认使用multi-stream，即每个session使用一个独立的stream。
 
 目前multi-streams功能是和SessionGroup功能绑定使用的，SessionGroup的用法详见前面链接。后续我们会在DirectSession上直接支持multi-streams功能。
 
@@ -91,7 +90,7 @@ nvidia-cuda-mps-control -d
 这里以Tensorflow serving为例(后续补充其他使用方式)，在启动server时需要增加下列参数，
 
 ```c++
-CUDA_VISIBLE_DEVICES=0  ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 MERGE_COMPUTE_COPY_STREAM=1 PER_SESSION_HOSTALLOC=1 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --use_multi_stream=true --allow_gpu_mem_growth=true --model_base_path=/xx/xx/pb/
+CUDA_VISIBLE_DEVICES=0  ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 MERGE_COMPUTE_COPY_STREAM=1 PER_SESSION_HOSTALLOC=1 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --allow_gpu_mem_growth=true --model_base_path=/xx/xx/pb/
 
 ENABLE_MPS=1: 开启MPS(一般都建议开启)。
 CONTEXTS_COUNT_PER_GPU=4: 每个物理GPU配置几组cuda context，默认是4。
@@ -100,7 +99,6 @@ PER_SESSION_HOSTALLOC=1: 表示每个session使用独立的gpu host allocator。
 
 use_per_session_threads=true: 每个session单独配置线程池。
 session_num_per_group=4: session group中配置几个session。
-use_multi_stream=true: 开启multi-stream功能。
 ```
 
 ##### 3.多张GPU使用
@@ -120,7 +118,7 @@ use_multi_stream=true: 开启multi-stream功能。
 
 整体命令如下：
 ```
-ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --use_multi_stream=true --allow_gpu_mem_growth=true --gpu_ids_list=0,2  --model_base_path=/xx/xx/pb/
+ENABLE_MPS=1 CONTEXTS_COUNT_PER_GPU=4 bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --tensorflow_intra_op_parallelism=8 --tensorflow_inter_op_parallelism=8 --use_per_session_threads=true  --session_num_per_group=4 --allow_gpu_mem_growth=true --gpu_ids_list=0,2  --model_base_path=/xx/xx/pb/
 ```
 上面环境变量详细解释见: [启动参数](https://deeprec.readthedocs.io/zh/latest/SessionGroup.html#id5)
 TF serving用DeepRec提供的代码: [TF serving](https://github.com/DeepRec-AI/serving/commits/deeprec)

From 27362e23278865d15edccab36d5bb89c65dff63b Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Fri, 2 Jun 2023 08:57:21 +0800
Subject: [PATCH 10/91] [Runtime] Enable multi-stream in session_group by
 default. (#875)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 .../core/common_runtime/direct_session.cc     | 58 ++++++-------------
 .../core/common_runtime/direct_session.h      |  2 -
 2 files changed, 18 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 20171503972..9670e838f88 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -364,7 +364,7 @@ class DirectSessionFactory : public SessionFactory {
     std::vector<unsigned> visible_cpus;
     DirectSession* session =
         new DirectSession(options, new DeviceMgr(std::move(devices)),
-                          true, this, visible_cpus);
+                          this, visible_cpus);
 
 #if GOOGLE_CUDA
     // owned gpu_shared_rmgr
@@ -422,13 +422,15 @@ class DirectSessionFactory : public SessionFactory {
 #if GOOGLE_CUDA
     // Each virtual gpu device will be assigned to one session,
     // and every virtual device has a independent stream.
-    bool use_multi_stream = options.config.use_per_session_stream();
+
+    // NOTE: we use multi_stream always in gpu session_group.
+    // bool use_multi_stream = options.config.use_per_session_stream();
     int base_index = 0;
     std::vector<size_t> sorted_gpu_ids;
     // Visiable gpu ids: 0,1,2...
     int visible_gpu_count = VisibleDeviceCount();
     int stream_num_per_device = session_num;
-    if (use_multi_stream) {
+    {
       // Current model id in multi-models
       int model_id = metadata.model_id;
       // If user not set gpu id list, session group will
@@ -510,11 +512,6 @@ class DirectSessionFactory : public SessionFactory {
 
       // We set allow_growth in multi-stream mode.
       gpu_options->set_allow_growth(true);
-    } else {
-      // NOTE: Use single stream in session group mode.
-      // This can't get good performance.
-      LOG(WARNING) << "Use single stream in session group mode,"
-                   << "this can't get good performance.";
     }
 #endif // GOOGLE_CUDA
 
@@ -614,7 +611,6 @@ class DirectSessionFactory : public SessionFactory {
     }
 #endif
 
-
 #if GOOGLE_CUDA
     SessionGroup* session_group =
       new DirectSessionGroup(shared_rmgr, gpu_shared_rmgrs,
@@ -634,44 +630,30 @@ class DirectSessionFactory : public SessionFactory {
           options, "/job:localhost/replica:0/task:0", &dev,
           &dev_rmgr_map, dev_global_tp_opt));
       DeviceMgr* dev_mgr = nullptr;
-      bool owned_device_mgr = true;
 #if GOOGLE_CUDA
-      if (use_multi_stream) {
-        RemoveUselessDevice(dev, session_to_device_id[i]);
-        dev_mgr = new DeviceMgr(std::move(dev));
-        owned_device_mgr = true;
-      } else {
-        // Use the same deivce as leader session, this can't get
-        // good performance, so user should set use_multi_stream true
-        // in session group mode.
-        static DeviceMgr* device_mgr = new DeviceMgr(std::move(dev));
-        dev_mgr = device_mgr;
-        owned_device_mgr = false;
-      }
+      RemoveUselessDevice(dev, session_to_device_id[i]);
+      dev_mgr = new DeviceMgr(std::move(dev));
 #else
       dev_mgr = new DeviceMgr(std::move(dev));
-      owned_device_mgr = true;
 #endif // GOOGLE_CUDA
 
       SessionOptions curr_options = options;
 #if GOOGLE_CUDA
-      if (use_multi_stream) {
+      curr_options.config.add_per_session_devices(
+          "/job:localhost/replica:0/task:0/device:GPU:" +
+          std::to_string(session_to_device_id[i]));
+      if (use_per_session_host_allocator) {
         curr_options.config.add_per_session_devices(
-            "/job:localhost/replica:0/task:0/device:GPU:" +
-            std::to_string(session_to_device_id[i]));
-        if (use_per_session_host_allocator) {
-          curr_options.config.add_per_session_devices(
-              "/job:localhost/replica:0/task:0/device:CPU:"+std::to_string(i));
-        } else {
-          curr_options.config.add_per_session_devices(
-              "/job:localhost/replica:0/task:0/device:CPU:0");
-        }
+            "/job:localhost/replica:0/task:0/device:CPU:"+std::to_string(i));
+      } else {
+        curr_options.config.add_per_session_devices(
+            "/job:localhost/replica:0/task:0/device:CPU:0");
       }
 #endif // GOOGLE_CUDA
 
       DirectSession* sess =
-          new DirectSession(curr_options, dev_mgr, owned_device_mgr,
-                            this, visible_cpus_per_session[i]);
+          new DirectSession(curr_options, dev_mgr, this,
+                            visible_cpus_per_session[i]);
       session_group->CreateSession(sess);
       {
         mutex_lock l(sessions_lock_);
@@ -812,11 +794,9 @@ bool DirectSession::ShouldUseRunHandlerPool(
 
 DirectSession::DirectSession(const SessionOptions& options,
                              const DeviceMgr* device_mgr,
-                             bool owd_device_mgr,
                              DirectSessionFactory* factory,
                              const std::vector<unsigned>& visible_cpus)
     : options_(options),
-      own_device_mgr_(owd_device_mgr),
       device_mgr_(device_mgr),
       factory_(factory),
       cancellation_manager_(new CancellationManager()),
@@ -966,9 +946,7 @@ DirectSession::~DirectSession() {
   execution_state_.reset(nullptr);
   flib_def_.reset(nullptr);
 
-  if (own_device_mgr_) {
-    delete device_mgr_;
-  }
+  delete device_mgr_;
 
   if (multi_stream_shared_rmgr_) {
     delete multi_stream_shared_rmgr_;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 5a3b3ae1335..142c1b25a30 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -62,7 +62,6 @@ class DirectSession : public Session {
   // closed. This ensures that Reset requests from the 'factory' don't get sent
   // to sessions that are already closed.
   DirectSession(const SessionOptions& options, const DeviceMgr* device_mgr,
-                bool own_device_mgr,
                 DirectSessionFactory* factory,
                 const std::vector<unsigned>& visible_cpus);
 
@@ -337,7 +336,6 @@ class DirectSession : public Session {
   const SessionOptions options_;
 
   // Device structures.
-  bool own_device_mgr_;
   const DeviceMgr* device_mgr_;
   std::vector<Device*> devices_;  // not owned
   DeviceSet device_set_;

From d678a50b31015bcb9519eee841a4d297083915d3 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Mon, 5 Jun 2023 11:52:11 +0800
Subject: [PATCH 11/91] [ModelZoo] Update documents and config files for
 modelzoo benchmark. (#879)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 modelzoo/benchmark/cpu/README.md   | 12 ++++++------
 modelzoo/benchmark/cpu/config.yaml |  3 ++-
 modelzoo/benchmark/gpu/README.md   | 12 ++++++------
 modelzoo/benchmark/gpu/config.yaml |  3 ++-
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/modelzoo/benchmark/cpu/README.md b/modelzoo/benchmark/cpu/README.md
index 70f0f2a35e4..67dd02069fb 100644
--- a/modelzoo/benchmark/cpu/README.md
+++ b/modelzoo/benchmark/cpu/README.md
@@ -1,4 +1,4 @@
-# Deeprec Benchmark
+# DeepRec Benchmark
 The following is a brief directory structure and description for this example:
 
 ```
@@ -17,7 +17,7 @@ The following is a brief directory structure and description for this example:
 
 
 ## Content
-- [Deeprec Benchmark](#deeprec-benchmark)
+- [DeepRec Benchmark](#deeprec-benchmark)
   - [Content](#content)
   - [Requirement](#requirement)
   - [Usage](#usage)
@@ -44,11 +44,11 @@ The following test environment should be installed:
 
 - `tf_test_image` : the images used to benchmark stock tf, the default value is `alideeprec/deeprec-weekly-modelzoo:tf` 
 
-- `test model` : which models you need to benchmark, like `- WDL`. These models can be added or deleted according to the benchmark needs.  The default value is all six models that need to be benchmarked : `DLRM WDL DeepFM DSSM DIEN DIN`
+- `test model` : which models you need to benchmark, like `- wide_and_deep`. These models can be added or deleted according to the benchmark needs. The default value is all models in modelzoo that need to be benchmarked
 
-- `model_batchsize` : batch size of all models, like `WDL: 512`. Please keep the same batch size as used in model training. It is only used to calculate throughput and cannot be used to set training parameters
+- `model_batchsize` : batch size of all models, like `wide_and_deep: 512`. Please keep the same batch size as used in model training. It is only used to calculate throughput and cannot be used to set training parameters
 
-- `modelArgs` : parameters of deeprec benchmark, like `--emb_fusion true`, the default is null
+- `modelArgs` : parameters of deeprec benchmark, like `--emb_fusion true`, the default is `--steps 12000`. Please do not modify the `--steps` parameter
 
 - `stocktf` : benchmark Stock tf or not, the value can be chosen in `on` and `off`. The default value is `on`, which means benchmark Stock tf. If there is no need to benchmark Stock tf, this value should be changed to `off`
 
@@ -61,4 +61,4 @@ The following test environment should be installed:
 ### log
 The documents of each benchmark are stored in `benchmark/benchmark_result/log/$CurrentTime`
 
-The end of log files are ACC and AUC value and the thoughtout values are multiply the average value of `global_step/sec` between 2000~3000 steps by model batchsize
+The end of log files are ACC and AUC value and the thoughtout values are multiply the average value of `global_step/sec` between 2000~12000 steps by model batchsize
diff --git a/modelzoo/benchmark/cpu/config.yaml b/modelzoo/benchmark/cpu/config.yaml
index ad31c533d57..0fe8a30791b 100644
--- a/modelzoo/benchmark/cpu/config.yaml
+++ b/modelzoo/benchmark/cpu/config.yaml
@@ -2,7 +2,7 @@
 deeprec_test_image: alideeprec/deeprec-release-modelzoo:latest
 tf_test_image: alideeprec/deeprec-weekly-modelzoo:tf
 
-#test model default: [WDL, DLRM ,DeepFM, DSSM, DIEN, DIN]
+#test model
 test_model: 
   - dlrm
   - wide_and_deep
@@ -41,6 +41,7 @@ model_batchsize:
 #args for deeprec
 #like --emb_fusion true
 modelArgs:
+  --steps 12000
 
 #tf on/off
 stocktf: on
diff --git a/modelzoo/benchmark/gpu/README.md b/modelzoo/benchmark/gpu/README.md
index 6ac61af2d27..4b225a1cdde 100644
--- a/modelzoo/benchmark/gpu/README.md
+++ b/modelzoo/benchmark/gpu/README.md
@@ -1,4 +1,4 @@
-# Deeprec Benchmark
+# DeepRec Benchmark
 The following is a brief directory structure and description for this example:
 
 ```
@@ -17,7 +17,7 @@ The following is a brief directory structure and description for this example:
 
 
 ## Content
-- [Deeprec Benchmark](#deeprec-benchmark)
+- [DeepRec Benchmark](#deeprec-benchmark)
   - [Content](#content)
   - [Requirement](#requirement)
   - [Usage](#usage)
@@ -44,11 +44,11 @@ The following test environment should be installed:
 
 - `tf_test_image` : the images used to benchmark stock tf, the default value is `alideeprec/deeprec-release-modelzoo:gpu-latest` 
 
-- `test model` : which models you need to benchmark, like `- WDL`. These models can be added or deleted according to the benchmark needs.  The default value is all six models that need to be benchmarked : `DLRM WDL DeepFM DSSM DIEN DIN`
+- `test model` : which models you need to benchmark, like `- wide_and_deep`. These models can be added or deleted according to the benchmark needs. The default value is all models in modelzoo that need to be benchmarked
 
-- `model_batchsize` : batch size of all models, like `WDL: 512`. Please keep the same batch size as used in model training. It is only used to calculate throughput and cannot be used to set training parameters
+- `model_batchsize` : batch size of all models, like `wide_and_deep: 512`. Please keep the same batch size as used in model training. It is only used to calculate throughput and cannot be used to set training parameters
 
-- `modelArgs` : parameters of deeprec benchmark, like `--emb_fusion true`, the default is null
+- `modelArgs` : parameters of deeprec benchmark, like `--emb_fusion true`, the default is `--steps 12000`. Please do not modify the `--steps` parameter
 
 - `stocktf` : benchmark Stock tf or not, the value can be chosen in `on` and `off`. The default value is `on`, which means benchmark Stock tf. If there is no need to benchmark Stock tf, this value should be changed to `off`
 
@@ -61,4 +61,4 @@ The following test environment should be installed:
 ### log
 The documents of each benchmark are stored in `benchmark/benchmark_result/log/$CurrentTime`
 
-The end of log files are ACC and AUC value and the thoughtout values are multiply the average value of `global_step/sec` between 2000~3000 steps by model batchsize
+The end of log files are ACC and AUC value and the thoughtout values are multiply the average value of `global_step/sec` between 2000~12000 steps by model batchsize
diff --git a/modelzoo/benchmark/gpu/config.yaml b/modelzoo/benchmark/gpu/config.yaml
index 00ff77f4781..31951ee30a1 100644
--- a/modelzoo/benchmark/gpu/config.yaml
+++ b/modelzoo/benchmark/gpu/config.yaml
@@ -2,7 +2,7 @@
 deeprec_test_image: alideeprec/deeprec-release-modelzoo:gpu-latest
 tf_test_image: alideeprec/deeprec-release-modelzoo:gpu-latest
 
-#test model default: [WDL, DLRM ,DeepFM, DSSM, DIEN, DIN]
+#test model
 test_model: 
   - dlrm
   - wide_and_deep
@@ -41,6 +41,7 @@ model_batchsize:
 #args for deeprec
 #like --emb_fusion true
 modelArgs:
+  --steps 12000
 
 #tf on/off
 stocktf: on

From 5dba495480f74ecb339a8e4cb287293e4ee0782a Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Mon, 5 Jun 2023 18:05:25 +0800
Subject: [PATCH 12/91] [Embedding] Support StaticGPUHashMap to optimize
 EmbeddingVariable in inference. (#878)

Performance Comparison:
| Benchmark | batch_size | Global_steps |
| -------------- | ----- | ----- |
| DCNv2 train EV | 8192 | 11.5 |
| DCNv2 inference EV | 8192 | 12.6 |
| DCNv2 train EV | 2048 | 25.80 |
| DCNv2 inference EV | 2048 | 29.20 |

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 .../core/framework/embedding/embedding_var.h  |  10 +-
 .../framework/embedding/gpu_hash_map_kv.h     | 300 ++++++++++--------
 .../framework/embedding/gpu_hash_table.cu.cc  | 193 ++++++++++-
 .../core/framework/embedding/gpu_hash_table.h | 125 ++++----
 .../core/framework/embedding/kv_interface.h   |  11 +-
 .../framework/embedding/single_tier_storage.h |   7 +
 tensorflow/core/framework/embedding/storage.h |   3 +
 ..._embedding_lookup_sparse_forward_ops.cu.cc |  36 ++-
 .../core/kernels/kv_variable_lookup_ops.cc    |  27 +-
 .../ops/embedding_variable_ops_gpu_test.py    |  22 +-
 third_party/cucollection.patch                | 103 ++++--
 11 files changed, 594 insertions(+), 243 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 9d224fb0d6f..1cdbcfcbd89 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -43,9 +43,6 @@ namespace tensorflow {
       EventMgr* event_mgr);
 #endif //GOOGLE_CUDA
 
-namespace {
-const char* kInferenceMode = "INFERENCE_MODE";
-}
 
 template <class K, class V>
 class GPUHashTable;
@@ -632,6 +629,13 @@ class EmbeddingVar : public ResourceBase {
     storage_->BatchLookupOrCreateKeys(key, item_idxs, n, device);
   }
 
+  void Lookup(const K* key, V* val, V* default_v,
+      int32 default_v_num, bool is_use_default_value_tensor,
+      size_t n, const Eigen::GpuDevice& device) {
+    storage_->BatchLookup(key, val, default_v, default_v_num,
+        is_use_default_value_tensor, n, device);
+  }
+  
   int32 SlotNum() {
     return (emb_config_.block_num * (1 + emb_config_.slot_num));
   }
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 1b4ca32f689..82edf045f60 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -16,50 +16,60 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
 
 #if GOOGLE_CUDA
-#include "tensorflow/core/framework/embedding/kv_interface.h"
+
 #include "tensorflow/core/framework/embedding/gpu_hash_table.h"
+#include "tensorflow/core/framework/embedding/kv_interface.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
+
 namespace embedding {
 
-template<typename K, typename V>
+template <typename K, typename V>
 class GPUHashMapKV : public KVInterface<K, V> {
  public:
   GPUHashMapKV(const EmbeddingConfig& config, Allocator* alloc)
-      : config_(config), alloc_(alloc) {
-    hash_table_ = new GPUHashTable<K, V>(-1, alloc);
+      : config_(config), alloc_(alloc), static_hash_table_(nullptr) {
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_));
+    if (!is_inference_) {
+      hash_table_ = new GPUHashTable<K, V>(-1, alloc);
+    }
   }
 
   ~GPUHashMapKV() override {
-    for (int i = 0; i < hash_table_->bank_ptrs.size(); ++i) {
-      TypedAllocator::Deallocate(
-          alloc_, hash_table_->bank_ptrs[i],
-          value_len_ * hash_table_->initial_bank_size);
-      TypedAllocator::Deallocate(
-          alloc_, hash_table_->existence_flag_ptrs[i],
-          hash_table_->initial_bank_size);
-    }
-    if (hash_table_->mem_bank_num != 0) {
-      auto num_elements = hash_table_->mem_bank_num *
-          (config_.block_num * (1 + config_.slot_num));
+    if (is_inference_) {
       TypedAllocator::Deallocate(
-          alloc_, hash_table_->d_bank_ptrs, num_elements);
-      TypedAllocator::Deallocate(
-          alloc_, hash_table_->d_existence_flag_ptrs, num_elements);
+          alloc_, static_hash_table_->values_d,
+          static_hash_table_->capacity_ * static_hash_table_->dimension_);
+      delete static_hash_table_;
+    } else {
+      for (int i = 0; i < hash_table_->bank_ptrs.size(); ++i) {
+        TypedAllocator::Deallocate(alloc_, hash_table_->bank_ptrs[i],
+                                   value_len_ * hash_table_->initial_bank_size);
+        TypedAllocator::Deallocate(alloc_, hash_table_->existence_flag_ptrs[i],
+                                   hash_table_->initial_bank_size);
+      }
+      if (hash_table_->mem_bank_num != 0) {
+        auto num_elements = hash_table_->mem_bank_num *
+                            (config_.block_num * (1 + config_.slot_num));
+        TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs,
+                                   num_elements);
+        TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs,
+                                   num_elements);
+      }
+      delete hash_table_;
     }
-    delete hash_table_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUHashMapKV);
 
-  void SetValueLen(int64 value_len) {
-    value_len_ = value_len;
-  }
+  void SetValueLen(int64 value_len) { value_len_ = value_len; }
 
-  Status BatchLookupOrCreateKeys(const K* keys, size_t n,
-      int32* item_idxs, const Eigen::GpuDevice& device) {
+  Status BatchLookupOrCreateKeys(const K* keys, size_t n, int32* item_idxs,
+                                 const Eigen::GpuDevice& device) {
     mutex_lock lock(lock_);
-    int remaining_size = n + *(hash_table_->start_idx) -
+    int remaining_size =
+        n + *(hash_table_->start_idx) -
         hash_table_->mem_bank_num * hash_table_->initial_bank_size;
     if (remaining_size > 0) {
       Resize(remaining_size);
@@ -71,99 +81,126 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {
-    int32* item_idxs = TypedAllocator::Allocate<int32>(alloc_, n,
-        AllocationAttributes());
+                             int32 default_v_num,
+                             bool is_use_default_value_tensor, size_t n,
+                             const Eigen::GpuDevice& device) {
+    int32* item_idxs =
+        TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
     BatchLookupOrCreateKeys(keys, n, item_idxs, device);
     functor::KvLookupCreateEmb<Eigen::GpuDevice, K, V>()(
-        keys, val, default_v, value_len_, item_idxs, n,
-        config_.emb_index, default_v_num, is_use_default_value_tensor,
-        hash_table_->d_bank_ptrs, hash_table_->d_existence_flag_ptrs,
+        keys, val, default_v, value_len_, item_idxs, n, config_.emb_index,
+        default_v_num, is_use_default_value_tensor, hash_table_->d_bank_ptrs,
+        hash_table_->d_existence_flag_ptrs,
         (config_.block_num * (1 + config_.slot_num)),
         hash_table_->initial_bank_size, device.stream());
     TypedAllocator::Deallocate(alloc_, item_idxs, n);
     return Status::OK();
   }
 
-  void GetSnapshot(std::vector<K>* key_list,
-      std::vector<V*>* value_list,
-      const EmbeddingConfig& emb_config) {
+  void GetSnapshot(std::vector<K>* key_list, std::vector<V*>* value_list,
+                   const EmbeddingConfig& emb_config) {
+    if (is_inference_) return;  // Special case for testing in training mode;
     auto size = hash_table_->Size();
-    if (size > 0) {
-      int32* item_idxs = TypedAllocator::Allocate<int32>(
-          alloc_, size, AllocationAttributes());
-      K* keys_gpu = TypedAllocator::Allocate<K>(
-          alloc_, size, AllocationAttributes());
-      V* values_gpu = TypedAllocator::Allocate<V>(
-          alloc_, size * value_len_, AllocationAttributes());
-      V* values = TypedAllocator::Allocate<V>(
-          cpu_allocator(), size * value_len_, AllocationAttributes());
-      key_list->resize(size);
-
-      auto slot_num = config_.block_num * (1 + config_.slot_num);
-      functor::KvKeyGetSnapshot<Eigen::GpuDevice, K, V>()(
-          keys_gpu, item_idxs, emb_config.emb_index,
-          emb_config.primary_emb_index, hash_table_->d_existence_flag_ptrs,
-          hash_table_->mem_bank_num, slot_num,
-          hash_table_->initial_bank_size, hash_table_, size, NULL);
-      functor::KvEmbGetSnapshot<Eigen::GpuDevice, K, V>()(
-          keys_gpu, values_gpu, -1, value_len_, item_idxs,size,
-          emb_config.emb_index, hash_table_->d_bank_ptrs,
-          hash_table_->mem_bank_num, slot_num,
-          hash_table_->initial_bank_size, NULL);
-
-      for (int64 i = 0; i < size; i++) {
-        value_list->emplace_back(values + i * value_len_);
-      }
-
-      cudaMemcpyAsync(const_cast<K*>(key_list->data()),
-          keys_gpu, size * sizeof(K), cudaMemcpyDeviceToHost);
-      cudaMemcpyAsync(values, values_gpu, size * value_len_ * sizeof(V),
-          cudaMemcpyDeviceToHost);
-      EventSynchronize(NULL);
-
-      TypedAllocator::Deallocate(alloc_, item_idxs, size);
-      TypedAllocator::Deallocate(alloc_, keys_gpu, size);
-      TypedAllocator::Deallocate(alloc_, values_gpu, size * value_len_);
+    if (size <= 0) return;
+
+    int32* item_idxs =
+        TypedAllocator::Allocate<int32>(alloc_, size, AllocationAttributes());
+    K* keys_gpu =
+        TypedAllocator::Allocate<K>(alloc_, size, AllocationAttributes());
+    V* values_gpu = TypedAllocator::Allocate<V>(alloc_, size * value_len_,
+                                                AllocationAttributes());
+    V* values = TypedAllocator::Allocate<V>(cpu_allocator(), size * value_len_,
+                                            AllocationAttributes());
+    key_list->resize(size);
+    for (int64 i = 0; i < size; i++) {
+      value_list->emplace_back(values + i * value_len_);
     }
+
+    auto slot_num = emb_config.block_num * (1 + emb_config.slot_num);
+    functor::KvKeyGetSnapshot<Eigen::GpuDevice, K, V>()(
+        keys_gpu, item_idxs, emb_config.emb_index, emb_config.primary_emb_index,
+        hash_table_->d_existence_flag_ptrs, hash_table_->mem_bank_num, slot_num,
+        hash_table_->initial_bank_size, hash_table_, size, NULL);
+
+    functor::KvEmbGetSnapshot<Eigen::GpuDevice, K, V>()(
+        keys_gpu, values_gpu, -1, value_len_, item_idxs, size,
+        emb_config.emb_index, hash_table_->d_bank_ptrs,
+        hash_table_->mem_bank_num, slot_num, hash_table_->initial_bank_size,
+        NULL);
+
+    cudaMemcpyAsync(const_cast<K*>(key_list->data()), keys_gpu,
+                    size * sizeof(K), cudaMemcpyDeviceToHost);
+    cudaMemcpyAsync(values, values_gpu, size * value_len_ * sizeof(V),
+                    cudaMemcpyDeviceToHost);
+    EventSynchronize(NULL);
+    TypedAllocator::Deallocate(alloc_, item_idxs, size);
+    TypedAllocator::Deallocate(alloc_, keys_gpu, size);
+    TypedAllocator::Deallocate(alloc_, values_gpu, size * value_len_);
   }
 
   Status Import(const std::vector<K>& key_import,
-      const std::vector<V>& value_import,
-      const Eigen::GpuDevice* device,
-      const EmbeddingConfig& emb_config) {
+                const std::vector<V>& value_import,
+                const Eigen::GpuDevice* device,
+                const EmbeddingConfig& emb_config) {
     int n = key_import.size();
     auto stream = device->stream();
-    if (n > 0) {
-      int32* item_idxs = TypedAllocator::Allocate<int32>(
-          alloc_, n, AllocationAttributes());
-      K* key_gpu = TypedAllocator::Allocate<K>(
-          alloc_, n, AllocationAttributes());
-      cudaMemcpyAsync(key_gpu, key_import.data(),
-          key_import.size() * sizeof(K), cudaMemcpyHostToDevice, stream);
-      BatchLookupOrCreateKeys(key_gpu, n, item_idxs, *device);
-      V* value_gpu = TypedAllocator::Allocate<V>(
+
+    if (is_inference_) {
+      if (n == 0) {
+        LOG(INFO) << "Size of keys in EmbeddingVar:  " << emb_config.name
+                  << " is 0 while loading in inference mode!";
+        return Status::OK();
+      }
+      static_hash_table_ =
+          new GPUStaticHashTable<K, V>(n, value_len_, -1, -1, alloc_, stream);
+      K* keys_d =
+          TypedAllocator::Allocate<K>(alloc_, n, AllocationAttributes());
+      cudaMemcpyAsync(keys_d, key_import.data(), n * sizeof(K),
+                      cudaMemcpyHostToDevice, stream);
+      static_hash_table_->values_d = TypedAllocator::Allocate<V>(
           alloc_, value_import.size(), AllocationAttributes());
-      cudaMemcpyAsync(value_gpu, value_import.data(),
-          value_import.size() * sizeof(V), cudaMemcpyHostToDevice, stream);
-
-      functor::KvUpdateEmb<Eigen::GpuDevice, K, V>()(
-          key_import.data(), value_gpu, value_len_, item_idxs, n,
-          emb_config.emb_index, key_import.size(),
-          hash_table_->d_bank_ptrs, hash_table_->d_existence_flag_ptrs,
-          (emb_config.block_num * (1 + emb_config.slot_num)),
-          hash_table_->initial_bank_size, stream);
+      cudaMemcpyAsync(static_hash_table_->values_d, value_import.data(),
+                      value_import.size() * sizeof(V), cudaMemcpyHostToDevice,
+                      stream);
+      functor::KvInitStaticMap<Eigen::GpuDevice, K, V>()(
+          keys_d, static_hash_table_, n, value_len_, stream);
       EventSynchronize(stream);
-      TypedAllocator::Deallocate(alloc_, item_idxs, n);
-      TypedAllocator::Deallocate(alloc_, value_gpu, value_import.size());
-      TypedAllocator::Deallocate(alloc_, key_gpu, n);
+
+      TypedAllocator::Deallocate(alloc_, keys_d, n);
+    } else {
+      if (n > 0) {
+        int32* item_idxs =
+            TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
+        K* key_gpu =
+            TypedAllocator::Allocate<K>(alloc_, n, AllocationAttributes());
+        cudaMemcpyAsync(key_gpu, key_import.data(),
+                        key_import.size() * sizeof(K), cudaMemcpyHostToDevice,
+                        stream);
+        BatchLookupOrCreateKeys(key_gpu, n, item_idxs, *device);
+        V* value_gpu = TypedAllocator::Allocate<V>(alloc_, value_import.size(),
+                                                   AllocationAttributes());
+        cudaMemcpyAsync(value_gpu, value_import.data(),
+                        value_import.size() * sizeof(V), cudaMemcpyHostToDevice,
+                        stream);
+
+        functor::KvUpdateEmb<Eigen::GpuDevice, K, V>()(
+            key_import.data(), value_gpu, value_len_, item_idxs, n,
+            emb_config.emb_index, key_import.size(), hash_table_->d_bank_ptrs,
+            hash_table_->d_existence_flag_ptrs,
+            (emb_config.block_num * (1 + emb_config.slot_num)),
+            hash_table_->initial_bank_size, stream);
+        EventSynchronize(stream);
+        TypedAllocator::Deallocate(alloc_, item_idxs, n);
+        TypedAllocator::Deallocate(alloc_, value_gpu, value_import.size());
+        TypedAllocator::Deallocate(alloc_, key_gpu, n);
+      }
     }
+
     return Status::OK();
   }
 
   Status BatchLookupOrCreate(const K* keys, size_t n,
-      ValuePtr<V>** value_ptrs) override {
+                             ValuePtr<V>** value_ptrs) override {
     return Status::OK();
   }
 
@@ -171,25 +208,21 @@ class GPUHashMapKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Contains(K key) override {
-    return Status::OK();
-  }
+  Status Contains(K key) override { return Status::OK(); }
 
   Status Insert(K key, const ValuePtr<V>* value_ptr) override {
     return Status::OK();
   }
 
-  Status Remove(K key) override {
-    return Status::OK();
-  }
+  Status Remove(K key) override { return Status::OK(); }
 
   Status BatchLookup(const K* keys, size_t size,
-      ValuePtr<V>** value_ptrs) override {
+                     ValuePtr<V>** value_ptrs) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
     return Status::OK();
   }
 
@@ -198,46 +231,43 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
     return Status::OK();
   }
 
-  int64 Size() const override {
-    return 0;
-  }
+  int64 Size() const override { return 0; }
 
-  void SetTotalDims(int total_dims) override {
-  }
+  void SetTotalDims(int total_dims) override {}
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-  }
+  void FreeValuePtr(ValuePtr<V>* value_ptr) override {}
 
   Status Commit(K key, const ValuePtr<V>* value_ptr) override {
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
     return Status::OK();
   }
 
-  std::string DebugString() const override {
-    return std::string();
-  }
+  std::string DebugString() const override { return std::string(); }
 
-  Iterator* GetIterator() override {
-    return nullptr;
-  }
+  Iterator* GetIterator() override { return nullptr; }
 
-  GPUHashTable<K, V>* HashTable() override {
-    return hash_table_;
+  GPUHashTable<K, V>* HashTable() override { return hash_table_; }
+
+  Status BatchLookup(const K* keys, V* val, V* default_v, int32 default_v_num,
+                     bool is_use_default_value_tensor, size_t n,
+                     const Eigen::GpuDevice& device) override {
+    functor::KvLookupKey<Eigen::GpuDevice, K, V>()(
+        keys, val, n, value_len_, static_hash_table_, device.stream());
+    return Status::OK();
   }
 
  private:
   void Resize(int hint) {
     while (hint > 0) {
-      for (int i = 0; i < (config_.block_num *
-            (1 + config_.slot_num)); ++i) {
+      for (int i = 0; i < (config_.block_num * (1 + config_.slot_num)); ++i) {
         V* ptr = TypedAllocator::Allocate<V>(
             alloc_, value_len_ * hash_table_->initial_bank_size,
             AllocationAttributes());
@@ -251,23 +281,23 @@ class GPUHashMapKV : public KVInterface<K, V> {
       ++hash_table_->mem_bank_num;
     }
 
-    auto num_elements = hash_table_->mem_bank_num * (
-        config_.block_num * (1 + config_.slot_num));
+    auto num_elements = hash_table_->mem_bank_num *
+                        (config_.block_num * (1 + config_.slot_num));
     if (hash_table_->d_bank_ptrs) {
       TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs,
-          num_elements);
+                                 num_elements);
       TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs,
-          num_elements);
+                                 num_elements);
     }
     hash_table_->d_bank_ptrs = TypedAllocator::Allocate<V*>(
         alloc_, num_elements, AllocationAttributes());
     cudaMemcpy(hash_table_->d_bank_ptrs, hash_table_->bank_ptrs.data(),
-        num_elements * sizeof(V*), cudaMemcpyHostToDevice);
+               num_elements * sizeof(V*), cudaMemcpyHostToDevice);
     hash_table_->d_existence_flag_ptrs = TypedAllocator::Allocate<bool*>(
         alloc_, num_elements, AllocationAttributes());
     cudaMemcpy(hash_table_->d_existence_flag_ptrs,
-        hash_table_->existence_flag_ptrs.data(),
-    num_elements * sizeof(bool*), cudaMemcpyHostToDevice);
+               hash_table_->existence_flag_ptrs.data(),
+               num_elements * sizeof(bool*), cudaMemcpyHostToDevice);
   }
 
   void EventSynchronize(const cudaStream_t& stream) {
@@ -280,14 +310,16 @@ class GPUHashMapKV : public KVInterface<K, V> {
 
  private:
   EmbeddingConfig config_;
+  bool is_inference_;
+  GPUStaticHashTable<K, V>* static_hash_table_;
   GPUHashTable<K, V>* hash_table_;
   Allocator* alloc_;
   int64 value_len_;
   mutex lock_;
 };
 
-} // namespace embedding
-} // namespace tensorflow
+}  // namespace embedding
+}  // namespace tensorflow
 
-#endif // GOOGLE_CUDA
-#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
diff --git a/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc b/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
index 1a2465bf0b7..b56bd5b7210 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
+++ b/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
@@ -22,9 +22,8 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
-#include <unordered_map>
-
 #include "cuco/dynamic_map.cuh"
+#include "cuco/static_map.cuh"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_table.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -125,15 +124,189 @@ int32 GPUHashTable<K, V>::Size() {
   return hash_table->map_.get_size();
 }
 
-#define REGISTER_ALL_TYPE(type)             \
-  template class GPUHashTable<int32, type>; \
-  template class GPUHashTable<int64, type>;
+template <typename K, typename V,
+          typename CUCOAllocator = gpu_hash_map_tf_allocator<uint8_t>>
+class StaticHashTable {
+ public:
+  cuco::static_map<K, int32, cuda::thread_scope_device, CUCOAllocator> map_;
+
+  StaticHashTable(size_t initial_capacity, K empty_key_sentinel,
+                  int32 empty_value_sentinel, CUCOAllocator alloc)
+      : map_(initial_capacity, empty_key_sentinel, empty_value_sentinel,
+             alloc) {}
+};
+
+template <typename K, typename V>
+GPUStaticHashTable<K, V>::GPUStaticHashTable(size_t capacity, int dimension,
+                                             K empty_key_sentinel,
+                                             int32 empty_value_sentinel,
+                                             Allocator* alloc,
+                                             cudaStream_t stream) {
+  capacity_ = capacity;
+  dimension_ = dimension;
+  // cudaMallocAsync(&values_d, sizeof(V) * dimension * capacity, stream);
+  // cudaMallocManaged(&values_d, sizeof(V) * dimension * capacity);
+
+  hash_table = new StaticHashTable<K, V>(
+      capacity / 0.8 /*load_factor*/, empty_key_sentinel, empty_value_sentinel,
+      gpu_hash_map_tf_allocator<uint8_t>(alloc));
+}
+
+template <typename K, typename V>
+GPUStaticHashTable<K, V>::~GPUStaticHashTable() {
+  delete hash_table;
+  delete default_values;
+  cudaFree(values_d);
+}
+
+template <typename K, typename V>
+std::size_t GPUStaticHashTable<K, V>::Size() {
+  return hash_table->map_.get_size();
+}
+
+#define REGISTER_ALL_TYPE(type)                   \
+  template class GPUHashTable<int32, type>;       \
+  template class GPUHashTable<int64, type>;       \
+  template class GPUStaticHashTable<int32, type>; \
+  template class GPUStaticHashTable<int64, type>;
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE)
 #undef REGISTER_ALL_TYPE
 
 namespace functor {
 using atomicT = cuda::atomic<std::size_t, cuda::thread_scope_device>;
 
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename mutableViewT,
+          typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_initialize_static_map(const Key* key_first, int32 num_items,
+                                         int32 dimension,
+                                         mutableViewT map_mutable_view,
+                                         atomicT* num_successes,
+                                         Hash hash = Hash{},
+                                         KeyEqual key_equal = KeyEqual{}) {
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t thread_num_successes = 0;
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 value = key_idx * dimension;
+
+    auto const insert_pair = cuco::pair_type<Key, int32>{key, value};
+    if (map_mutable_view.insert(tile, insert_pair, hash, key_equal) &&
+        tile.thread_rank() == 0) {
+      thread_num_successes++;
+    }
+
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+  std::size_t block_num_successes =
+      BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    *num_successes += block_num_successes;
+  }
+}
+
+template <typename Key, typename V>
+struct KvInitStaticMap<GPUDevice, Key, V> {
+  void operator()(const Key* keys, GPUStaticHashTable<Key, V>* hash_table,
+                  int32 num_items, int32 dimension, cudaStream_t stream) {
+    using MutableViewT = typename cuco::static_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::device_mutable_view;
+
+    auto& map = hash_table->hash_table->map_;
+    size_t num_to_insert = num_items;
+    while (num_to_insert > 0) {
+      static_assert(sizeof(std::size_t) == sizeof(atomicT));
+      CUCO_CUDA_TRY(
+          cudaMemsetAsync(map.get_num_success(), 0, sizeof(atomicT), stream));
+
+      auto n = std::min((size_t)65535, num_to_insert);
+      auto const block_size = 128;
+      auto stride = 1;
+      auto const tile_size = 4;
+      auto const grid_size =
+          (tile_size * n + stride * block_size - 1) / (stride * block_size);
+      TF_CHECK_OK(GpuLaunchKernel(
+          kv_initialize_static_map<block_size, tile_size, Key, V, MutableViewT,
+                                   cuco::detail::MurmurHash3_32<Key>,
+                                   thrust::equal_to<Key>>,
+          grid_size, block_size, 0, stream, keys, n, dimension,
+          map.get_device_mutable_view(), map.get_num_success(),
+          cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+
+      CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+
+      std::size_t h_num_successes =
+          map.get_num_success()->load(cuda::std::memory_order_relaxed);
+      map.update_size(h_num_successes);
+      keys += n;
+      num_to_insert -= n;
+    }
+  }
+};
+
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename ViewT, typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_lookup_key_kernel(const Key* key_first, const V* value_srcs,
+                                     V* value_first, size_t num_items,
+                                     int32 dimension, ViewT map_views,
+                                     Hash hash = Hash{},
+                                     KeyEqual key_equal = KeyEqual{}) {
+  auto grid = cooperative_groups::this_grid();
+  auto block = cooperative_groups::this_thread_block();
+  auto tile = cooperative_groups::tiled_partition<tile_size>(block);
+
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;  // actual thread idx
+  auto empty_value_sentinel = map_views.get_empty_value_sentinel();
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 found_value = empty_value_sentinel;
+    auto found = map_views.find(tile, key, hash, key_equal);
+    if (found != map_views.end()) {
+      found_value = found->second;
+    }
+
+    if (tile.thread_rank() == 0) {
+      for (auto id = threadIdx.x; id < dimension; id += blockDim.x) {
+        value_first[key_idx * dimension + id] = value_srcs[found_value + id];
+      }
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+template <typename Key, typename V>
+struct KvLookupKey<GPUDevice, Key, V> {
+  void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension,
+                  GPUStaticHashTable<Key, V>* hash_table, cudaStream_t stream) {
+    using ViewT = typename cuco::static_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::device_view;
+    auto& map = hash_table->hash_table->map_;
+
+    auto const block_size = 128;
+    auto const stride = 1;
+    auto const tile_size = 4;
+    auto const grid_size = (tile_size * num_items + stride * block_size - 1) /
+                           (stride * block_size);
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_lookup_key_kernel<block_size, tile_size, Key, V, ViewT>, grid_size,
+        block_size, 0, stream, keys, hash_table->values_d, vals, num_items,
+        dimension, map.get_device_view(), cuco::detail::MurmurHash3_32<Key>{},
+        thrust::equal_to<Key>{}));
+  }
+};
+
 template <uint32_t block_size, uint32_t tile_size, typename Key,
           typename mutableViewT, typename ViewT,
           typename Hash = cuco::detail::MurmurHash3_32<Key>,
@@ -220,6 +393,7 @@ struct KvLookupInsertKey<GPUDevice, Key, V> {
                                            sizeof(atomicT), device_id));
 
         auto n = std::min(capacity_remaining, num_to_insert);
+
         auto const block_size = 128;
         auto const stride = 1;
         auto const tile_size = 4;
@@ -274,7 +448,8 @@ __global__ void kv_lookup_or_create_emb_kernel(
     }
   }
   for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-    val[item_idx * dim + id] = d_banks[slot_offset][offset_in_bank * dim + id];
+      val[item_idx * dim + id] =
+          d_banks[slot_offset][offset_in_bank * dim + id];
   }
 }
 
@@ -434,6 +609,10 @@ struct KvEmbGetSnapshot<GPUDevice, Key, Value> {
 }  // namespace functor
 
 #define REGISTER_ALL_TYPE(type)                                       \
+  template struct functor::KvInitStaticMap<GPUDevice, int32, type>;   \
+  template struct functor::KvInitStaticMap<GPUDevice, int64, type>;   \
+  template struct functor::KvLookupKey<GPUDevice, int32, type>;       \
+  template struct functor::KvLookupKey<GPUDevice, int64, type>;       \
   template struct functor::KvLookupInsertKey<GPUDevice, int32, type>; \
   template struct functor::KvLookupInsertKey<GPUDevice, int64, type>; \
   template struct functor::KvLookupCreateEmb<GPUDevice, int32, type>; \
@@ -449,4 +628,4 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE)
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/framework/embedding/gpu_hash_table.h b/tensorflow/core/framework/embedding/gpu_hash_table.h
index d57970aecd6..076f3e767c7 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_table.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_table.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include <cuda/std/atomic>
+
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -28,10 +29,32 @@ class gpu_hash_map_tf_allocator;
 template <typename KeyType, typename ValueType, typename Allocator>
 class DynamicHashTable;
 
+template <typename KeyType, typename ValueType, typename Allocator>
+class StaticHashTable;
+
+template <typename K, typename V>
+class GPUStaticHashTable {
+ public:
+  GPUStaticHashTable(size_t capacity, int dimension, K empty_key_sentinel,
+                     int32 empty_value_sentinel, Allocator* alloc,
+                     cudaStream_t stream);
+
+  ~GPUStaticHashTable();
+
+  std::size_t Size();
+
+  StaticHashTable<K, V, gpu_hash_map_tf_allocator<uint8_t>>* hash_table;
+  V* values_d{nullptr};
+  int dimension_;
+  V* default_values{nullptr};
+  int capacity_;
+};
+
 template <typename K, typename V>
 class GPUHashTable {
-public:
-  GPUHashTable(K empty_key_sentinel, Allocator* alloc, size_t initial_capacity=50000);
+ public:
+  GPUHashTable(K empty_key_sentinel, Allocator* alloc,
+               size_t initial_capacity = 50000);
 
   ~GPUHashTable();
 
@@ -49,83 +72,65 @@ class GPUHashTable {
 };
 
 namespace functor {
+
 template <typename Device, typename Key, typename V>
-struct KvLookupInsertKey {
-  void operator()(const Key* key_first,
-                  int32* value_first,
-                  int32 num_items,
-                  GPUHashTable<Key, V>* hash_table,
-                  cuda::atomic<std::size_t, cuda::thread_scope_device>* start_idx,
+struct KvLookupKey {
+  void operator()(const Key* key_first, V* value_first, int32 num_items,
+                  int32 dimension, GPUStaticHashTable<Key, V>* hash_table,
                   cudaStream_t stream);
 };
 
+template <typename Device, typename Key, typename V>
+struct KvInitStaticMap {
+  void operator()(const Key* key_first, GPUStaticHashTable<Key, V>* hash_table,
+                  int32 num_items, int32 dimension, cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename V>
+struct KvLookupInsertKey {
+  void operator()(
+      const Key* key_first, int32* value_first, int32 num_items,
+      GPUHashTable<Key, V>* hash_table,
+      cuda::atomic<std::size_t, cuda::thread_scope_device>* start_idx,
+      cudaStream_t stream);
+};
+
 template <typename Device, typename Key, typename Value>
 struct KvLookupCreateEmb {
-  void operator()(const Key* key_first,
-                  Value* val,
-                  Value* default_v,
-                  int64 dim,
-                  int32* item_idxs,
-                  int32 num_items,
-                  int32 slot_idx,
-                  int32 default_v_num,
-                  bool is_use_default_value_tensor,
-                  Value** d_banks,
-                  bool** d_flags,
-                  int32 slot_num,
-                  int32 bank_size,
-                  cudaStream_t stream);
+  void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, bool is_use_default_value_tensor,
+                  Value** d_banks, bool** d_flags, int32 slot_num,
+                  int32 bank_size, cudaStream_t stream);
 };
 
 template <typename Device, typename Key, typename Value>
 struct KvUpdateEmb {
-  void operator()(const Key* key_first,
-                  Value* default_v,
-                  int64 dim,
-                  int32* item_idxs,
-                  int32 num_items,
-                  int32 slot_idx,
-                  int32 default_v_num,
-                  Value** d_banks,
-                  bool** d_flags,
-                  int32 slot_num,
-                  int32 bank_size,
-                  cudaStream_t stream);
+  void operator()(const Key* key_first, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, Value** d_banks, bool** d_flags,
+                  int32 slot_num, int32 bank_size, cudaStream_t stream);
 };
 
 template <typename Device, typename Key, typename V>
 struct KvKeyGetSnapshot {
-  void operator()(Key* key_first,
-                  int32* value_first,
-                  int32 slot_idx,
-                  int32 primary_slot_idx,
-                  bool** d_flags,
-                  int32 bank_num,
-                  int32 slot_num,
-                  int32 bank_size,
-                  GPUHashTable<Key, V>* hash_table,
-                  int32 ev_size,
+  void operator()(Key* key_first, int32* value_first, int32 slot_idx,
+                  int32 primary_slot_idx, bool** d_flags, int32 bank_num,
+                  int32 slot_num, int32 bank_size,
+                  GPUHashTable<Key, V>* hash_table, int32 ev_size,
                   cudaStream_t stream);
 };
 
 template <typename Device, typename Key, typename Value>
 struct KvEmbGetSnapshot {
-  void operator()(Key* key,
-                  Value* val,
-                  Key empty_key_sentinel,
-                  int64 dim,
-                  int32* item_idxs,
-                  int32 num_items,
-                  int32 slot_idx,
-                  Value** d_banks,
-                  int32 bank_num,
-                  int32 slot_num,
-                  int32 bank_size,
-                  cudaStream_t stream);
+  void operator()(Key* key, Value* val, Key empty_key_sentinel, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  Value** d_banks, int32 bank_num, int32 slot_num,
+                  int32 bank_size, cudaStream_t stream);
 };
 
-} // namespace functor
-} // namespace tensorflow
+}  // namespace functor
+}  // namespace tensorflow
 
-#endif // GOOGLE_CUDA
-#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_
\ No newline at end of file
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 7e4436a7845..64e0c4685f0 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -19,6 +19,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+namespace {
+const char* kInferenceMode = "INFERENCE_MODE";
+}
 
 template <class V>
 class ValuePtr;
@@ -107,12 +110,18 @@ class KVInterface {
     return Status::OK();
   }
 
+  virtual Status BatchLookup(const K* keys, V* val, V* default_v,
+      int32 default_v_num, bool is_use_default_value_tensor,
+      size_t n, const Eigen::GpuDevice& device) {
+    return Status(error::Code::UNIMPLEMENTED,
+                  "Unimplemented for BatchLookup in KVInterface.");
+  }
+  
   virtual GPUHashTable<K, V>* HashTable() {
     return nullptr;
   }
 
   virtual void SetValueLen(int64 value_len) {}
-
 };
 
 }  // namespace embedding
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index df1ed5e7bfb..ad9dc4e15b6 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -455,6 +455,13 @@ class HbmStorage : public SingleTierStorage<K, V> {
     SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
   }
 
+  void BatchLookup(const K* key, V* val, V* default_v,
+      int32 default_v_num, bool is_use_default_value_tensor,
+      size_t n, const Eigen::GpuDevice& device) override {
+    SingleTierStorage<K, V>::kv_->BatchLookup(key, val, default_v, default_v_num,
+        is_use_default_value_tensor, n, device);
+  }
+  
   int64 GetSnapshot(std::vector<K>* key_list,
       std::vector<V* >* value_list,
       std::vector<int64>* version_list,
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index fa87b574f79..cc22bb4712a 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -114,6 +114,9 @@ class Storage {
       size_t n, const Eigen::GpuDevice& device) {}
   virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
       const Eigen::GpuDevice& device) {}
+  virtual void BatchLookup(const K* keys, V* val, V* default_v,
+      int32 default_v_num, bool is_use_default_value_tensor,
+      size_t n, const Eigen::GpuDevice& device) {}
   virtual void ImportToHbm(const std::vector<K>& keys,
       const std::vector<V>& values, const Eigen::GpuDevice* device,
       const EmbeddingConfig& emb_config) {};
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
index 1ad0c352b4d..97f44eb8ae6 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
@@ -52,6 +52,26 @@ class GroupEmbeddingVarLookupOp
         return default_v + len * (id % total_dim);
       };
     }
+    bool is_inference;
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
+    if (!is_inference) {
+      lookup_fn_ = [](EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device) {
+        ev->LookupOrCreate(key, val, default_v, default_v_num,
+            is_use_default_value_tensor, n, device);
+      };
+    } else {
+      lookup_fn_ = [](EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device) {
+        ev->Lookup(key, val, default_v, default_v_num,
+            is_use_default_value_tensor, n, device);
+      };
+    }
+
   }
 
   ~GroupEmbeddingVarLookupOp() { delete[] occupy_flag_; }
@@ -67,6 +87,7 @@ class GroupEmbeddingVarLookupOp
         this->num_lookups_, this->dimension_, this->max_norm_, gpu_allocator);
 
     std::vector<Tensor> tensor_list;
+    tensor_list.reserve(this->num_lookups_);
 
     for (int i = 0; i < this->num_lookups_; ++i) {
       EmbeddingVar<TFKey, TValue>* ev = nullptr;
@@ -107,12 +128,13 @@ class GroupEmbeddingVarLookupOp
           auto default_values_matrix =
               default_values.shaped<TValue, 2>({default_value_num, dimension});
           TValue* default_v_base = &default_values_matrix(0, 0);
-          ev->LookupOrCreate(key_base, out_base, default_v_base,
-                             default_value_num, is_use_default_value_tensor_, N,
-                             device);
+          lookup_fn_(ev, key_base, out_base, default_v_base,
+                     default_value_num, is_use_default_value_tensor_, N,
+                     device);
+          
         } else {
-          ev->LookupOrCreate(key_base, out_base, ev->GetDefaultValuePtr(),
-                             ev->GetDefaultValueDim(), true, N, device);
+          lookup_fn_(ev, key_base, out_base, ev->GetDefaultValuePtr(),
+                     ev->GetDefaultValueDim(), true, N, device);
         }
       } else {
         auto out_flat =
@@ -287,6 +309,10 @@ class GroupEmbeddingVarLookupOp
  private:
   std::map<uint64, int64> hash_map_;
   std::function<TValue*(TValue*, TFKey, int64, int64, int64)> get_default_v_fn_;
+  std::function<void(EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
   mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
   bool* occupy_flag_{nullptr};
   mutex m_init_occupy_flag_;
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index c5c5cc22c33..6b3139645c0 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -783,6 +783,25 @@ class KvResourceGatherGPUOp : public OpKernel {
         return 1;
       };
     }
+    bool is_inference;
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
+    if (!is_inference) {
+      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, const TKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device) {
+        ev->LookupOrCreate(key, val, default_v, default_v_num,
+            is_use_default_value_tensor, n, device);
+      };
+    } else {
+      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, const TKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device) {
+        ev->Lookup(key, val, default_v, default_v_num,
+            is_use_default_value_tensor, n, device);
+      };
+    }
   }
 
   ~KvResourceGatherGPUOp() {
@@ -851,11 +870,11 @@ class KvResourceGatherGPUOp : public OpKernel {
           auto default_values_matrix = default_values.shaped<TValue, 2>(
               {default_value_num, ev->ValueLen()});
           TValue* default_v_base = &default_values_matrix(0, 0);
-          ev->LookupOrCreate(key_base, out_base, default_v_base,
+          lookup_fn_(ev, key_base, out_base, default_v_base,
               default_value_num, is_use_default_value_tensor_,
               indices_size, device);
         } else {
-          ev->LookupOrCreate(key_base, out_base, ev->GetDefaultValuePtr(),
+          lookup_fn_(ev, key_base, out_base, ev->GetDefaultValuePtr(),
               ev->GetDefaultValueDim(), is_use_default_value_tensor_,
               indices_size, device);
         }
@@ -967,6 +986,10 @@ class KvResourceGatherGPUOp : public OpKernel {
     std::function<
       TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn_;
     std::function<int32(int32*, int64)> get_count_fn_;
+    std::function<void(EmbeddingVar<TKey, TValue>* ev, const TKey* key,
+                      TValue* val, TValue* default_v, int32 default_v_num,
+                      bool is_use_default_value_tensor,
+                      size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
     std::map<uint64, int64> hash_map_;
     mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
     bool* occupy_flag_ = nullptr;
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 445118a3926..26ae99126b9 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -146,7 +146,7 @@ def testEmbeddingVariableForLookupInt64(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
@@ -198,7 +198,7 @@ def testEmbeddingVariableForGetShape(self):
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
     shape = var.total_count()
     init = variables.global_variables_initializer()
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
@@ -226,7 +226,7 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
 
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run(init)
@@ -255,7 +255,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
 
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
@@ -291,7 +291,7 @@ def testEmbeddingVariableForSparseColumnEmbeddingCol(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run(init)
@@ -311,7 +311,7 @@ def runTestAdagrad(self, var):
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
-      with self.test_session(force_gpu=True) as sess:
+      with self.test_session() as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
@@ -346,7 +346,7 @@ def runTestFtrl(self, var, g):
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
-      with self.test_session(graph=g, force_gpu=True) as sess:
+      with self.test_session(graph=g) as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
@@ -383,7 +383,7 @@ def testEmbeddingVariableForGeneralConstInitializer(self):
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,6], dtypes.int64))
     init = variables.global_variables_initializer()
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
@@ -402,7 +402,7 @@ def testEmbeddingVariableForGeneralRandomInitializer(self):
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,6], dtypes.int64))
     init = variables.global_variables_initializer()
-    with self.test_session(force_gpu=True) as sess:
+    with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
@@ -428,7 +428,7 @@ def testEVInitializerWithKeyFetch(self):
         var_emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,3,4,5,6,7], dtypes.int64))
         emb_emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([0,1,2,5,6,7,8,9,10], dtypes.int64))
         init = variables.global_variables_initializer()
-        with self.test_session(graph=g, force_gpu=True) as sess:
+        with self.test_session(graph=g) as sess:
           sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
           sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
           sess.run([init])
@@ -454,7 +454,7 @@ def runTest(self, var, g):
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
-      with self.test_session(graph=g, force_gpu=True) as sess:
+      with self.test_session(graph=g) as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
diff --git a/third_party/cucollection.patch b/third_party/cucollection.patch
index e5ca14fbcd0..fc3d06603b4 100644
--- a/third_party/cucollection.patch
+++ b/third_party/cucollection.patch
@@ -1,17 +1,17 @@
-From 874376f7e6b597bc288d3b945e706fd83a7033bf Mon Sep 17 00:00:00 2001
-From: Hongxiao Bai <hongxiaob@nvidia.com>
-Date: Thu, 20 Jan 2022 21:33:03 +0800
-Subject: [PATCH] cuco_modification_for_deeprec
+From b47364f0bf2c1e630c600e4e2e09e54020bac7fa Mon Sep 17 00:00:00 2001
+From: Mesilenceki <silenceki@hotmail.com>
+Date: Tue, 18 Apr 2023 11:56:47 +0800
+Subject: [PATCH] cuco patch
 
 ---
- include/cuco/detail/dynamic_map.inl         |  47 +++++++-
- include/cuco/detail/dynamic_map_kernels.cuh |  71 +++++++++++-
- include/cuco/detail/pair.cuh                |  14 +++
- include/cuco/detail/static_map.inl          | 115 ++++++++++++++++----
- include/cuco/dynamic_map.cuh                |  49 ++++++++-
- include/cuco/static_map.cuh                 |  51 ++++++++-
+ include/cuco/detail/dynamic_map.inl         |  47 ++++++-
+ include/cuco/detail/dynamic_map_kernels.cuh |  71 +++++++++-
+ include/cuco/detail/pair.cuh                |  14 ++
+ include/cuco/detail/static_map.inl          | 138 ++++++++++++++++----
+ include/cuco/dynamic_map.cuh                |  49 ++++++-
+ include/cuco/static_map.cuh                 |  57 +++++++-
  include/cuco/traits.hpp                     |   1 +
- 7 files changed, 317 insertions(+), 31 deletions(-)
+ 7 files changed, 340 insertions(+), 37 deletions(-)
 
 diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
 index 57950ea..78543c5 100644
@@ -190,10 +190,61 @@ index 0d8a85e..4aa8481 100644
  namespace detail {
  
 diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
-index 1719970..febc1fb 100644
+index 1719970..23482f8 100644
 --- a/include/cuco/detail/static_map.inl
 +++ b/include/cuco/detail/static_map.inl
-@@ -271,18 +271,18 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
+@@ -31,7 +31,10 @@ static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
+     counter_allocator_{alloc}
+ {
+   slots_         = std::allocator_traits<slot_allocator_type>::allocate(slot_allocator_, capacity_);
+-  num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
++  // num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
++  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
++  // static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
++  // CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
+ 
+   auto constexpr block_size = 256;
+   auto constexpr stride     = 4;
+@@ -45,7 +48,8 @@ template <typename Key, typename Value, cuda::thread_scope Scope, typename Alloc
+ static_map<Key, Value, Scope, Allocator>::~static_map()
+ {
+   std::allocator_traits<slot_allocator_type>::deallocate(slot_allocator_, slots_, capacity_);
+-  std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, num_successes_, 1);
++  // std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, num_successes_, 1);
++  CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
+ }
+ 
+ template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+@@ -63,8 +67,12 @@ void static_map<Key, Value, Scope, Allocator>::insert(
+   auto view             = get_device_mutable_view();
+ 
+   // TODO: memset an atomic variable is unsafe
+-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+-  CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
++  // static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
++  int device_id;
++  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
++  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
++
++  // CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
+   std::size_t h_num_successes;
+ 
+   detail::insert<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
+@@ -101,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
+   auto view            = get_device_mutable_view();
+ 
+   // TODO: memset an atomic variable is unsafe
+-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
+-  CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
++  // static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
++  // CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
++  int device_id;
++  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
++  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+   std::size_t h_num_successes;
+ 
+   detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
+@@ -271,18 +282,18 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
  
      if (slot_is_empty) {
        auto const status = [&]() {
@@ -222,7 +273,7 @@ index 1719970..febc1fb 100644
        }();
  
        // successful insert
-@@ -325,18 +325,18 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
+@@ -325,18 +336,18 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
        uint32_t src_lane = __ffs(window_contains_empty) - 1;
  
        if (g.thread_rank() == src_lane) {
@@ -252,7 +303,7 @@ index 1719970..febc1fb 100644
        }
  
        uint32_t res_status = g.shfl(static_cast<uint32_t>(status), src_lane);
-@@ -358,6 +358,43 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
+@@ -358,6 +369,43 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
    }
  }
  
@@ -296,7 +347,7 @@ index 1719970..febc1fb 100644
  template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
  template <typename Hash, typename KeyEqual>
  __device__ typename static_map<Key, Value, Scope, Allocator>::device_view::iterator
-@@ -482,6 +519,42 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
+@@ -482,6 +530,42 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
    }
  }
  
@@ -340,7 +391,7 @@ index 1719970..febc1fb 100644
  template <typename Hash, typename KeyEqual>
  __device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
 diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
-index 2e57ac6..64f4d3f 100644
+index 2e57ac6..b85759d 100644
 --- a/include/cuco/dynamic_map.cuh
 +++ b/include/cuco/dynamic_map.cuh
 @@ -96,8 +96,8 @@ class dynamic_map {
@@ -420,7 +471,7 @@ index 2e57ac6..64f4d3f 100644
    thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
    thrust::device_vector<mutable_view_type>
 diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
-index 321b1f3..cc7601b 100644
+index 321b1f3..fa810e4 100644
 --- a/include/cuco/static_map.cuh
 +++ b/include/cuco/static_map.cuh
 @@ -123,10 +123,10 @@ class static_map {
@@ -502,6 +553,19 @@ index 321b1f3..cc7601b 100644
      /**
       * @brief Indicates whether the key `k` was inserted into the map.
       *
+@@ -1053,6 +1096,12 @@ class static_map {
+    * @return The number of elements in the map
+    */
+   std::size_t get_size() const noexcept { return size_; }
++  
++  void update_size(std::size_t n) noexcept { size_ += n; }
++
++  atomic_ctr_type* get_num_success() noexcept {
++    return num_successes_;
++  }
+ 
+   /**
+    * @brief Gets the load factor of the hash map.
 diff --git a/include/cuco/traits.hpp b/include/cuco/traits.hpp
 index 445a40d..07fe954 100644
 --- a/include/cuco/traits.hpp
@@ -515,5 +579,4 @@ index 445a40d..07fe954 100644
  namespace cuco {
  
 -- 
-2.33.0
-
+2.37.1 (Apple Git-137.1)
\ No newline at end of file

From 84cff67ac521a3e5ae9d74d95b385004fb125a78 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 6 Jun 2023 10:05:56 +0800
Subject: [PATCH 13/91] [Embedding] Move insertion of new features into the
 backward ops. (#869)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../framework/embedding/bloom_filter_policy.h |  15 +-
 tensorflow/core/framework/embedding/cache.h   |  54 +-
 .../core/framework/embedding/config.proto     |   5 +
 .../embedding/counter_filter_policy.h         |  15 +-
 .../embedding/dram_leveldb_storage.h          |  13 +-
 .../framework/embedding/dram_pmem_storage.h   |  21 +-
 .../framework/embedding/dram_ssd_storage.h    |  14 +-
 .../framework/embedding/embedding_config.h    |   7 +-
 .../core/framework/embedding/embedding_var.h  | 105 ++-
 .../embedding/embedding_var_context.h         |  61 ++
 .../core/framework/embedding/filter_policy.h  |   2 +-
 .../embedding/hbm_dram_ssd_storage.h          |   2 +-
 .../framework/embedding/multi_tier_storage.h  |  13 +
 .../embedding/nullable_filter_policy.h        |   8 +-
 tensorflow/core/framework/embedding/storage.h |   5 +
 .../core/framework/embedding/value_ptr.h      |  12 +-
 tensorflow/core/graph/graph.h                 |  19 +-
 .../kernels/embedding_variable_ops_test.cc    |  20 +-
 .../group_embedding_lookup_ops.cc             |   2 +-
 ...oup_embedding_lookup_sparse_forward_ops.cc |   2 +-
 ..._embedding_lookup_sparse_forward_ops.cu.cc |   2 +-
 .../core/kernels/incr_save_restore_ops.h      |   2 +-
 .../core/kernels/kv_variable_lookup_ops.cc    | 133 +--
 tensorflow/core/kernels/kv_variable_ops.cc    |  48 +-
 tensorflow/core/kernels/kv_variable_ops.h     |  16 +-
 tensorflow/core/kernels/training_ali_ops.cc   | 456 +++++++--
 tensorflow/core/kernels/unique_ali_op.cc      |  25 +
 tensorflow/core/ops/kv_variable_ops.cc        |  47 +
 tensorflow/core/ops/training_ali_ops.cc       | 206 +++++
 tensorflow/python/ops/embedding_ops.py        |  13 +-
 .../ops/embedding_variable_ops_gpu_test.py    |   2 +-
 .../python/ops/embedding_variable_ops_test.py | 873 +++++++++---------
 tensorflow/python/ops/kv_variable_ops.py      |  22 +-
 tensorflow/python/training/adagrad.py         |  29 +-
 tensorflow/python/training/adagrad_decay.py   |  41 +-
 .../python/training/adagrad_decay_v2.py       |  41 +-
 tensorflow/python/training/adam.py            |  33 +-
 tensorflow/python/training/adam_async.py      |  29 +-
 tensorflow/python/training/ftrl.py            |  53 +-
 .../python/training/gradient_descent.py       |  15 +-
 tensorflow/python/training/optimizer.py       |  38 +-
 tensorflow/python/training/slot_creator.py    |   4 +
 .../training/weight_decay_optimizers.py       |  37 +-
 43 files changed, 1787 insertions(+), 773 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_context.h

diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 7ee94e5dd27..22fb45e78d5 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -59,7 +59,15 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    return errors::Unimplemented("Can't use CBF filter in EV for inference.");
+    ValuePtr<V>* value_ptr = nullptr;
+    Status s = ev_->LookupKey(key, &value_ptr);
+    if (s.ok()) {
+      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+    return Status::OK();
   }
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
@@ -76,12 +84,13 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   Status LookupOrCreateKey(K key, ValuePtr<V>** val,
-      bool* is_filter) override {
-    if (GetFreq(key, *val) >= config_.filter_freq) {
+      bool* is_filter, int64 count) override {
+    if ((GetFreq(key, *val) + count) >= config_.filter_freq) {
       *is_filter = true;
       return ev_->LookupOrCreateKey(key, val);
     }
     *is_filter = false;
+    AddFreq(key, count);
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/cache.h b/tensorflow/core/framework/embedding/cache.h
index 44a8895c1a4..df5870f71a7 100644
--- a/tensorflow/core/framework/embedding/cache.h
+++ b/tensorflow/core/framework/embedding/cache.h
@@ -20,8 +20,8 @@ class BatchCache {
  public:
   BatchCache() {}
   virtual ~BatchCache() {}
-  void add_to_rank(const Tensor& t) {
-    add_to_rank((K*)t.data(), t.NumElements());
+  void update(const Tensor& t) {
+    update((K*)t.data(), t.NumElements());
   }
   void add_to_prefetch_list(const Tensor& t) {
     add_to_prefetch_list((K*)t.data(), t.NumElements());
@@ -30,16 +30,20 @@ class BatchCache {
     add_to_cache((K*)t.data(), t.NumElements());
   }
 
+  void update(const Tensor& t, const Tensor& counts_tensor) {
+    update((K*)t.data(), t.NumElements(),
+           nullptr, (int64*)counts_tensor.data());
+  }
   virtual size_t get_evic_ids(K* evic_ids, size_t k_size) = 0;
   virtual size_t get_cached_ids(K* cached_ids, size_t k_size,
                                 int64* cached_versions,
                                 int64* cached_freqs) = 0;
-  virtual void add_to_rank(const K* batch_ids, size_t batch_size,
-                           bool use_locking=true) = 0;
-  virtual void add_to_rank(const K* batch_ids, size_t batch_size,
-                           const int64* batch_versions,
-                           const int64* batch_freqs,
-                           bool use_locking=true) = 0;
+  virtual void update(const K* batch_ids, size_t batch_size,
+                      bool use_locking=true) = 0;
+  virtual void update(const K* batch_ids, size_t batch_size,
+                      const int64* batch_versions,
+                      const int64* batch_freqs,
+                      bool use_locking=true) = 0;
   virtual void add_to_prefetch_list(
       const K* batch_ids, size_t batch_size) = 0;
   virtual void add_to_cache(
@@ -172,8 +176,8 @@ class LRUCache : public BatchCache<K> {
     return i;
   }
 
-  void add_to_rank(const K* batch_ids, size_t batch_size,
-                   bool use_locking=true) {
+  void update(const K* batch_ids, size_t batch_size,
+              bool use_locking=true) {
     mutex temp_mu;
     auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
     for (size_t i = 0; i < batch_size; ++i) {
@@ -200,12 +204,12 @@ class LRUCache : public BatchCache<K> {
     }
   }
 
-  void add_to_rank(const K* batch_ids, size_t batch_size,
-                    const int64* batch_version,
-                    const int64* batch_freqs,
-                    bool use_locking = true) {
+  void update(const K* batch_ids, size_t batch_size,
+              const int64* batch_version,
+              const int64* batch_freqs,
+              bool use_locking = true) override {
     //TODO: add to rank accroding to the version of ids
-    add_to_rank(batch_ids, batch_size);
+    update(batch_ids, batch_size);
   }
 
   void add_to_prefetch_list(const K* batch_ids, const size_t batch_size) {
@@ -247,7 +251,7 @@ class LRUCache : public BatchCache<K> {
         nums_to_cache++;
       }
     }
-    add_to_rank(ids_to_cache.data(), nums_to_cache, false);
+    update(ids_to_cache.data(), nums_to_cache, false);
   }
 
  private:
@@ -334,8 +338,8 @@ class LFUCache : public BatchCache<K> {
     return true_size;
   }
 
-  void add_to_rank(const K *batch_ids, size_t batch_size,
-                   bool use_locking=true) {
+  void update(const K *batch_ids, size_t batch_size,
+              bool use_locking=true) {
     mutex temp_mu;
     auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
     for (size_t i = 0; i < batch_size; ++i) {
@@ -370,10 +374,10 @@ class LFUCache : public BatchCache<K> {
     }
   }
 
-  void add_to_rank(const K *batch_ids, const size_t batch_size,
-                   const int64* batch_versions,
-                   const int64* batch_freqs,
-                   bool use_locking = true) {
+  void update(const K *batch_ids, const size_t batch_size,
+              const int64* batch_versions,
+              const int64* batch_freqs,
+              bool use_locking = true) override {
     mutex temp_mu;
     auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
     for (size_t i = 0; i < batch_size; ++i) {
@@ -480,9 +484,9 @@ class LFUCache : public BatchCache<K> {
       }
     }
     const int64* versions_to_cache = nullptr;
-    add_to_rank(ids_to_cache.data(), nums_to_cache,
-                versions_to_cache, freqs_to_cache.data(),
-                false);
+    update(ids_to_cache.data(), nums_to_cache,
+           versions_to_cache, freqs_to_cache.data(),
+           false);
   }
 
 
diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index 12354c83ec9..1eef3edccc2 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -41,3 +41,8 @@ enum CacheStrategy {
   LRU = 0;
   LFU = 1;
 }
+
+enum EmbeddingVariableType {
+  IMMUTABLE = 0;
+  MUTABLE = 1;
+}
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index 28dd5941fc8..14d18f65605 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -30,8 +30,15 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    return errors::Unimplemented(
-        "Can't use counter filter in EV for inference.");
+    ValuePtr<V>* value_ptr = nullptr;
+    Status s = ev_->LookupKey(key, &value_ptr);
+    if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) {
+      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+    return Status::OK();
   }
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
@@ -47,9 +54,9 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   Status LookupOrCreateKey(K key, ValuePtr<V>** val,
-      bool* is_filter) override {
+      bool* is_filter, int64 count) override {
     Status s = ev_->LookupOrCreateKey(key, val);
-    *is_filter = GetFreq(key, *val) >= config_.filter_freq;
+    *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq;
     return s;
   }
 
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index 9a2539c749d..7a7f4986f62 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -48,8 +48,17 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
   Status Get(K key, ValuePtr<V>** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
-    if (!s.ok()) {
-      s = leveldb_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = leveldb_->Get(key, value_ptr);
+    if (s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      leveldb_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
     }
     return s;
   }
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index 97abe33e8a8..c6ff328f6fc 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -37,6 +37,9 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
       : MultiTierStorage<K, V>(sc, name) {
     dram_ = new DramStorage<K, V>(sc, dram_alloc, lc, new LocklessHashMap<K, V>());
     pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_alloc, lc);
+    value_ptr_size_ =
+        const_cast<EmbeddingConfig&>(sc.embedding_config).total_num(
+            Storage<K, V>::GetAllocLen());
   }
 
   ~DramPmemStorage() override {
@@ -49,8 +52,21 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
 
   Status Get(K key, ValuePtr<V>** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
-    if (!s.ok()) {
-      s = pmem_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = pmem_->Get(key, value_ptr);
+    if (s.ok()) {
+      ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_);
+      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
+             sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_);
+      *value_ptr = new_value_ptr;
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      dram_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
     }
     return s;
   }
@@ -229,6 +245,7 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
  private:
   DramStorage<K, V>* dram_;
   PmemLibpmemStorage<K, V>* pmem_;
+  int64 value_ptr_size_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 91d90cb6ca2..9ba794edf7e 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -48,8 +48,18 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
 
   Status Get(K key, ValuePtr<V>** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
-    if (!s.ok()) {
-      s = ssd_hash_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = ssd_hash_->Get(key, value_ptr);
+    if(s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      //Insert Failed, the key is already in Dram;
+      ssd_hash_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
     }
     return s;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index 78280100e14..3aaa259c3f2 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -23,6 +23,7 @@ struct EmbeddingConfig {
   int normal_fix_flag;
   bool record_freq;
   bool record_version;
+  bool is_inference;
 
   EmbeddingConfig(int64 emb_index = 0,
                   int64 primary_emb_index = 0,
@@ -40,7 +41,8 @@ struct EmbeddingConfig {
                   int64 default_value_dim = 4096,
                   float default_value_no_permission = .0,
                   bool record_freq =false,
-                  bool record_version=false):
+                  bool record_version=false,
+                  bool is_inference=false):
       emb_index(emb_index),
       primary_emb_index(primary_emb_index),
       block_num(block_num),
@@ -55,7 +57,8 @@ struct EmbeddingConfig {
       default_value_no_permission(default_value_no_permission),
       normal_fix_flag(0),
       record_freq(record_freq),
-      record_version(record_version) {
+      record_version(record_version),
+      is_inference(is_inference) {
     if (max_element_size != 0 && false_positive_probability != -1.0){
       kHashFunc = calc_num_hash_func(false_positive_probability);
       num_counter = calc_num_counter(max_element_size,
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 1cdbcfcbd89..e3c0c2a0b24 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #include "tensorflow/core/framework/embedding/cache.h"
+#include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/framework/embedding/filter_factory.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
@@ -37,6 +38,8 @@ limitations under the License.
 #include "tensorflow/core/framework/typed_allocator.h"
 
 namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #if GOOGLE_CUDA
   void SyncWithEventMgr(se::Stream* stream,
@@ -63,16 +66,16 @@ class EmbeddingVar : public ResourceBase {
       default_value_alloc_(alloc),
       emb_config_(emb_cfg) {
     if (IsMultiLevel() || emb_config_.record_freq) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int freq, int64 filter_freq) {
+      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
         value_ptr->AddFreq(freq);
       };
     } else if (emb_config_.is_counter_filter()) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int freq, int64 filter_freq) {
+      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
         if (value_ptr->GetFreq() < filter_freq)
           value_ptr->AddFreq(freq);
       };
     } else {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int freq, int64 filter_freq) {};
+      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {};
     }
     if (emb_config_.steps_to_live != 0 || emb_config_.record_version) {
       update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {
@@ -160,13 +163,16 @@ class EmbeddingVar : public ResourceBase {
   }
 
   Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
-                           bool* is_filter, bool indices_as_pointer) {
+                           bool* is_filter, bool indices_as_pointer,
+                           int64 count = 1) {
     if (indices_as_pointer) {
       *value_ptr = (ValuePtr<V>*)key;
       *is_filter = (*value_ptr != nullptr);
       return Status::OK();
     } else {
-      return filter_->LookupOrCreateKey(key, value_ptr, is_filter);
+      Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count);
+      add_freq_fn_(*value_ptr, count, emb_config_.filter_freq);
+      return s;
     }
   }
 
@@ -230,6 +236,46 @@ class EmbeddingVar : public ResourceBase {
                            default_value_no_permission_);
   }
 
+  void GetEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                     const K* keys, V* output,
+                     int64 num_of_keys) {
+    auto do_work = [this, keys, output] (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v =
+            default_value_ +
+                (keys[i] % emb_config_.default_value_dim) * value_len_;
+        filter_->Lookup(this, keys[i],
+            output + i * value_len_, default_v,
+            default_value_no_permission_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
+//Used for CPU Adaptive Embedding
+  void GetEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                     const K* keys, V* output,
+                     int64 num_of_keys, V* default_value) {
+    auto do_work = [this, keys, output, default_value]
+        (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v = default_value + i * value_len_;
+        ValuePtr<V>* value_ptr = nullptr;
+        filter_->LookupOrCreate(
+            keys[i], output + i * value_len_, default_v, &value_ptr, 1,
+            default_value_no_permission_);
+        add_freq_fn_(value_ptr, 1, emb_config_.filter_freq);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
   void LookupOrCreate(K key, V* val, V* default_v, int count = 1)  {
     const V* default_value_ptr =
       (default_v == nullptr) ? default_value_ : default_v;
@@ -248,7 +294,6 @@ class EmbeddingVar : public ResourceBase {
       embedding::CopyBackFlag copyback_flag =
           embedding::CopyBackFlag::NOT_COPYBACK;
       TF_CHECK_OK(LookupOrCreateKey(keys[i], &value_ptr, -1, copyback_flag));
-      value_ptr->AddFreq();
       memcpy_address[i] = GetAddressOfGpuValuePtr(value_ptr, i, copyback_flag,
           init_cursor, copyback_cursor);
     }
@@ -344,8 +389,10 @@ class EmbeddingVar : public ResourceBase {
     return primary_val;
   }
 
-  typename TTypes<V>::Flat flat(ValuePtr<V>* value_ptr) {
-    V* val = LookupOrCreateEmb(value_ptr, default_value_);
+  typename TTypes<V>::Flat flat(ValuePtr<V>* value_ptr, int64 index) {
+    V* default_v =
+        default_value_ + (index % emb_config_.default_value_dim) * value_len_;
+    V* val = LookupOrCreateEmb(value_ptr, default_v);
     Eigen::array<Eigen::DenseIndex, 1> dims({value_len_});
     return typename TTypes<V>::Flat(val, dims);
   }
@@ -599,11 +646,28 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  void UpdateCache(const Tensor& indices,
+                   const Tensor& indices_counts,
+                   bool is_called_by_gather = false) {
+    if (!is_called_by_gather ||
+        (is_called_by_gather && emb_config_.is_inference)) {
+      storage_->UpdateCache(indices, indices_counts);
+    }
+  }
+
+  void UpdateCache(const Tensor& indices,
+                   bool is_called_by_gather = false) {
+    if (!is_called_by_gather ||
+        (is_called_by_gather && emb_config_.is_inference)) {
+      storage_->UpdateCache(indices);
+    }
+  }
+
   void UpdateCache(const K* key_buff, int64 key_num,
       const int64* version_buff, const int64* freq_buff) {
     auto cache = Cache();
     if (cache) {
-      cache->add_to_rank(key_buff, key_num, version_buff, freq_buff);
+      cache->update(key_buff, key_num, version_buff, freq_buff);
       auto cache_size = CacheSize();
       if (cache->size() > cache_size) {
         int64 evict_size = cache->size() - cache_size;
@@ -671,6 +735,27 @@ class EmbeddingVar : public ResourceBase {
   }
 
  private:
+  void LookupThroughFilter(
+      const EmbeddingVarContext<CPUDevice>& context,
+      const Tensor& indices, V* output,
+      int64 num_of_keys) {
+    const K* keys = (K*)indices.data();
+    auto do_work = [this, keys, output] (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v =
+            default_value_ +
+                (keys[i] % emb_config_.default_value_dim) * value_len_;
+        filter_->Lookup(this, keys[i],
+            output + i * value_len_, default_v,
+            default_value_no_permission_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
   V* GetAddressOfGpuValuePtr(ValuePtr<V>* value_ptr,
       int64 index,
       bool copyback_flag,
@@ -716,7 +801,7 @@ class EmbeddingVar : public ResourceBase {
   embedding::StorageType storage_type_;
   EmbeddingConfig emb_config_;
   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
-  std::function<void(ValuePtr<V>*, int, int64)> add_freq_fn_;
+  std::function<void(ValuePtr<V>*, int64, int64)> add_freq_fn_;
   std::function<void(ValuePtr<V>*, int64)> update_version_fn_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar);
diff --git a/tensorflow/core/framework/embedding/embedding_var_context.h b/tensorflow/core/framework/embedding/embedding_var_context.h
new file mode 100644
index 00000000000..b0dd89a2851
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_context.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow{
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename Device>
+struct EmbeddingVarContext;
+
+template<>
+struct EmbeddingVarContext<CPUDevice> {
+ public:
+  EmbeddingVarContext<CPUDevice>(OpKernelContext* op_ctx)
+      : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()) {}
+
+  const DeviceBase::CpuWorkerThreads* worker_threads;
+};
+
+#if GOOGLE_CUDA
+template<>
+struct EmbeddingVarContext<GPUDevice> {
+ public:
+  EmbeddingVarContext<GPUDevice>(OpKernelContext* op_ctx)
+      : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()),
+        compute_stream(op_ctx->op_device_context()->stream()),
+        event_mgr(op_ctx->device()->tensorflow_gpu_device_info()->event_mgr),
+        gpu_device(op_ctx->eigen_gpu_device()) {}
+
+  const DeviceBase::CpuWorkerThreads* worker_threads = nullptr;
+  se::Stream* compute_stream = nullptr;
+  EventMgr* event_mgr = nullptr;
+  const GPUDevice& gpu_device;
+};
+#endif  // GOOGLE_CUDA
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 631af740d53..565201a844f 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -45,7 +45,7 @@ class FilterPolicy {
     const V* default_value_no_permission) = 0;
 
   virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
-      bool* is_filter) = 0;
+      bool* is_filter, int64 count) = 0;
 
   virtual int64 GetFreq(K key, ValuePtr<V>* value_ptr) = 0;
   virtual int64 GetFreq(K key) = 0;
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index c649bf9e993..9c7535f670a 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -400,7 +400,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     mutex_lock l(*(ssd_->get_mutex()));
     mutex_lock l1(*(dram_->get_mutex()));
 
-    dram_cache_->add_to_rank(keys->data(), keys->size());
+    dram_cache_->update(keys->data(), keys->size());
     int64 dram_count = dram_cache_->size();
     if (dram_count > dram_capacity_) {
       int k_size = dram_count - dram_capacity_;
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index db8999cff2f..529d7ae4549 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -243,6 +243,19 @@ class MultiTierStorage : public Storage<K, V> {
     }
   }
 
+  void UpdateCache(const Tensor& indices,
+                   const Tensor& indices_counts) override {
+    Schedule([this, indices, indices_counts]() {
+      cache_->update(indices, indices_counts);
+    });
+  }
+
+  void UpdateCache(const Tensor& indices) override {
+    Schedule([this, indices]() {
+      cache_->update(indices);
+    });
+  }
+
  protected:
   virtual void SetTotalDims(int64 total_dims) = 0;
 
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index c930baac0e1..e0e734eed81 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -39,11 +39,11 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
     ValuePtr<V>* value_ptr = nullptr;
     Status s = ev->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev->LookupPrimaryEmb(value_ptr);
+      V* mem_val = ev->LookupOrCreateEmb(value_ptr, default_value_ptr);
       memcpy(val, mem_val, sizeof(V) * ev->ValueLen());
     } else {
-      memcpy(val, default_value_no_permission,
-          sizeof(V) * ev->ValueLen());
+      memcpy(val, default_value_ptr,
+             sizeof(V) * ev->ValueLen());
     }
     return Status::OK();
   }
@@ -57,7 +57,7 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   Status LookupOrCreateKey(K key, ValuePtr<V>** val,
-      bool* is_filter) override {
+      bool* is_filter, int64 count) override {
     *is_filter = true;
     return ev_->LookupOrCreateKey(key, val);
   }
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index cc22bb4712a..3915676e57d 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -180,6 +180,11 @@ class Storage {
     }
   }
 
+  virtual void UpdateCache(const Tensor& indices,
+                           const Tensor& indices_counts) {}
+
+  virtual void UpdateCache(const Tensor& indices) {}
+
  protected:
   int64 alloc_len_ = 0;
   int64 total_dims_ = 0;
diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h
index e8e4e133a54..ca7d234ed61 100644
--- a/tensorflow/core/framework/embedding/value_ptr.h
+++ b/tensorflow/core/framework/embedding/value_ptr.h
@@ -138,7 +138,7 @@ struct NormalHeader {
         freq_counter, freq_counter + 1);
   }
 
-  inline void AddFreq(int count) {
+  inline void AddFreq(int64 count) {
     __sync_bool_compare_and_swap(&freq_counter,
         freq_counter, freq_counter + count);
   }
@@ -192,7 +192,7 @@ struct FixedLengthHeader {
         freq_counter, freq_counter + 1);
   }
 
-  inline void AddFreq(int count) {
+  inline void AddFreq(int64 count) {
     __sync_bool_compare_and_swap(&freq_counter,
         freq_counter, freq_counter + count);
   }
@@ -237,7 +237,7 @@ class ValuePtr {
     LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
   }
 
-  virtual void AddFreq(int count) {
+  virtual void AddFreq(int64 count) {
     LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
   }
 
@@ -381,7 +381,7 @@ class NormalValuePtr : public LooseValuePtr<V> {
     return ((NormalHeader*)this->ptr_)->AddFreq();
   }
 
-  void AddFreq(int count) {
+  void AddFreq(int64 count) override {
     return ((NormalHeader*)this->ptr_)->AddFreq(count);
   }
 };
@@ -457,7 +457,7 @@ class NormalContiguousValuePtr : public LooseValuePtr<V> {
     ((FixedLengthHeader*)this->ptr_)->AddFreq();
   }
 
-  void AddFreq(int count) {
+  void AddFreq(int64 count) override {
     ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
   }
 
@@ -555,7 +555,7 @@ class NormalGPUValuePtr : public LooseValuePtr<V> {
     ((FixedLengthHeader*)this->ptr_)->AddFreq();
   }
 
-  void AddFreq(int64 count) {
+  void AddFreq(int64 count) override {
     ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
   }
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index e189fc69978..1bf4f2c210b 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -153,6 +153,7 @@ class Node {
   bool IsSparseApplyAdagradOps() const {
     return type_string() == "SparseApplyAdagrad" ||
            type_string() == "KvResourceSparseApplyAdagrad" ||
+           type_string() == "KvResourceSparseApplyAdagradWithCounts" ||
            type_string() == "ResourceSparseApplyAdagrad";
   }
   bool IsApplyAdamOps() const {
@@ -165,7 +166,9 @@ class Node {
            type_string() == "ResourceSparseApplyAdam" ||
            type_string() == "KvResourceSparseApplyAdam" ||
            type_string() == "ResourceSparseApplyAdamAsync" ||
-           type_string() == "KvResourceSparseApplyAdamAsync";
+           type_string() == "KvResourceSparseApplyAdamAsync" ||
+           type_string() == "KvResourceSparseApplyAdamWithCounts" ||
+           type_string() == "KvResourceSparseApplyAdamAsyncWithCounts";
   }
   bool IsApplyFtrlOps() const {
     return type_string() == "ApplyFtrl" ||
@@ -179,9 +182,11 @@ class Node {
            type_string() == "ResourceGroupSparseApplyFtrl" ||
            type_string() == "ResourceSparseApplyFtrl" ||
            type_string() == "KvResourceSparseApplyFtrl" ||
+           type_string() == "KvResourceSparseApplyFtrlWithCounts" ||
            type_string() == "SparseApplyFtrlV2" ||
            type_string() == "ResourceSparseApplyFtrlV2" ||
-           type_string() == "KvResourceSparseApplyFtrlV2";
+           type_string() == "KvResourceSparseApplyFtrlV2" ||
+           type_string() == "KvResourceSparseApplyFtrlV2WithCounts";
   }
   bool IsKvSparseApply() const {
     return type_string() == "KvResourceSparseApplyAdagrad" ||
@@ -191,7 +196,15 @@ class Node {
            type_string() == "KvResourceSparseApplyFtrl" ||
            type_string() == "KvResourceSparseApplyFtrlV2" ||
            type_string() == "KvResourceSparseApplyGradientDescent" ||
-           type_string() == "KvResourceSparseApplyAdamW";
+           type_string() == "KvResourceSparseApplyAdamW" ||
+           type_string() == "KvResourceSparseApplyAdagradWithCounts" ||
+           type_string() == "KvResourceSparseApplyAdagradDecayWithCounts" ||
+           type_string() == "KvResourceSparseApplyAdamWithCounts" ||
+           type_string() == "KvResourceSparseApplyAdamAsyncWithCounts" ||
+           type_string() == "KvResourceSparseApplyFtrlWithCounts" ||
+           type_string() == "KvResourceSparseApplyFtrlV2WithCounts" ||
+           type_string() == "KvResourceSparseApplyGradientDescentWithCounts" ||
+           type_string() == "KvResourceSparseApplyAdamWWithCounts";
   }
   bool IsPlaceholder() const { return type_string() == "Placeholder"; }
   bool IsSwitch() const { return class_ == NC_SWITCH; }
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 1ee1111d7fc..efe327a7ec1 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -142,7 +142,7 @@ TEST(TensorBundleTest, TestEVShrinkL2) {
   for (int64 i=0; i < insert_num; ++i) {
     ValuePtr<float>* value_ptr = nullptr;
     Status s = emb_var->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr, i);
     vflat += vflat.constant((float)i);
   }
 
@@ -182,7 +182,7 @@ TEST(TensorBundleTest, TestEVShrinkLockless) {
   for (int64 i=0; i < insert_num; ++i) {
     ValuePtr<float>* value_ptr = nullptr;
     Status s = emb_var->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr, i);
     emb_var->UpdateVersion(value_ptr, i);
   }
 
@@ -281,7 +281,7 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
   for (int64 i = 0; i < 5; i++) {
     ValuePtr<float>* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
     vflat(i) = 5.0;
   }
 
@@ -358,7 +358,7 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
   for (int64 i = 0; i < ev_size; i++) {
     ValuePtr<float>* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -420,7 +420,7 @@ void multi_insertion(EmbeddingVar<int64, float>* variable, int64 value_size){
   for (long j = 0; j < 5; j++) {
     ValuePtr<float>* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, j);
   }
 }
 
@@ -908,7 +908,7 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
   for (int64 i = 0; i < InsertLoop; i++){
     ValuePtr<float>* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
   }
 
   testing::StartTiming();
@@ -1196,7 +1196,7 @@ TEST(EmbeddingVariableTest, TestLRUCache) {
   for (int i = 0; i < num_access; i++){
     ids[i] = i % num_ids;
   }
-  cache->add_to_rank(ids, num_access);
+  cache->update(ids, num_access);
   int64 size = cache->get_evic_ids(evict_ids, num_evict);
   ASSERT_EQ(size, num_ids);
   ASSERT_EQ(cache->size(), 0);
@@ -1216,7 +1216,7 @@ TEST(EmbeddingVariableTest, TestLRUCacheGetCachedIds) {
   for (int i = 0; i < num_access; i++){
     ids[i] = i % num_ids;
   }
-  cache->add_to_rank(ids, num_access);
+  cache->update(ids, num_access);
   ASSERT_EQ(cache->size(), num_ids);
   int64* cached_ids = new int64[num_cache];
   int64* cached_freqs = new int64[num_cache];
@@ -1244,7 +1244,7 @@ TEST(EmbeddingVariableTest, TestLFUCacheGetCachedIds) {
   for (int i = 0; i < num_access; i++){
     ids[i] = i % num_ids;
   }
-  cache->add_to_rank(ids, num_access);
+  cache->update(ids, num_access);
   ASSERT_EQ(cache->size(), num_ids);
   int64* cached_ids = new int64[num_cache];
   int64* cached_freqs = new int64[num_cache];
@@ -1315,7 +1315,7 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
   for (int i = 0; i < num_access; i++){
     ids[i] = i % num_ids;
   }
-  cache->add_to_rank(ids, num_access);
+  cache->update(ids, num_access);
   int64 size = cache->get_evic_ids(evict_ids, num_evict);
   ASSERT_EQ(size, num_ids);
   ASSERT_EQ(cache->size(), 0);
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
index 0e4d82a22fb..ba251559f7b 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
@@ -144,7 +144,7 @@ class GroupEmbeddingVariableLookupDenseCpuOp
         embedding_var->storage()->Schedule(
             [embedding_var, dense_values_tensor] {
               embedding::BatchCache<TKey>* cache = embedding_var->Cache();
-              cache->add_to_rank(dense_values_tensor);
+              cache->update(dense_values_tensor);
             });
       }
     }
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
index 92a599babb2..bb9888f642b 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
@@ -176,7 +176,7 @@ class GroupEmbeddingVariableLookupCpuOp
         embedding_var->storage()->Schedule(
             [embedding_var, sp_values_tensor] {
               embedding::BatchCache<TKey> *cache = embedding_var->Cache();
-              cache->add_to_rank(sp_values_tensor);
+              cache->update(sp_values_tensor);
             });
       }
 
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
index 97f44eb8ae6..9eb43b6127a 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
@@ -233,7 +233,7 @@ class GroupEmbeddingVarLookupOp
         if (ev->IsMultiLevel()) {
           ev->storage()->Schedule([ev, indices_host, N]() {
             embedding::BatchCache<TFKey>* cache = ev->Cache();
-            cache->add_to_rank(indices_host, N);
+            cache->update(indices_host, N);
             delete[] indices_host;
           });
         }
diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h
index f2d3be3fb5e..0582697ad16 100644
--- a/tensorflow/core/kernels/incr_save_restore_ops.h
+++ b/tensorflow/core/kernels/incr_save_restore_ops.h
@@ -227,7 +227,7 @@ class IncrEVValueDumpIterator : public  DumpIterator<T> {
     }
     ValuePtr<T>* value_ptr = NULL;
     TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr));
-    return emb_var_->flat(value_ptr)(col_idx_++);
+    return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++);
   }
 
  private:
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index 6b3139645c0..12ca0f66ec9 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -47,6 +48,8 @@ limitations under the License.
 #endif //GOOGLE_CUDA
 
 namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 #if GOOGLE_CUDA
 using se::DeviceMemoryBase;
@@ -612,50 +615,13 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
 #undef REGISTER_KERNELS
 #endif //GOOGLE_CUDA
 
-template <typename TKey, typename TValue>
+template <typename TKey, typename TValue, bool has_counts>
 class KvResourceGatherOp : public OpKernel {
  public:
   explicit KvResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("is_inference", &is_inference_));
-    bool is_inference;
-    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
-    is_inference_ |= is_inference;
     OP_REQUIRES_OK(c,
         c->GetAttr("is_use_default_value_tensor",
           &is_use_default_value_tensor_));
-    if (is_use_default_value_tensor_) {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * index;
-      };
-    } else {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim) ;
-      };
-    }
-    if (c->num_inputs() == 4) {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return count[index];
-      };
-    } else {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return 1;
-      };
-    }
-    if (!is_inference_) {
-      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, TKey key,
-                      TValue* val, TValue* default_v, int count) {
-        ev->LookupOrCreate(key, val, default_v, count);
-        return Status::OK();
-      };
-    } else {
-      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, TKey key,
-                      TValue* val, TValue* default_v, int count) {
-        Status s = ev->Lookup(key, val, default_v);
-        return s;
-      };
-    }
   }
 
   void Compute(OpKernelContext* c) override {
@@ -672,23 +638,11 @@ class KvResourceGatherOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
 
-    int32* counts = nullptr;
-    if (c->num_inputs() == 4)
-      counts = (int32*)c->input(3).data();
-
     if (N > 0) {
       auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
       TValue* out_base = &out_flat(0, 0);
 
-      auto indices_flat = indices.flat<TKey>();
-      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
       const int64 slice_elems = out_flat.dimension(1);
-      TValue* default_v = nullptr;
-      if (is_use_default_value_tensor_) {
-        default_v = (TValue*)c->input(2).data();
-      } else {
-        default_v = ev->GetDefaultValuePtr();
-      }
       OP_REQUIRES(c, ev->ValueLen() == slice_elems,
           errors::InvalidArgument(
               "ev's value_len should same with output's dimension(1)",
@@ -698,54 +652,33 @@ class KvResourceGatherOp : public OpKernel {
           errors::InvalidArgument(
               "MultiLevel EV's Cache size ", ev->CacheSize(),
               " should large than IDs in batch ", N));
-      const size_t slice_bytes = slice_elems * sizeof(TValue);
-      auto do_work = [this, indices_flat,
-           out_base, slice_elems, c, default_v, ev, counts] (
-               int64 start, int64 limit) {
-        for (int64 i = start; i < limit; ++i) {
-          TValue* default_v_ptr = get_default_v_fn_(
-              default_v, indices_flat(i), i, ev->GetDefaultValueDim(),
-              ev->ValueLen());
-          int32 count = get_count_fn_(counts, i);
-          OP_REQUIRES_OK(c, lookup_fn_(ev, indices_flat(i),
-              out_base + i * slice_elems, default_v_ptr, count));
-        }
-      };
-      auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, indices_size,
-            slice_bytes, do_work);
 
-      if (ev->IsMultiLevel()) {
-        embedding::BatchCache<TKey>* cache = ev->Cache();
-        ev->storage()->Schedule([ev, indices]() {
-          embedding::BatchCache<TKey>* cache = ev->Cache();
-          cache->add_to_rank(indices);
-        });
+      EmbeddingVarContext<CPUDevice> ev_ctx(c);
+      if (is_use_default_value_tensor_) {
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N,
+                          reinterpret_cast<TValue*>(c->input(2).data()));
+      } else {
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N);
+        if (has_counts) {
+          const Tensor& indices_counts = c->input(2);
+          ev->UpdateCache(indices, indices_counts, true);
+        } else {
+          ev->UpdateCache(indices, true);
+        }
       }
     }
   }
 
   private:
     bool is_use_default_value_tensor_;
-    bool is_inference_;
-    std::function<
-      TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn_;
-    std::function<int32(int32*, int64)> get_count_fn_;
-    std::function<Status(EmbeddingVar<TKey, TValue>* ev,
-      TKey key, TValue* val, TValue* default_v, int count)> lookup_fn_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("KvResourceGather")                \
                               .Device(DEVICE_##dev)               \
-                              .HostMemory("resource")             \
-                              .HostMemory("indices")              \
-                              .HostMemory("default_value")        \
-                              .HostMemory("output")               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceGatherOp<ktype, vtype>)
+                          KvResourceGatherOp<ktype, vtype, false>)
 
 #define REGISTER_KERNELS_ALL_INDICES(type)                        \
   REGISTER_KERNELS(CPU, int32, type);                             \
@@ -755,6 +688,22 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDICES)
 #undef REGISTER_KERNELS_ALL_INDICES
 #undef REGISTER_KERNELS
 
+#define REGISTER_KERNELS(dev, ktype, vtype)                       \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceGatherV1")              \
+                              .Device(DEVICE_##dev)               \
+                              .TypeConstraint<vtype>("dtype")     \
+                              .TypeConstraint<ktype>("Tkeys"),    \
+                          KvResourceGatherOp<ktype, vtype, true>)
+
+#define REGISTER_KERNELS_ALL(dev, type)                           \
+  REGISTER_KERNELS(dev, int32, type);                             \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
 #if GOOGLE_CUDA
 template <typename Device, typename TKey, typename TValue>
 class KvResourceGatherGPUOp : public OpKernel {
@@ -973,7 +922,7 @@ class KvResourceGatherGPUOp : public OpKernel {
         if (ev->IsMultiLevel()) {
           ev->storage()->Schedule([ev, indices_host, N]() {
             embedding::BatchCache<TKey>* cache = ev->Cache();
-            cache->add_to_rank(indices_host, N);
+            cache->update(indices_host, N);
             delete []indices_host;
           });
         }
@@ -1011,28 +960,24 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU);
 #undef REGISTER_KERNELS_GPU
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
-#endif  // GOOGLE_CUDA
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("KvResourceGatherV1")              \
                               .Device(DEVICE_##dev)               \
-                              .HostMemory("resource")             \
-                              .HostMemory("indices")              \
-                              .HostMemory("default_value")        \
                               .HostMemory("counts")               \
-                              .HostMemory("output")               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceGatherOp<ktype, vtype>)
+                          KvResourceGatherGPUOp<GPUDevice, ktype, vtype>)
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
   REGISTER_KERNELS(dev, int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
 
 template <typename TKey, typename TValue>
 class EVGetFrequencyOp : public OpKernel {
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 07129229f13..20ea6d3cb61 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -195,6 +195,17 @@ class InitializeKvVariableOp : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("slot_num", &slot_num_));
     OP_REQUIRES_OK(c, c->GetAttr("record_freq", &record_freq_));
     OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_));
+    int embedding_var_type= 0;
+    Status s = c->GetAttr("embedding_variable_type", &embedding_var_type);
+    if (!s.ok()) {
+      //Not InitializeKvVariableV2Op!
+      embedding_var_type = embedding::EmbeddingVariableType::MUTABLE;
+    }
+    is_inference_ = false;
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_));
+    is_inference_ |=
+        (embedding_var_type ==
+         embedding::EmbeddingVariableType::IMMUTABLE);
 
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
@@ -315,7 +326,8 @@ class InitializeKvVariableOp : public OpKernel {
                 max_element_size_, false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
-                record_freq_, record_version_); 
+                record_freq_, record_version_,
+                is_inference_);
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
@@ -350,7 +362,8 @@ class InitializeKvVariableOp : public OpKernel {
                 steps_to_live_, filter_freq_, max_freq_,
                 l2_weight_threshold_, layout_,
                 max_element_size_, false_positive_probability_,
-                counter_type_, 0, record_freq_, record_version_);
+                counter_type_, 0, record_freq_, record_version_,
+                is_inference_);
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
@@ -385,7 +398,8 @@ class InitializeKvVariableOp : public OpKernel {
                               false_positive_probability_,
                               counter_type_, default_value_dim_,
                               default_value_no_permission_,
-                              record_freq_, record_version_),
+                              record_freq_, record_version_,
+                              is_inference_),
           primary_variable->GetAllocator());
           return (*ptr)->Init(default_values, default_value_dim_);
         }));
@@ -421,6 +435,7 @@ class InitializeKvVariableOp : public OpKernel {
   float default_value_no_permission_;
   bool record_freq_;
   bool record_version_;
+  bool is_inference_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
@@ -436,6 +451,19 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
 #undef REGISTER_KERNELS_ALL_INDEX
 #undef REGISTER_KERNELS
 
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op")           \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<ktype>("Tkeys")        \
+                              .TypeConstraint<vtype>("dtype"),       \
+                          InitializeKvVariableOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
 #if GOOGLE_CUDA
 #define REGISTER_KERNELS(ktype, vtype)                               \
   REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableOp")             \
@@ -444,6 +472,20 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
                               .TypeConstraint<vtype>("dtype"),       \
                           InitializeKvVariableOp<ktype, vtype>);
 
+#define REGISTER_GPU_KERNELS(type)        \
+  REGISTER_KERNELS(int32, type);          \
+  REGISTER_KERNELS(int64, type);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op")            \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<ktype>("Tkeys")        \
+                              .TypeConstraint<vtype>("dtype"),       \
+                          InitializeKvVariableOp<ktype, vtype>);
+
 #define REGISTER_GPU_KERNELS(type)        \
   REGISTER_KERNELS(int32, type);          \
   REGISTER_KERNELS(int64, type);
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index 69961839e57..a17a75d8124 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -756,7 +756,7 @@ Status DynamicRestoreValue(EmbeddingVar<K, V>* ev, BundleReader* reader,
         st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
             partition_id, partition_num, false, device);
         if (cache_for_restore_hbm) {
-          cache_for_restore_hbm->add_to_rank(
+          cache_for_restore_hbm->update(
               (K*)restore_buff.key_buffer, read_key_num,
               (int64*)restore_buff.version_buffer,
               (int64*)restore_buff.freq_buffer);
@@ -781,7 +781,7 @@ Status DynamicRestoreValue(EmbeddingVar<K, V>* ev, BundleReader* reader,
     ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
                                      hbm_versions, hbm_freqs]() {
       embedding::BatchCache<K>* cache = ev->Cache();
-      cache->add_to_rank(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
+      cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
       delete[] hbm_ids;
       delete[] hbm_freqs;
     });
@@ -981,7 +981,7 @@ Status EVRestoreNoPartition(EmbeddingVar<K, V>* ev, BundleReader* reader,
       }
       st = ev->Import(restore_buff, read_key_num, 1, 0, 1, false, device);
       if (cache_for_restore_hbm) {
-        cache_for_restore_hbm->add_to_rank(
+        cache_for_restore_hbm->update(
             (K*)restore_buff.key_buffer, read_key_num,
             (int64*)restore_buff.version_buffer,
             (int64*)restore_buff.freq_buffer);
@@ -1018,7 +1018,7 @@ Status EVRestoreNoPartition(EmbeddingVar<K, V>* ev, BundleReader* reader,
 
         st = ev->Import(restore_buff, read_key_num, 1, 0, 1, true, device);
         if (cache_for_restore_hbm) {
-          cache_for_restore_hbm->add_to_rank(
+          cache_for_restore_hbm->update(
               (K*)restore_buff.key_buffer, read_key_num,
               (int64*)restore_buff.version_buffer,
               (int64*)restore_buff.freq_buffer);
@@ -1043,7 +1043,7 @@ Status EVRestoreNoPartition(EmbeddingVar<K, V>* ev, BundleReader* reader,
     ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
                                      hbm_versions, hbm_freqs]() {
       embedding::BatchCache<K>* cache = ev->Cache();
-      cache->add_to_rank(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
+      cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
       delete[] hbm_ids;
       delete[] hbm_freqs;
     });
@@ -1424,7 +1424,7 @@ Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
             st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
                 partition_id, partition_num, false, device);
             if (cache_for_restore_hbm) {
-              cache_for_restore_hbm->add_to_rank(
+              cache_for_restore_hbm->update(
                   (K*)restore_buff.key_buffer, read_key_num,
                   (int64*)restore_buff.version_buffer,
                   (int64*)restore_buff.freq_buffer);
@@ -1477,7 +1477,7 @@ Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
               st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
                   partition_id, partition_num, true, device);
               if (cache_for_restore_hbm) {
-                cache_for_restore_hbm->add_to_rank(
+                cache_for_restore_hbm->update(
                     (K*)restore_buff.key_buffer, read_key_num,
                     (int64*)restore_buff.version_buffer,
                     (int64*)restore_buff.freq_buffer);
@@ -1504,7 +1504,7 @@ Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
       ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
                                        hbm_versions, hbm_freqs]() {
         embedding::BatchCache<K>* cache = ev->Cache();
-        cache->add_to_rank(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
+        cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
         delete[] hbm_ids;
         delete[] hbm_freqs;
       });
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index d82ac1bf96e..8e8edceb2ff 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -67,7 +67,8 @@ struct ApplyAdagradDecay<CPUDevice, T> {
 
 }
 
-template <typename TKey, typename T, typename Tstep, bool indices_as_pointer>
+template <typename TKey, typename T, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdagradOp : public OpKernel {
  public:
   explicit KvSparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -118,6 +119,15 @@ class KvSparseApplyAdagradOp : public OpKernel {
     OP_REQUIRES(ctx, inner_dim > 0,
                 errors::InvalidArgument(
                     "Inner dimension should be greater than zero."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(6);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       if (inner_dim > 0) {
@@ -126,18 +136,20 @@ class KvSparseApplyAdagradOp : public OpKernel {
         T lr_scalar = lr.scalar<T>()();
         Tstep gs = global_step.scalar<Tstep>()();
         auto do_work = [this, ctx, &indices_vec, var, accum, &grad_flat,
-            &gs, &lr_scalar] (int64 start_i, int64 limit_i) {
+            &gs, &lr_scalar, indices_counts, get_count_fn]
+            (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr);
+              auto a = accum->flat(value_ptr, index);
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr);
+              auto v = var->flat(value_ptr, index);
               a += g.square();
               v -= g.constant(lr_scalar) * g * a.rsqrt();
             }
@@ -146,6 +158,11 @@ class KvSparseApplyAdagradOp : public OpKernel {
         const int64 cost = 1000; //very unreliable estimate for cost per step.
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
+
+        if (has_counts) {
+          const Tensor& indices_counts = ctx->input(6);
+          var->UpdateCache(indices, indices_counts);
+        }
       }
     }
   }
@@ -160,13 +177,25 @@ class KvSparseApplyAdagradOp : public OpKernel {
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradOp<Tindices, T, Tstep, false>); \
+                          KvSparseApplyAdagradOp<Tindices, T, Tstep, false, false>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagrad")  \
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradOp<Tindices, T, Tstep, true>);
+                          KvSparseApplyAdagradOp<Tindices, T, Tstep, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradWithCounts")       \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices")  \
+                              .TypeConstraint<Tstep>("Tstep"),       \
+                          KvSparseApplyAdagradOp<Tindices, T, Tstep, false, true>); \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradWithCounts")  \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices")  \
+                              .TypeConstraint<Tstep>("Tstep"),       \
+                          KvSparseApplyAdagradOp<Tindices, T, Tstep, true, true>);
 #define REGISTER_CPU_KERNELS(T)        \
   REGISTER_KERNELS(int32, T, int32);   \
   REGISTER_KERNELS(int64, T, int32);   \
@@ -424,6 +453,25 @@ DECLARE_GPU_SPEC(int64, double);
                               .HostMemory("global_step")             \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradWithCounts")       \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .HostMemory("lr")                      \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<Tindices>("Tindices")  \
+                              .TypeConstraint<Tstep>("Tstep"),       \
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, false>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradWithCounts")  \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .HostMemory("indices")                 \
+                              .HostMemory("lr")                      \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<Tindices>("Tindices")  \
+                              .TypeConstraint<Tstep>("Tstep"),       \
                           KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true>);
 #define REGISTER_GPU_KERNELS(T)        \
   REGISTER_KERNELS(int32, T, int32);   \
@@ -437,7 +485,8 @@ TF_CALL_float(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 // Note, this op works on cpu only.
-template <typename Device, typename TKey, typename T, bool has_l2_shrinkage, bool indices_as_pointer>
+template <typename Device, typename TKey, typename T,
+          bool has_l2_shrinkage, bool indices_as_pointer, bool has_counts>
 class KvSparseApplyFtrlOp : public OpKernel {
  public:
   explicit KvSparseApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -521,6 +570,16 @@ class KvSparseApplyFtrlOp : public OpKernel {
                                   "is not a non-negative scalar: ",
                                   l2_shrinkage->shape().DebugString()));
     }
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const int counts_input_index = has_l2_shrinkage ? 10 : 9;
+      const Tensor& counts_tensor = ctx->input(counts_input_index);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       if (inner_dim > 0) {
@@ -537,19 +596,20 @@ class KvSparseApplyFtrlOp : public OpKernel {
         auto do_work = [this, ctx, inner_dim, &var_,
                        &indices_vec, &accum_, &linear_, &grad_flat,
                        &lr_scalar, &l1_scalar, &l2_scalar, &lr_power,
-                       &l2_shrinkage_scalar, &lr_power_scalar]
-                       (int64 start_i, int64 limit_i) {
-
+                       &l2_shrinkage_scalar, &lr_power_scalar,
+                       get_count_fn, indices_counts]
+            (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             if (is_filter) {
-              auto var = var_->flat(value_ptr);
-              auto accum = accum_->flat(value_ptr);
-              auto linear = linear_->flat(value_ptr);
+              auto var = var_->flat(value_ptr, index);
+              auto accum = accum_->flat(value_ptr, index);
+              auto linear = linear_->flat(value_ptr, index);
               auto grad = grad_flat.template chip<0>(i);
 
 // Use a macro to implement the computation here due to the templating of the
@@ -598,6 +658,12 @@ class KvSparseApplyFtrlOp : public OpKernel {
         const int64 cost = 4500; //very unreliable estimate for cost per step.
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
+
+        if (has_counts) {
+          const int counts_input_index = has_l2_shrinkage ? 10 : 9;
+          const Tensor& indices_counts = ctx->input(counts_input_index);
+          var_->UpdateCache(indices, indices_counts);
+        }
       }
     }
 
@@ -614,13 +680,25 @@ class KvSparseApplyFtrlOp : public OpKernel {
           .Device(DEVICE_CPU)                                                 \
           .TypeConstraint<T>("T")                                             \
           .TypeConstraint<Tindices>("Tindices"),                              \
-      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, false>);\
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, false, false>);\
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("_OPT_KvResourceSparseApplyFtrl")                                  \
           .Device(DEVICE_CPU)                                                 \
           .TypeConstraint<T>("T")                                             \
           .TypeConstraint<Tindices>("Tindices"),                              \
-      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, true>);
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, true, false>);\
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("KvResourceSparseApplyFtrlWithCounts")                                       \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, false, true>);\
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyFtrlWithCounts")                                  \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, true, true>);
 
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(int64, T);   \
@@ -637,13 +715,25 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
           .Device(DEVICE_CPU)                                                \
           .TypeConstraint<T>("T")                                            \
           .TypeConstraint<Tindices>("Tindices"),                             \
-      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, false>);\
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, false, false>);\
   REGISTER_KERNEL_BUILDER(                                                   \
       Name("_OPT_KvResourceSparseApplyFtrlV2")                               \
           .Device(DEVICE_CPU)                                                \
           .TypeConstraint<T>("T")                                            \
           .TypeConstraint<Tindices>("Tindices"),                             \
-      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, true>);
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, true, false>)\
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("KvResourceSparseApplyFtrlV2WithCounts")                                    \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, false, true>);\
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("_OPT_KvResourceSparseApplyFtrlV2WithCounts")                               \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, true, true>);
 
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(int64, T);   \
@@ -1167,7 +1257,8 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 // Note, this op works on cpu only.
-template <typename T, typename Tindex, typename Tstep, bool indices_as_pointer>
+template <typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdagradDecayOp : public OpKernel {
  public:
   explicit KvSparseApplyAdagradDecayOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1242,6 +1333,15 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
       ctx, grad.dim_size(0) == N,
       errors::InvalidArgument(
         "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(10);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       auto indices_vec = indices.vec<Tindex>();
@@ -1255,21 +1355,24 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
         auto grad_flat = grad.flat_outer_dims<T>();
         auto do_work = [this, ctx, &indices_vec, &var, &accum, &gs,
             &grad_flat, accum_decay_power_var, &decay_step_scalar,
-            &decay_rate_scalar, &decay_baseline_scalar, &lr_scalar]
-                (int64 start_i, int64 limit_i) {
+            &decay_rate_scalar, &decay_baseline_scalar, &lr_scalar,
+            get_count_fn, indices_counts]
+            (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
+            var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr);
+              auto a = accum->flat(value_ptr, index);
 
               auto g = grad_flat.template chip<0>(i);
 
-              auto v = var->flat(value_ptr);
-              auto accum_decay_power = accum_decay_power_var->flat(value_ptr);
+              auto v = var->flat(value_ptr, index);
+              auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index);
 
               if (gs / decay_step_scalar > accum_decay_power(0)) {
                 a *= a.constant(decay_rate_scalar);
@@ -1284,6 +1387,10 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
         const int64 cost = 1000;
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
+        if (has_counts) {
+          const Tensor& indices_counts = ctx->input(10);
+          var->UpdateCache(indices, indices_counts);
+        }
       }
     }
 
@@ -1303,7 +1410,7 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, false>);\
+                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradDecay")   \
                               .Device(DEVICE_CPU)                          \
                               .HostMemory("var")                           \
@@ -1312,7 +1419,25 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, true>);
+                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradDecayWithCounts")        \
+                              .Device(DEVICE_CPU)                          \
+                              .HostMemory("var")                           \
+                              .HostMemory("accum")                         \
+                              .HostMemory("accum_decay_power")             \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, false, true>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradDecayWithCounts")   \
+                              .Device(DEVICE_CPU)                          \
+                              .HostMemory("var")                           \
+                              .HostMemory("accum")                         \
+                              .HostMemory("accum_decay_power")             \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdagradDecayOp<T, Tindices, Tstep, true, true>);
 
 #define REGISTER_CPU_KERNELS(T)        \
   REGISTER_KERNELS(T, int64, int32);   \
@@ -1325,7 +1450,8 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T, typename Tindex, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1408,6 +1534,15 @@ class KvSparseApplyAdamOp : public OpKernel {
         ctx, grad.dim_size(0) == N,
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(12);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       T beta1_power_scalar = beta1_power.scalar<T>()();
@@ -1422,7 +1557,8 @@ class KvSparseApplyAdamOp : public OpKernel {
 
       auto DoWork = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices,
            &beta1_power_scalar, &beta2_power_scalar, &lr_scalar, &beta1_scalar,
-           &beta2_scalar, &epsilon_scalar, &alpha, &global_step] (int64 start_i, int64 limit_i) {
+           &beta2_scalar, &epsilon_scalar, &alpha, &global_step,
+           get_count_fn, indices_counts] (int64 start_i, int64 limit_i) {
         if (inner_dim > 0) {
           auto grad_flat = grad.flat_outer_dims<T>();
           auto indices_vec = indices.vec<Tindex>();
@@ -1433,13 +1569,14 @@ class KvSparseApplyAdamOp : public OpKernel {
             const Tindex index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter =false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr);
-              auto m_a = m->flat(value_ptr);
-              auto v_a = v->flat(value_ptr);
+              auto var_i = var->flat(value_ptr, index);
+              auto m_a = m->flat(value_ptr, index);
+              auto v_a = v->flat(value_ptr, index);
 
               auto g = grad_flat.template chip<0>(i);
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
@@ -1453,6 +1590,10 @@ class KvSparseApplyAdamOp : public OpKernel {
       const int64 cost = 1000;
       auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
       Shard(worker_threads.num_threads, worker_threads.workers, N, cost, DoWork);
+      if (has_counts) {
+        const Tensor& indices_counts = ctx->input(12);
+        var->UpdateCache(indices, indices_counts);
+      }
     }
   }
 
@@ -1465,12 +1606,22 @@ class KvSparseApplyAdamOp : public OpKernel {
                               .Device(DEVICE_CPU)                     \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, false>);\
+                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdam")      \
                               .Device(DEVICE_CPU)                     \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, true>);
+                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWithCounts")           \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, false, true>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWithCounts")      \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamOp<CPUDevice, T, Tindices, true, true>);
 
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -1744,7 +1895,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
                           KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false>); \
-   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdam")           \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdam")           \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
                               .HostMemory("lr")                      \
@@ -1756,6 +1907,34 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .HostMemory("global_step")             \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWithCounts")             \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("indices")                 \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1_power")             \
+                              .HostMemory("beta2_power")             \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false>); \
+   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWithCounts")           \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("indices")                 \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1_power")             \
+                              .HostMemory("beta2_power")             \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
                           KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true>);
 #define REGISTER_GPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -2242,7 +2421,8 @@ TF_CALL_double(REGISTER_GPU_KERNEL);
 #endif // End of #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T, typename Tindex, typename Tstep, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          typename Tstep, bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamAsyncOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamAsyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -2333,7 +2513,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
         ctx, grad.dim_size(0) == N,
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
-
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(12);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
     if (N > 0) {
       if (apply_sparse_rmsprop_) {
         auto indices_vec = indices.vec<Tindex>();
@@ -2345,19 +2533,21 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
         const T epsilon_scalar = epsilon.scalar<T>()();
 
         auto do_work = [this, ctx, &indices_vec, &var, v, m, &grad_flat,
-            &beta2_scalar, &beta1_scalar, &epsilon_scalar, &lr_scalar, &global_step]
-                (int64 start_i, int64 limit_i) {
+            &beta2_scalar, &beta1_scalar, &epsilon_scalar, &lr_scalar, &global_step,
+            get_count_fn, indices_counts]
+            (int64 start_i, int64 limit_i) {
           Tstep gs = global_step.scalar<Tstep>()();
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto v_ = v->flat(value_ptr);
-              auto m_ = m->flat(value_ptr);
+              auto v_ = v->flat(value_ptr, index);
+              auto m_ = m->flat(value_ptr, index);
               auto grad_ = grad_flat.template chip<0>(i);
 
               v_ = v_ * v_.constant(beta2_scalar) +
@@ -2366,7 +2556,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
                      (v_ + v_.constant(epsilon_scalar)).rsqrt() *
                          v_.constant(lr_scalar) * grad_;
 
-              auto v = var->flat(value_ptr);
+              auto v = var->flat(value_ptr, index);
               v -= m_;
             }
           }
@@ -2388,7 +2578,8 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
         auto do_work = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices,
              &lr_scalar, &beta1_scalar,
              &beta1_power, &beta2_power,
-             &beta2_scalar, &epsilon_scalar, &alpha, &global_step] (int64 start_i, int64 limit_i) {
+             &beta2_scalar, &epsilon_scalar, &alpha, &global_step,
+             get_count_fn, indices_counts] (int64 start_i, int64 limit_i) {
 
           if (inner_dim > 0) {
             auto grad_flat = grad.flat_outer_dims<T>();
@@ -2399,14 +2590,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
               const Tindex index = indices_vec(i);
               ValuePtr<T>* value_ptr = nullptr;
               bool is_filter = false;
+              int64 count = get_count_fn(indices_counts, i);
               OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                             &is_filter, indices_as_pointer));
+                             &is_filter, indices_as_pointer, count));
               var->UpdateVersion(value_ptr, gs);
               if (is_filter) {
-                auto m_a = m->flat(value_ptr);
-                auto v_a = v->flat(value_ptr);
+                auto m_a = m->flat(value_ptr, index);
+                auto v_a = v->flat(value_ptr, index);
                 auto g = grad_flat.template chip<0>(i);
-                auto var_i = var->flat(value_ptr);
+                auto var_i = var->flat(value_ptr, index);
 
                 m_a = m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
                 v_a = v_a * beta2_scalar + g.square() * (static_cast<T>(1) - beta2_scalar);
@@ -2423,6 +2615,10 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
         beta1_power_scalar() *= beta1_scalar;
         beta2_power_scalar() *= beta2_scalar;
       }
+      if (has_counts) {
+        const Tensor& indices_counts = ctx->input(12);
+        var->UpdateCache(indices, indices_counts);
+      }
     }
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
@@ -2439,13 +2635,25 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, false>);\
+                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsync")      \
                               .Device(DEVICE_##D)                          \
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, true>);
+                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsyncWithCounts")           \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, false, true>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")      \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, true, true>);
 
 #define REGISTER_CPU_KERNELS(T)             \
   REGISTER_KERNELS(CPU, T, int32, int32);   \
@@ -2769,6 +2977,30 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true>); \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsyncWithCounts")           \
+                              .Device(DEVICE_##D)                          \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")           \
+                              .Device(DEVICE_##D)                          \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("indices_counts")          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
                           KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true>);
 #define REGISTER_GPU_KERNELS(T)        \
   REGISTER_KERNELS(GPU, T, int32, int32);   \
@@ -2822,7 +3054,8 @@ DECLARE_GPU_SPEC_TYPE(double);
 
 #endif // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename T, typename Tindex, typename Tstep, bool indices_as_pointer>
+template <typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
 class KvResourceSparseApplyGradientDescentOp : public OpKernel {
  public:
   explicit KvResourceSparseApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -2871,6 +3104,15 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
       ctx, grad.dim_size(0) == N,
       errors::InvalidArgument(
         "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(5);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       auto indices_vec = indices.vec<Tindex>();
@@ -2880,17 +3122,19 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
       if (inner_dim > 0) {
         auto grad_flat = grad.flat_outer_dims<T>();
         auto do_work = [this, ctx, &indices_vec, var, &grad_flat, &gs,
-            &lr_scalar] (int64 start_i, int64 limit_i) {
+            &lr_scalar, indices_counts, get_count_fn]
+            (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr);
+              auto v = var->flat(value_ptr, index);
               v -= g.constant(lr_scalar) * g;
             }
           }
@@ -2898,6 +3142,12 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
         const int64 cost = 1000;
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
+        if (has_counts) {
+          const Tensor& indices = ctx->input(5);
+          var->UpdateCache(indices, indices_counts);
+        } else {
+          var->UpdateCache(indices);
+        }
       }
     }
 
@@ -2915,14 +3165,28 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, false>);\
+                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyGradientDescent")\
                               .Device(DEVICE_CPU)                          \
                               .HostMemory("var")                           \
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, true>);
+                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyGradientDescentWithCounts")     \
+                              .Device(DEVICE_CPU)                          \
+                              .HostMemory("var")                           \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, false, true>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyGradientDescentWithCounts")\
+                              .Device(DEVICE_CPU)                          \
+                              .HostMemory("var")                           \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<Tindices>("Tindices")        \
+                              .TypeConstraint<Tstep>("Tstep"),             \
+                          KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, true, true>);
 
 #define REGISTER_CPU_KERNELS(T)        \
   REGISTER_KERNELS(T, int64, int32);   \
@@ -2935,7 +3199,8 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-template <typename Device, typename T, typename Tindex, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamWOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamWOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -3023,6 +3288,15 @@ class KvSparseApplyAdamWOp : public OpKernel {
         ctx, grad.dim_size(0) == N,
         errors::InvalidArgument(
             "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(13);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) {return counts[index];};
+    } else {
+      get_count_fn = [](int64* counts, int64 index) {return 1;};
+    }
 
     if (N > 0) {
       T beta1_power_scalar = beta1_power.scalar<T>()();
@@ -3037,9 +3311,10 @@ class KvSparseApplyAdamWOp : public OpKernel {
           (static_cast<T>(1) - beta1_power_scalar);
 
       auto DoWork = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices,
-           &beta1_power_scalar, &beta2_power_scalar, &lr_scalar, &beta1_scalar,
-           &beta2_scalar, &epsilon_scalar, &alpha, &global_step, 
-           &weight_decay_scalar] (int64 start_i, int64 limit_i) {
+          &beta1_power_scalar, &beta2_power_scalar, &lr_scalar, &beta1_scalar,
+          &beta2_scalar, &epsilon_scalar, &alpha, &global_step, 
+          &weight_decay_scalar, get_count_fn, indices_counts]
+          (int64 start_i, int64 limit_i) {
         if (inner_dim > 0) {
           auto grad_flat = grad.flat_outer_dims<T>();
           auto indices_vec = indices.vec<Tindex>();
@@ -3050,13 +3325,14 @@ class KvSparseApplyAdamWOp : public OpKernel {
             const Tindex index = indices_vec(i);
             ValuePtr<T>* value_ptr = nullptr;
             bool is_filter =false;
+            int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
-                           &is_filter, indices_as_pointer));
+                           &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr);
-              auto m_a = m->flat(value_ptr);
-              auto v_a = v->flat(value_ptr);
+              auto var_i = var->flat(value_ptr, index);
+              auto m_a = m->flat(value_ptr, index);
+              auto v_a = v->flat(value_ptr, index);
               auto g = grad_flat.template chip<0>(i);
               // m_a = beta1 * m + (1 - beta1) * g
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
@@ -3071,6 +3347,10 @@ class KvSparseApplyAdamWOp : public OpKernel {
       const int64 cost = 1000;
       auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
       Shard(worker_threads.num_threads, worker_threads.workers, N, cost, DoWork);
+      if (has_counts) {
+        const Tensor& indices_counts = ctx->input(13);
+        var->UpdateCache(indices, indices_counts);
+      }
     }
   }
 
@@ -3083,12 +3363,22 @@ class KvSparseApplyAdamWOp : public OpKernel {
                               .Device(DEVICE_CPU)                     \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, false>);\
+                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamW")     \
                               .Device(DEVICE_CPU)                     \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, true>);
+                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, true, false>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWWithCounts")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, false, true>);\
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWWithCounts")     \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamWOp<CPUDevice, T, Tindices, true, true>);
 
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -3379,6 +3669,36 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
                               .HostMemory("weight_decay")            \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true>);\
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWWithCounts")             \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("indices")                 \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1_power")             \
+                              .HostMemory("beta2_power")             \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("weight_decay")            \
+                              .HostMemory("indices_counts")           \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWWithCounts")             \
+                              .Device(DEVICE_GPU)                     \
+                              .HostMemory("indices")                 \
+                              .HostMemory("lr")                      \
+                              .HostMemory("beta1_power")             \
+                              .HostMemory("beta2_power")             \
+                              .HostMemory("beta1")                   \
+                              .HostMemory("beta2")                   \
+                              .HostMemory("epsilon")                 \
+                              .HostMemory("global_step")             \
+                              .HostMemory("weight_decay")            \
+                              .HostMemory("indices_counts")           \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
                           KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true>);
 #define REGISTER_GPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
diff --git a/tensorflow/core/kernels/unique_ali_op.cc b/tensorflow/core/kernels/unique_ali_op.cc
index 1a6cf01491e..28b5dad1990 100644
--- a/tensorflow/core/kernels/unique_ali_op.cc
+++ b/tensorflow/core/kernels/unique_ali_op.cc
@@ -178,6 +178,31 @@ class UniqueAliOp : public OpKernel {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
+
+#if GOOGLE_CUDA
+#define REGISTER_UNIQUE(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("x")                   \
+                              .HostMemory("y")                   \
+                              .HostMemory("idx")                 \
+                              .HostMemory("count")               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueAliOp<type, int32>)              \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("x")                   \
+                              .HostMemory("y")                   \
+                              .HostMemory("idx")                 \
+                              .HostMemory("count")               \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueAliOp<type, int64>);
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
+REGISTER_UNIQUE(string)
+#undef REGISTER_UNIQUE
+#endif //GOOGLE_CUDA
   
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
diff --git a/tensorflow/core/ops/kv_variable_ops.cc b/tensorflow/core/ops/kv_variable_ops.cc
index a74c4df9670..4d003b4213e 100644
--- a/tensorflow/core/ops/kv_variable_ops.cc
+++ b/tensorflow/core/ops/kv_variable_ops.cc
@@ -168,6 +168,53 @@ value: the value to set the new tensor to use.
 dtype: the dtype of the value.
 )");
 
+REGISTER_OP("InitializeKvVariableV2Op")
+    .Input("resource_self: resource")
+    .Input("resource_primary: resource")
+    .Input("value: dtype")
+    .Input("empty_key: Tkeys")
+    .Attr("slot_num: int = 0")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .Attr("steps_to_live: int = 0")
+    .Attr("ht_type: string = ''")
+    .Attr("emb_index: int = 0")
+    .Attr("block_num: int = 1")
+    .Attr("slot_index: int = 0")
+    .Attr("ht_partition_num: int = 1000")
+    .Attr("filter_freq: int = 0")
+    .Attr("max_freq: int = 999999")
+    .Attr("max_element_size: int  = 0")
+    .Attr("counter_type: type")
+    .Attr("false_positive_probability: float = -1.0")
+    .Attr("l2_weight_threshold: float =-1.0")
+    .Attr("layout: string = ''")
+    .Attr("storage_type: int = 0")
+    .Attr("storage_path: string = '.'")
+    .Attr("storage_size: list(int) = []")
+    .Attr("default_value_dim: int = 4096")
+    .Attr("default_value_no_permission: float = .0")
+    .Attr("record_freq: bool = false")
+    .Attr("record_version: bool = false")
+    .Attr("embedding_variable_type: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      return Status::OK();
+    })
+    .Doc(R"(
+Assigns a new value to a variable.
+
+Any ReadVariableOp with a control dependency on this op is guaranteed to return
+this value or a subsequent newer value of the variable.
+
+resource_self: handle to the resource in which to store the variable.
+resource_primary: handle to the resource in which to store the variable.
+value: the value to set the new tensor to use.
+dtype: the dtype of the value.
+)");
+
 REGISTER_OP("KvVarIsInitializedOp")
     .Input("resource: resource")
     .Output("is_initialized: bool")
diff --git a/tensorflow/core/ops/training_ali_ops.cc b/tensorflow/core/ops/training_ali_ops.cc
index 38bd2e0f441..d0c33e5e1c2 100644
--- a/tensorflow/core/ops/training_ali_ops.cc
+++ b/tensorflow/core/ops/training_ali_ops.cc
@@ -109,6 +109,28 @@ REGISTER_OP(name)                              \
     .Doc(R"doc()doc")
 REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagrad");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagrad");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("accum: resource")                  \
+    .Input("lr: T")                            \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("global_step: Tstep")               \
+    .Input("indices_counts: int64")            \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64}")          \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvResourceApplyAdagradShapeFn(c, true /* sparse */); \
+    })                                         \
+    .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagradWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagradWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 static Status KvResourceApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -149,6 +171,31 @@ REGISTER_OP(name)                              \
     .Doc(R"doc()doc")
 REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrl");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrl");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("accum: resource")                  \
+    .Input("linear: resource")                 \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("lr: T")                            \
+    .Input("l1: T")                            \
+    .Input("l2: T")                            \
+    .Input("lr_power: T")                      \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64, string}")  \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvResourceApplyFtrlShapeFn(c, true /* sparse */);\
+    })                                         \
+    .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrlWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrlWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 #define REGISTER_OP_BY_NAME(name)              \
 REGISTER_OP(name)                              \
@@ -171,6 +218,31 @@ REGISTER_OP(name)                              \
     })
 REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrlV2");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrlV2");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("accum: resource")                  \
+    .Input("linear: resource")                 \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("lr: T")                            \
+    .Input("l1: T")                            \
+    .Input("l2: T")                            \
+    .Input("l2_shrinkage: T")                  \
+    .Input("lr_power: T")                      \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64, string}")  \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvResourceApplyFtrlShapeFn(c, true /* sparse */);\
+    })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrlV2WithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrlV2WithCounts");
+#undef REGISTER_OP_BY_NAME
 
 static Status ApplyAdagradDecayShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -315,6 +387,33 @@ REGISTER_OP(name)                              \
     .Doc(R"doc()doc")
 REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagradDecay");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagradDecay");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("accum: resource")                  \
+    .Input("accum_decay_power: resource")      \
+    .Input("lr: T")                            \
+    .Input("accum_decay_step: Tstep")          \
+    .Input("accum_decay_rate: T")              \
+    .Input("accum_baseline: T")                \
+    .Input("global_step: Tstep")               \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64}")          \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvApplyAdagradDecayShapeFn(c, true /* sparse */);\
+    })                                         \
+    .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagradDecayWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagradDecayWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 static Status ApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -457,6 +556,35 @@ REGISTER_OP(name)                              \
     .Doc(R"doc()doc")
 REGISTER_OP_BY_NAME("KvResourceSparseApplyAdam");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdam");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("m: resource")                      \
+    .Input("v: resource")                      \
+    .Input("beta1_power: T")                   \
+    .Input("beta2_power: T")                   \
+    .Input("lr: T")                            \
+    .Input("beta1: T")                         \
+    .Input("beta2: T")                         \
+    .Input("epsilon: T")                       \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("global_step: Tstep")               \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64, string}")  \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvResourceApplyAdamShapeFn(c, true /* sparse */);\
+    })                                         \
+    .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 static Status KvApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -502,6 +630,35 @@ REGISTER_OP(name)                              \
     })
 REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsync");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsync");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("m: resource")                      \
+    .Input("v: resource")                      \
+    .Input("beta1_power: resource")            \
+    .Input("beta2_power: resource")            \
+    .Input("lr: T")                            \
+    .Input("beta1: T")                         \
+    .Input("beta2: T")                         \
+    .Input("epsilon: T")                       \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("global_step: Tstep")               \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64}")          \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .Attr("apply_sparse_rmsprop: bool = false")\
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvApplyAdamAsyncShapeFn(c, true /* sparse */);\
+    })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsyncWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsyncWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 static Status KvApplyGradientDescentShapeFn(InferenceContext* c) {
   ShapeHandle unused;
@@ -529,6 +686,25 @@ REGISTER_OP(name)                              \
     .SetShapeFn(KvApplyGradientDescentShapeFn)
 REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescent");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescent");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("alpha: T")                         \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("global_step: Tstep")               \
+    .Input("counts: int64")                    \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64}")          \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn(KvApplyGradientDescentShapeFn)
+REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescentWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescentWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 #define REGISTER_OP_BY_NAME(name)              \
 REGISTER_OP(name)                              \
@@ -556,5 +732,35 @@ REGISTER_OP(name)                              \
     .Doc(R"doc()doc")
 REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamW");
 REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamW");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)              \
+REGISTER_OP(name)                              \
+    .Input("var: resource")                    \
+    .Input("m: resource")                      \
+    .Input("v: resource")                      \
+    .Input("beta1_power: T")                   \
+    .Input("beta2_power: T")                   \
+    .Input("lr: T")                            \
+    .Input("beta1: T")                         \
+    .Input("beta2: T")                         \
+    .Input("epsilon: T")                       \
+    .Input("grad: T")                          \
+    .Input("indices: Tindices")                \
+    .Input("global_step: Tstep")               \
+    .Input("weight_decay: T")                  \
+    .Input("indices_counts: int64")              \
+    .Attr("T: numbertype")                     \
+    .Attr("Tindices: {int32, int64, string}")  \
+    .Attr("Tstep: {int32, int64}")             \
+    .Attr("use_locking: bool = false")         \
+    .Attr("indices_as_pointer: bool = false")  \
+    .SetShapeFn([](InferenceContext* c) {      \
+      return KvResourceApplyAdamShapeFn(c, true /* sparse */);\
+    })                                         \
+    .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamWWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamWWithCounts");
+#undef REGISTER_OP_BY_NAME
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 4782dcdd6d1..ef2265ec235 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -264,10 +264,13 @@ def _embedding_lookup_and_transform(params,
       gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
       gather_blocknums = None
       gather_ev_init_value = None
+      gather_counts = None
       if isinstance(params[0], kv_variable_ops.DynamicEmbeddingVariable): 
         gather_blocknums = data_flow_ops.dynamic_partition(blocknums, p_assignments, np)
       if ev_init_value is not None:
         gather_ev_init_value = data_flow_ops.dynamic_partition(ev_init_value, p_assignments, np)
+      if counts is not None:
+        gather_counts = data_flow_ops.dynamic_partition(counts, p_assignments, np)
       # Similarly, partition the original indices.
       pindices = data_flow_ops.dynamic_partition(original_indices,
                                                  p_assignments, np)
@@ -297,7 +300,11 @@ def _embedding_lookup_and_transform(params,
               new_ev_init_value = None
             else:
               new_ev_init_value = gather_ev_init_value[p]
-            result = array_ops.gather(params[p], pids, ev_init_value=new_ev_init_value, counts=counts)
+            if counts is None:
+              new_counts = None
+            else:
+              new_counts = gather_counts[p]
+            result = array_ops.gather(params[p], pids, ev_init_value=new_ev_init_value, counts=new_counts)
             if transform_fn:
               # If transform_fn is provided, the clip_by_norm precedes
               # the transform and hence must be co-located. See below
@@ -570,8 +577,8 @@ def embedding_lookup_sparse(params,
       segment_ids = math_ops.cast(segment_ids, dtypes.int32)
 
     ids = sp_ids.values
-    if isinstance(params[0], kv_variable_ops.EmbeddingVariable) and params[0]._filter_freq > 0:
-      ids, idx, counts = array_ops.unique_with_counts(ids)
+    if isinstance(params[0], kv_variable_ops.EmbeddingVariable) and params[0].need_counts():
+      ids, idx, counts = array_ops.unique_with_counts(ids, out_idx=dtypes.int64)
     else:
       ids, idx = array_ops.unique(ids)
       counts = None
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 26ae99126b9..58b6083ef24 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -362,12 +362,12 @@ def runTestFtrl(self, var, g):
               initializer=init_ops.ones_initializer(dtypes.float32),
               ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)),
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
+      emb1 = runTestFtrl(self, emb_var, g)
       with ops.device('/cpu:0'):
         emb_var2 = variable_scope.get_embedding_variable("var_2", embedding_dim=3,
               initializer=init_ops.ones_initializer(dtypes.float32),
               ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM)),
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      emb1 = runTestFtrl(self, emb_var, g)
       emb2 = runTestFtrl(self, emb_var2, g)
       for i in range(0, 6):
         for j in range(0, 3):
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 21c6a6fa20e..02272d13a6b 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -41,6 +41,7 @@
 from tensorflow.python.training import adagrad_decay
 from tensorflow.python.training import adagrad_decay_v2
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import weight_decay_optimizers
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import training_util
 from tensorflow.python.ops import variables
@@ -53,10 +54,160 @@
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import loader
 from tensorflow.core.protobuf import config_pb2 as config_pb3
+from tensorflow.python.platform import tf_logging as logging
 import time
 import random
 
 class EmbeddingVariableTest(test_util.TensorFlowTestCase):
+  def _CreateOptimizer(self, optimizer):
+    if optimizer == "Adagrad":
+      return adagrad.AdagradOptimizer(0.1)
+    elif optimizer == "AdagradDecay":
+      gs = training_util.get_or_create_global_step()
+      return adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+    elif optimizer == "AdagradDecayV2":
+      gs = training_util.get_or_create_global_step()
+      return adagrad_decay_v2.AdagradDecayOptimizerV2(0.1, gs)
+    elif optimizer == "Adam":
+      return adam.AdamOptimizer(0.1)
+    elif optimizer == "AdamAsync":
+      return adam_async.AdamAsyncOptimizer(0.1)
+    elif optimizer == "FTRL":
+      return ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
+    elif optimizer == "GradientDescent":
+      return gradient_descent.GradientDescentOptimizer(0.1)
+    elif optimizer == "AdamW":
+      return weight_decay_optimizers.AdamWOptimizer(0.01)
+    else:
+      logging.fatal("Optimizer name is invalid")
+
+  def _OpitmizerTestTemplate(self, optimizer):
+    def runTest(self, var):
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      opt = self._CreateOptimizer(optimizer)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        r, _, _ = sess.run([emb, train_op,loss])
+        r, _, _ = sess.run([emb, train_op,loss])
+        r, _, _ = sess.run([emb, train_op,loss])
+        r, _, _ = sess.run([emb, train_op,loss])
+        r, _, _ = sess.run([emb, train_op,loss])
+        return r
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
+      emb1 = runTest(self, var)
+      emb_var = variable_scope.get_embedding_variable("var_1",
+            embedding_dim = 3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
+      emb2 = runTest(self, emb_var)
+
+    for i in range(0, 6):
+      for j in range(0, 3):
+        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
+
+  def _CounterFilterTestTemplate(self, optimizer):
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = self._CreateOptimizer(optimizer)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        emb1, top, l = sess.run([emb, train_op, loss])
+        for val in emb1.tolist()[0]:
+          self.assertEqual(val, .0)
+        emb1, top, l = sess.run([emb, train_op, loss])
+        for val in emb1.tolist()[0]:
+          self.assertNotEqual(val, 1.0)
+
+  def _RecordFreqTestTemplate(self, optimizer):
+    checkpoint_directory = self.get_temp_dir()
+    os.environ["TF_RECORD_FREQ"] = "1"
+    with ops.device("/cpu:0"):
+      emb_var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32))
+    emb = embedding_ops.embedding_lookup(emb_var,
+            math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = self._CreateOptimizer(optimizer)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v, gs)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([emb, train_op])
+      sess.run([emb, train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if name == "var_1-freqs":
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(ckpt_value.tolist()[0], 6)
+          self.assertEqual(ckpt_value.tolist()[1], 4)
+          self.assertEqual(ckpt_value.tolist()[2], 2)
+    os.environ["TF_RECORD_FREQ"] = "0"
+
+  def _RecordVersionTemplate(self, optimizer):
+    checkpoint_directory = self.get_temp_dir()
+    os.environ["TF_RECORD_VERSION"] = "1"
+    with ops.device("/cpu:0"):
+      emb_var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32))
+    emb = embedding_ops.embedding_lookup(emb_var,
+            math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = self._CreateOptimizer(optimizer)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v, gs)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([emb, train_op])
+      sess.run([emb, train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if name == "var_1-versions":
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          if "AdagradDecay" in optimizer:
+            self.assertEqual(ckpt_value.tolist()[0], 2)
+            self.assertEqual(ckpt_value.tolist()[1], 2)
+            self.assertEqual(ckpt_value.tolist()[2], 2)
+          else:
+            self.assertEqual(ckpt_value.tolist()[0], 1)
+            self.assertEqual(ckpt_value.tolist()[1], 1)
+            self.assertEqual(ckpt_value.tolist()[2], 1)
+    os.environ["TF_RECORD_VERSION"] = "0"
+
   def testSaveVersionWithGlobalStepEviction(self):
     print("testSaveVersionWithGlobalStepEviction")
     checkpoint_directory = self.get_temp_dir()
@@ -85,6 +236,94 @@ def testSaveVersionWithGlobalStepEviction(self):
           ckpt_value = checkpoint_utils.load_variable(model_path, name)
           self.assertEqual(ckpt_value.tolist()[0], 1)
 
+  def testFeatureColumnRecordFreqWithPartition(self):
+    print("testFeatureColumnRecordFreqWithPartition")
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    columns = feature_column.sparse_column_with_embedding(
+                                        column_name="col_emb",
+                                        dtype=dtypes.int64,
+                                        partition_num=2)
+    W = feature_column.embedding_column(sparse_id_column=columns,
+            dimension=3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            combiner="mean")
+    ids = {}
+    ids["col_emb"] = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+    with ops.device("/cpu:0"):
+      emb= feature_column_ops.input_from_feature_columns(
+             columns_to_tensors=ids, feature_columns=[W])
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([emb, train_op])
+      sess.run([emb, train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if "part_0-freqs" in name and name.endswith("-freqs"):
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(ckpt_value.tolist()[0], 6)
+          self.assertEqual(ckpt_value.tolist()[1], 2)
+        elif "part_1-freqs" in name and name.endswith("-freqs"):
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(ckpt_value.tolist()[0], 4)
+    os.environ["TF_RECORD_FREQ"] = "0"
+
+  def testFeatureColumnRecordFreqSGDWithPartition(self):
+    print("testFeatureColumnRecordFreqSGDWithPartition")
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    columns = feature_column.sparse_column_with_embedding(
+                                        column_name="col_emb",
+                                        dtype=dtypes.int64,
+                                        partition_num=2)
+    W = feature_column.embedding_column(sparse_id_column=columns,
+            dimension=3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            combiner="mean")
+    ids = {}
+    ids["col_emb"] = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+    with ops.device("/cpu:0"):
+      emb= feature_column_ops.input_from_feature_columns(
+             columns_to_tensors=ids, feature_columns=[W])
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = gradient_descent.GradientDescentOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([emb, train_op])
+      sess.run([emb, train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if "part_0-freqs" in name and name.endswith("-freqs"):
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(ckpt_value.tolist()[0], 6)
+          self.assertEqual(ckpt_value.tolist()[1], 2)
+        elif "part_1-freqs" in name and name.endswith("-freqs"):
+          ckpt_value = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(ckpt_value.tolist()[0], 4)
+    os.environ["TF_RECORD_FREQ"] = "0"
+
   def testDynamicDimensionEmbeddingVariable(self):
     print("testDynamicDimensionEmbeddingVariable")
     def runTestAdagradDecay(self, var, g):
@@ -221,15 +460,20 @@ def testEmbeddingVariableForExport(self):
       var = variable_scope.get_embedding_variable("var_1", embedding_dim=3,
               initializer=init_ops.ones_initializer(dtypes.float32), steps_to_live=10000, ev_option=ev_config)
       emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+      fun = math_ops.multiply(emb, 0.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      opt = adam.AdamOptimizer(0.01)
+      g_v = opt.compute_gradients(loss)
+      gs = training_util.get_or_create_global_step()
+      train_op = opt.apply_gradients(g_v, gs)
       init = variables.global_variables_initializer()
       keys, values, versions, freqs = var.export()
       with self.test_session() as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
-        sess.run(emb)
-        sess.run(emb)
-        sess.run(emb)
+        sess.run(train_op)
+        sess.run(train_op)
         fetches = sess.run([keys, values, versions, freqs])
         print(fetches)
         self.assertAllEqual([0, 1, 2, 5, 6, 7], fetches[0])
@@ -239,7 +483,7 @@ def testEmbeddingVariableForExport(self):
                             [1., 1., 1.],
                             [1., 1., 1.],
                             [1., 1., 1.]], fetches[1])
-        self.assertAllEqual([-1, -1, -1, -1, -1, -1], fetches[2])
+        self.assertAllEqual([1, 1, 1, 1, 1, 1], fetches[2])
         self.assertAllEqual([1, 1, 1, 1, 1, 1], fetches[3])
 
   def testEmbeddingVariableForGetShape(self):
@@ -249,13 +493,18 @@ def testEmbeddingVariableForGetShape(self):
               embedding_dim = 3,
               initializer=init_ops.ones_initializer(dtypes.float32))
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = adam.AdamOptimizer(0.01)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
     shape = var.total_count()
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
-      sess.run([emb])
+      sess.run([train_op])
       self.assertAllEqual([6, 3], sess.run(shape))
 
   def testEmbeddingVariableForMultiHashFunction(self):
@@ -788,276 +1037,119 @@ def testEmbeddingVariableForBloomFilterInt16(self):
 
   def testEmbeddingVariableForAdagradDecayFilter(self):
     print("testEmbeddingVariableForAdagradDecayFilter")
-    var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    gs = training_util.get_or_create_global_step()
-    opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("AdagradDecay")
+
+  def testEmbeddingVariableForAdagradFilter(self):
+    print("testEmbeddingVariableForAdagradFilter")
+    self._CounterFilterTestTemplate("Adagrad")
 
   def testEmbeddingVariableForFtrlFilter(self):
     print("testEmbeddingVariableForFtrlFilter")
-    with ops.device('/cpu:0'):
-      var = variable_scope.get_embedding_variable("var_1",
-              embedding_dim = 3,
-              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-              initializer=init_ops.ones_initializer(dtypes.float32),
-              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      #var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertEqual(val, .0)
-        emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("FTRL")
 
   def testEmbeddingVariableForAdamAsyncFilter(self):
     print("testEmbeddingVariableForAdamAsynsFilter")
-    with ops.device('/cpu:0'):
-      var = variable_scope.get_embedding_variable("var_1",
-              embedding_dim = 3,
-              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-              initializer=init_ops.ones_initializer(dtypes.float32),
-              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adam_async.AdamAsyncOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertEqual(val, .0)
-        emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("AdamAsync")
 
   def testEmbeddingVariableForGradientDescentFilter(self):
     print("testEmbeddingVariableForGradientDescentFilter")
-    var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    gs = training_util.get_or_create_global_step()
-    opt = gradient_descent.GradientDescentOptimizer(0.1)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("GradientDescent")
 
   def testEmbeddingVariableForAdagradDecayV2Filter(self):
     print("testEmbeddingVariableForAdagradDecayV2Filter")
-    var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    gs = training_util.get_or_create_global_step()
-    opt = adagrad_decay_v2.AdagradDecayOptimizerV2(0.1, gs)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("AdagradDecayV2")
 
   def testEmbeddingVariableForAdamFilter(self):
     print("testEmbeddingVariableForAdamFilter")
-    with ops.device("/cpu:0"):
-      var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    gs = training_util.get_or_create_global_step()
-    opt = adam.AdamOptimizer(0.1, gs)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val in emb1.tolist()[0]:
-        self.assertNotEqual(val, 1.0)
+    self._CounterFilterTestTemplate("Adam")
+
+  def testEmbeddingVariableForAdamWFilter(self):
+    print("testEmbeddingVariableForAdamWFilter")
+    self._CounterFilterTestTemplate("AdamW")
 
   def testEmbeddingVariableForGradientDescent(self):
     print("testEmbeddingVariableForGradientDescent")
-    def runTestGradientDescent(self, var):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = gradient_descent.GradientDescentOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    emb_var = variable_scope.get_embedding_variable("var_1",
-          embedding_dim = 3,
-          initializer=init_ops.ones_initializer(dtypes.float32),
-          partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-    var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-    emb1 = runTestGradientDescent(self, emb_var)
-    emb2 = runTestGradientDescent(self, var)
-
-    for i in range(0, 6):
-      for j in range(0, 3):
-        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
+    self._OpitmizerTestTemplate("GradientDescent")
 
   def testEmbeddingVariableForAdagrad(self):
     print("testEmbeddingVariableForAdagrad")
-    def runTestAdagrad(self, var):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adagrad.AdagradOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var)
-      emb2 = runTestAdagrad(self, var)
-
-    for i in range(0, 6):
-      for j in range(0, 3):
-        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
+    self._OpitmizerTestTemplate("Adagrad")
 
   def testEmbeddingVariableForAdagradDecay(self):
     print("testEmbeddingVariableForAdagradDecay")
-    with ops.device('/cpu:0'):
-      def runTestAdagradDecay(self, var):
-        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-        fun = math_ops.multiply(emb, 2.0, name='multiply')
-        loss = math_ops.reduce_sum(fun, name='reduce_sum')
-        gs = training_util.get_or_create_global_step()
-        opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
-        g_v = opt.compute_gradients(loss)
-        train_op = opt.apply_gradients(g_v)
-        init = variables.global_variables_initializer()
-        with self.test_session() as sess:
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-          sess.run([init])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          return r
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            embedding_dim = 3,
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagradDecay(self, emb_var)
-      emb2 = runTestAdagradDecay(self, var)
+    self._OpitmizerTestTemplate("AdagradDecay")
 
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
+  def testEmbeddingVariableForAdagradDecayV2(self):
+    print("testEmbeddingVariableForAdagradDecayV2")
+    self._OpitmizerTestTemplate("AdagradDecayV2")
+
+  def testEmbeddingVariableForAdam(self):
+    print("testEmbeddingVariableForAdam")
+    self._OpitmizerTestTemplate("Adam")
+
+  def testEmbeddingVariableForAdamAsync(self):
+    print("testEmbeddingVariableForAdamAsync")
+    self._OpitmizerTestTemplate("AdamAsync")
+
+  def testEmbeddingVariableForAdagradDecayRecrodFreq(self):
+    print("testEmbeddingVariableForAdagradDecayRecrodFreq")
+    self._RecordFreqTestTemplate("AdagradDecay")
+
+  def testEmbeddingVariableForAdagradRecordFreq(self):
+    print("testEmbeddingVariableForAdagradRecordFreq")
+    self._RecordFreqTestTemplate("Adagrad")
+
+  def testEmbeddingVariableForFtrlRecrodFreq(self):
+    print("testEmbeddingVariableForFtrlRecrodFreq")
+    self._RecordFreqTestTemplate("FTRL")
+
+  def testEmbeddingVariableForAdamAsyncRecrodFreq(self):
+    print("testEmbeddingVariableForAdamAsyncRecrodFreq")
+    self._RecordFreqTestTemplate("AdamAsync")
+
+  def testEmbeddingVariableForGradientDescentRecrodFreq(self):
+    print("testEmbeddingVariableForGradientDescentRecrodFreq")
+    self._RecordFreqTestTemplate("GradientDescent")
+
+  def testEmbeddingVariableForAdagradDecayV2RecrodFreq(self):
+    print("testEmbeddingVariableForAdagradDecayV2RecrodFreq")
+    self._RecordFreqTestTemplate("AdagradDecayV2")
+
+  def testEmbeddingVariableForAdamRecrodFreq(self):
+    print("testEmbeddingVariableForAdamRecrodFreq")
+    self._RecordFreqTestTemplate("Adam")
+
+  def testEmbeddingVariableForAdamWRecrodFreq(self):
+    print("testEmbeddingVariableForAdamWRecrodFreq")
+    self._RecordFreqTestTemplate("AdamW")
+
+  def testEmbeddingVariableForAdagradDecayRecrodVersion(self):
+    print("testEmbeddingVariableForAdagradDecayRecrodVersion")
+    self._RecordFreqTestTemplate("AdagradDecay")
+
+  def testEmbeddingVariableForAdagradRecordVersion(self):
+    print("testEmbeddingVariableForAdagradRecordVersion")
+    self._RecordFreqTestTemplate("Adagrad")
+
+  def testEmbeddingVariableForAdamAsyncRecrodVersion(self):
+    print("testEmbeddingVariableForAdamAsyncRecrodVersion")
+    self._RecordFreqTestTemplate("AdamAsync")
+
+  def testEmbeddingVariableForGradientDescentRecrodVersion(self):
+    print("testEmbeddingVariableForGradientDescentRecrodVersion")
+    self._RecordFreqTestTemplate("GradientDescent")
+
+  def testEmbeddingVariableForAdagradDecayV2RecrodVersion(self):
+    print("testEmbeddingVariableForAdagradDecayV2RecrodVersion")
+    self._RecordFreqTestTemplate("AdagradDecayV2")
+
+  def testEmbeddingVariableForAdamRecrodVersion(self):
+    print("testEmbeddingVariableForAdamRecrodVersion")
+    self._RecordFreqTestTemplate("Adam")
+
+  def testEmbeddingVariableForAdamWRecrodVersion(self):
+    print("testEmbeddingVariableForAdamWRecrodVersion")
+    self._RecordFreqTestTemplate("AdamW")
 
   def testEmbeddingVariableWeightedCategoricalColumn(self):
     print("testEmbeddingVariableWeightedCategoricalColumn")
@@ -1149,145 +1241,6 @@ def runTestColumn(W):
         for j in range(0, 3):
           self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
 
-  def testEmbeddingVariableForAdagradDecayV2(self):
-    print("testEmbeddingVariableForAdagradDecayV2")
-    with ops.device('/cpu:0'):
-      def runTestAdagradDecayV2(self, var):
-        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-        fun = math_ops.multiply(emb, 2.0, name='multiply')
-        loss = math_ops.reduce_sum(fun, name='reduce_sum')
-        gs = training_util.get_or_create_global_step()
-        opt = adagrad_decay_v2.AdagradDecayOptimizerV2(0.1, gs)
-        g_v = opt.compute_gradients(loss)
-        train_op = opt.apply_gradients(g_v)
-        init = variables.global_variables_initializer()
-        with self.test_session() as sess:
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-          sess.run([init])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          return r
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagradDecayV2(self, emb_var)
-      emb2 = runTestAdagradDecayV2(self, var)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-  def testEmbeddingVariableForAdam(self):
-    print("testEmbeddingVariableForAdam")
-    def runTestAdam(self, var):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adam.AdamOptimizer(0.1, gs)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.device("/cpu:0"):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      var = variable_scope.get_variable("var_2", shape=[8, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdam(self, emb_var)
-      emb2 = runTestAdam(self, var)
-
-    print(emb1.tolist())
-    print(emb2.tolist())
-    for i in range(0, 6):
-      for j in range(0, 3):
-        self.assertAlmostEqual(emb1.tolist()[i][j], emb2.tolist()[i][j], delta=1e-05)
-
-  def testEmbeddingVariableForAdamAsync(self):
-    print("testEmbeddingVariableForAdamAsync")
-    def runTestAdamAsync(self, var):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adam_async.AdamAsyncOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.device("/cpu:0"):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2))
-      var = variable_scope.get_variable("var_2", shape=[8, 3],
-            initializer=init_ops.ones_initializer(dtypes.float32))
-    emb1 = runTestAdamAsync(self, emb_var)
-    emb2 = runTestAdamAsync(self, var)
-
-    for i in range(0, 6):
-      for j in range(0, 3):
-        self.assertAllCloseAccordingToType(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-
-  def testEmbeddingVariableForFtrl(self):
-    print("testEmbeddingVariableForFtrl")
-    def runTestAdam(self, var):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.device("/cpu:0"):
-      emb_var = variable_scope.get_embedding_variable("var_1", embedding_dim=3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-    emb1 = runTestAdam(self, emb_var)
-    emb2 = runTestAdam(self, var)
-
-      #for i in range(0, 6):
-      #  for j in range(0, 3):
-      #    self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
   def testEmbeddingVariableForAdagradDecayStep(self):
     print("testEmbeddingVariableForAdagradDecayStep")
     var = variable_scope.get_embedding_variable("var_1",
@@ -1481,13 +1434,18 @@ def testEVInitializerWithKeyFetch(self):
                                                        ev_option=ev_option)
       var_emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,3,4,5,6,7], dtypes.int64))
       emb_emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([0,1,2,5,6,7,8,9,10], dtypes.int64))
+      fun = math_ops.multiply(emb_emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1 = sess.run(var_emb)
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[0], emb2.tolist()[0])
         self.assertListEqual(emb1.tolist()[1], emb2.tolist()[1])
         self.assertListEqual(emb1.tolist()[2], emb2.tolist()[2])
@@ -1499,16 +1457,6 @@ def testEVInitializerWithKeyFetch(self):
         self.assertListEqual(emb1.tolist()[2], emb2.tolist()[8])
 
   def testEVInitializerWithCounterFeatureFilter(self):
-    def testembedding(emb1, emb2):
-      is_match = 0
-      for i in range(8):
-        for j in range(3):
-          if emb1.tolist()[i][j] != emb2.tolist()[3][j]:
-            break
-        if j == 2:
-          is_match = 1
-      return is_match
-
     print("testEVInitializerWithCounterFeatureFilter")
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       var = variable_scope.get_variable("var", shape=[8,3],
@@ -1521,33 +1469,28 @@ def testembedding(emb1, emb2):
                                                        ev_option=ev_option)
       var_emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,3,4,5,6,7], dtypes.int64))
       emb_emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([3], dtypes.int64))
+      fun = math_ops.multiply(emb_emb, 0.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1 = np.zeros([8,3])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
         emb1 = sess.run(var_emb)
         emb2 = sess.run(emb_emb)
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
 
   def testEVInitializerWithBloomFeatureFilter(self):
-    def testembedding(emb1, emb2):
-      is_match = 0
-      for i in range(8):
-        for j in range(3):
-          if emb1.tolist()[i][j] != emb2.tolist()[0][j]:
-            break
-        if j == 2:
-          is_match = 1
-      return is_match
-
     print("testEVInitializerWithBloomFeatureFilter")
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
       var = variable_scope.get_variable("var", shape=[8,3],
@@ -1563,17 +1506,22 @@ def testembedding(emb1, emb2):
                                                        ev_option=ev_option)
       var_emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,3,4,5,6,7], dtypes.int64))
       emb_emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([3], dtypes.int64))
+      fun = math_ops.multiply(emb_emb, 0.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1 = np.zeros([8,3])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
-        emb2 = sess.run(emb_emb)
+        emb2, _ = sess.run([emb_emb, train_op])
         self.assertListEqual(emb1.tolist()[3], emb2.tolist()[0])
         emb1 = sess.run(var_emb)
         emb2 = sess.run(emb_emb)
@@ -1593,7 +1541,7 @@ def runTest(self, var, g):
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
+        r, _, _ = sess.run([emb, train_op, loss])
         r, _, _ = sess.run([emb, train_op, loss])
         r, _, _ = sess.run([emb, train_op, loss])
         r, _, _ = sess.run([emb, train_op, loss])
@@ -2083,14 +2031,13 @@ def testEmbeddingVariableForDefaultValueNoPermission(self):
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
-      emb1 = sess.run(emb)
-      emb1 = sess.run(emb)
-      emb1 = sess.run(emb)
+      emb1, _ = sess.run([emb, train_op])
+      emb1, _ = sess.run([emb, train_op])
       for val in emb1.tolist()[0]:
         self.assertAlmostEqual(val, .2, delta=1e-05)
-      emb1 = sess.run(emb)
+      emb1, _ = sess.run([emb, train_op])
       for val in emb1.tolist()[0]:
-        self.assertEqual(val, .0)
+        self.assertNotEqual(val, .2)
 
   def testEmbeddingVariableForGetFrequencyAndVersion(self):
     print("testEmbeddingVariableForGetFrequencyAndVersion")
@@ -2136,7 +2083,6 @@ def testEmbeddingVariableForInference(self):
     # modify graph for infer
     # emb.op.inputs[0].op.inputs[0].op._set_attr("is_inference", attr_value_pb2.AttrValue(b=True))
     # set environment
-    os.environ["INFERENCE_MODE"] = "1"
     fun = math_ops.multiply(emb, 2.0, name='multiply')
     loss = math_ops.reduce_sum(fun, name='reduce_sum')
     init = variables.global_variables_initializer()
@@ -2146,7 +2092,6 @@ def testEmbeddingVariableForInference(self):
       sess.run([emb, loss], feed_dict={'ids:0': [1,3,5]})
       sess.run([emb, loss], feed_dict={'ids:0': [1,5,7]})
       s = sess.run(shape)
-      del os.environ["INFERENCE_MODE"]
       self.assertAllEqual(np.array([0,3]), s)
 
   def testEmbeddingVariableForLookupTier(self):
@@ -2157,7 +2102,7 @@ def testEmbeddingVariableForLookupTier(self):
     storage_opt = variables.StorageOption(
                           storage_type=config_pb2.StorageType.DRAM_SSDHASH,
                           storage_path=db_directory,
-                          storage_size=[256])
+                          storage_size=[512])
     ev_option = variables.EmbeddingVariableOption(storage_option=storage_opt)
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
     with ops.device('/cpu:0'):
@@ -2168,19 +2113,26 @@ def testEmbeddingVariableForLookupTier(self):
           ev_option = ev_option)
       ids = array_ops.placeholder(dtype=dtypes.int64, name='ids')
       emb = embedding_ops.embedding_lookup(emb_var, ids)
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v, gs)
       init = variables.global_variables_initializer()
       tires = kv_variable_ops.lookup_tier(emb_var,
                   math_ops.cast([1,2,3,4,5,6], dtypes.int64))
 
     with self.test_session() as sess:
       sess.run([init])
-      sess.run(emb, {ids:[1,2,3]})
-      sess.run(emb, {ids:[1,2,4]})
-      sess.run(emb, {ids:[1,2,2]})
-      sess.run(emb, {ids:[1,2,5]})
+      sess.run(train_op, {ids:[1,2,3]})
+      sess.run(train_op, {ids:[1,2,4]})
+      sess.run(train_op, {ids:[1,2,2]})
+      sess.run(train_op, {ids:[1,2,5]})
       result = sess.run(tires)
       del os.environ["TF_SSDHASH_ASYNC_COMPACTION"]
       del os.environ["TF_MULTI_TIER_EV_EVICTION_THREADS"]
+      print(result)
       for i in range(0, 6):
         if i == 2:
           self.assertEqual(result[i], 1)
@@ -2188,6 +2140,11 @@ def testEmbeddingVariableForLookupTier(self):
           self.assertEqual(result[i], -1)
         else:
           self.assertEqual(result[i], 0)
+      sess.run(emb, {ids:[3]})
+      result = sess.run(tires)
+      print(result)
+      for i in range(0, 5):
+        self.assertEqual(result[i], 0)
 
   @test_util.run_gpu_only
   def testEmbeddingVariableForHBMandDRAM(self):
@@ -2412,6 +2369,54 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
            name == "var_1-freqs_filtered":
           self.assertEqual(0, shape[0])
     del os.environ["TF_EV_SAVE_FILTERED_FEATURES"]
+  
+  def testEmbeddingVariableForMultiTierInference(self):
+    print("testEmbeddingVariableForMultiTierInference")
+    checkpoint_directory = self.get_temp_dir()
+    os.environ["TF_SSDHASH_ASYNC_COMPACTION"]="0"
+    os.environ["TF_RECORD_FREQ"] = "1"
+    with ops.Graph().as_default() as g, ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var", embedding_dim=30)
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,2,2,3,4], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v, gs)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run([init])
+        sess.run(train_op)
+        saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"))
+    
+    with ops.Graph().as_default() as g, ops.device("/cpu:0"):
+      db_directory = self.get_temp_dir()
+      storage_opt = variables.StorageOption(
+                          storage_type=config_pb2.StorageType.DRAM_SSDHASH,
+                          storage_path=db_directory,
+                          storage_size=[256])
+      ev_option = variables.EmbeddingVariableOption(storage_option=storage_opt)
+      emb_var = variable_scope.get_embedding_variable("var",
+          embedding_dim = 30,
+          initializer=init_ops.ones_initializer(dtypes.float32),
+          ev_option = ev_option)
+      ids = array_ops.placeholder(dtype=dtypes.int64, name='ids')
+      emb = embedding_ops.embedding_lookup(emb_var, ids)
+      tires = kv_variable_ops.lookup_tier(emb_var,
+                  math_ops.cast([1,2,3,4], dtypes.int64))
+      saver = saver_module.Saver()
+      graph = ops.get_default_graph()
+      with self.test_session(graph = graph) as sess:
+        saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt"))
+        result = sess.run(tires)
+        self.assertAllEqual(result, [0, 0, 1, 1])
+        sess.run(emb, feed_dict={ids:[3, 3]})
+        result = sess.run(tires)
+        self.assertAllEqual(result, [0, 1, 0, 1])
+    del os.environ["TF_SSDHASH_ASYNC_COMPACTION"]
+    del os.environ["TF_RECORD_FREQ"]
 
   def testEmbeddingVariableCustomDimForSaveAndRestore(self):
     print("testEmbeddingVariableCustomForSaveAndRestore")
@@ -2431,6 +2436,13 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
         
         emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        fun = math_ops.multiply(emb, 0.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
         saver = saver_module.Saver(sharded=True)
         init = variables.global_variables_initializer()
         graph = ops.get_default_graph()
@@ -2438,6 +2450,7 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
           sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
           sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
           sess.run([init])
+          sess.run(train_op)
           emb_ori = sess.run(emb)
           emb_ori_2 = sess.run(emb2)
           save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
@@ -2466,7 +2479,7 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
           emb_val_2 = sess.run(emb2)
           self.assertAllEqual(emb_ori, emb_val[:,0:3])
           self.assertAllEqual(emb_ori_2, emb_val_2[:,0:3])
-          
+
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
         var = variable_scope.get_embedding_variable("var_1",
                 embedding_dim = 2,
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index e5c61cf9878..625e02a757b 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -367,6 +367,7 @@ def _init_from_args(self,
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
         self._constraint = constraint
+        self._gather_op = None
         if self._is_primary:
           self._slot_num = 0 
         else:
@@ -379,7 +380,7 @@ def _init_from_args(self,
         if initial_value is not None:
           with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
             with ops.control_dependencies(None if self._is_primary else [self._primary.initializer]):
-              self._init_op = gen_kv_variable_ops.initialize_kv_variable_op(
+              self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op(
                     self._handle,
                     self._primary._handle,
                     variables._try_guard_against_uninitialized_dependencies(name, initial_value),
@@ -405,6 +406,7 @@ def _init_from_args(self,
                     default_value_no_permission = self._default_value_no_permission,
                     record_freq = self._record_freq,
                     record_version = self._record_version,
+                    embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
                     name=n)
             set_attr_ops = []
 
@@ -418,8 +420,8 @@ def is_multi_tier(storage_type):
                                   config_pb2.StorageType.DRAM_PMEM_SSDHASH,
                                   config_pb2.StorageType.HBM_DRAM_SSDHASH]
               return storage_type in multi_level_list
-
-            if self._is_primary and is_multi_tier(self._storage_type):
+            self._is_multi_tier = is_multi_tier(self._storage_type)
+            if self._is_primary and self._is_multi_tier:
               with ops.control_dependencies([self._init_op]):
                 self._set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
                   self._handle,
@@ -441,6 +443,12 @@ def is_multi_tier(storage_type):
   def export(self):
     return gen_kv_variable_ops.kv_resource_export(self._handle, Tkeys=self._invalid_key_type)
 
+  def need_counts(self):
+    return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
+  @property
+  def gather_op(self):
+    return self._gather_op
+
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
     # Note that init_from_proto is currently not supported in Eager mode.
@@ -465,7 +473,8 @@ def _init_from_proto(self, variable_def, import_scope=None):
     cache_op = None
     if self._initializer_op.type == "NoOp":
       for op in self._initializer_op.control_inputs:
-        if op.type == "InitializeKvVariableOp":
+        if op.type == "InitializeKvVariableOp" or \
+           op.type == "InitializeKvVariableV2Op":
           init_op = op
           self._init_op = op
         elif op.type == "KvResourceSetCacheStrategyOp":
@@ -788,13 +797,16 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
         value = gen_kv_variable_ops.kv_resource_gather_v1(self._handle,
               indices,
               default_value,
-              counts, name=name)
+              counts, is_inference=True,
+              name=name)
       else:
         value = gen_kv_variable_ops.kv_resource_gather(self._handle,
               indices,
               default_value,
               is_use_default_value_tensor,
+              is_inference=True,
               name=name)
+      self._counts_tensor = counts
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index 46eec12748a..7f4b0f72ae4 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -136,18 +136,29 @@ def _hash_table_apply_sparse(self, grad, var, indices):
         indices,
         use_locking=self._use_locking)
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts=None):
     acc = self.get_slot(var, "accumulator")
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      return training_ops.kv_resource_sparse_apply_adagrad(
-        var.handle,
-        acc.handle,
-        math_ops.cast(self._learning_rate_tensor, grad.dtype),
-        grad,
-        indices,
-        global_step,
-        use_locking=self._use_locking)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adagrad_with_counts(
+          var.handle,
+          acc.handle,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          grad,
+          indices,
+          global_step,
+          indices_counts,
+          use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_adagrad(
+          var.handle,
+          acc.handle,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          grad,
+          indices,
+          global_step,
+          use_locking=self._use_locking)
     else:
       return training_ops.resource_sparse_apply_adagrad(
         var.handle,
diff --git a/tensorflow/python/training/adagrad_decay.py b/tensorflow/python/training/adagrad_decay.py
index a2c7e4900c5..6f3bc0f211f 100644
--- a/tensorflow/python/training/adagrad_decay.py
+++ b/tensorflow/python/training/adagrad_decay.py
@@ -191,24 +191,39 @@ def _apply_sparse(self, grad, var):
         grad.indices,
         use_locking=self._use_locking)
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts=None):
     acc = self.get_slot(var, "accumulator")
     acc_decay_power = self.get_slot(var, "accumulator_decay_power")
     with ops.device(var.device):
       global_step = array_ops.identity(self._global_step_on_worker)
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
-      return training_ops.kv_resource_sparse_apply_adagrad_decay(
-        var.handle,
-        acc.handle,
-        acc_decay_power.handle,
-        math_ops.cast(self._learning_rate_tensor, grad.dtype),
-        self._accumulator_decay_step_tensor,
-        math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
-        math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
-        global_step,
-        grad,
-        indices,
-        use_locking=self._use_locking)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adagrad_decay_with_counts(
+          var.handle,
+          acc.handle,
+          acc_decay_power.handle,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          self._accumulator_decay_step_tensor,
+          math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
+          math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
+          global_step,
+          grad,
+          indices,
+          indices_counts,
+          use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_adagrad_decay(
+          var.handle,
+          acc.handle,
+          acc_decay_power.handle,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          self._accumulator_decay_step_tensor,
+          math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
+          math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
+          global_step,
+          grad,
+          indices,
+          use_locking=self._use_locking)
     else:
       return training_ops.resource_sparse_apply_adagrad_decay(
           var.handle,
diff --git a/tensorflow/python/training/adagrad_decay_v2.py b/tensorflow/python/training/adagrad_decay_v2.py
index 082dab07602..d7d3c80c095 100644
--- a/tensorflow/python/training/adagrad_decay_v2.py
+++ b/tensorflow/python/training/adagrad_decay_v2.py
@@ -192,24 +192,39 @@ def _apply_sparse(self, grad, var):
         grad.indices,
         use_locking=self._use_locking)
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts = None):
     acc = self.get_slot(var, "accumulator")
     acc_decay_power = self.get_slot(var, "accumulator_decay_power")
     with ops.device(var.device):
       global_step = array_ops.identity(self._global_step_on_worker)
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
-      return training_ops.kv_resource_sparse_apply_adagrad_decay(
-          var.handle,
-          acc.handle,
-          acc_decay_power.handle,
-          math_ops.cast(self._learning_rate_tensor, grad.dtype),
-          self._accumulator_decay_step_tensor,
-          math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
-          math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
-          global_step,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adagrad_decay_with_counts(
+            var.handle,
+            acc.handle,
+            acc_decay_power.handle,
+            math_ops.cast(self._learning_rate_tensor, grad.dtype),
+            self._accumulator_decay_step_tensor,
+            math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
+            math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
+            global_step,
+            grad,
+            indices,
+            indices_counts,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_adagrad_decay(
+            var.handle,
+            acc.handle,
+            acc_decay_power.handle,
+            math_ops.cast(self._learning_rate_tensor, grad.dtype),
+            self._accumulator_decay_step_tensor,
+            math_ops.cast(self._accumulator_decay_rate_tensor, grad.dtype.base_dtype),
+            math_ops.cast(self._accumulator_baseline_tensor, grad.dtype.base_dtype),
+            global_step,
+            grad,
+            indices,
+            use_locking=self._use_locking)
     else:
       return training_ops.resource_sparse_apply_adagrad_decay(
           var.handle,
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index fa998f6cb7f..c45365a3456 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -266,21 +266,34 @@ def _resource_scatter_add(self, x, i, v):
         [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts = None):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      return training_ops.kv_resource_sparse_apply_adam(
-        var.handle, m.handle, v.handle,
-        math_ops.cast(beta1_power, grad.dtype),
-        math_ops.cast(beta2_power, grad.dtype),
-        math_ops.cast(self._lr_t, grad.dtype),
-        math_ops.cast(self._beta1_t, grad.dtype),
-        math_ops.cast(self._beta2_t, grad.dtype),
-        math_ops.cast(self._epsilon_t, grad.dtype),
-        grad, indices, global_step, use_locking=self._use_locking)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adam_with_counts(
+          var.handle, m.handle, v.handle,
+          math_ops.cast(beta1_power, grad.dtype),
+          math_ops.cast(beta2_power, grad.dtype),
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step, indices_counts,
+          use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_adam(
+          var.handle, m.handle, v.handle,
+          math_ops.cast(beta1_power, grad.dtype),
+          math_ops.cast(beta2_power, grad.dtype),
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step,
+          use_locking=self._use_locking)
     else:
       return self._resource_apply_sparse_shared(grad, var, indices,
           self._resource_scatter_add)
diff --git a/tensorflow/python/training/adam_async.py b/tensorflow/python/training/adam_async.py
index d72e9c98eb6..d2d5b75f928 100644
--- a/tensorflow/python/training/adam_async.py
+++ b/tensorflow/python/training/adam_async.py
@@ -210,21 +210,32 @@ def _hash_table_apply_sparse(self, grad, var, indices):
           use_locking=self._use_locking)
     return control_flow_ops.group(update_beta1, update_beta2)
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts = None):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power = self.get_slot(var, 'beta1_power')
     beta2_power = self.get_slot(var, 'beta2_power')
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      return training_ops.kv_resource_sparse_apply_adam_async(
-        var.handle, m.handle, v.handle, beta1_power.handle, beta2_power.handle,
-        math_ops.cast(self._lr_t, grad.dtype),
-        math_ops.cast(self._beta1_t, grad.dtype),
-        math_ops.cast(self._beta2_t, grad.dtype),
-        math_ops.cast(self._epsilon_t, grad.dtype),
-        grad, indices, global_step, use_locking=self._use_locking,
-        apply_sparse_rmsprop=self._apply_sparse_rmsprop)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adam_async_with_counts(
+          var.handle, m.handle, v.handle, beta1_power.handle, beta2_power.handle,
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step, indices_counts,
+          use_locking=self._use_locking,
+          apply_sparse_rmsprop=self._apply_sparse_rmsprop)
+      else:
+        return training_ops.kv_resource_sparse_apply_adam_async(
+          var.handle, m.handle, v.handle, beta1_power.handle, beta2_power.handle,
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step, use_locking=self._use_locking,
+          apply_sparse_rmsprop=self._apply_sparse_rmsprop)
     else:
       return training_ops.resource_sparse_apply_adam_async(
         var.handle, m.handle, v.handle, beta1_power.handle, beta2_power.handle,
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 161b6d45f7d..79fe8bf516a 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -246,12 +246,12 @@ def _apply_sparse(self, grad, var):
           math_ops.cast(self._learning_rate_power_tensor, var.dtype.base_dtype),
           use_locking=self._use_locking)
 
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _create_ftrl_appy_v1_op(self, grad, var, indices, indices_counts=None):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
-    if self._l2_shrinkage_regularization_strength <= 0.0:
-      if isinstance(var, kv_variable_ops.EmbeddingVariable):
-        return training_ops.kv_resource_sparse_apply_ftrl(
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_ftrl_with_counts(
           var.handle,
           accum.handle,
           linear.handle,
@@ -261,9 +261,10 @@ def _resource_apply_sparse(self, grad, var, indices):
           math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
           math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
+          indices_counts,
           use_locking=self._use_locking)
       else:
-        return training_ops.resource_sparse_apply_ftrl(
+        return training_ops.kv_resource_sparse_apply_ftrl(
           var.handle,
           accum.handle,
           linear.handle,
@@ -275,7 +276,37 @@ def _resource_apply_sparse(self, grad, var, indices):
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
     else:
-      if isinstance(var, kv_variable_ops.EmbeddingVariable):
+       return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
+          use_locking=self._use_locking)
+
+  def _create_ftrl_appy_v2_op(self, grad, var, indices, indices_counts=None):
+    accum = self.get_slot(var, "accum")
+    linear = self.get_slot(var, "linear")
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_ftrl_v2_with_counts(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          math_ops.cast(self._learning_rate_tensor, grad.dtype),
+          math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
+          math_ops.cast(self._l2_shrinkage_regularization_strength_tensor,
+                        grad.dtype),
+          math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
+          indices_counts, use_locking=self._use_locking)
+      else:
         return training_ops.kv_resource_sparse_apply_ftrl_v2(
           var.handle,
           accum.handle,
@@ -289,8 +320,8 @@ def _resource_apply_sparse(self, grad, var, indices):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
-      else:
-        return training_ops.resource_sparse_apply_ftrl_v2(
+    else:
+       return training_ops.resource_sparse_apply_ftrl_v2(
           var.handle,
           accum.handle,
           linear.handle,
@@ -303,3 +334,9 @@ def _resource_apply_sparse(self, grad, var, indices):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts=None):
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return self._create_ftrl_appy_v1_op(grad, var, indices, indices_counts)
+    else:
+      return self._create_ftrl_appy_v2_op(grad, var, indices, indices_counts)
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 2fe67d86592..32a12a0554f 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -71,10 +71,17 @@ def _resource_apply_dense(self, grad, handle):
   def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     if isinstance(handle, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      return training_ops.kv_resource_sparse_apply_gradient_descent(
-          handle.handle, math_ops.cast(self._learning_rate_tensor,
-                                       grad.dtype.base_dtype),
-          grad, indices, global_step, use_locking=self._use_locking)
+      if handle.need_counts() and handle._counts_tensor is not None:
+        return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
+            handle.handle, math_ops.cast(self._learning_rate_tensor,
+                                         grad.dtype.base_dtype),
+            grad, indices, global_step,
+            handle._counts_tensor, use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_gradient_descent(
+            handle.handle, math_ops.cast(self._learning_rate_tensor,
+                                         grad.dtype.base_dtype),
+            grad, indices, global_step, use_locking=self._use_locking)
     else:
       return resource_variable_ops.resource_scatter_add(
           handle.handle, indices, -grad * self._learning_rate)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index c26fea7b463..2b765814c0d 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -83,6 +83,15 @@ def _deduplicate_indexed_slices(values, indices):
       array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices)
 
+def _deduplicate_indexed_slices_with_counts(values, indices):
+  """Sums `values` associated with any non-unique `indices`
+  and return counts of each count in `values`."""
+  unique_indices, new_index_positions, indices_counts = \
+      array_ops.unique_with_counts(indices, out_idx=dtypes.int64)
+  summed_values = math_ops.unsorted_segment_sum(
+      values, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  return (summed_values, unique_indices, indices_counts)
 
 def _var_key(var):
   # TODO(ashankar): Consolidate handling for eager and graph
@@ -232,6 +241,10 @@ def _get_processor(v):
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
   if v.op.type == "KvVarHandleOp":
+    from tensorflow.core.framework import attr_value_pb2
+    from tensorflow.core.framework.embedding import config_pb2
+    v._init_op._set_attr("embedding_variable_type",
+        attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
@@ -1074,11 +1087,26 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     Returns:
       An `Operation` which updates the value of the variable.
     """
-    summed_grad, unique_indices = _deduplicate_indexed_slices(
-        values=grad, indices=indices)
-    return self._resource_apply_sparse(summed_grad, handle, unique_indices)
+    from tensorflow.python.ops import kv_variable_ops
+    if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts():
+      if handle._counts_tensor is None:
+        summed_grad, unique_indices, indices_counts = \
+            _deduplicate_indexed_slices_with_counts(
+                values=grad, indices=indices)
+      else:
+        summed_grad, unique_indices = _deduplicate_indexed_slices(
+            values=grad, indices=indices)
+        indices_counts = handle._counts_tensor
+      return self._resource_apply_sparse(
+          summed_grad, handle, unique_indices, indices_counts)
+    else:
+      summed_grad, unique_indices = _deduplicate_indexed_slices(
+          values=grad, indices=indices)
+      indices_counts = None
+      return self._resource_apply_sparse(
+          summed_grad, handle, unique_indices)
 
-  def _resource_apply_sparse(self, grad, handle, indices):
+  def _resource_apply_sparse(self, grad, handle, indices, indices_count=None):
     """Add ops to apply sparse gradients to the variable `handle`.
 
     Similar to `_apply_sparse`, the `indices` argument to this method has been
@@ -1092,6 +1120,8 @@ def _resource_apply_sparse(self, grad, handle, indices):
        to be updated.
       indices: a `Tensor` of integral type representing the indices for
        which the gradient is nonzero. Indices are unique.
+      indices_count: a `Tensor` of integral type representing the count of
+       each index in `indices` when it is not None.
 
     Returns:
       An `Operation` which updates the value of the variable.
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index c8ba90083b2..90a820d82f6 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -94,6 +94,8 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
         validate_shape=validate_shape,
         steps_to_live=primary._steps_to_live,
         ht_partition_num=primary._ht_partition_num)
+      slot._init_op._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
     else:
       filter_strategy = None
       if primary._filter_freq != 0:
@@ -130,6 +132,8 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
             l2_weight_threshold=primary._l2_weight_threshold,
             filter_strategy=filter_strategy)
         )
+        slot._init_op._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
       else:
         slot = variable_scope.get_variable(
           scope,
diff --git a/tensorflow/python/training/weight_decay_optimizers.py b/tensorflow/python/training/weight_decay_optimizers.py
index f783ce13209..bc4d5038a0e 100644
--- a/tensorflow/python/training/weight_decay_optimizers.py
+++ b/tensorflow/python/training/weight_decay_optimizers.py
@@ -351,23 +351,36 @@ def __init__(self,
         epsilon=epsilon,
         use_locking=use_locking,
         name=name)
-    
-  def _resource_apply_sparse(self, grad, var, indices):
+  def _resource_apply_sparse(self, grad, var, indices, indices_counts=None):
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       m = self.get_slot(var, "m")
       v = self.get_slot(var, "v")
       beta1_power, beta2_power = self._get_beta_accumulators()
       global_step = training_util.get_or_create_global_step()
-      return training_ops.kv_resource_sparse_apply_adam_w(
-        var.handle, m.handle, v.handle,
-        math_ops.cast(beta1_power, grad.dtype),
-        math_ops.cast(beta2_power, grad.dtype),
-        math_ops.cast(self._lr_t, grad.dtype),
-        math_ops.cast(self._beta1_t, grad.dtype),
-        math_ops.cast(self._beta2_t, grad.dtype),
-        math_ops.cast(self._epsilon_t, grad.dtype),
-        grad, indices, global_step, weight_decay=self._weight_decay,
-        use_locking=self._use_locking)
+      if indices_counts != None:
+        return training_ops.kv_resource_sparse_apply_adam_w_with_counts(
+          var.handle, m.handle, v.handle,
+          math_ops.cast(beta1_power, grad.dtype),
+          math_ops.cast(beta2_power, grad.dtype),
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step,
+          weight_decay=self._weight_decay,
+          indices_counts=indices_counts,
+          use_locking=self._use_locking)
+      else:
+        return training_ops.kv_resource_sparse_apply_adam_w(
+          var.handle, m.handle, v.handle,
+          math_ops.cast(beta1_power, grad.dtype),
+          math_ops.cast(beta2_power, grad.dtype),
+          math_ops.cast(self._lr_t, grad.dtype),
+          math_ops.cast(self._beta1_t, grad.dtype),
+          math_ops.cast(self._beta2_t, grad.dtype),
+          math_ops.cast(self._epsilon_t, grad.dtype),
+          grad, indices, global_step, weight_decay=self._weight_decay,
+          use_locking=self._use_locking)
     else:
       scatter_add = self._resource_scatter_add
       decay_op = self._decay_weights_sparse_op(var, indices, scatter_add)

From 90917172369cc8aebd9dc5e2de0a75ed2453fa0b Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 6 Jun 2023 16:18:55 +0800
Subject: [PATCH 14/91] [ModelZoo] Fix hung issue when using both SOK and
 SmartStaged simultaneously. (#882)

Fix embedding_dim setting error that would cause segmentation fault in SOK at the same time.

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 modelzoo/bst/train.py                         |   2 +
 modelzoo/dcn/train.py                         |   2 +
 modelzoo/dcnv2/train.py                       |   2 +
 modelzoo/deepfm/train.py                      |   2 +
 modelzoo/din/train.py                         | 124 ++++++++++--------
 modelzoo/dlrm/train.py                        |   2 +
 modelzoo/dssm/train.py                        |   2 +
 modelzoo/esmm/train.py                        |   2 +
 .../features/group_embedding/dcnv2/train.py   |   1 +
 modelzoo/masknet/train.py                     |   2 +
 modelzoo/mlperf/train.py                      |   2 +
 modelzoo/mmoe/train.py                        |   4 +-
 modelzoo/ple/train.py                         |   2 +
 modelzoo/simple_multitask/train.py            |   2 +
 modelzoo/wide_and_deep/train.py               |   2 +
 15 files changed, 98 insertions(+), 55 deletions(-)

diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py
index 8eda718a652..2fb5e4e90f5 100644
--- a/modelzoo/bst/train.py
+++ b/modelzoo/bst/train.py
@@ -1030,6 +1030,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py
index 5bbd12055af..b8e1dba5d63 100644
--- a/modelzoo/dcn/train.py
+++ b/modelzoo/dcn/train.py
@@ -1006,6 +1006,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py
index ce2e567ae76..7ac4c1a0358 100644
--- a/modelzoo/dcnv2/train.py
+++ b/modelzoo/dcnv2/train.py
@@ -1026,6 +1026,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py
index 93b6bb9d80a..896295b0ae6 100644
--- a/modelzoo/deepfm/train.py
+++ b/modelzoo/deepfm/train.py
@@ -888,6 +888,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py
index ab484be160d..6273e0d15a4 100644
--- a/modelzoo/din/train.py
+++ b/modelzoo/din/train.py
@@ -22,6 +22,7 @@
 import collections
 from tensorflow.python.client import timeline
 import json
+import contextlib
 
 from tensorflow.python.ops import partitioned_variables
 
@@ -40,9 +41,9 @@
 LABEL_COLUMN = ['CLICKED']
 TRAIN_DATA_COLUMNS = LABEL_COLUMN + UNSEQ_COLUMNS + SEQ_COLUMNS
 
-EMBEDDING_DIM = 18
-HIDDEN_SIZE = 18 * 2
-ATTENTION_SIZE = 18 * 2
+EMBEDDING_DIM = 16
+HIDDEN_SIZE = 16 * 2
+ATTENTION_SIZE = 16 * 2
 MAX_SEQ_LENGTH = 50
 
 
@@ -50,7 +51,7 @@ class DIN():
     def __init__(self,
                  feature_column=None,
                  learning_rate=0.001,
-                 embedding_dim=18,
+                 embedding_dim=16,
                  hidden_size=36,
                  attention_size=36,
                  inputs=None,
@@ -503,56 +504,62 @@ def build_feature_columns(data_location=None):
     uid_cate_column = tf.feature_column.categorical_column_with_vocabulary_file(
         'UID', uid_file, default_value=0)
     ev_opt = None
-    if not args.tf:
-        '''Feature Elimination of EmbeddingVariable Feature'''
-        if args.ev_elimination == 'gstep':
-            # Feature elimination based on global steps
-            evict_opt = tf.GlobalStepEvict(steps_to_live=4000)
-        elif args.ev_elimination == 'l2':
-            # Feature elimination based on l2 weight
-            evict_opt = tf.L2WeightEvict(l2_weight_threshold=1.0)
-        else:
-            evict_opt = None
-        '''Feature Filter of EmbeddingVariable Feature'''
-        if args.ev_filter == 'cbf':
-            # CBF-based feature filter
-            filter_option = tf.CBFFilter(filter_freq=3,
-                                         max_element_size=2**30,
-                                         false_positive_probability=0.01,
-                                         counter_type=tf.int64)
-        elif args.ev_filter == 'counter':
-            # Counter-based feature filter
-            filter_option = tf.CounterFilter(filter_freq=3)
-        else:
-            filter_option = None
-        ev_opt = tf.EmbeddingVariableOption(evict_option=evict_opt,
-                                            filter_option=filter_option)
-
-        if args.ev:
-            '''Embedding Variable Feature with feature_column API'''
-            uid_cate_column = tf.feature_column.categorical_column_with_embedding(
-                'UID', dtype=tf.string, ev_option=ev_opt)
-        elif args.adaptive_emb:
-            '''            Adaptive Embedding Feature Part 2 of 2
-            Expcet the follow code, a dict, 'adaptive_mask_tensors', is need as the input of
-            'tf.feature_column.input_layer(adaptive_mask_tensors=adaptive_mask_tensors)'.
-            For column 'COL_NAME',the value of adaptive_mask_tensors['$COL_NAME'] is a int32
-            tensor with shape [batch_size].
-            '''
-            uid_cate_column = tf.feature_column.categorical_column_with_adaptive_embedding(
-                'UID',
-                hash_bucket_size=100000,
-                dtype=tf.string,
-                ev_option=ev_opt)
-        elif args.dynamic_ev:
-            '''Dynamic-dimension Embedding Variable'''
-            print(
-                "Dynamic-dimension Embedding Variable isn't really enabled in model now."
-            )
-            sys.exit()
-
-    uid_emb_column = tf.feature_column.embedding_column(
-        uid_cate_column, dimension=EMBEDDING_DIM)
+    if args.group_embedding and not args.tf:
+          context = tf.feature_column.group_embedding_column_scope(name="categorical")
+    else:
+        context = contextlib.nullcontext()
+    with context:
+
+        if not args.tf:
+            '''Feature Elimination of EmbeddingVariable Feature'''
+            if args.ev_elimination == 'gstep':
+                # Feature elimination based on global steps
+                evict_opt = tf.GlobalStepEvict(steps_to_live=4000)
+            elif args.ev_elimination == 'l2':
+                # Feature elimination based on l2 weight
+                evict_opt = tf.L2WeightEvict(l2_weight_threshold=1.0)
+            else:
+                evict_opt = None
+            '''Feature Filter of EmbeddingVariable Feature'''
+            if args.ev_filter == 'cbf':
+                # CBF-based feature filter
+                filter_option = tf.CBFFilter(filter_freq=3,
+                                            max_element_size=2**30,
+                                            false_positive_probability=0.01,
+                                            counter_type=tf.int64)
+            elif args.ev_filter == 'counter':
+                # Counter-based feature filter
+                filter_option = tf.CounterFilter(filter_freq=3)
+            else:
+                filter_option = None
+            ev_opt = tf.EmbeddingVariableOption(evict_option=evict_opt,
+                                                filter_option=filter_option)
+
+            if args.ev:
+                '''Embedding Variable Feature with feature_column API'''
+                uid_cate_column = tf.feature_column.categorical_column_with_embedding(
+                    'UID', dtype=tf.string, ev_option=ev_opt)
+            elif args.adaptive_emb:
+                '''            Adaptive Embedding Feature Part 2 of 2
+                Expcet the follow code, a dict, 'adaptive_mask_tensors', is need as the input of
+                'tf.feature_column.input_layer(adaptive_mask_tensors=adaptive_mask_tensors)'.
+                For column 'COL_NAME',the value of adaptive_mask_tensors['$COL_NAME'] is a int32
+                tensor with shape [batch_size].
+                '''
+                uid_cate_column = tf.feature_column.categorical_column_with_adaptive_embedding(
+                    'UID',
+                    hash_bucket_size=100000,
+                    dtype=tf.string,
+                    ev_option=ev_opt)
+            elif args.dynamic_ev:
+                '''Dynamic-dimension Embedding Variable'''
+                print(
+                    "Dynamic-dimension Embedding Variable isn't really enabled in model now."
+                )
+                sys.exit()
+
+        uid_emb_column = tf.feature_column.embedding_column(
+            uid_cate_column, dimension=EMBEDDING_DIM)
 
     # item
     item_cate_column = tf.feature_column.categorical_column_with_vocabulary_file(
@@ -930,6 +937,11 @@ def get_arg_parser():
                         help='Whether to enable shuffle operation for Parquet Dataset. Default to False.',
                         type=boolean_string,
                         default=False)
+    parser.add_argument("--group_embedding", \
+                      help='Whether to enable Group Embedding. Defualt to None.',
+                      type=str,
+                      choices=[None, 'localized', 'collective'],
+                      default=None)
     return parser
 
 
@@ -1006,6 +1018,10 @@ def set_env_for_DeepRec():
     os.environ['STOP_STATISTIC_STEP'] = '110'
     os.environ['MALLOC_CONF']= \
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
+    if args.group_embedding == "collective":
+        tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py
index 8f4a5c75252..0789e9418b8 100644
--- a/modelzoo/dlrm/train.py
+++ b/modelzoo/dlrm/train.py
@@ -929,6 +929,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py
index 35d741c693f..a757851711c 100644
--- a/modelzoo/dssm/train.py
+++ b/modelzoo/dssm/train.py
@@ -894,6 +894,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py
index 8485e2da116..58219e19e3e 100755
--- a/modelzoo/esmm/train.py
+++ b/modelzoo/esmm/train.py
@@ -975,6 +975,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 def check_stock_tf():
     import pkg_resources
diff --git a/modelzoo/features/group_embedding/dcnv2/train.py b/modelzoo/features/group_embedding/dcnv2/train.py
index 0f20139338d..478930442f8 100644
--- a/modelzoo/features/group_embedding/dcnv2/train.py
+++ b/modelzoo/features/group_embedding/dcnv2/train.py
@@ -59,6 +59,7 @@
 from tensorflow.python.framework import ops
 
 os.environ["TF_GPU_THREAD_MODE"] = "global"
+os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 import horovod.tensorflow as hvd
 
diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py
index 102720bce44..0790f200b21 100644
--- a/modelzoo/masknet/train.py
+++ b/modelzoo/masknet/train.py
@@ -948,6 +948,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()
diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py
index 644703886a7..db7e077250b 100644
--- a/modelzoo/mlperf/train.py
+++ b/modelzoo/mlperf/train.py
@@ -946,6 +946,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 
 if __name__ == '__main__':
diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py
index 7e737c79e3c..251e02c7a72 100644
--- a/modelzoo/mmoe/train.py
+++ b/modelzoo/mmoe/train.py
@@ -1,4 +1,4 @@
-import time
+ import time
 import argparse
 import numbers
 import tensorflow as tf
@@ -934,6 +934,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 def check_stock_tf():
     import pkg_resources
diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py
index 0d21cb3b852..2ba98363bbf 100644
--- a/modelzoo/ple/train.py
+++ b/modelzoo/ple/train.py
@@ -999,6 +999,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 def check_stock_tf():
     import pkg_resources
diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py
index 8ccb9b4bbe4..ff90946c96d 100644
--- a/modelzoo/simple_multitask/train.py
+++ b/modelzoo/simple_multitask/train.py
@@ -867,6 +867,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 def check_stock_tf():
     import pkg_resources
diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py
index 90b6574e9eb..b4f4dbc7a65 100644
--- a/modelzoo/wide_and_deep/train.py
+++ b/modelzoo/wide_and_deep/train.py
@@ -962,6 +962,8 @@ def set_env_for_DeepRec():
         'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
     if args.group_embedding == "collective":
         tf.config.experimental.enable_distributed_strategy(strategy="collective")
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
 if __name__ == '__main__':
     parser = get_arg_parser()

From 8f9d678e0955244c99eb58740c9cde9147f258b6 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 6 Jun 2023 20:26:45 +0800
Subject: [PATCH 15/91] [Embedding] Fix shape validation in API
 shared_embedding_columns. (#881)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 .../python/feature_column/feature_column.py   |  3 ++
 .../feature_column/feature_column_v2_test.py  | 35 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index e8f58c9ae25..19b54c499a0 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -151,6 +151,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import kv_variable_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -2674,6 +2675,8 @@ def create_embedding(self,
               'The feature_column library already adds a variable under the '
               'hood.'.format(shared_embedding_collection))
         embedding_weights = shared_embedding_collection[0]
+        if isinstance(embedding_weights, kv_variable_ops.EmbeddingVariable):
+          embedding_shape = (self.dimension)
         if embedding_weights.get_shape() != embedding_shape:
           raise ValueError(
               'Shared embedding collection {} contains variable {} of '
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 18146b71938..ff5935b708f 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -7670,6 +7670,41 @@ def testEmbeddingVariableForSharedEmbeddingColumns(self):
         for j in range(3):
           self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
    
+  @test_util.run_deprecated_v1
+  def testEmbeddingVariableForSharedEmbeddingColumnsMultiCol(self):
+    columns_list=[]
+    columns_list.append(fc.categorical_column_with_embedding("col_emb", dtype=dtypes.string))
+    columns_list.append(fc.categorical_column_with_embedding("col_emb2", dtype=dtypes.string))
+    W = fc.shared_embedding_columns(columns_list,
+            dimension=3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            shared_embedding_collection_name="xxxxx_shared")
+
+    ids={}
+    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    ids["col_emb2"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    emb = fc_old.input_layer(ids, W)
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables_lib.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init)
+      sess.run([emb, train_op,loss])
+      sess.run([emb, train_op,loss])
+      emb_r, _, _ = sess.run([emb, train_op,loss])
+      emb_right = [[0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214]]
+      for i in range(5):
+        for j in range(3):
+          self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
+
   @test_util.run_deprecated_v1
   def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self):
     columns_list=[]

From 500e5c9df2448928fbcdc0f0f2b7035216d516c7 Mon Sep 17 00:00:00 2001
From: JunqiHu <silenceki@hotmail.com>
Date: Tue, 6 Jun 2023 11:59:23 +0800
Subject: [PATCH 16/91] [Embedding] Update logic of GroupEmbedding in
 feature_column API.

tf.feature_column.input_layer would only construct GroupEmbeddingOp corresponding to the embedding columns passed in.

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 .../feature_column/sequence_feature_column.py | 19 ++--
 .../python/feature_column/feature_column.py   | 28 +++---
 .../feature_column/feature_column_v2.py       | 91 +++++++++++++++----
 .../feature_column/group_embedding_column.py  | 31 +++----
 tensorflow/python/ops/embedding_ops.py        | 16 ++--
 .../python/ops/group_embedding_lookup_ops.py  |  4 +-
 6 files changed, 120 insertions(+), 69 deletions(-)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 8c5c5188185..07650a723d1 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -110,7 +110,6 @@ def sequence_input_layer(
         output_tensors = []
         sequence_lengths = []
         ordered_columns = []
-        group_name_set = set()
         group_embedding_list = []
         embedding_columns = []
 
@@ -122,8 +121,7 @@ def sequence_input_layer(
             ):
                 if group_name != "":
                     group_name_set.add(group_name)
-                    output_tensor = None
-                    output_tensors.append(output_tensor)  # placeholder
+                    output_tensors.append(None)  # placeholder
                     group_embedding_list.append(index)
                     embedding_columns.append(column)
                     sequence_lengths.append(None)
@@ -142,13 +140,14 @@ def sequence_input_layer(
                     )
                     sequence_lengths.append(sequence_length)
 
-        group_embedding_tensor = gec._get_global_group_embedding_scope(
-            group_name_set, builder, weight_collections, trainable
-        )
-        for ind, column in zip(group_embedding_list, embedding_columns):
-            output_tensor, sequence_length = group_embedding_tensor[column]
-            output_tensors[ind] = output_tensor
-            sequence_lengths[ind] = sequence_length
+        if len(embedding_columns) > 0:
+          group_embedding_tensor = gec._get_global_group_embedding_scope(
+              embedding_columns, builder, weight_collections, trainable
+          )
+          for ind, column in zip(group_embedding_list, embedding_columns):
+              output_tensor, sequence_length = group_embedding_tensor[column]
+              output_tensors[ind] = output_tensor
+              sequence_lengths[ind] = sequence_length
 
         fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
         fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 19b54c499a0..3d5e7a71330 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -201,7 +201,6 @@ def _get_logits():  # pylint: disable=missing-docstring
         builder = _LazyBuilder(features, adaptive_mask_tensors)
         output_tensors = []
         ordered_columns = []
-        group_name_set = set()
         group_embedding_list = []
         embedding_columns = []
         for index, column in enumerate(sorted(feature_columns, key=lambda x: x.name)):
@@ -211,9 +210,7 @@ def _get_logits():  # pylint: disable=missing-docstring
                 None, default_name=column._var_scope_name
             ):  # pylint: disable=protected-access
                 if group_name != "":
-                    group_name_set.add(group_name)
-                    output_tensor = None
-                    output_tensors.append(output_tensor)  # placeholder
+                    output_tensors.append(None)  # placeholder
                     group_embedding_list.append(index)  # for later gather
                     embedding_columns.append(column)
                 else:
@@ -244,17 +241,18 @@ def _get_logits():  # pylint: disable=missing-docstring
                         scope=variable_scope.get_variable_scope().name,
                     )
 
-        group_embedding_tensor = gec._get_global_group_embedding_scope(
-            group_name_set, builder, weight_collections, trainable
-        )
-        for ind, column in zip(group_embedding_list, embedding_columns):
-            output_tensor, _ = group_embedding_tensor[column]
-            output_tensors[ind] = output_tensor
-            if cols_to_output_tensors is not None:
-                cols_to_output_tensors[column] = output_tensor
-            ops.add_to_collections(
-                ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS, output_tensor
-            )
+        if len(embedding_columns) > 0:
+          group_embedding_tensor = gec._get_global_group_embedding_scope(
+              embedding_columns, builder, weight_collections, trainable
+          )
+          for ind, column in zip(group_embedding_list, embedding_columns):
+              output_tensor, _ = group_embedding_tensor[column]
+              output_tensors[ind] = output_tensor
+              if cols_to_output_tensors is not None:
+                  cols_to_output_tensors[column] = output_tensor
+              ops.add_to_collections(
+                  ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS, output_tensor
+              )
 
         _verify_static_batch_size_equality(output_tensors, ordered_columns)
         return array_ops.concat(output_tensors, -1)
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index d22e6f50478..7ce730f9f38 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -4238,15 +4238,17 @@ def _from_config(cls, config, custom_objects=None, columns_by_name=None):
 @contextlib.contextmanager
 def group_embedding_column_scope(name='', params_num_per_group=sys.maxsize):
   global_group_embedding_scope = group_embedding_column._global_group_embedding_scope_list()
-  group_id = group_embedding_column._current_group_id()
-  if name == '':
-    name = "group_embedding_column_scope_{}".format(group_id)
-    group_id +=1
+  if name != '':
+    name = "group_embedding_column_scope_{}".format(name)
   else:
-     name = "group_embedding_column_scope_{}".format(name)
-  fusion_embedding_scope = GroupEmbeddingScope(name, params_num_per_group)
-  global_group_embedding_scope.append(fusion_embedding_scope)
-  yield global_group_embedding_scope 
+    name = "group_embedding_column_scope"
+  if len(global_group_embedding_scope) == 0:
+    fusion_embedding_scope = GroupEmbeddingScope(name)
+    global_group_embedding_scope.append(fusion_embedding_scope)
+  else:
+    fusion_embedding_scope = global_group_embedding_scope.pop(-1)
+    global_group_embedding_scope.append(fusion_embedding_scope)
+  yield fusion_embedding_scope
 
 class GroupEmbeddingScope(group_embedding_column.GroupEmbeddingScopeBase):
   def __init__(self, name=None, params_num_per_group=sys.maxsize):
@@ -4264,33 +4266,86 @@ def add_column(self, embedding_column):
                       "given {}".format(embedding_column))
     self.embedding_columns.append(embedding_column)
   
-  def _get_dense_tensor(self, filter_ec, inputs, 
+  def _get_dense_tensor(self, admitted_ec, inputs, 
       weight_collections=None, trainable=None):
+
+    output_tensors = [None for _ in range(len(admitted_ec))]
+    sequence_lengths = [0 for _ in range(len(admitted_ec))]
+    output_mapping = []
+
     embedding_weights = []
     sp_ids = []
+    sp_weights = []
     combiners = []
-    output_tensors = []
-    sequence_lengths = [0 for _ in range(len(self.embedding_columns))]
+    non_sp_weights = []
     is_sequence = False
-    for index, ec in enumerate(self.embedding_columns): 
-      if ec in filter_ec:
+
+    for index, ec in enumerate(admitted_ec):
+      sparse_tensor = ec.categorical_column._get_sparse_tensors(
+          inputs, weight_collections, trainable)
+      sp_id = sparse_tensor.id_tensor
+      sp_weight = sparse_tensor.weight_tensor
+      if sp_weight == None:
+        non_sp_weights.append(index)
         continue
-      sp_id = ec.categorical_column._get_sparse_tensors(
-          inputs, weight_collections, trainable).id_tensor
       #Special logic for sequence feature_column
       if isinstance(ec.categorical_column, fc_old._SequenceCategoricalColumn):
         sequence_lengths[index] = fc_utils.sequence_length_from_sparse_tensor(
             sp_id)
         is_sequence = True
+      
+      #prune invalid id and weights
+      sp_id, sp_weight = _prune_invalid_ids(sp_id, sp_weight)
+      if ec.combiner != "sum":
+        sp_id, sp_weight = _prune_invalid_weights(sp_id, sp_weight)
+
       sp_ids.append(sp_id)
+      sp_weights.append(sp_weight)
       combiners.append(ec.combiner)
       with variable_scope.variable_scope(ec._var_scope_name):
         embedding_weight = ec.create_embedding(weight_collections, trainable)
         embedding_weights.append(embedding_weight)
+      output_mapping.append(index)
+    
+    if len(output_mapping) > 0:
+      weighted_outputs = embedding_ops.group_embedding_lookup_sparse(
+                                embedding_weights, sp_ids, combiners, sp_weights, 
+                                is_sequence=is_sequence, params_num_per_group=self.params_num_per_group)
+
+      for index, output in zip(output_mapping, weighted_outputs):
+        output_tensors[index] = output
+
+    embedding_weights.clear()
+    sp_ids.clear()
+    combiners.clear()
+    output_mapping.clear()
+    is_sequence = False
+    del sp_weights
+
+    for index in non_sp_weights:
+      ec = admitted_ec[index]
+      sparse_tensor = ec.categorical_column._get_sparse_tensors(
+          inputs, weight_collections, trainable)
+      sp_id = sparse_tensor.id_tensor
+      #Special logic for sequence feature_column
+      if isinstance(ec.categorical_column, fc_old._SequenceCategoricalColumn):
+        sequence_lengths[index] = fc_utils.sequence_length_from_sparse_tensor(
+            sp_id)
+        is_sequence = True
+      sp_ids.append(sp_id)
+      combiners.append(ec.combiner)
+      with variable_scope.variable_scope(ec._var_scope_name):
+        embedding_weight = ec.create_embedding(weight_collections, trainable)
+        embedding_weights.append(embedding_weight)
+      output_mapping.append(index)
+    
+    if len(output_mapping) > 0:
+      non_weighted_outputs = embedding_ops.group_embedding_lookup_sparse(
+                              embedding_weights, sp_ids, combiners, is_sequence=is_sequence, params_num_per_group=self.params_num_per_group)
+
+      for index, output in zip(output_mapping, non_weighted_outputs):
+        output_tensors[index] = output
 
-    output_tensors.extend(embedding_ops.group_embedding_lookup_sparse(
-                              embedding_weights, sp_ids, combiners, 
-                              is_sequence=is_sequence, params_num_per_group=self.params_num_per_group))
     return output_tensors, sequence_lengths
 
 class EmbeddingColumn(
diff --git a/tensorflow/python/feature_column/group_embedding_column.py b/tensorflow/python/feature_column/group_embedding_column.py
index b40bcb26c47..d91df26ffbb 100644
--- a/tensorflow/python/feature_column/group_embedding_column.py
+++ b/tensorflow/python/feature_column/group_embedding_column.py
@@ -5,7 +5,6 @@
 import sys
 
 _global_fusion_embedding_scope = []
-_group_id = 0
 _group_embedding_tensor = dict()
 
 def _global_group_embedding_scope_list():
@@ -16,30 +15,26 @@ def _current_group_embedding_scope():
     global _global_fusion_embedding_scope
     return None if len(_global_fusion_embedding_scope) == 0 else _global_fusion_embedding_scope[-1]
 
-def _get_global_group_embedding_scope(group_names,
+def _get_global_group_embedding_scope(embedding_columns,
                                       builder=None,
                                       weight_collections=None,
                                       trainable=True):
     global _group_embedding_tensor
     global _global_fusion_embedding_scope
+    fused_scope = _global_fusion_embedding_scope[-1]
     filter_ec, admitted_ec = [], []
-    for fused_scope in _global_fusion_embedding_scope:
-        if fused_scope.name in group_names:
-            for ec in fused_scope.embedding_columns:
-                if ec in _group_embedding_tensor:
-                    filter_ec.append(ec)
-                else:
-                    admitted_ec.append(ec)
-            fused_output, sequence_lengths = fused_scope._get_dense_tensor(
-                filter_ec, builder, weight_collections, trainable)
-            
-            for ec, output, sequence_length in zip(admitted_ec, fused_output, sequence_lengths): #Ordered
-                _group_embedding_tensor[ec] = (output, sequence_length)
+    for ec in embedding_columns:
+        if ec in _group_embedding_tensor:
+            filter_ec.append(ec)
+        else:
+            admitted_ec.append(ec)
+    fused_output, sequence_lengths = fused_scope._get_dense_tensor(
+        admitted_ec, builder, weight_collections, trainable)
+
+    for ec, output, sequence_length in zip(admitted_ec, fused_output, sequence_lengths): #Ordered
+        _group_embedding_tensor[ec] = (output, sequence_length)
     return _group_embedding_tensor
 
-def _current_group_id():
-    global _group_id
-    return _group_id
 
 class GroupEmbeddingScopeBase(object):
     def __init__(self, name=None, params_num_per_group=sys.maxsize):
@@ -51,7 +46,7 @@ def add_column(self, embedding_column):
         raise NotImplementedError("Valid EmbeddingColumn should be "
                                   "specified by successor.")
 
-    def _get_dense_tensor(self, filter_ec, inputs, weight_collections=None, trainable=None, is_sequence=False):
+    def _get_dense_tensor(self, admitted_ec, inputs, weight_collections=None, trainable=None, is_sequence=False):
         raise NotImplementedError("should be implement in successor.")
 
     def get_dense_tensor(self, transformation_cache, state_manager):
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index ef2265ec235..8c98d2b59f3 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -1642,6 +1642,14 @@ def group_embedding_lookup_sparse(params,
     if not isinstance(params, list):
         params = [params]
 
+    #Currently do not support PartitionedVariable.
+    for index, param in enumerate(params):
+      if isinstance(param, variables.PartitionedVariable):
+        tmp_param = list(param)
+        if len(tmp_param) != 1:
+          raise TypeError("PartitionedVariable not support in 'group_embedding_lookup_sparse'. ")
+        params[index] = tmp_param[0]
+
     ignore_weights = sp_weights is None
 
     if len(combiners) != len(sp_ids):
@@ -1655,14 +1663,9 @@ def group_embedding_lookup_sparse(params,
             raise ValueError('len of combiners must be equal to len of sp_weights'
                              )
 
-  # # Currently not doing unique
-
     strategy = get_group_lookup_strategy()
     if strategy == DistStrategy.COLLECTIVE:
         for (index, param) in enumerate(params):
-            if isinstance(param, variables.PartitionedVariable):
-                raise TypeError("PartitionedVariable not support in 'group_embedding_lookup_sparse'. "
-                                )
             param.target_gpu = -1
 
         try:
@@ -1690,6 +1693,7 @@ def group_embedding_lookup_sparse(params,
         if not isinstance(sp_id, sparse_tensor.SparseTensor):
           try:  # assume RaggedTensor
             sp_id = sp_id.to_sparse()
+            sp_ids[index] = sp_id
           except:
             raise ValueError('sp_id is neither SparseTensor nor RaggedTensor!')
 
@@ -1796,7 +1800,7 @@ def group_embedding_lookup_sparse(params,
                   (ev_handlers[group_id])[-num_remainder:],
                   (ev_sp_values[group_id])[-num_remainder:],
                   (ev_sp_indices[group_id])[-num_remainder:],
-                  (ev_sp_weights[group_id])[-num_remainder:],
+                  sub_ev_sp_weight,
                   ev_combiners[group_id],
                   (ev_dense_shapes[group_id])[-num_remainder:],
                   dim,
diff --git a/tensorflow/python/ops/group_embedding_lookup_ops.py b/tensorflow/python/ops/group_embedding_lookup_ops.py
index 494ea4e2ab0..73809966986 100644
--- a/tensorflow/python/ops/group_embedding_lookup_ops.py
+++ b/tensorflow/python/ops/group_embedding_lookup_ops.py
@@ -43,8 +43,8 @@ def group_embedding_var_lookup(params,
                                                         default_value,
                                                         combiners,
                                                         dimensions,
-                                                        ignore_weights,
-                                                        is_use_default_value_tensor,
+                                                        ignore_weights=ignore_weights,
+                                                        is_use_default_value_tensor=is_use_default_value_tensor,
                                                         is_sequence=is_sequence)
 
 @ops.RegisterGradient("GroupEmbeddingVarLookup")

From 6f32d3a5f770d35ae28159ae9b188c64b7c58501 Mon Sep 17 00:00:00 2001
From: JunqiHu <silenceki@hotmail.com>
Date: Tue, 6 Jun 2023 15:08:45 +0800
Subject: [PATCH 17/91] [Embedding] Modify calculation logic of embedding
 lookup sparse combiner.

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 .../fused_embedding_common.cu.h               | 30 +++++--
 .../fused_embedding_local_ops_gpu.cu.cc       |  2 +-
 .../fused_embedding_post_ops_gpus.cu.cc       |  2 +-
 ...dding_lookup_sparse_backward_base_ops.cu.h |  4 +-
 ...edding_lookup_sparse_forward_base_ops.cu.h | 35 ++++----
 ...oup_embedding_lookup_sparse_forward_ops.cc | 85 ++++++++++++++-----
 6 files changed, 113 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/kernels/fused_embedding/fused_embedding_common.cu.h b/tensorflow/core/kernels/fused_embedding/fused_embedding_common.cu.h
index eff9e7c0782..e62b32e3534 100644
--- a/tensorflow/core/kernels/fused_embedding/fused_embedding_common.cu.h
+++ b/tensorflow/core/kernels/fused_embedding/fused_embedding_common.cu.h
@@ -29,27 +29,45 @@ struct IndicePair {
 
 enum Combiner { Mean, Sum, Sqrtn };
 
-template <Combiner combiner>
-__forceinline__ __device__ float Combine(const float in, const int feature_num);
+template <Combiner combiner, typename T>
+__forceinline__ __device__ float Combine(const float in, const T feature_num);
 
 template <>
-__forceinline__ __device__ float Combine<Sqrtn>(const float in,
+__forceinline__ __device__ float Combine<Sqrtn, int>(const float in,
                                                 const int feature_num) {
   return in / sqrtf(feature_num);
 }
 
 template <>
-__forceinline__ __device__ float Combine<Mean>(const float in,
+__forceinline__ __device__ float Combine<Mean, int>(const float in,
                                                const int feature_num) {
   return in / feature_num;
 }
 
 template <>
-__forceinline__ __device__ float Combine<Sum>(const float in,
+__forceinline__ __device__ float Combine<Sum, int>(const float in,
                                               const int feature_num) {
   return in;
 }
 
+template <>
+__forceinline__ __device__ float Combine<Sqrtn, float>(const float in,
+                                                const float feature_num) {
+  return in / sqrtf(feature_num);
+}
+
+template <>
+__forceinline__ __device__ float Combine<Mean, float>(const float in,
+                                               const float feature_num) {
+  return in / feature_num;
+}
+
+template <>
+__forceinline__ __device__ float Combine<Sum, float>(const float in,
+                                              const float feature_num) {
+  return in;
+}
+
 template <Combiner combiner>
 __forceinline__ __device__ float CombineGrad(const float grad,
                                              const int feature_num);
@@ -77,4 +95,4 @@ __forceinline__ __device__ float CombineGrad<Sum>(const float grad,
 
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_
\ No newline at end of file
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_
diff --git a/tensorflow/core/kernels/fused_embedding/fused_embedding_local_ops_gpu.cu.cc b/tensorflow/core/kernels/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
index 266b960a32c..ee1bdc54ea9 100644
--- a/tensorflow/core/kernels/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
@@ -77,7 +77,7 @@ __global__ void EmbeddingLookUp(const float* emb_variable,
   }
 
   // combine
-  out = Combine<combiner>(out, feature_num);
+  out = Combine<combiner, int>(out, feature_num);
 
   // store the embedding vector
   embedding_vector[blockIdx.x * emb_vec_size + threadIdx.x] = out;
diff --git a/tensorflow/core/kernels/fused_embedding/fused_embedding_post_ops_gpus.cu.cc b/tensorflow/core/kernels/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
index 3325bd69a6d..5fe061fc4f4 100644
--- a/tensorflow/core/kernels/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
+++ b/tensorflow/core/kernels/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
@@ -59,7 +59,7 @@ __global__ void ApplyCombiner(float* emb_vectors, const int* row_emptiness_flag,
   }
   const int feature_num = feature_nums[blockIdx.x];
   const float emb_element = emb_vectors[offset];
-  emb_vectors[offset] = Combine<combiner>(emb_element, feature_num);
+  emb_vectors[offset] = Combine<combiner, int>(emb_element, feature_num);
 }
 
 template <Combiner combiner>
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
index 884f4f0fb4a..5c352144234 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
@@ -274,7 +274,7 @@ class GroupEmbeddingLookupBackWard {
 
     {
       if (tile_size <= 32) {
-        const int block_size = batch_size / 64 * tile_size + 1;
+        const int block_size = batch_size * tile_size / 64 + 1;
 
         fn<<<block_size, 64, 0, stream>>>(batch_size, max_norm_, nums_,
                                           dimension_, d_args_);
@@ -361,4 +361,4 @@ class GroupLookupBackWardBaseOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
index 28e63565eef..665c2a6703e 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
@@ -116,14 +116,14 @@ __global__ void WeightedEmbeddingVarComputeFn(
       }
 
       float out = 0.0f;
-
-      // #pragma unroll
+      float total_batch_weight = 0.0f;
       if (feature_num > 0) {
         for (int j = 0; j < feature_num; ++j) {
           size_t feature_indices = value_offset + j;
           int64_t embedding_offset = feature_indices * dimension;
           TValue sum = args[ev_id].emb_variable_[embedding_offset + tid];
           TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
           if (max_norm >= 0.0) {
             if (tid == 0) {
               l2_sum = 0.0;
@@ -138,7 +138,7 @@ __global__ void WeightedEmbeddingVarComputeFn(
           }
           out = __fmaf_rn(sum, sp_weights, out);
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, TValue>(out, total_batch_weight);
       }
       args[ev_id].emb_vector_[bid * dimension + tid] = out;
     }
@@ -169,7 +169,7 @@ __global__ void WeightedVariableComputeFn(
       }
 
       TValue out = 0.0f;
-
+      TValue total_batch_weight = 0.0f;
       const TValue* emb_variable = args[ev_id].emb_variable_;
       // #pragma unroll
       if (feature_num > 0) {
@@ -177,6 +177,7 @@ __global__ void WeightedVariableComputeFn(
           size_t feature_indices = value_offset + i;
           int embedding_indices = int(args[ev_id].sp_values_[feature_indices]);
           TValue sp_weights = args[ev_id].sp_weights_[embedding_indices];
+          total_batch_weight += sp_weights;
           TValue emb_element = emb_variable[feature_indices];
           if (max_norm >= 0.0f) {
             // calc l2 norm of this emb row(per block) and compare with
@@ -196,7 +197,7 @@ __global__ void WeightedVariableComputeFn(
           }
           out = __fmaf_rn(emb_element, sp_weights, out);
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, TValue>(out, total_batch_weight);
       }
       args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
     }
@@ -227,7 +228,7 @@ __global__ void EmbeddingVarComputeFn(
         feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
       }
       TValue out = 0.0;
-
+      
       // #pragma unroll
       if (feature_num > 0) {
         for (int j = 0; j < feature_num; ++j) {
@@ -247,7 +248,7 @@ __global__ void EmbeddingVarComputeFn(
           }
           out += sum;
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, int>(out, feature_num);
       }
       args[ev_id].emb_vector_[bid * dimension + tid] = out;
     }
@@ -303,7 +304,7 @@ __global__ void VariableComputeFn(
           }
           out += emb_element;
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, int>(out, feature_num);
       }
       args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
     }
@@ -352,7 +353,7 @@ __global__ void NormalEmbeddingVarComputeFn(
           }
           out += sum;
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, int>(out, feature_num);
       }
       args[ev_id].emb_vector_[bid * dimension + tid] = out;
     }
@@ -406,7 +407,7 @@ __global__ void NormalVariableComputeFn(
           }
           out += emb_element;
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, int>(out, feature_num);
       }
       args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
     }
@@ -434,8 +435,8 @@ __global__ void NormalWeightedEmbeddingVarComputeFn(
       } else {
         feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
       }
-      TValue out = 0.0;
-
+      TValue out = 0.0f;
+      TValue total_batch_weight = 0.0f;
       // #pragma unroll
       if (feature_num > 0) {
         for (int j = 0; j < feature_num; ++j) {
@@ -443,6 +444,7 @@ __global__ void NormalWeightedEmbeddingVarComputeFn(
           int64_t embedding_offset = feature_indices * dimension;
           TValue sum = args[ev_id].emb_variable_[embedding_offset + tid];
           TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
           if (max_norm >= 0.0) {
             if (tid == 0) {
               l2_sum[0] = 0.0;
@@ -457,7 +459,7 @@ __global__ void NormalWeightedEmbeddingVarComputeFn(
           }
           out = __fmaf_rn(sum, sp_weights, out);
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, TValue>(out, total_batch_weight);
       }
       args[ev_id].emb_vector_[bid * dimension + tid] = out;
     }
@@ -485,7 +487,7 @@ __global__ void NormalWeightedVariableComputeFn(
         feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
       }
       TValue out = 0.0f;
-
+      TValue total_batch_weight = 0.0f;
       const TValue* emb_variable = args[ev_id].emb_variable_;
 
       // #pragma unroll
@@ -496,6 +498,7 @@ __global__ void NormalWeightedVariableComputeFn(
           TValue emb_element =
               emb_variable[embedding_indices * emb_vec_size + tid];
           TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
           // printf("indices is %d emb_element is %f\n", indices, emb_element);
           if (max_norm >= 0.0f) {
             // calc l2 norm of this emb row(per block) and compare with
@@ -515,7 +518,7 @@ __global__ void NormalWeightedVariableComputeFn(
           }
           out = __fmaf_rn(emb_element, sp_weights, out);
         }
-        out = Combine<combiner>(out, feature_num);
+        out = Combine<combiner, int>(out, feature_num);
       }
       args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
     }
@@ -556,7 +559,7 @@ class GroupEmbeddingLookupForWard {
 
     {
       if (tile_size <= 32) {
-        const int block_size = batch_size / 64 * tile_size + 1;
+	const int block_size = batch_size * tile_size / 64 + 1;
         compute_fn<<<block_size, 64, 0, stream>>>(batch_size, dimension_,
                                                   max_norm_, ev_nums_, d_args_);
       } else {
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
index bb9888f642b..e05aadbd350 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
@@ -214,6 +214,7 @@ class GroupEmbeddingVariableLookupCpuOp
                                                int64 start, int64 end) {
           for (int64 i = start; i < end; ++i) {
 #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            __m512 batch_total_weights = _mm512_set1_ps(0.0f);
             int tmp_length = (m_dimension + 15) / 16;
             __m512 tmp_embedding[tmp_length];
             for (int i = 0; i < tmp_length; ++i) {
@@ -221,14 +222,13 @@ class GroupEmbeddingVariableLookupCpuOp
             }
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
-            __m512 _bs = _mm512_set1_ps(batch_num);
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
               float *u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               __m512 _weights =
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
-              _weights = _mm512_div_ps(_weights, _bs);
+              batch_total_weights = _mm512_add_ps(batch_total_weights, _weights);
               for (int d = 0; d < m_dimension; d += 16) {
                 int index = d / 16;
                 int remain = m_dimension - d;
@@ -239,14 +239,17 @@ class GroupEmbeddingVariableLookupCpuOp
               }
             }
 
+            if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f);
             for (int d = 0; d < m_dimension; d += 16) {
               int index = d / 16;
               int remain = m_dimension - d;
               __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] = _mm512_div_ps(tmp_embedding[index], batch_total_weights);
               _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d,
                                     mask, tmp_embedding[index]);
             }
 #else
+            TValue batch_total_weights = 0.0f;
             std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
@@ -254,12 +257,18 @@ class GroupEmbeddingVariableLookupCpuOp
               int unique_indice = unique_idx[batch_offset + j];
               float *u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
-              TValue sp_weight = sp_weights[batch_offset + j] / batch_num;
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights += sp_weight;
               for (int d = 0; d < m_dimension; ++d) {
                 tmp_embedding[d] =
                     std::fma(*(u_embedding + d), sp_weight, tmp_embedding[d]);
               }
             }
+
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
+
             memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(),
                    sizeof(float) * m_dimension);
 #endif
@@ -331,6 +340,7 @@ class GroupEmbeddingVariableLookupCpuOp
                                                 int64 start, int64 end) {
           for (int64 i = start; i < end; ++i) {
 #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            TValue batch_total_weights = 0.0f;
             int tmp_length = (m_dimension + 15) / 16;
             __m512 tmp_embedding[tmp_length];
             for (int i = 0; i < tmp_length; ++i) {
@@ -338,14 +348,13 @@ class GroupEmbeddingVariableLookupCpuOp
             }
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
-            __m512 _bs = _mm512_set1_ps(sqrtf(batch_num));
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
               float *u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
-              __m512 _weights =
-                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
-              _weights = _mm512_div_ps(_weights, _bs);
+	      TValue local_weight = *(sp_weights + batch_offset + j);
+              __m512 _weights = _mm512_set1_ps(local_weight);
+              batch_total_weights = std::fma(local_weight, local_weight, batch_total_weights);
               for (int d = 0; d < m_dimension; d += 16) {
                 int index = d / 16;
                 int remain = m_dimension - d;
@@ -355,15 +364,23 @@ class GroupEmbeddingVariableLookupCpuOp
                     _item, _weights, tmp_embedding[index], mask);
               }
             }
+            __m512 _total_weights;
+            if (batch_num != 0) {
+              _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights));
+            } else {
+              _total_weights = _mm512_set1_ps(1.0f);
+            }
 
             for (int d = 0; d < m_dimension; d += 16) {
               int index = d / 16;
               int remain = m_dimension - d;
               __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] = _mm512_div_ps(tmp_embedding[index], _total_weights);
               _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d,
                                     mask, tmp_embedding[index]);
             }
 #else
+            TValue batch_total_weights = 0.0f;
             std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
@@ -371,13 +388,23 @@ class GroupEmbeddingVariableLookupCpuOp
               int unique_indice = unique_idx[batch_offset + j];
               float *u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
-              TValue sp_weight =
-                  sp_weights[batch_offset + j] / sqrtf(batch_num);
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights = std::fma(sp_weight, sp_weight, batch_total_weights);
               for (int d = 0; d < m_dimension; ++d) {
                 tmp_embedding[d] =
                     std::fma(u_embedding[d], sp_weight, tmp_embedding[d]);
               }
             }
+
+            if (batch_num != 0) {
+              batch_total_weights = sqrtf(batch_total_weights);
+            } else {
+              batch_total_weights = 1.0f;
+            }
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
+
             memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(),
                    sizeof(float) * m_dimension);
 #endif
@@ -488,6 +515,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
                             embedding_variable](int64 start, int64 end) {
           for (int64 i = start; i < end; ++i) {
 #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            __m512 batch_total_weights = _mm512_set1_ps(0.0f);
             int tmp_length = (m_dimension + 15) / 16;
             __m512 tmp_embedding[tmp_length];
             for (int i = 0; i < tmp_length; ++i) {
@@ -500,8 +528,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
               int unique_id = unique[unique_indice];
               __m512 _weights =
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
-              __m512 _bs = _mm512_set1_ps(batch_num);
-              _weights = _mm512_div_ps(_weights, _bs);
+              batch_total_weights = _mm512_add_ps(batch_total_weights, _weights);
               const float *embedding_ptr =
                   embedding_variable + unique_id * m_dimension;
 
@@ -514,28 +541,34 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
                     _item, _weights, tmp_embedding[index], mask);
               }
             }
-
+            if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f);
             for (int d = 0; d < m_dimension; d += 16) {
               int index = d / 16;
               int remain = m_dimension - d;
               __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] = _mm512_div_ps(tmp_embedding[index], batch_total_weights);
               _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask,
                                     tmp_embedding[index]);
             }
 #else
+            TValue batch_total_weights = 0.0f;
             std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
               int unique_id = unique[unique_indice];
-              TValue sp_weight = sp_weights[batch_offset + j] / batch_num;
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights += sp_weight;
               for (int d = 0; d < m_dimension; ++d) {
                 tmp_embedding[d] =
                     std::fma(embedding_variable[unique_id * m_dimension + d],
                              sp_weight, tmp_embedding[d]);
               }
             }
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
             memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(),
                    sizeof(float) * m_dimension);
 #endif
@@ -605,6 +638,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
                              embedding_variable](int64 start, int64 end) {
           for (int64 i = start; i < end; ++i) {
 #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            TValue batch_total_weights = 0.0f;
             int tmp_length = (m_dimension + 15) / 16;
             __m512 tmp_embedding[tmp_length];
             for (int i = 0; i < tmp_length; ++i) {
@@ -615,10 +649,9 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
               int unique_id = unique[unique_indice];
-              __m512 _weights =
-                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
-              __m512 _bs = _mm512_set1_ps(sqrtf(batch_num));
-              _weights = _mm512_div_ps(_weights, _bs);
+	      TValue local_weight = *(sp_weights + batch_offset + j);
+              __m512 _weights = _mm512_set1_ps(local_weight);
+              batch_total_weights = std::fma(local_weight, local_weight, batch_total_weights);
               const float *embedding_ptr =
                   embedding_variable + unique_id * m_dimension;
               for (int d = 0; d < m_dimension; d += 16) {
@@ -630,29 +663,43 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
                     _item, _weights, tmp_embedding[index], mask);
               }
             }
+            
+	    __m512 _total_weights;
+            if (batch_num != 0) {
+              _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights));
+            } else {
+              _total_weights = _mm512_set1_ps(1.0f);
+            }
 
             for (int d = 0; d < m_dimension; d += 16) {
               int index = d / 16;
               int remain = m_dimension - d;
               __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] = _mm512_div_ps(tmp_embedding[index], _total_weights);
               _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask,
                                     tmp_embedding[index]);
             }
 #else
+            TValue batch_total_weights = 0.0f;
             std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
             int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
               int unique_id = unique[unique_indice];
-              TValue sp_weight =
-                  sp_weights[batch_offset + j] / sqrtf(batch_num);
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights = std::fma(sp_weight, sp_weight, batch_total_weights);
               for (int d = 0; d < m_dimension; ++d) {
                 tmp_embedding[d] =
                     std::fma(embedding_variable[unique_id * m_dimension + d],
                              sp_weight, tmp_embedding[d]);
               }
             }
+            if (batch_num != 0) {
+              batch_total_weights = sqrtf(batch_total_weights);
+            } else {
+              batch_total_weights = 1.0f;
+            }
             memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(),
                    sizeof(float) * m_dimension);
 #endif

From 9f6308d5ed8aa7c3d6d2ad113c95937b0fb8ad5e Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Wed, 7 Jun 2023 17:09:53 +0800
Subject: [PATCH 18/91] [Runtime] Dispatch expensive ops via multiple threads
 in theadpool. (#884)

If there are lots of expensive ops that are ready at the same time,
use multiple child threads of dispatch these ready ops.
This is because dispatching ops on thread pool can be expensive

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 tensorflow/core/common_runtime/executor.cc | 56 +++++++++++++++++++---
 tensorflow/core/framework/op_kernel.h      |  5 +-
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 2d473b1b6b4..fd38329a1fa 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/executor.h"
 
+#include <algorithm>
 #include <atomic>
 #include <memory>
 #include <queue>
@@ -350,6 +351,11 @@ class ExecutorState {
   // The deadline for the session to complete by. Empty if unspecified.
   absl::optional<absl::Time> deadline_;
 
+  // Maximum number of kernels that can be scheduled inline. If lots of kernels
+  // are ready at the same time, scheduling them in one thread can be very slow.
+  // TODO(fishx): Make it configurable if necessary.
+  static constexpr uint64 kInlineScheduleReadyThreshold = 500;
+
   // Not owned.
   //RendezvousInterface* rendezvous_;
   Rendezvous* rendezvous_;
@@ -1339,6 +1345,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
 
   {
     const TaggedNode* curr_expensive_node = nullptr;
+    TaggedNodeSeq expensive_nodes;
     if (inline_ready == nullptr) {
       // Schedule to run all the ready ops in thread pool.
       for (auto& tagged_node : *ready) {
@@ -1352,10 +1359,8 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
           inline_ready->push_back(tagged_node);
         } else {
           if (curr_expensive_node) {
-            // Dispatch to another thread since there is plenty of work to
-            // do for this thread.
-            RunTask(std::bind(&ExecutorState::Process, this,
-                              *curr_expensive_node, scheduled_nsec));
+            // push_back expensive nodes, we will schdule them later.
+            expensive_nodes.push_back(*curr_expensive_node);
           }
           curr_expensive_node = &tagged_node;
         }
@@ -1367,8 +1372,47 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
       } else {
         // There are inline nodes to run already. We dispatch this expensive
         // node to other thread.
-        RunTask(std::bind(&ExecutorState::Process, this, *curr_expensive_node,
-                          scheduled_nsec));
+        expensive_nodes.push_back(*curr_expensive_node);
+      }
+    }
+
+    if (!expensive_nodes.empty()) {
+      if (expensive_nodes.size() < kInlineScheduleReadyThreshold) {
+        for (auto& tagged_node : expensive_nodes) {
+          RunTask(std::bind(&ExecutorState::Process, this, tagged_node,
+                            scheduled_nsec));
+        }
+      } else {
+        // There are too many ready expensive nodes. Schedule them in child
+        // threads.
+        // TODO(fishx): Apply the same optimization to cheap ops as well since
+        // executing lots of cheap ops in one thread can potentially be the
+        // bottleneck as well.
+        auto it = expensive_nodes.begin();
+        while (it < expensive_nodes.end()) {
+          auto end = it;
+          std::advance(end, kInlineScheduleReadyThreshold);
+          if (end > expensive_nodes.end()) {
+            end = expensive_nodes.end();
+          }
+          TaggedNodeSeq ready_chunk{it, end};
+          RunTask(
+              [this, ready_chunk = std::move(ready_chunk), scheduled_nsec]() {
+                profiler::TraceMe activity(
+                    [&]() {
+                      return strings::StrCat(
+                          "ExecutorState::ScheduleReady::"
+                          "ChildThreadExpensiveNodes#",
+                          "ready_chunk_size=", ready_chunk.size(), "#");
+                    },
+                    profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+                for (auto& tagged_node : ready_chunk) {
+                  RunTask(std::bind(&ExecutorState::Process, this, tagged_node,
+                                    scheduled_nsec));
+                }
+              });
+          it = end;
+        }
       }
     }
   }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index f6a6f798305..65ad40bbbf8 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -159,9 +159,8 @@ class OpKernel {
     // updates may result in one or more updates being ignored.  This does not
     // affect correctness but may slow down the update frequency.
     cost_estimate_.store(
-        (kCostDecay - 1) * cost_estimate_.load(std::memory_order_relaxed) /
-                kCostDecay +
-            (elapsed_cycles / kCostDecay),
+        ((kCostDecay - 1) * cost_estimate_.load(std::memory_order_relaxed) +
+                elapsed_cycles) / kCostDecay,
         std::memory_order_relaxed);
   }
 

From a6a797f1768ace4a00113018d06c440ee3a6c4e9 Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Thu, 8 Jun 2023 15:33:46 +0800
Subject: [PATCH 19/91] [Runtime] Fix memory leak when a graph node is invalid.
 (#885)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 tensorflow/core/common_runtime/immutable_executor_state.cc | 1 +
 tensorflow/core/graph/graph_constructor.cc                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index 3f05fb65250..661c62417cc 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -133,6 +133,7 @@ Status ImmutableExecutorState::Initialize() {
 
     Status s = params_.create_kernel(n->def(), &item->kernel);
     if (!s.ok()) {
+      params_.delete_kernel(item->kernel);
       item->kernel = nullptr;
       s = AttachDef(s, *n);
       return s;
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 9ab9aa7cae0..f42e5fb12ad 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -1062,7 +1062,7 @@ void GraphConstructor::DFS(int cur_node, std::vector<int>* cur_branch,
             std::find(cur_branch->begin(), cur_branch->end(), next_node);
         LOG(WARNING) << "Cycle detected:";
         while (iter != cur_branch->end()) {
-          LOG(WARNING) << SummarizeNodeDef(get_node_def(*iter));
+          LOG(WARNING) << *iter;
           ++iter;
         }
         LOG(WARNING) << "End of cycle";

From 21764bb040d31884b2cef417aec5338569c04a99 Mon Sep 17 00:00:00 2001
From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
Date: Thu, 8 Jun 2023 16:55:13 +0800
Subject: [PATCH 20/91] [CIBUILD] Update default TF_CUDA_COMPUTE_CAPABILITIES
 to 7.0,7.5,8.0,8.6

Signed-off-by: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index cfbd09305aa..362479981b2 100644
--- a/configure.py
+++ b/configure.py
@@ -36,7 +36,7 @@
 _DEFAULT_CUDA_VERSION = '11'
 _DEFAULT_CUDNN_VERSION = '8'
 _DEFAULT_TENSORRT_VERSION = '8'
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '6.0,6.1,7.0,7.5,8.0'
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '7.0,7.5,8.0,8.6'
 
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'

From f5e1247c4ff6bb9b9037a7f111ff63be0d4307ae Mon Sep 17 00:00:00 2001
From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
Date: Thu, 8 Jun 2023 17:00:07 +0800
Subject: [PATCH 21/91] [Docs] Update COMMITTERS.md.

Signed-off-by: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
---
 COMMITTERS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/COMMITTERS.md b/COMMITTERS.md
index 881dc88b2f5..21854a3ab4c 100644
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@@ -16,6 +16,7 @@
 - Wei LIN, weilin.lw@alibaba-inc.com, Alibaba
 - Ruozhou ZANG, zrzn@foxmail.com, MetaApp
 - Jin OUYANG, oyjmical@gmail.com, Kuaishou
+- Yue SONG, 13810271944@163.com, Kuaishou
 - Changqing LI, changqing.li@intel.com, Intel
 - Pujiang HE, pujiang.he@intel.com, Intel
 - Weifei YU, weifei.yu@intel.com, Intel

From b40b8b8dfc4223aedff7ea1e5e329753fcd905ef Mon Sep 17 00:00:00 2001
From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
Date: Thu, 8 Jun 2023 17:00:17 +0800
Subject: [PATCH 22/91] [Docs] Update cases of configure
 TF_CUDA_COMPUTE_CAPABILITIES for H100.

Signed-off-by: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
---
 docs/docs_en/DeepRec-Compile-And-Install.md | 7 +++++--
 docs/docs_zh/DeepRec-Compile-And-Install.md | 7 ++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index 1fbe923b30d..0a170177353 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -47,12 +47,15 @@ Configure TF_CUDA_COMPUTE_CAPABILITIES could improve performance, please follow
 | Volta (V100)        | 7.0                          |
 | Turing (T4)         | 7.5                          |
 | Ampere (A10, A100)  | 8.0+8.6                      |
+| Hopper (H100, H800) | 9.0                          |
 
-If you need to compile DeepRec wheel that run on different GPU architecture, configure TF_CUDA_COMPUTE_CAPABILITIES such as:
+If you need to compile DeepRec wheel that run on different GPU architecture, configure TF_CUDA_COMPUTE_CAPABILITIES, by default TF_CUDA_COMPUTE_CAPABILITIES is "7.0,7.5,8.0,8.6" (CIBUILD use A10 card).
+
+For example, if you want to run DeepRec on H100 and A100 GPU card, please setup TF_CUDA_COMPUTE_CAPABILITIES as follows:
 
 
 ```bash
-export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
+export TF_CUDA_COMPUTE_CAPABILITIES="8.0,8.6,9.0"
 ```
 
 **Configuration**
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index b526ce9f2d8..20df07aa252 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -46,13 +46,14 @@
 | Volta (V100)        | 7.0                          |
 | Turing (T4)         | 7.5                          |
 | Ampere (A10, A100)  | 8.0+8.6                      |
+| Hopper (H100, H800) | 9.0                          |
 
-如果希望编译出支持不同GPU卡上执行的版本，可以配置多个值，比如DeepRec中默认配置为"6.0,6.1,7.0,7.5,8.0"
+如果希望编译出支持不同GPU卡上执行的版本，可以配置多个值，比如DeepRec中默认配置为"7.0,7.5,8.0,8.6" (当前CIBUILD使用A10卡）
 
-比如配置环境变量TF_CUDA_COMPUTE_CAPABILITIES方法：
+如果编译的DeepRec需要执行在H100和A100的GPU卡上，配置环境变量TF_CUDA_COMPUTE_CAPABILITIES：
 
 ```bash
-export TF_CUDA_COMPUTE_CAPABILITIES="7.5,8.0"
+export TF_CUDA_COMPUTE_CAPABILITIES="8.0,8.6,9.0"
 ```
 
 ```bash

From 7e1db924fa91a393fb5f3de9833b37590f524038 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Fri, 9 Jun 2023 15:48:36 +0800
Subject: [PATCH 23/91] [Embedding] Fix issue of missing params while
 constructing the GroupEmbeddingScope. (#887)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 tensorflow/python/feature_column/feature_column_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 7ce730f9f38..eb3cb3bc702 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -4243,7 +4243,7 @@ def group_embedding_column_scope(name='', params_num_per_group=sys.maxsize):
   else:
     name = "group_embedding_column_scope"
   if len(global_group_embedding_scope) == 0:
-    fusion_embedding_scope = GroupEmbeddingScope(name)
+    fusion_embedding_scope = GroupEmbeddingScope(name, params_num_per_group)
     global_group_embedding_scope.append(fusion_embedding_scope)
   else:
     fusion_embedding_scope = global_group_embedding_scope.pop(-1)

From 087a332323037642c466c48ed60f155e6e1d0f5b Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 14 Jun 2023 18:31:07 +0800
Subject: [PATCH 24/91] [Graph] Fix the device placement bug of
 stage_subgraph_on_cpu in distributed scenarios. (#891)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../gpu/gpu_stage_subgraph_on_cpu_pass.cc     | 29 ++++---------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_stage_subgraph_on_cpu_pass.cc b/tensorflow/core/common_runtime/gpu/gpu_stage_subgraph_on_cpu_pass.cc
index 1fdecdcee18..eccfe193ccf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stage_subgraph_on_cpu_pass.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stage_subgraph_on_cpu_pass.cc
@@ -48,43 +48,26 @@ class StageSubGraphOnCPUPass : public GraphOptimizationPass {
     std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, new_graph.get());
 
-    // Get CPU Device
-    std::string cpu_device_name="";
-    const DeviceSet* device_set = options.device_set;
-    GetCPUDevice(cpu_device_name, device_set);
-    if (cpu_device_name.empty()) {
-      LOG(INFO) << "Failed to Get CPU Device. "
-		<< "StageSubGraphOnCPU Optimization is disabled.";
-	return Status::OK();
-    }
-
     // Place Stage SubGraph on CPU.
-    PlaceStageSubGraphOnCPU(cpu_device_name, new_graph.get());
+    PlaceStageSubGraphOnCPU(new_graph.get());
 
     options.graph->swap(new_graph);
     return Status::OK();
   }
 
  private:
-  void GetCPUDevice(std::string& cpu_device_name, const DeviceSet* device_set) {
-    const auto& devices = device_set->devices();
-    for (auto iter = devices.begin(); iter != devices.end(); iter++) {
-      if ((*iter)->device_type() == "CPU") {
-	cpu_device_name = (*iter)->name();
-	return;
-      }
-    }
-  }
 
-  void PlaceStageSubGraphOnCPU(const std::string& cpu_device_name,
-			       Graph* graph) {
+  void PlaceStageSubGraphOnCPU(Graph* graph) {
     for (Node* n : graph->op_nodes()) {
       if (n->IsStage()) {
 	std::vector<Node*> start_node;
 	for (const Edge* e : n->in_edges())
 	  start_node.emplace_back(e->src());
 
-	auto set_stage_subgraph_node_device = [cpu_device_name](Node *node) {
+	auto set_stage_subgraph_node_device = [](Node *node) {
+          std::string cpu_device_name;
+          TF_CHECK_OK(DeviceNameUtils::DeviceNameToCpuDeviceName(
+              node->assigned_device_name(), &cpu_device_name));
 	  node->set_assigned_device_name(cpu_device_name);
 	};
         ReverseDFSFrom(*graph, start_node,

From 4bc502709a4ac2b0c51cae4f7c05b2d4576e5fe0 Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Fri, 16 Jun 2023 10:47:10 +0800
Subject: [PATCH 25/91] [Allocator] Optimize EV allocator performance. (#893)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 tensorflow/core/BUILD                    |  2 +
 tensorflow/core/framework/ev_allocator.h | 67 ++++++++++++++++++------
 tensorflow/workspace.bzl                 | 11 ++++
 third_party/readerwriterqueue.BUILD      |  5 +-
 4 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 777cde4a40d..dde1098ebae 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2946,6 +2946,7 @@ tf_cuda_library(
         ":framework_internal_headers_lib",
         "//third_party/eigen3",
         ":lib",
+        "@readerwriterqueue_archive//:readerwriterqueue",
     ] + if_static(
         extra_deps = [
             ":framework_internal_impl",
@@ -3062,6 +3063,7 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@readerwriterqueue_archive//:readerwriterqueue",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core:framework/bfloat16",
         "//tensorflow/core:framework/numeric_types",
diff --git a/tensorflow/core/framework/ev_allocator.h b/tensorflow/core/framework/ev_allocator.h
index fd2a20a2b92..5028d45c4d9 100644
--- a/tensorflow/core/framework/ev_allocator.h
+++ b/tensorflow/core/framework/ev_allocator.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>
 #include <list>
 #include <vector>
+#include <readerwriterqueue.h>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
@@ -163,26 +164,58 @@ class FreeList {
   }
 
   int PopBatch(int N, void** ret) {
-    if (list_.size() >= N) {
-      for (int i = 0; i < N; ++i) {
-        ret[i] = list_.back();
-        list_.pop_back();
-      }
-      return N;
-    } else {
-      auto loop = list_.size();
-      for (int i = 0; i < loop; ++i) {
-        ret[i] = list_.back();
-        list_.pop_back();
-      }
-      return loop;
+    int count = list_.size();
+    if (count > N) {
+      count = N;
+    }
+    for (int i = 0; i < count; ++i) {
+      ret[i] = list_.back();
+      list_.pop_back();
     }
+
+    return count;
   }
 
  private:
   std::list<void*> list_;
 };
 
+class FreeQueue {
+ public:
+  void Push(void* ptr) {
+    q_.enqueue(ptr);
+  }
+
+  bool TryPop(void** ret) {
+    return q_.try_dequeue(*ret);
+  }
+
+  // PushBatch and PopBatch do not guarantee an ordering.
+  void PushBatch(int N, void** ptrs) {
+    for (int i = 0; i < N; ++i) {
+      q_.enqueue(ptrs[i]);
+    }
+  }
+
+  int PopBatch(int N, void** ret) {
+    int pop_count = 0;
+    while (pop_count < N) {
+      bool succeeded = q_.try_dequeue(ret[pop_count]);
+      if (!succeeded) {
+        break;
+      }
+      ++pop_count;
+    }
+
+    return pop_count;
+  }
+
+ private:
+  // NOTE(TODO): Consider to use concurrentqueue instead,
+  // that we can delete mutex in Bin.
+  moodycamel::ReaderWriterQueue<void*> q_;
+};
+
 template<typename ChunkType>
 class Chunk {
  public:
@@ -266,7 +299,7 @@ class Bin {
 
   size_t BatchAllocate(size_t num, void** ret) {
     mutex_lock l(mu_);
-    auto allocated = free_list_.PopBatch(num, ret);
+    auto allocated = free_queue_.PopBatch(num, ret);
     auto remains = num - allocated;
     if (remains == 0) {
       return num;
@@ -304,7 +337,7 @@ class Bin {
 
   void BatchDeallocate(std::vector<void *> &ptrs) {
     mutex_lock l(mu_);
-    free_list_.PushBatch(ptrs.size(), ptrs.data());
+    free_queue_.PushBatch(ptrs.size(), ptrs.data());
   }
 
   size_t BinSize() const {
@@ -325,7 +358,7 @@ class Bin {
   PageMap<ChunkType>* page_map_ = nullptr GUARDED_BY(mu_);
   Chunk<ChunkType>* current_chunk_ = nullptr GUARDED_BY(mu_);
 
-  FreeList free_list_ GUARDED_BY(mu_);
+  FreeQueue free_queue_ GUARDED_BY(mu_);
   std::vector<Chunk<ChunkType>*> chunks_ GUARDED_BY(mu_);
 };
 
@@ -354,7 +387,7 @@ class Arena {
         bin = it->second;
       }
     }
-    
+
     return bin->BatchAllocate(num, ret);
   }
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5bd123ce00f..9e99cd19904 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -329,6 +329,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "readerwriterqueue_archive",
+        build_file = clean_dep("//third_party:readerwriterqueue.BUILD"),
+        sha256 = "fc68f55bbd49a8b646462695e1777fb8f2c0b4f342d5e6574135211312ba56c1",
+        strip_prefix = "readerwriterqueue-1.0.6",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz",
+            "https://github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz",
+        ],
+    )
+
     tf_http_archive(
         name = "png_archive",                                    # PNG Reference Library License
         build_file = clean_dep("//third_party:png.BUILD"),
diff --git a/third_party/readerwriterqueue.BUILD b/third_party/readerwriterqueue.BUILD
index afcf230b38e..f87ee6df79c 100644
--- a/third_party/readerwriterqueue.BUILD
+++ b/third_party/readerwriterqueue.BUILD
@@ -3,7 +3,8 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "readerwriterqueue",
     hdrs = [
-        "atomicops.h",
         "readerwriterqueue.h",
-    ],
+        "atomicops.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
 )

From c18f3f909254fa4fc43fb603ad57c8ad57a73758 Mon Sep 17 00:00:00 2001
From: Duyi-Wang <duyi.wang@intel.com>
Date: Fri, 16 Jun 2023 11:01:43 +0800
Subject: [PATCH 26/91] [Grappler] Disable MatMul fused with LeakyRule when MKL
 is disabled. (#892)

Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
---
 tensorflow/core/grappler/optimizers/remapper.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 6d944738d8a..97b244bccae 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -531,9 +531,13 @@ bool FindContractionWithBiasAndActivation(
   if (!IsMatMul(*contraction_node_def) && IsGelu(*node_def)) return false;
 
   // Currently, only (conv | matmul) + bias + leakyrelu is enabled
+#ifdef INTEL_MKL
   if ((!IsConv2D(*contraction_node_def) && !IsMatMul(*contraction_node_def)) &&
       IsLeakyRelu(*node_def))
     return false;
+#else
+  if (!IsConv2D(*contraction_node_def) && IsLeakyRelu(*node_def)) return false;
+#endif //! INTEL_MKL
 
   // Currently, only matmul + bias + tanh is enable
   if (!IsMatMul(*contraction_node_def) && IsTanh(*node_def)) return false;

From 43a7a055663e3b63bcd4af5abb5b30905a1e48e0 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Mon, 19 Jun 2023 20:49:31 +0800
Subject: [PATCH 27/91] [Embedding] Move insertions of new features into the
 backward process when using HBM multi-tier storage. (#883)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   3 +-
 .../core/framework/embedding/batch.cu.cc      |  36 +-
 tensorflow/core/framework/embedding/batch.h   |  11 +-
 .../framework/embedding/bloom_filter_policy.h |  36 +-
 tensorflow/core/framework/embedding/cache.h   |   1 +
 .../core/framework/embedding/cache_factory.h  |   1 +
 .../embedding/counter_filter_policy.h         |  37 +-
 .../framework/embedding/embedding_var.cu.cc   |  26 +-
 .../core/framework/embedding/embedding_var.h  |  78 ++-
 .../embedding/embedding_var_context.h         |   2 +
 .../core/framework/embedding/filter_policy.h  |  10 +-
 .../embedding/hbm_dram_ssd_storage.h          | 180 +++++--
 .../framework/embedding/hbm_dram_storage.h    | 141 +++++-
 .../intra_thread_copy_id_allocator.h          |  71 +++
 .../embedding/multi_tier_storage.cu.cc        | 137 ++++++
 .../framework/embedding/multi_tier_storage.h  |  11 +
 .../embedding/nullable_filter_policy.h        |  44 +-
 tensorflow/core/framework/embedding/storage.h |  21 +
 .../core/framework/embedding/storage_config.h |   4 +-
 tensorflow/core/kernels/BUILD                 |   3 +-
 .../kernels/embedding_variable_ops_test.cc    |  33 +-
 ..._embedding_lookup_sparse_forward_ops.cu.cc | 125 +----
 .../core/kernels/kv_variable_lookup_ops.cc    | 189 +------
 .../core/kernels/training_ali_op_helpers.h    | 103 ++--
 tensorflow/core/kernels/training_ali_ops.cc   | 465 ++++--------------
 .../python/ops/embedding_variable_ops_test.py |   2 -
 tensorflow/python/ops/kv_variable_ops.py      |   3 +-
 27 files changed, 955 insertions(+), 818 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h
 create mode 100644 tensorflow/core/framework/embedding/multi_tier_storage.cu.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index dde1098ebae..925f164d8b8 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3025,7 +3025,8 @@ tf_cuda_library(
             "framework/embedding/batch.h",
             "framework/embedding/gpu_hash_table.cu.cc",
             "framework/embedding/gpu_hash_table.h",
-            "framework/embedding/embedding_var.cu.cc"
+            "framework/embedding/embedding_var.cu.cc",
+            "framework/embedding/multi_tier_storage.cu.cc"
         ],
     ) + select({
         "//tensorflow:windows": [],
diff --git a/tensorflow/core/framework/embedding/batch.cu.cc b/tensorflow/core/framework/embedding/batch.cu.cc
index efb48595f00..929fcfcbee5 100644
--- a/tensorflow/core/framework/embedding/batch.cu.cc
+++ b/tensorflow/core/framework/embedding/batch.cu.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
+namespace embedding {
 template<class V>
 __global__ void BatchCopy(V** batch, V* val_base, int value_len,
     int limit) {
@@ -57,6 +58,24 @@ TF_CALL_int32(REGISTER_KERNELS_ALL_INDEX)
 TF_CALL_int64(REGISTER_KERNELS_ALL_INDEX)
 #undef REGISTER_KERNELS_ALL_INDEX
 
+template<class V>
+__global__ void CopyEmbedding(V** batch, V** batch_data_space,
+    int total_dims, int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / total_dims;
+  int item_pos = i % total_dims;
+
+  if (i < limit  * total_dims) {
+    *(batch_data_space[item_id] + item_pos) = *(batch[item_id] + item_pos);
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T) \
+   template __global__ void CopyEmbedding<T>(T**, T**, int, int);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+}  // namespace embedding
+
 template<class V>
 __global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr,
     int embedding_dim, long long int limit) {
@@ -199,22 +218,5 @@ __global__ void SparseApplyAdamWGPU(V** var, V** m, V** v,
 TF_CALL_float(REGISTER_KERNELS_ALL_INDEX)
 TF_CALL_double(REGISTER_KERNELS_ALL_INDEX)
 #undef REGISTER_KERNELS_ALL_INDEX
-
-template<class V>
-__global__ void CopyEmbedding(V** batch, V** batch_data_space,
-    int total_dims, int limit) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  int item_id = i / total_dims;
-  int item_pos = i % total_dims;
-
-  if (i < limit  * total_dims) {
-    *(batch_data_space[item_id] + item_pos) = *(batch[item_id] + item_pos);
-  }
-}
-
-#define REGISTER_KERNELS_ALL_INDEX(T) \
-   template __global__ void CopyEmbedding<T>(T**, T**, int, int);
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
-#undef REGISTER_KERNELS_ALL_INDEX
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/framework/embedding/batch.h b/tensorflow/core/framework/embedding/batch.h
index 7580349fe21..a259ec20eaa 100644
--- a/tensorflow/core/framework/embedding/batch.h
+++ b/tensorflow/core/framework/embedding/batch.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 namespace tensorflow {
+namespace embedding {
 
 template<class V>
 __global__ void BatchCopy(V** batch, V* val_base, int value_len,
@@ -27,6 +28,11 @@ template<class V>
 __global__ void BatchUnpack(V** dev_value_address, V* memcpy_buffer_gpu,
     int value_len, int limit);
 
+template<class V>
+__global__ void CopyEmbedding(V** batch, V** batch_data_space,
+    int total_dims, int limit);
+} //namespace embedding
+
 template<class V>
 __global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr,
     int embedding_dim, long long int limit);
@@ -53,11 +59,6 @@ template<class V>
 __global__ void SparseApplyAdamWGPU(V** var, V** m, V** v,
     const V* g, V alpha, V beta1, V beta2, V epsilon,
     V weight_decay, int embedding_dim, long long int limit);
-
-template<class V>
-__global__ void CopyEmbedding(V** batch, V** batch_data_space,
-    int total_dims, int limit);
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 22fb45e78d5..1ac76b51fc1 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -57,7 +57,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     GenerateSeed(config.kHashFunc);
   }
 
-  Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
+  Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
     ValuePtr<V>* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
@@ -70,6 +70,40 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     return Status::OK();
   }
 
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx,
+                   const K* keys, V* output,
+                   int64 num_of_keys,
+                   V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, value_ptr_list, &embedding_ptr,
+                    default_value_ptr, default_value_no_permission]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        if (value_ptr != nullptr) {
+          embedding_ptr[i] =
+              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+        } else {
+          embedding_ptr[i] = default_value_no_permission;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(
+        output, num_of_keys, embedding_ptr.data(),
+        stream, event_mgr, ctx.gpu_device);
+  }
+#endif //GOOGLE_CUDA
+
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
                       ValuePtr<V>** value_ptr, int count,
                       const V* default_value_no_permission) override {
diff --git a/tensorflow/core/framework/embedding/cache.h b/tensorflow/core/framework/embedding/cache.h
index df5870f71a7..67553e8738f 100644
--- a/tensorflow/core/framework/embedding/cache.h
+++ b/tensorflow/core/framework/embedding/cache.h
@@ -34,6 +34,7 @@ class BatchCache {
     update((K*)t.data(), t.NumElements(),
            nullptr, (int64*)counts_tensor.data());
   }
+
   virtual size_t get_evic_ids(K* evic_ids, size_t k_size) = 0;
   virtual size_t get_cached_ids(K* cached_ids, size_t k_size,
                                 int64* cached_versions,
diff --git a/tensorflow/core/framework/embedding/cache_factory.h b/tensorflow/core/framework/embedding/cache_factory.h
index bc1d339bcef..8d4cf68261c 100644
--- a/tensorflow/core/framework/embedding/cache_factory.h
+++ b/tensorflow/core/framework/embedding/cache_factory.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_
 
 #include "cache.h"
+#include "tensorflow/core/framework/embedding/config.pb.h"
 
 namespace tensorflow {
 namespace embedding {
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index 14d18f65605..5d0711585b4 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -28,7 +28,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
       : config_(config), ev_(ev){
   }
 
-  Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
+  Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
     ValuePtr<V>* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
@@ -41,6 +41,41 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
     return Status::OK();
   }
 
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx,
+                   const K* keys, V* output,
+                   int64 num_of_keys,
+                   V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
+                    default_value_ptr, default_value_no_permission]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        int64 freq = GetFreq(keys[i], value_ptr);
+        if (value_ptr != nullptr && freq >= config_.filter_freq) {
+          embedding_ptr[i] =
+              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+        } else {
+          embedding_ptr[i] = default_value_no_permission;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(
+        output, num_of_keys, embedding_ptr.data(),
+        stream, event_mgr, ctx.gpu_device);
+  }
+#endif //GOOGLE_CUDA
+
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
                       ValuePtr<V>** value_ptr, int count,
                       const V* default_value_no_permission) override {
diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc
index 94c90fa3c62..0c0be83ec1d 100644
--- a/tensorflow/core/framework/embedding/embedding_var.cu.cc
+++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc
@@ -45,9 +45,7 @@ void SyncWithEventMgr(se::Stream* stream,
 template <class K, class V>
 void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
     const K* keys, int64 size, const std::list<int64>& init_cursor,
-    V** memcpy_address, V* default_values,
-    std::function<V*(V*, K, int64, int64, int64)> get_default_v_fn,
-    se::Stream* compute_stream, EventMgr* event_mgr,
+    V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr,
     const Eigen::GpuDevice& gpu_device) {
   if (init_cursor.size() > 0) {
     int64 total = init_cursor.size();
@@ -67,15 +65,17 @@ void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
       value_address[i] =
           *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) +
           storage_->GetOffset(emb_config_.emb_index);
-      default_value_address[i] = get_default_v_fn(
-          default_values, keys[*it], *it, GetDefaultValueDim(), ValueLen());
+      default_value_address[i] =
+          default_value_ +
+          (keys[i] % emb_config_.default_value_dim) % value_len_;
     }
     DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*));
     compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
                                total * 2 * sizeof(V*));
     int block_dim = 128;
     TF_CHECK_OK(GpuLaunchKernel(
-        CopyEmbedding<V>, (total * value_len_ + block_dim - 1) / block_dim,
+        embedding::CopyEmbedding<V>,
+        (total * value_len_ + block_dim - 1) / block_dim,
         block_dim, 0, gpu_device.stream(), dev_default_value_address,
         dev_value_address, value_len_, total));
     SyncWithEventMgr(compute_stream, event_mgr);
@@ -95,9 +95,8 @@ void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
 
 #define REGISTER_KERNELS(ktype, vtype)                                        \
   template void EmbeddingVar<ktype, vtype>::SetDefaultValueOfNewFeatures(     \
-      const ktype*, int64, const std::list<int64>&, vtype**, vtype*,          \
-      std::function<vtype*(vtype*, ktype, int64, int64, int64)>, se::Stream*, \
-      EventMgr*, const Eigen::GpuDevice& gpu_device);
+      const ktype*, int64, const std::list<int64>&, vtype**,                  \
+      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
 #define REGISTER_KERNELS_ALL(type) \
   REGISTER_KERNELS(int32, type);   \
   REGISTER_KERNELS(int64, type)
@@ -110,7 +109,7 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 
 template <class K, class V>
 void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
-    V* val_base, int64 size, int64 slice_elems, V** memcpy_address,
+    V* val_base, int64 size, V** memcpy_address,
     se::Stream* compute_stream, EventMgr* event_mgr,
     const Eigen::GpuDevice& gpu_device) {
   int block_dim = 128;
@@ -121,13 +120,14 @@ void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
   int limit = size;
   int length = ValueLen();
   TF_CHECK_OK(GpuLaunchKernel(
-      BatchCopy<V>, (limit + block_dim - 1) / block_dim * length, block_dim, 0,
+      embedding::BatchCopy<V>,
+      (limit + block_dim - 1) / block_dim * length, block_dim, 0,
       gpu_device.stream(), dev_value_address, val_base, length, limit));
   SyncWithEventMgr(compute_stream, event_mgr);
 }
 #define REGISTER_KERNELS(ktype, vtype)                              \
   template void EmbeddingVar<ktype, vtype>::CopyEmbeddingsToBuffer( \
-      vtype*, int64, int64, vtype**, se::Stream*, EventMgr*,        \
+      vtype*, int64, vtype**, se::Stream*, EventMgr*,               \
       const Eigen::GpuDevice& gpu_device);
 #define REGISTER_KERNELS_ALL(type) \
   REGISTER_KERNELS(int32, type);   \
@@ -178,7 +178,7 @@ void EmbeddingVar<K, V>::CopyEmbeddingsFromCPUToGPU(
 
     int block_dim = 128;
     TF_CHECK_OK(GpuLaunchKernel(
-        BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
+        embedding::BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
         block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu,
         value_len, total));
 
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index e3c0c2a0b24..201afd8bf5a 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -162,6 +162,14 @@ class EmbeddingVar : public ResourceBase {
     return storage_->Get(key, value_ptr);
   }
 
+  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                      const K* keys,
+                      ValuePtr<V>** value_ptr_list,
+                      int64 num_of_keys) {
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys,
+                       emb_config_.total_num(storage_->GetAllocLen()));
+  }
+
   Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
@@ -232,7 +240,7 @@ class EmbeddingVar : public ResourceBase {
   Status Lookup(K key, V* val, V* default_v)  {
     const V* default_value_ptr =
       (default_v == nullptr) ? default_value_ : default_v;
-    return filter_->Lookup(this, key, val, default_value_ptr,
+    return filter_->Lookup(key, val, default_value_ptr,
                            default_value_no_permission_);
   }
 
@@ -244,7 +252,7 @@ class EmbeddingVar : public ResourceBase {
         V* default_v =
             default_value_ +
                 (keys[i] % emb_config_.default_value_dim) * value_len_;
-        filter_->Lookup(this, keys[i],
+        filter_->Lookup(keys[i],
             output + i * value_len_, default_v,
             default_value_no_permission_);
       }
@@ -275,6 +283,65 @@ class EmbeddingVar : public ResourceBase {
           worker_threads->workers, num_of_keys,
           value_len_ * sizeof(V), do_work);
   }
+#if GOOGLE_CUDA
+  void GetEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
+                     const K* keys,
+                     V* output,
+                     int64 num_of_keys) {
+    filter_->BatchLookup(context, keys, output,
+                         num_of_keys, default_value_,
+                         default_value_no_permission_);
+  }
+
+  void BatchLookupOrCreateEmb(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      V** var_ptr,
+      ValuePtr<V>** value_ptrs,
+      const K* indices,
+      int64 num_of_keys,
+      IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> init_cursor_list(
+        num_worker_threads + 1);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list,
+        &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) {
+      int copy_id =
+          thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id);
+      for (int i = start; i < limit; i++) {
+        bool is_need_set_default_value = false;
+        var_ptr[i] = LookupOrCreateEmb(
+            value_ptrs[i], is_need_set_default_value);
+        if (is_need_set_default_value) {
+          init_cursor_list[copy_id].emplace_back(i);
+        }
+      }
+    };
+    const int64 unit_cost = 1000;
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers,
+          num_of_keys, unit_cost, do_work_get_ptrs);
+
+    // Merge copies of init_cursor_list
+    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
+      if (init_cursor_list[i].size() > 0) {
+        init_cursor_list[0].splice(init_cursor_list[0].end(),
+                                   init_cursor_list[i]);
+      }
+    }
+
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+
+    SetDefaultValueOfNewFeatures(
+        indices, num_of_keys,
+        init_cursor_list[0],
+        var_ptr, stream, event_mgr,
+        ctx.gpu_device);
+  }
+#endif
 
   void LookupOrCreate(K key, V* val, V* default_v, int count = 1)  {
     const V* default_value_ptr =
@@ -325,7 +392,7 @@ class EmbeddingVar : public ResourceBase {
 #if GOOGLE_CUDA
   void CopyEmbeddingsToBuffer(
       V* val_base, int64 size,
-      int64 slice_elems, V** memcpy_address,
+      V** memcpy_address,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
       const Eigen::GpuDevice& gpu_device);
@@ -333,8 +400,7 @@ class EmbeddingVar : public ResourceBase {
   void SetDefaultValueOfNewFeatures(
       const K* keys, int64 size,
       const std::list<int64>& init_cursor,
-      V** memcpy_address, V* default_values,
-      std::function<V*(V*, K, int64, int64, int64)> get_default_v_fn,
+      V** memcpy_address,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
       const Eigen::GpuDevice& gpu_device);
@@ -745,7 +811,7 @@ class EmbeddingVar : public ResourceBase {
         V* default_v =
             default_value_ +
                 (keys[i] % emb_config_.default_value_dim) * value_len_;
-        filter_->Lookup(this, keys[i],
+        filter_->Lookup(keys[i],
             output + i * value_len_, default_v,
             default_value_no_permission_);
       }
diff --git a/tensorflow/core/framework/embedding/embedding_var_context.h b/tensorflow/core/framework/embedding/embedding_var_context.h
index b0dd89a2851..fea85132e29 100644
--- a/tensorflow/core/framework/embedding/embedding_var_context.h
+++ b/tensorflow/core/framework/embedding/embedding_var_context.h
@@ -48,11 +48,13 @@ struct EmbeddingVarContext<GPUDevice> {
       : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()),
         compute_stream(op_ctx->op_device_context()->stream()),
         event_mgr(op_ctx->device()->tensorflow_gpu_device_info()->event_mgr),
+        gpu_allocator(op_ctx->device()->GetAllocator(AllocatorAttributes())),
         gpu_device(op_ctx->eigen_gpu_device()) {}
 
   const DeviceBase::CpuWorkerThreads* worker_threads = nullptr;
   se::Stream* compute_stream = nullptr;
   EventMgr* event_mgr = nullptr;
+  Allocator* gpu_allocator= nullptr;
   const GPUDevice& gpu_device;
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 565201a844f..53c1b69f608 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -41,9 +41,17 @@ class FilterPolicy {
       const V* default_value_ptr, ValuePtr<V>** value_ptr,
       int count, const V* default_value_no_permission) = 0;
 
-  virtual Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
+  virtual Status Lookup(K key, V* val, const V* default_value_ptr,
     const V* default_value_no_permission) = 0;
 
+#if GOOGLE_CUDA
+  virtual void BatchLookup(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys, V* output,
+                           int64 num_of_keys,
+                           V* default_value_ptr,
+                           V* default_value_no_permission) = 0;
+#endif //GOOGLE_CUDA
+
   virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
       bool* is_filter, int64 count) = 0;
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 9c7535f670a..c4eadbd2614 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -28,7 +28,8 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         MultiTierStorage<K, V>(sc, name),
         dram_capacity_(-1) {
     hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc_, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc_, lc, new LocklessHashMapCPU<K, V>(gpu_alloc_));
+    dram_ = new DramStorage<K, V>(sc, cpu_alloc_, lc,
+        new LocklessHashMapCPU<K, V>(gpu_alloc_));
     ssd_ = new SsdHashStorage<K, V>(sc, cpu_alloc_, lc);
   }
 
@@ -65,15 +66,41 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   Status Get(K key, ValuePtr<V>** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
-    if (!s.ok()) {
-      s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
     }
-    if (!s.ok()) {
-      s = ssd_->Get(key, value_ptr);
+    s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK);
+      return s;
+    }
+    s = ssd_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK_AND_DESTROY);
+      return s;
     }
     return s;
   }
 
+  void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
+                const K* keys,
+                ValuePtr<V>** value_ptr_list,
+                int64 num_of_keys,
+                int64 value_len) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        copyback_cursor_list(num_worker_threads + 1);
+    std::vector<std::list<ValuePtr<V>*>>
+        ssd_value_ptr_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, ssd_value_ptr_list);
+
+    CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursor_list[0],
+        ssd_value_ptr_list[0], value_len);
+  }
+
   void Insert(K key, ValuePtr<V>* value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
@@ -98,34 +125,9 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     {
       mutex_lock l(memory_pool_mu_);
       gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-    }
-    // Not found in HBM, Lookup in DRAM
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      // copy dram value to hbm
-      CopyToGpuValuePtr(gpu_value_ptr, *value_ptr, size);
-      *value_ptr = gpu_value_ptr;
-      s = hbm_->TryInsert(key, *value_ptr);
-      if (!s.ok()) {
-        {
-          mutex_lock l(memory_pool_mu_);
-          embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-        }
-        delete *value_ptr;
-        return hbm_->Get(key, value_ptr);
-      } else {
-        return s;
-      }
-    }
-    // Not found in DRAM, Lookup in SSD
-    s = ssd_->Get(key, value_ptr);
-    if (s.ok()) {
-      CopyToGpuValuePtr(gpu_value_ptr, *value_ptr, size);
-      ssd_->DestroyValuePtr(*value_ptr);
-      *value_ptr = gpu_value_ptr;
-    } else {
       *value_ptr = gpu_value_ptr;
     }
+
     s = hbm_->TryInsert(key, *value_ptr);
     // Insert Failed
     if (!s.ok()) {
@@ -516,6 +518,121 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
            cpu_ptr->GetPtr(),
            sizeof(FixedLengthHeader));
   }
+ private:
+  void BatchGetValuePtrs(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* keys,
+      ValuePtr<V>** value_ptr_list,
+      int64 num_of_keys,
+      std::vector<std::list<int64>>& copyback_cursor_list,
+      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+    auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
+                    main_thread_id, &copyback_cursor_list,
+                    &ssd_value_ptr_list]
+        (int64 start, int64 limit) {
+      int copy_id =
+          thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int64 i = start; i < limit; i++) {
+        Status s = Get(keys[i], &value_ptr_list[i]);
+        if (s.ok()) {
+          int64 copyback_flag =
+              (int64)value_ptr_list[i] >> copyback_flag_offset_bits_;
+          RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]);
+          if (copyback_flag == COPYBACK) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+          } else if (copyback_flag == COPYBACK_AND_DESTROY) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+            ssd_value_ptr_list[copy_id].emplace_back(value_ptr_list[i]);
+          }
+        } else {
+          value_ptr_list[i] = nullptr;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+      if (copyback_cursor_list[i].size()>0) {
+        copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
+                                       copyback_cursor_list[i]);
+      }
+      if (ssd_value_ptr_list[i].size()>0) {
+        ssd_value_ptr_list[0].splice(ssd_value_ptr_list[0].end(),
+                                     ssd_value_ptr_list[i]);
+      }
+    }
+  }
+
+  void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
+                                   const K* keys,
+                                   ValuePtr<V>** value_ptr_list,
+                                   std::list<int64>& copyback_cursors,
+                                   std::list<ValuePtr<V>*>& ssd_value_ptrs,
+                                   int64 value_len) {
+    int64 total = copyback_cursors.size();
+    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<K> copyback_keys(total);
+    std::vector<int64> memory_index(total);
+    //Create Hbm ValuePtrs.
+    {
+      int64 i = 0;
+      auto it = copyback_cursors.cbegin();
+      //Mutex with eviction thread
+      mutex_lock l(memory_pool_mu_);
+      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        memory_index[i] = j;
+        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
+        V* val_ptr = embedding_mem_pool_->Allocate();
+        bool flag = gpu_value_ptr->SetPtr(val_ptr);
+        if (!flag) {
+          embedding_mem_pool_->Deallocate(val_ptr);
+        }
+        memcpy((char *)gpu_value_ptr->GetPtr(),
+               (char *)value_ptr_list[j]->GetPtr(),
+               sizeof(FixedLengthHeader));
+        gpu_value_ptrs[i] = gpu_value_ptr;
+        copyback_keys[i] = keys[*it];
+      }
+    }
+    MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursors,
+        memory_index, gpu_value_ptrs, value_len);
+
+    //Insert copyback ids to hbm hash table.
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs]
+        (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; i++)
+        hbm_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers,
+          total, 100000, do_insert);
+
+    for (auto it = ssd_value_ptrs.cbegin();
+         it != ssd_value_ptrs.cend(); ++it) {
+      ssd_->DestroyValuePtr(*it);
+    }
+  }
+
+  void AddCopyBackFlagToValuePtr(
+      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+    int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
+    tmp = ((int64)*value_ptr) | tmp;
+    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+  }
+
+  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+    int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
+    tmp = ((int64)*value_ptr) & tmp;
+    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+  }
 
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
@@ -528,6 +645,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   int64 dram_capacity_;
   std::deque<ValuePtr<V>*> dram_value_ptr_out_of_date_;
   mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index fe6ca2adcb4..2921f873908 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
+#include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
@@ -41,7 +42,8 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       : gpu_alloc_(gpu_alloc),
         MultiTierStorage<K, V>(sc, name) {
     hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc, lc, new LocklessHashMapCPU<K, V>(gpu_alloc));
+    dram_ = new DramStorage<K, V>(sc, cpu_alloc, lc,
+        new LocklessHashMapCPU<K, V>(gpu_alloc));
   }
 
   ~HbmDramStorage() override {
@@ -54,12 +56,34 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   Status Get(K key, ValuePtr<V>** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
-    if (!s.ok()) {
-      s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK);
+      return s;
     }
     return s;
   }
 
+  void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
+                const K* keys,
+                ValuePtr<V>** value_ptr_list,
+                int64 num_of_keys,
+                int64 value_len) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        copyback_cursor_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursor_list[0],
+        value_len);
+  }
+
   void Insert(K key, ValuePtr<V>* value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
@@ -84,18 +108,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     {
       mutex_lock l(memory_pool_mu_);
       gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      // copy dram value to hbm
-      V* cpu_data_address = (*value_ptr)->GetValue(0, 0);
-      V* gpu_data_address = gpu_value_ptr->GetValue(0, 0);
-      cudaMemcpy(gpu_data_address, cpu_data_address,
-          size * sizeof(V), cudaMemcpyHostToDevice);
       *value_ptr = gpu_value_ptr;
-      memcpy(gpu_value_ptr->GetPtr(),
-             (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader));
     }
 
     s = hbm_->TryInsert(key, *value_ptr);
@@ -435,6 +448,105 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   void SetTotalDims(int64 total_dims) override {
     dram_->SetTotalDims(total_dims);
   }
+ private:
+  void BatchGetValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
+                         const K* keys,
+                         ValuePtr<V>** value_ptr_list,
+                         int64 num_of_keys,
+                         std::vector<std::list<int64>>& copyback_cursor_list) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+    auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
+                    main_thread_id, &copyback_cursor_list]
+        (int64 start, int64 limit) {
+      int copy_id =
+          thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int64 i = start; i < limit; i++) {
+        Status s = Get(keys[i], &value_ptr_list[i]);
+        if (s.ok()) {
+          int64 copyback_flag =
+              (int64)value_ptr_list[i] >> copyback_flag_offset_bits_;
+          RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]);
+          if (copyback_flag == CopyBackFlag::COPYBACK) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+          }
+        } else {
+          value_ptr_list[i] = nullptr;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+      if (copyback_cursor_list[i].size()>0) {
+        copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
+                                       copyback_cursor_list[i]);
+      }
+    }
+  }
+
+  void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
+                                   const K* keys,
+                                   ValuePtr<V>** value_ptr_list,
+                                   std::list<int64>& copyback_cursors,
+                                   int64 value_len) {
+    int64 total = copyback_cursors.size();
+    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<K> copyback_keys(total);
+    std::vector<int64> memory_index(total);
+    //Create Hbm ValuePtrs.
+    {
+      int64 i = 0;
+      auto it = copyback_cursors.cbegin();
+      //Mutex with eviction thread
+      mutex_lock l(memory_pool_mu_);
+      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        memory_index[i] = j;
+        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
+        V* val_ptr = embedding_mem_pool_->Allocate();
+        bool flag = gpu_value_ptr->SetPtr(val_ptr);
+        if (!flag) {
+          embedding_mem_pool_->Deallocate(val_ptr);
+        }
+        memcpy((char *)gpu_value_ptr->GetPtr(),
+               (char *)value_ptr_list[j]->GetPtr(),
+               sizeof(FixedLengthHeader));
+        gpu_value_ptrs[i] = gpu_value_ptr;
+        copyback_keys[i] = keys[*it];
+      }
+    }
+    MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursors,
+        memory_index, gpu_value_ptrs, value_len);
+
+    //Insert copyback ids to hbm hash table.
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs]
+        (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; i++)
+        hbm_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers,
+          total, 100000, do_insert);
+  }
+
+  void AddCopyBackFlagToValuePtr(
+      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+    int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
+    tmp = ((int64)*value_ptr) | tmp;
+    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+  }
+
+  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+    int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
+    tmp = ((int64)*value_ptr) & tmp;
+    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+  }
 
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
@@ -442,6 +554,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   EmbeddingMemoryPool<V>* embedding_mem_pool_ = nullptr;
   Allocator* gpu_alloc_;
   mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h b/tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h
new file mode 100644
index 00000000000..9da534328f9
--- /dev/null
+++ b/tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
+
+#include "tensorflow/core/lib/core/spin_rw_lock.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+#include <iostream>
+#include <map>
+#include <memory>
+namespace tensorflow{
+
+// Allocate a copy id for each thread
+class IntraThreadCopyIdAllocator {
+ public:
+  IntraThreadCopyIdAllocator(int num_threads): num_worker_threads_(num_threads) {
+    is_occupy_flag_.reset(new bool[num_worker_threads_]);
+    memset(is_occupy_flag_.get(), 0, sizeof(bool) * num_worker_threads_);
+  }
+
+  int64 GetCopyIdOfThread(uint64 main_thread_id) {
+    uint64 thread_id = Env::Default()->GetCurrentThreadId();
+    if (thread_id == main_thread_id) {
+      return num_worker_threads_;
+    } else {
+      int copy_id = -1;
+      {
+        spin_rd_lock l(mu_);
+        auto iter = hash_map_.find(thread_id);
+        if (iter != hash_map_.end()) {
+          copy_id = iter->second;
+          return copy_id;
+        }
+      }
+      if (copy_id == -1) {
+        // bind a new thread to a local cursor_list
+        copy_id = thread_id % num_worker_threads_;
+        while (!__sync_bool_compare_and_swap(
+            &(is_occupy_flag_[copy_id]), false, true)) {
+          copy_id = (copy_id + 1) % num_worker_threads_;
+        }
+        {
+          spin_wr_lock l(mu_);
+          hash_map_.insert(std::pair<uint64, int64>(thread_id, copy_id));
+        }
+        return copy_id;
+      }
+    }
+  }
+
+ private:
+  int num_worker_threads_;
+  std::unique_ptr<bool[]> is_occupy_flag_;
+  std::map<uint64, int64> hash_map_;
+  mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
+};
+} //namespace tensorflow
+#endif //TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
new file mode 100644
index 00000000000..de275183d22
--- /dev/null
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
@@ -0,0 +1,137 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/embedding/multi_tier_storage.h"
+#include "tensorflow/core/framework/embedding/value_ptr.h"
+#include "tensorflow/core/framework/embedding/batch.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using se::DeviceMemoryBase;
+using se::Stream;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+void SyncWithEventMgr(se::Stream* stream,
+      EventMgr* event_mgr);
+
+namespace embedding{
+template <class K, class V>
+void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+    const EmbeddingVarContext<GPUDevice>& ctx,
+    const K* keys,
+    ValuePtr<V>** value_ptr_list,
+    std::list<int64>& copyback_cursor,
+    const std::vector<int64>& memory_index,
+    const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
+    int value_len) {
+  if (copyback_cursor.size() > 0) {
+    int total = copyback_cursor.size();
+    //Alocate memcpy buffer on CPU and GPU.
+    Allocator* gpu_alloc = ctx.gpu_allocator;
+    V* memcpy_buffer_gpu = (V*)gpu_alloc->AllocateRaw(
+        Allocator::kAllocatorAlignment,
+        total * value_len * sizeof(V));
+    V* memcpy_buffer_cpu = (V*)cpu_allocator()->AllocateRaw(
+        Allocator::kAllocatorAlignment,
+        total * value_len * sizeof(V));
+
+    //Copy embeddings on CPU to bufer on CPU
+    auto do_work = [memory_index,
+                    memcpy_buffer_cpu, value_ptr_list,
+                    gpu_value_ptrs,
+                    value_len, this] (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        int j = memory_index[i];
+        memcpy(memcpy_buffer_cpu + i * value_len,
+               value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V));
+        value_ptr_list[j] = gpu_value_ptrs[i];
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, total,
+          1000, do_work);
+
+    //Copy embeddings from CPU buffer to GPU buffer
+    auto compute_stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    DeviceMemoryBase gpu_buffer_dst_ptr(
+        memcpy_buffer_gpu, total * value_len * sizeof(V));
+    compute_stream->ThenMemcpy(
+        &gpu_buffer_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
+    SyncWithEventMgr(compute_stream, event_mgr);
+                                                
+    //Copy addr of embeddings on GPU to GPU
+    V** value_address = (V**)cpu_allocator()->AllocateRaw(
+        Allocator::kAllocatorAlignment, sizeof(V*) * total);
+    V** dev_value_address = (V**)gpu_alloc->AllocateRaw(
+        Allocator::kAllocatorAlignment, sizeof(V*) * total);
+    int64 i = 0;
+    auto it = copyback_cursor.cbegin();
+    for (; it != copyback_cursor.cend(); ++it, ++i) {
+      // Get the cursor
+      int64 cursor = *it;
+      gpu_value_ptrs[i]->SetInitialized(0);
+      value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0);
+    }
+    DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*));
+    compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*));
+
+    //Copy each embedding to corresponding address
+    int block_dim = 128;
+    TF_CHECK_OK(GpuLaunchKernel(
+        BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
+        block_dim, 0, ctx.gpu_device.stream(),
+        dev_value_address, memcpy_buffer_gpu,
+        value_len, total));
+    SyncWithEventMgr(compute_stream, event_mgr);
+
+    gpu_alloc->DeallocateRaw(dev_value_address);
+    gpu_alloc->DeallocateRaw(memcpy_buffer_gpu);
+    cpu_allocator()->DeallocateRaw(value_address);
+    cpu_allocator()->DeallocateRaw(memcpy_buffer_cpu);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                                        \
+  template void MultiTierStorage<ktype, vtype>::CopyEmbeddingsFromDramToHbm(       \
+      const EmbeddingVarContext<GPUDevice>&, const ktype*, ValuePtr<vtype>**,\
+      std::list<int64>&, const std::vector<int64>&,\
+      const std::vector<ValuePtr<vtype>*>&, int);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+} // namespace embedding
+} // namespace tensorflow
+
+#endif //GOOGLE_CUDA
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 529d7ae4549..e948eb7be6b 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache_thread_pool_creator.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
+#include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/embedding/eviction_manager.h"
 #include "tensorflow/core/framework/embedding/globalstep_shrink_policy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
@@ -285,6 +286,16 @@ class MultiTierStorage : public Storage<K, V> {
   void KeepInvalidValuePtr(ValuePtr<V>* value_ptr) {
     value_ptr_out_of_date_.emplace_back(value_ptr);
   }
+
+#if GOOGLE_CUDA
+  void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& context,
+                                   const K* keys,
+                                   ValuePtr<V>** value_ptr_list,
+                                   std::list<int64>& copyback_cursors,
+                                   const std::vector<int64>& memory_index,
+                                   const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
+                                   int value_len);
+#endif //GOOGL_CUDA
  private:
   virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {}
 
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index e0e734eed81..0f3ae6a0050 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -34,20 +34,54 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
        : config_(config), ev_(ev), storage_(storage) {
   }
 
-  Status Lookup(EV* ev, K key, V* val, const V* default_value_ptr,
+  Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
     ValuePtr<V>* value_ptr = nullptr;
-    Status s = ev->LookupKey(key, &value_ptr);
+    Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev->LookupOrCreateEmb(value_ptr, default_value_ptr);
-      memcpy(val, mem_val, sizeof(V) * ev->ValueLen());
+      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_ptr,
-             sizeof(V) * ev->ValueLen());
+             sizeof(V) * ev_->ValueLen());
     }
     return Status::OK();
   }
 
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx,
+                   const K* keys, V* output,
+                   int64 num_of_keys,
+                   V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
+                    default_value_ptr, default_value_no_permission]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        if (value_ptr != nullptr) {
+          embedding_ptr[i] =
+              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+        } else {
+          embedding_ptr[i] = default_value_ptr;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(
+        output, num_of_keys, embedding_ptr.data(),
+        stream, event_mgr, ctx.gpu_device);
+  }
+#endif //GOOGLE_CUDA
+
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
                       ValuePtr<V>** value_ptr, int count,
                       const V* default_value_no_permission) override {
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 3915676e57d..2fe84b57088 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -23,8 +23,15 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "tensorflow/core/framework/device_base.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#endif
 
 namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 const int kSavedPartitionNum = 1000;
 
@@ -43,6 +50,8 @@ class FilterPolicy;
 template <class K, class V>
 class GPUHashTable;
 
+template<typename Device>
+struct EmbeddingVarContext;
 namespace embedding {
 
 template<typename K, typename V>
@@ -54,6 +63,11 @@ class Storage {
   TF_DISALLOW_COPY_AND_ASSIGN(Storage);
 
   virtual Status Get(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
+                        const K* key,
+                        ValuePtr<V>** value_ptr_list,
+                        int64 num_of_keys,
+                        int64 value_len) {}
   virtual Status Contains(K key) = 0;
   virtual void Insert(K key, ValuePtr<V>** value_ptr, size_t alloc_len) = 0;
   virtual void InsertToDram(K key, ValuePtr<V>** value_ptr,
@@ -185,6 +199,13 @@ class Storage {
 
   virtual void UpdateCache(const Tensor& indices) {}
 
+  virtual void UpdateCache(const K* indices,
+                           int64 num_indices,
+                           const Tensor& indices_counts) {}
+
+  virtual void UpdateCache(const K* keys,
+                           int64 num_indices) {}
+
  protected:
   int64 alloc_len_ = 0;
   int64 total_dims_ = 0;
diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h
index 4ba642cc527..85e44879dcb 100644
--- a/tensorflow/core/framework/embedding/storage_config.h
+++ b/tensorflow/core/framework/embedding/storage_config.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_
 
 #include "tensorflow/core/framework/embedding/cache.h"
-#include "tensorflow/core/framework/embedding/config.pb.h"
-
+#include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/value_ptr.h"
 namespace tensorflow {
 namespace embedding {
 struct StorageConfig {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 2e8814f62fa..14788ede450 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2890,7 +2890,8 @@ cuda_library(
         "//tensorflow/core:common_runtime/gpu/gpu_event_mgr.h"
     ],
     srcs = [
-        "//tensorflow/core:framework/embedding/embedding_var.cu.cc"
+        "//tensorflow/core:framework/embedding/embedding_var.cu.cc",
+        "//tensorflow/core:framework/embedding/multi_tier_storage.cu.cc"
     ],
     copts = tf_copts() + ["-g"]
         + if_cuda(["-DGOOGLE_CUDA=1"]) +
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index efe327a7ec1..0159ccbc25d 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -455,14 +455,25 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
   ASSERT_EQ(variable->Size(), total_size);
 }
 
-void InsertAndLookup(EmbeddingVar<int64, int64>* variable, int64 *keys, long ReadLoops, int value_size){
+void InsertAndLookup(EmbeddingVar<int64, float>* variable,
+                     int64 *keys, long ReadLoops, int value_size){
+  float *default_value_fake = (float *)malloc((value_size)*sizeof(float));
+  for (int j = 0; j < value_size; j++) {
+      default_value_fake[j] = -1.0;
+    }
   for (long j = 0; j < ReadLoops; j++) {
-    int64 *val = (int64 *)malloc((value_size+1)*sizeof(int64));
-    variable->LookupOrCreate(keys[j], val, &(keys[j]));
-    variable->LookupOrCreate(keys[j], val, (&keys[j]+1));
-    ASSERT_EQ(keys[j] , val[0]);
+    float *val = (float *)malloc((value_size+1)*sizeof(float));
+    float *default_value = (float *)malloc((value_size)*sizeof(float));
+    for (int k = 0; k < value_size; k++) {
+      default_value[k] = (float)keys[j];
+    }
+    variable->LookupOrCreate(keys[j], val, default_value);
+    variable->LookupOrCreate(keys[j], val, default_value_fake);
+    ASSERT_EQ(default_value[0] , val[0]);
     free(val);
+    free(default_value);
   }
+  free(default_value_fake);
 }
 
 void MultiBloomFilter(EmbeddingVar<int64, float>* var, int value_size, int64 i) {
@@ -796,11 +807,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) {
 
 TEST(EmbeddingVariableTest, TestInsertAndLookup) {
   int64 value_size = 128;
-  Tensor value(DT_INT64, TensorShape({value_size}));
-  test::FillValues<int64>(&value, std::vector<int64>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, int64>(
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10));
+  auto storage = embedding::StorageFactory::Create<int64, float>(
       embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, int64>("EmbeddingVar",
+  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
       storage, EmbeddingConfig(), cpu_allocator());
 
   variable->Init(value, 1);
@@ -809,15 +820,11 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
   bool* flag = (bool *)malloc(sizeof(bool)*max);
   srand((unsigned)time(NULL));
   int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoops);
-  long *counter = (long *)malloc(sizeof(long)*InsertLoops);
 
   for (long i = 0; i < max; i++) {
     flag[i] = 0;
   }
 
-  for (long i = 0; i < InsertLoops; i++) {
-    counter[i] = 1;
-  }
   int index = 0;
   while (index < InsertLoops) {
     long j = rand() % max;
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
index 9eb43b6127a..f9b9363e1aa 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
@@ -41,17 +41,6 @@ class GroupEmbeddingVarLookupOp
       : GroupEmbeddingLookupForwardBaseOp<TKey, TValue>(c) {
     OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
                                  &is_use_default_value_tensor_));
-
-    if (is_use_default_value_tensor_) {
-      get_default_v_fn_ = [](TValue* default_v, TFKey id, int64 index,
-                             int64 total_dim,
-                             int64 len) { return default_v + len * index; };
-    } else {
-      get_default_v_fn_ = [](TValue* default_v, TFKey id, int64 index,
-                             int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim);
-      };
-    }
     bool is_inference;
     TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
     if (!is_inference) {
@@ -71,11 +60,8 @@ class GroupEmbeddingVarLookupOp
             is_use_default_value_tensor, n, device);
       };
     }
-
   }
 
-  ~GroupEmbeddingVarLookupOp() { delete[] occupy_flag_; }
-
   void Compute(OpKernelContext* ctx) override {
     const auto& device = ctx->eigen_device<GPUDevice>();
     TValue* default_v = nullptr;
@@ -137,106 +123,18 @@ class GroupEmbeddingVarLookupOp
                      ev->GetDefaultValueDim(), true, N, device);
         }
       } else {
-        auto out_flat =
-            out_tensor.shaped<TValue, 2>({N, out_tensor.NumElements() / N});
-        const int64 slice_elems = out_flat.dimension(1);
-        const size_t slice_bytes = slice_elems * sizeof(TValue);
-        TValue** memcpy_address = new TValue*[N];
-        TFKey* indices_host = new TFKey[N];
-
-        auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-        int64 num_threads = worker_threads->num_threads;
-        if (occupy_flag_ == nullptr) {
-          mutex_lock l(m_init_occupy_flag_);
-          // double check
-          if (occupy_flag_ == nullptr) {
-            occupy_flag_ = new bool[num_threads];
-            memset(occupy_flag_, 0, sizeof(bool) * num_threads);
-          }
-        }
-        std::vector<std::list<int64>> init_cursor_list(num_threads + 1);
-        std::vector<std::list<int64>> copyback_cursor_list(num_threads + 1);
-
-        volatile bool is_cpu_indices_ready = false;
-        // Copy ids from GPU to CPU for CPU Lookup.
+        Tensor indices_host(
+            sp_values_tensor.dtype(), sp_values_tensor.shape());
+        //Copy ids from GPU to CPU for CPU Lookup.
         auto stream = ctx->op_device_context()->stream();
         auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
-
-        se::DeviceMemoryBase gpu_src(const_cast<TFKey*>(key_base),
-                                     N * sizeof(TFKey));
-        stream->ThenMemcpy(indices_host, gpu_src, N * sizeof(TFKey));
+        se::DeviceMemoryBase gpu_src(const_cast<TFKey*>(key_base), N * sizeof(TFKey));
+        stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TFKey));
         SyncWithEventMgr(stream, event_mgr);
-
-        uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-        auto do_work = [this, indices_host, out_base, slice_elems, ctx, ev,
-                        memcpy_address, &init_cursor_list,
-                        &copyback_cursor_list, main_thread_id,
-                        num_threads](int64 start, int64 limit) {
-          uint64 thread_id = Env::Default()->GetCurrentThreadId();
-          int64 position;
-          if (thread_id == main_thread_id) {
-            position = num_threads;
-          } else {
-            position = -1;
-            {
-              spin_rd_lock l(mu_);
-              auto iter = hash_map_.find(thread_id);
-              if (iter != hash_map_.end()) {
-                position = iter->second;
-              }
-            }
-
-            if (position == -1) {
-              // bind a new thread to a local cursor_list
-              position = thread_id % num_threads;
-              while (!__sync_bool_compare_and_swap(&(occupy_flag_[position]),
-                                                   false, true)) {
-                position = (position + 1) % num_threads;
-              }
-              {
-                spin_wr_lock l(mu_);
-                hash_map_.insert(std::pair<uint64, int64>(thread_id, position));
-              }
-            }
-          }
-          ev->LookupWithFreqBatch(indices_host, memcpy_address, start, limit,
-                                  init_cursor_list[position],
-                                  copyback_cursor_list[position]);
-        };
-        Shard(num_threads, worker_threads->workers, N, slice_bytes, do_work);
-        for (int i = 1; i < num_threads + 1; i++) {
-          if (init_cursor_list[i].size() > 0) {
-            init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                       init_cursor_list[i]);
-          }
-          if (copyback_cursor_list[i].size() > 0) {
-            copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
-                                           copyback_cursor_list[i]);
-          }
-        }
-        // Pointers in memcpy_address here will
-        // be cast to ValuePtr<Tvalue>* in this funcation.
-        ev->AllocateMemoryForNewFeatures(memcpy_address, init_cursor_list[0]);
-
-        ev->SetDefaultValueOfNewFeatures(
-            indices_host, N, init_cursor_list[0], memcpy_address, default_v,
-            get_default_v_fn_, stream, event_mgr, ctx->eigen_gpu_device());
-
-        ev->CopyEmbeddingsFromCPUToGPU(indices_host, copyback_cursor_list[0],
-                                       memcpy_address, stream, event_mgr,
-                                       ctx->eigen_gpu_device(), worker_threads);
-
-        ev->CopyEmbeddingsToBuffer(out_base, N, slice_elems, memcpy_address,
-                                   stream, event_mgr, ctx->eigen_gpu_device());
-        delete[] memcpy_address;
-
-        if (ev->IsMultiLevel()) {
-          ev->storage()->Schedule([ev, indices_host, N]() {
-            embedding::BatchCache<TFKey>* cache = ev->Cache();
-            cache->update(indices_host, N);
-            delete[] indices_host;
-          });
-        }
+        EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
+        ev->GetEmbeddings(ev_ctx, (TFKey*)indices_host.data(),
+                          out_base, N);
+        ev->UpdateCache(indices_host);
       }
 
       TensorShape emb_vectors_tensor_shape;
@@ -307,15 +205,10 @@ class GroupEmbeddingVarLookupOp
   }
 
  private:
-  std::map<uint64, int64> hash_map_;
-  std::function<TValue*(TValue*, TFKey, int64, int64, int64)> get_default_v_fn_;
   std::function<void(EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
                       TValue* val, TValue* default_v, int32 default_v_num,
                       bool is_use_default_value_tensor,
                       size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
-  mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
-  bool* occupy_flag_{nullptr};
-  mutex m_init_occupy_flag_;
   bool is_use_default_value_tensor_;
 };
 
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index 12ca0f66ec9..57af5ed916e 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -132,15 +132,6 @@ class KvResourceLookupIDOp : public OpKernel {
       auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
       Shard(worker_threads->num_threads, worker_threads->workers, indices_size,
           100, do_work);
-
-      if (ev->IsMultiLevel()) {
-        ev->storage()->Schedule([ev, indices]() {
-          embedding::BatchCache<TKey>* cache = ev->Cache();
-          if (cache) {
-            cache->add_to_prefetch_list(indices);
-          }
-        });
-      }
     }
   }
 };
@@ -279,7 +270,6 @@ class KvResourceLookupIDGPUOp : public OpKernel {
       ev->SetDefaultValueOfNewFeatures(
           indices_flat.data(), indices_size,
           init_cursor_list[0], memcpy_address,
-          default_v, get_default_v_fn,
           stream, event_mgr,
           c->eigen_gpu_device());
 
@@ -290,14 +280,6 @@ class KvResourceLookupIDGPUOp : public OpKernel {
           event_mgr, c->eigen_gpu_device(),
           worker_threads, out_base);
 
-      if (ev->IsMultiLevel()) {
-        ev->storage()->Schedule([ev, indices]() {
-          embedding::BatchCache<TKey>* cache = ev->Cache();
-          if (cache) {
-            cache->add_to_prefetch_list(indices);
-          }
-        });
-      }
       delete[] memcpy_address;
     }
   }
@@ -427,14 +409,6 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
       Shard(worker_threads->num_threads,
             worker_threads->workers, indices_size,
             slice_bytes, do_work);
-      if (ev->IsMultiLevel()) {
-        ev->storage()->Schedule([ev, indices]() {
-          embedding::BatchCache<TKey>* cache = ev->Cache();
-          if (cache) {
-            cache->add_to_cache(indices);
-          }
-        });
-      }
     }
   }
 
@@ -570,19 +544,11 @@ class KvResourceCollectEmbeddingGPUOp : public OpKernel {
       auto event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
       ev->CopyEmbeddingsToBuffer(
           out_base, indices_size,
-          slice_elems, memcpy_address,
+          memcpy_address,
           stream, event_mgr,
           c->eigen_gpu_device());
 
       delete[] memcpy_address;
-      if (ev->IsMultiLevel()) {
-        ev->storage()->Schedule([ev, indices]() {
-          embedding::BatchCache<TKey>* cache = ev->Cache();
-          if (cache) {
-            cache->add_to_cache(indices);
-          }
-        });
-      }
     }
   }
 
@@ -705,33 +671,13 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename Device, typename TKey, typename TValue>
+template <typename Device, typename TKey, typename TValue, bool has_counts>
 class KvResourceGatherGPUOp : public OpKernel {
  public:
   explicit KvResourceGatherGPUOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c,
         c->GetAttr("is_use_default_value_tensor",
           &is_use_default_value_tensor_));
-    if (is_use_default_value_tensor_) {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * index;
-      };
-    } else {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim) ;
-      };
-    }
-    if (c->num_inputs() == 4) {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return count[index];
-      };
-    } else {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return 1;
-      };
-    }
     bool is_inference;
     TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
     if (!is_inference) {
@@ -753,10 +699,6 @@ class KvResourceGatherGPUOp : public OpKernel {
     }
   }
 
-  ~KvResourceGatherGPUOp() {
-    delete[] occupy_flag_;
-  }
-
   void Compute(OpKernelContext* c) override {
     EmbeddingVar<TKey, TValue>* ev = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
@@ -771,22 +713,6 @@ class KvResourceGatherGPUOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
 
-    int32* counts = nullptr;
-    if (c->num_inputs() == 4)
-      counts = (int32*)c->input(3).data();
-
-    int64 num_threads = c->device()
-                         ->tensorflow_cpu_worker_threads()
-                         ->num_threads;
-    if (occupy_flag_ == nullptr) {
-      mutex_lock l(m_init_occupy_flag_);
-      //double check
-      if (occupy_flag_ == nullptr) {
-        occupy_flag_ = new bool[num_threads];
-        memset(occupy_flag_, 0, sizeof(bool) * num_threads);
-      }
-    }
-
     if (N > 0) {
       auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
       TValue* out_base = &out_flat(0, 0);
@@ -828,103 +754,23 @@ class KvResourceGatherGPUOp : public OpKernel {
               indices_size, device);
         }
       } else {
-        TValue** memcpy_address = new TValue*[indices_size];
-        TKey* indices_host = new TKey[N];
+        Tensor indices_host(indices.dtype(), indices.shape());
         //Copy ids from GPU to CPU for CPU Lookup.
         auto stream = c->op_device_context()->stream();
         auto event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
         se::DeviceMemoryBase gpu_src(
             const_cast<TKey*>(&indices_flat(0)), N * sizeof(TKey));
-        stream->ThenMemcpy(indices_host, gpu_src, N * sizeof(TKey));
+        stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TKey));
         SyncWithEventMgr(stream, event_mgr);
-        auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-        std::vector<std::list<int64>> init_cursor_list(
-                                          worker_threads->num_threads + 1);
-        std::vector<std::list<int64>> copyback_cursor_list(
-                                          worker_threads->num_threads + 1);
-        uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-        auto do_work = [this, indices_host,
-            out_base, slice_elems, c, ev,
-            memcpy_address, &init_cursor_list,
-            &copyback_cursor_list, main_thread_id,
-            num_threads] (int64 start, int64 limit) {
-          uint64 thread_id = Env::Default()->GetCurrentThreadId();
-          int64 position;
-          if (thread_id == main_thread_id) {
-            position = num_threads;
-          } else {
-            position = -1;
-            {
-              spin_rd_lock l(mu_);
-              auto iter = hash_map_.find(thread_id);
-              if (iter != hash_map_.end()) {
-                position = iter->second;
-              }
-            }
 
-            if (position == -1) {
-              // bind a new thread to a local cursor_list
-              position = thread_id % num_threads;
-              while (!__sync_bool_compare_and_swap(&(occupy_flag_[position]),
-                                                   false, true)) {
-                position = (position + 1) % num_threads;
-              }
-              {
-                spin_wr_lock l(mu_);
-                hash_map_.insert(std::pair<uint64, int64>(thread_id, position));
-              }
-            }
-          }
-          ev->LookupWithFreqBatch(indices_host, memcpy_address,
-                                  start, limit, init_cursor_list[position],
-                                  copyback_cursor_list[position]);
-        };
-        Shard(worker_threads->num_threads, worker_threads->workers, indices_size,
-            slice_bytes, do_work);
-        for (int i = 1; i < worker_threads->num_threads + 1; i++) {
-          if (init_cursor_list[i].size()>0) {
-            init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                       init_cursor_list[i]);
-          }
-          if (copyback_cursor_list[i].size()>0) {
-            copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
-                                           copyback_cursor_list[i]);
-          }
-        }
-        //Pointers in memcpy_address here will 
-        //be cast to ValuePtr<Tvalue>* in this funcation.
-        ev->AllocateMemoryForNewFeatures(
-            memcpy_address,
-            init_cursor_list[0]);
-
-        ev->SetDefaultValueOfNewFeatures(
-            indices_host, indices_size,
-            init_cursor_list[0], memcpy_address,
-            default_v, get_default_v_fn_,
-            stream, event_mgr,
-            c->eigen_gpu_device());
-
-        ev->CopyEmbeddingsFromCPUToGPU(
-            indices_host,
-            copyback_cursor_list[0],
-            memcpy_address,
-            stream, event_mgr,
-            c->eigen_gpu_device(),
-            worker_threads);
-
-        ev->CopyEmbeddingsToBuffer(
-            out_base, indices_size,
-            slice_elems, memcpy_address,
-            stream, event_mgr,
-            c->eigen_gpu_device());
-        delete []memcpy_address;
-
-        if (ev->IsMultiLevel()) {
-          ev->storage()->Schedule([ev, indices_host, N]() {
-            embedding::BatchCache<TKey>* cache = ev->Cache();
-            cache->update(indices_host, N);
-            delete []indices_host;
-          });
+        EmbeddingVarContext<GPUDevice> ev_ctx(c);
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices_host.data(),
+                          out_base, N);
+        if (has_counts) {
+          const Tensor& indices_counts = c->input(2);
+          ev->UpdateCache(indices_host, indices_counts, true);
+        } else {
+          ev->UpdateCache(indices_host, true);
         }
       }
     }
@@ -932,17 +778,10 @@ class KvResourceGatherGPUOp : public OpKernel {
 
   private:
     bool is_use_default_value_tensor_;
-    std::function<
-      TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn_;
-    std::function<int32(int32*, int64)> get_count_fn_;
     std::function<void(EmbeddingVar<TKey, TValue>* ev, const TKey* key,
                       TValue* val, TValue* default_v, int32 default_v_num,
                       bool is_use_default_value_tensor,
                       size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
-    std::map<uint64, int64> hash_map_;
-    mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
-    bool* occupy_flag_ = nullptr;
-    mutex m_init_occupy_flag_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
@@ -950,7 +789,7 @@ class KvResourceGatherGPUOp : public OpKernel {
                               .Device(DEVICE_##dev)               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceGatherGPUOp<GPUDevice, ktype, vtype>)
+                          KvResourceGatherGPUOp<GPUDevice, ktype, vtype, false>)
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
@@ -967,7 +806,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU);
                               .HostMemory("counts")               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceGatherGPUOp<GPUDevice, ktype, vtype>)
+                          KvResourceGatherGPUOp<GPUDevice, ktype, vtype, true>)
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h
index a71222a83f9..e013a6a2bae 100644
--- a/tensorflow/core/kernels/training_ali_op_helpers.h
+++ b/tensorflow/core/kernels/training_ali_op_helpers.h
@@ -118,66 +118,32 @@ EmbeddingVariableInputLockHolder<K, V> MaybeLockEmbeddingVariableInputMutexesInO
   return EmbeddingVariableInputLockHolder<K, V>(std::move(vars), std::move(locks));
 }
 
-// Allocate a copy id for each thread
-class ThreadCopyIdAllocator {
- public:
-  ThreadCopyIdAllocator(int num_threads): num_worker_threads_(num_threads) {
-    is_occupy_flag_ = new bool[num_worker_threads_];
-    memset(is_occupy_flag_, 0, sizeof(bool) * num_worker_threads_);
-  }
-
-  ~ThreadCopyIdAllocator() {
-    delete[] is_occupy_flag_;
-  }
-
-  int64 GetCopyIdOfThread(uint64 main_thread_id) {
-    uint64 thread_id = Env::Default()->GetCurrentThreadId();
-    if (thread_id == main_thread_id) {
-      return num_worker_threads_;
-    } else {
-      int64 copy_id = -1;
-      {
-        spin_rd_lock l(mu_);
-        auto iter = hash_map_.find(thread_id);
-        if (iter != hash_map_.end()) {
-          copy_id = iter->second;
-          return copy_id;
-        }
-      }
-      if (copy_id == -1) {
-        // bind a new thread to a local cursor_list
-        copy_id = thread_id % num_worker_threads_;
-        while (!__sync_bool_compare_and_swap(
-            &(is_occupy_flag_[copy_id]), false, true)) {
-          copy_id = (copy_id + 1) % num_worker_threads_;
-        }
-        {
-          spin_wr_lock l(mu_);
-          hash_map_.insert(std::pair<uint64, int64>(thread_id, copy_id));
-        }
-        return copy_id;
-      }
-    }
-  }
-
- private:
-  int num_worker_threads_;
-  bool* is_occupy_flag_ = nullptr;
-  std::map<uint64, int64> hash_map_;
-  mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
-};
-
 template<class K, class V, class Tstep>
 void LookupKeyAndSetVersion(
     OpKernelContext* ctx, EmbeddingVar<K, V>* var,
     ValuePtr<V>** value_ptrs, Tstep gs, const K* indices,
-    const int64 task_size, bool indices_as_pointer) {
+    int64 task_size, bool indices_as_pointer,
+    int counts_index) {
+  int64* indices_counts = nullptr;
+  std::function<int64(int64*, int64)> get_count_fn = 0;
+  if (counts_index != -1) {
+    const Tensor& counts_tensor = ctx->input(counts_index);
+    indices_counts = (int64*)counts_tensor.data();
+    get_count_fn = [](int64* counts, int64 index) {
+      return counts[index];};
+  } else {
+    get_count_fn = [](int64* counts, int64 index) {return 1;};
+  }
+
   auto lookup_key_and_set_version_fn = [var, value_ptrs, gs,
-                  indices, indices_as_pointer] (int64 start, int64 limit) {
+      indices, indices_as_pointer,
+      indices_counts, get_count_fn] (int64 start, int64 limit) {
     ValuePtr<V>* value_ptr = nullptr;
     for (int i = start; i < limit; i++) {
       bool is_filter = false;
-      var->LookupOrCreateKey(indices[i], &value_ptr, &is_filter, indices_as_pointer);
+      int64 count = get_count_fn(indices_counts, i);
+      var->LookupOrCreateKey(indices[i], &value_ptr,
+          &is_filter, indices_as_pointer, count);
       value_ptrs[i] = value_ptr;
       var->UpdateVersion(value_ptr, gs);
     }
@@ -188,6 +154,39 @@ void LookupKeyAndSetVersion(
         worker_threads->workers, task_size, unit_cost,
         lookup_key_and_set_version_fn);
 }
+
+template<class K, class V>
+void LookupOrCreateEmbedding(
+    OpKernelContext* ctx,
+    std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
+    ValuePtr<V>** value_ptrs,
+    const K* indices,
+    int64 num_of_keys,
+    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+  for (auto it: vars) {
+    EmbeddingVar<K, V>* var = it.first;
+    V** var_ptr = it.second;
+    EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
+    var->BatchLookupOrCreateEmb(
+        ev_ctx, var_ptr, value_ptrs,
+        indices, num_of_keys, thread_copy_id_alloc);
+  }
+}
+
+template<class K, class V, class Tstep>
+void GetEmbeddingPointers(
+    OpKernelContext* ctx,
+    std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
+    const K* indices, Tstep gs, bool indices_as_pointer,
+    int counts_index, int64 num_of_keys,
+    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+  std::vector<ValuePtr<V>*> value_ptrs(num_of_keys);
+  LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(),
+                         gs, indices, num_of_keys,
+                         indices_as_pointer, counts_index);
+  LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(),
+                          indices, num_of_keys, thread_copy_id_alloc);
+}
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 8e8edceb2ff..b11908a71bf 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/kv_variable_ops.h"
@@ -208,7 +209,8 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename Device, typename TKey, typename T, typename Tstep, bool indices_as_pointer>
+template <typename Device, typename TKey, typename T,
+          typename Tstep, bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdagradGPUOp : public OpKernel {
  public:
   explicit KvSparseApplyAdagradGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -217,55 +219,8 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
     int num_worker_threads = ctx->device()
                              ->tensorflow_cpu_worker_threads()
                              ->num_threads;
-    thread_copy_id_alloc_ = new ThreadCopyIdAllocator(num_worker_threads);
-
-    get_default_v_fn_ =
-        [](T* default_v, TKey id,
-          int64 index, int64 total_dim, int64 len) {
-      return default_v + len * (id % total_dim);
-    };
-  }
-
-  ~KvSparseApplyAdagradGPUOp() {
-    delete thread_copy_id_alloc_;
-  }
-
-  void LookupEmbeddingPointers(
-      OpKernelContext* ctx, EmbeddingVar<TKey, T>* var,
-      EmbeddingVar<TKey, T>* accum, ValuePtr<T>** value_ptrs,
-      std::vector<std::list<int64>>& init_cursor_list,
-      T** v, T**a, const int64 task_size) {
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-    auto do_work_get_ptrs =
-        [var, accum, value_ptrs, a, v,
-         &init_cursor_list, this,
-         main_thread_id] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc_->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        a[i] = accum->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        v[i] = var->LookupOrCreateEmb(
-            value_ptrs[i], var->GetDefaultValue(0));
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          task_size, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
   }
 
   void ApplyGradients(
@@ -353,44 +308,30 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
               N, ctx->get_allocator(AllocatorAttributes()), var, accum,
               key_base, grad_base, lr_scalar, gs, device);
         } else {
-          ValuePtr<T>** value_ptrs = new ValuePtr<T>*[N];
-          TKey* indices_host = nullptr;
+          Tensor indices_temp_host(indices.dtype(), indices.shape());
+          const Tensor* indices_host_ptr = nullptr;
           //Copy ids from GPU to CPU for CPU Lookup.
           auto stream = ctx->op_device_context()->stream();
           auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
           if (!indices_as_pointer) {
-            indices_host = new TKey[N];
+            indices_host_ptr = &indices_temp_host;
             se::DeviceMemoryBase gpu_src(
                 const_cast<TKey*>(&indices_flat(0)), N * sizeof(TKey));
-            stream->ThenMemcpy(indices_host, gpu_src, N * sizeof(TKey));
+            stream->ThenMemcpy(indices_host_ptr->data(), gpu_src, N * sizeof(TKey));
             SyncWithEventMgr(stream, event_mgr);
           } else {
-            indices_host = const_cast<TKey*>(&indices_flat(0));
+            indices_host_ptr = &indices;
           }
-          // Lookup ValuePtrs of ids and set version of each id  in parallel
-          LookupKeyAndSetVersion(ctx, var, value_ptrs,
-                                 gs, indices_host, N,
-                                 indices_as_pointer);
 
-          // Get pointers to embeddings and
-          // check which ids need to be initialized
+          int counts_index = has_counts ? 6 : -1;
           T** v = new T*[N * 2];
           T** a = v + N;
-          int num_worker_threads = ctx->device()
-                            ->tensorflow_cpu_worker_threads()
-                            ->num_threads;
-          std::vector<std::list<int64>> init_cursor_list(
-                                            num_worker_threads + 1);
-          LookupEmbeddingPointers(ctx, var, accum,
-                                  value_ptrs, init_cursor_list,
-                                  v, a, N);
-
-          accum->SetDefaultValueOfNewFeatures(
-              indices_host, N,
-              init_cursor_list[0],
-              a, accum->GetDefaultValuePtr(),
-              get_default_v_fn_, stream,
-              event_mgr, ctx->eigen_device<GPUDevice>());
+          std::vector<std::pair<EmbeddingVar<TKey, T>*, T**>> vars(2);
+          vars[0] = std::pair<EmbeddingVar<TKey, T>*, T**>(var, v);
+          vars[1] = std::pair<EmbeddingVar<TKey, T>*, T**>(accum, a);
+          GetEmbeddingPointers(ctx, vars, (TKey*)indices_host_ptr->data(),
+                               gs, indices_as_pointer,
+                               counts_index, N, thread_copy_id_alloc_.get());
 
           ApplyGradients(
               var, accum, v, a,
@@ -399,11 +340,12 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
               stream, event_mgr,
               ctx->eigen_device<GPUDevice>());
 
-          delete[] v;
-          delete[] value_ptrs;
-          if (!indices_as_pointer) {
-            delete[] indices_host;
+          if (has_counts) {
+            const Tensor& counts_tensor = ctx->input(counts_index);
+              var->UpdateCache(*indices_host_ptr, counts_tensor);
           }
+
+          delete[] v;
         }
       }
     }
@@ -411,8 +353,7 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
 
  private:
   bool use_exclusive_lock_;
-  ThreadCopyIdAllocator* thread_copy_id_alloc_ = nullptr;
-  std::function<T*(T*, TKey, int64, int64, int64)> get_default_v_fn_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
 };
 
 namespace functor {
@@ -444,7 +385,7 @@ DECLARE_GPU_SPEC(int64, double);
                               .HostMemory("global_step")             \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, false>);\
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, false, false>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagrad")  \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
@@ -453,7 +394,7 @@ DECLARE_GPU_SPEC(int64, double);
                               .HostMemory("global_step")             \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true>);\
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true, false>);\
   REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradWithCounts")       \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
@@ -462,7 +403,7 @@ DECLARE_GPU_SPEC(int64, double);
                               .HostMemory("indices_counts")          \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, false>);\
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, false, true>);\
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradWithCounts")  \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
@@ -472,7 +413,7 @@ DECLARE_GPU_SPEC(int64, double);
                               .HostMemory("indices_counts")          \
                               .TypeConstraint<Tindices>("Tindices")  \
                               .TypeConstraint<Tstep>("Tstep"),       \
-                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true>);
+                          KvSparseApplyAdagradGPUOp<GPUDevice, Tindices, T, Tstep, true, true>);
 #define REGISTER_GPU_KERNELS(T)        \
   REGISTER_KERNELS(int32, T, int32);   \
   REGISTER_KERNELS(int64, T, int32);   \
@@ -1633,7 +1574,8 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename Device, typename T, typename Tindex, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamGPUOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -1642,58 +1584,8 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
     int num_worker_threads = ctx->device()
                              ->tensorflow_cpu_worker_threads()
                              ->num_threads;
-    thread_copy_id_alloc_ = new ThreadCopyIdAllocator(num_worker_threads);
-
-    get_default_v_fn_ =
-        [](T* default_v, Tindex id,
-          int64 index, int64 total_dim, int64 len) {
-      return default_v + len * (id % total_dim);
-    };
-  }
-
-  ~KvSparseApplyAdamGPUOp() {
-    delete thread_copy_id_alloc_;
-  }
-
-  void LookupEmbeddingPointers(
-      OpKernelContext* ctx, EmbeddingVar<Tindex, T>* var,
-      EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
-      ValuePtr<T>** value_ptrs,
-      std::vector<std::list<int64>>& init_cursor_list,
-      T** var_ptr, T** m_ptr, T** v_ptr, const int64 task_size) {
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-    auto do_work_get_ptrs =
-        [var, m, v, value_ptrs, var_ptr, m_ptr, v_ptr,
-         &init_cursor_list, this,
-         main_thread_id] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc_->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        m_ptr[i] = m->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        v_ptr[i] = v->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        var_ptr[i] = var->LookupOrCreateEmb(
-            value_ptrs[i], var->GetDefaultValue(0));
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          task_size, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
   }
 
   void ApplyGradients(
@@ -1814,7 +1706,6 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
         auto indices_flat = indices.flat<Tindex>();
         auto grad_flat = grad.flat_outer_dims<T>();
         int64 gs = global_step.scalar<int64>()();
-        ValuePtr<T>** value_ptrs = new ValuePtr<T>*[N];
         T beta1_power_scalar = beta1_power.scalar<T>()();
         T beta2_power_scalar = beta2_power.scalar<T>()();
         T lr_scalar = lr.scalar<T>()();
@@ -1825,42 +1716,20 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
             Eigen::numext::sqrt(static_cast<T>(1) - beta2_power_scalar) /
             (static_cast<T>(1) - beta1_power_scalar);
 
-        // Lookup ValuePtrs of ids and set version of each id  in parallel
-        LookupKeyAndSetVersion(ctx, var, value_ptrs,
-                               gs, indices_flat.data(), N,
-                               indices_as_pointer);
-
-        // Get pointers to embeddings and
-        // check which ids need to be initialized
+        int counts_index = has_counts ? 12 : -1;
         T** var_ptr = new T*[N * 3];
         T** m_ptr = var_ptr + N;
         T** v_ptr = m_ptr + N;
-        int num_worker_threads = ctx->device()
-                                 ->tensorflow_cpu_worker_threads()
-                                 ->num_threads;
-        std::vector<std::list<int64>> init_cursor_list(
-            num_worker_threads + 1);
-        LookupEmbeddingPointers(ctx, var, m, v,
-                                value_ptrs, init_cursor_list,
-                                var_ptr, m_ptr, v_ptr, N);
+        std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(3);
+        vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+        vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(m, m_ptr);
+        vars[2] = std::pair<EmbeddingVar<Tindex, T>*, T**>(v, v_ptr);
+        GetEmbeddingPointers(ctx, vars, indices_flat.data(),
+                             gs, indices_as_pointer,
+                             counts_index, N, thread_copy_id_alloc_.get());
 
         auto stream = ctx->op_device_context()->stream();
         auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
-
-        m->SetDefaultValueOfNewFeatures(
-            indices_flat.data(), N,
-            init_cursor_list[0],
-            m_ptr, m->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_gpu_device());
-
-        v->SetDefaultValueOfNewFeatures(
-            indices_flat.data(), N,
-            init_cursor_list[0],
-            v_ptr, v->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_gpu_device());
-
         ApplyGradients(
             var, m, v, var_ptr,
             m_ptr, v_ptr, alpha,
@@ -1869,15 +1738,19 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_gpu_device());
 
+        if (has_counts) {
+          const Tensor& counts_tensor = ctx->input(counts_index);
+          var->UpdateCache(indices, counts_tensor);
+        }
+
         delete[] var_ptr;
-        delete[] value_ptrs;
       }
     }
   }
 
  private:
   bool use_exclusive_lock_;
-  ThreadCopyIdAllocator* thread_copy_id_alloc_ = nullptr;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
   std::function<T*(T*, Tindex, int64, int64, int64)> get_default_v_fn_;
 };
 
@@ -1894,7 +1767,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .HostMemory("global_step")             \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false>); \
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false, false>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdam")           \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -1907,7 +1780,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .HostMemory("global_step")             \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true>);\
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true, false>);\
   REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWithCounts")             \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -1921,7 +1794,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .HostMemory("indices_counts")          \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false>); \
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, false, true>); \
    REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWithCounts")           \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -1935,7 +1808,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
                               .HostMemory("indices_counts")          \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true>);
+                          KvSparseApplyAdamGPUOp<GPUDevice, T, Tindices, true, true>);
 #define REGISTER_GPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
@@ -2670,7 +2543,8 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename Device, typename T, typename Tindex, typename Tstep, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          typename Tstep, bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamAsyncGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -2680,58 +2554,8 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
     int num_worker_threads = ctx->device()
                              ->tensorflow_cpu_worker_threads()
                              ->num_threads;
-    thread_copy_id_alloc_ = new ThreadCopyIdAllocator(num_worker_threads);
-
-    get_default_v_fn_ =
-        [](T* default_v, Tindex id,
-          int64 index, int64 total_dim, int64 len) {
-      return default_v + len * (id % total_dim);
-    };
-  }
-
-  ~KvSparseApplyAdamAsyncGPUOp() {
-    delete thread_copy_id_alloc_;
-  }
-
-  void LookupEmbeddingPointers(
-      OpKernelContext* ctx, EmbeddingVar<Tindex, T>* var,
-      EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
-      ValuePtr<T>** value_ptrs,
-      std::vector<std::list<int64>>& init_cursor_list,
-      T** var_ptr, T** m_ptr, T** v_ptr, const int64 task_size) {
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-    auto do_work_get_ptrs =
-        [var, m, v, value_ptrs, var_ptr, m_ptr, v_ptr,
-         &init_cursor_list, this,
-         main_thread_id] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc_->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        m_ptr[i] = m->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        v_ptr[i] = v->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        var_ptr[i] = var->LookupOrCreateEmb(
-            value_ptrs[i], var->GetDefaultValue(0));
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          task_size, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
   }
 
   void ApplyGradients(
@@ -2887,47 +2711,32 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
         auto beta1_power_scalar = beta1_power.scalar<T>();
         auto beta2_power_scalar = beta2_power.scalar<T>();
 
-        ValuePtr<T>** value_ptrs = new ValuePtr<T>*[N];
-        Tindex* indices_host = new Tindex[N];
+        Tensor indices_temp_host(indices.dtype(), indices.shape());
+        const Tensor* indices_host_ptr = nullptr;
         //Copy ids from GPU to CPU for CPU Lookup.
         auto stream = ctx->op_device_context()->stream();
         auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
-        se::DeviceMemoryBase gpu_src(
-            const_cast<Tindex*>(indices_vec.data()), N * sizeof(Tindex));
-        stream->ThenMemcpy(indices_host, gpu_src, N * sizeof(Tindex));
-        SyncWithEventMgr(stream, event_mgr);
-        // Lookup ValuePtrs of ids and set version of each id  in parallel
-        LookupKeyAndSetVersion(ctx, var, value_ptrs,
-                               gs, indices_host, N,
-                               indices_as_pointer);
+        if (!indices_as_pointer) {
+          indices_host_ptr = &indices_temp_host;
+          se::DeviceMemoryBase gpu_src(
+              const_cast<Tindex*>(&indices_vec(0)), N * sizeof(Tindex));
+          stream->ThenMemcpy(indices_host_ptr->data(), gpu_src, N * sizeof(Tindex));
+          SyncWithEventMgr(stream, event_mgr);
+        } else {
+          indices_host_ptr = &indices;
+        }
 
-        // Get pointers to embeddings and
-        // check which ids need to be initialized
+        int counts_index = has_counts ? 12 : -1;
         T** var_ptr = new T*[N * 3];
         T** m_ptr = var_ptr + N;
         T** v_ptr = m_ptr + N;
-        int num_worker_threads = ctx->device()
-                                 ->tensorflow_cpu_worker_threads()
-                                 ->num_threads;
-        std::vector<std::list<int64>> init_cursor_list(
-            num_worker_threads + 1);
-        LookupEmbeddingPointers(ctx, var, m, v,
-                                value_ptrs, init_cursor_list,
-                                var_ptr, m_ptr, v_ptr, N);
-
-        m->SetDefaultValueOfNewFeatures(
-            indices_host, N,
-            init_cursor_list[0],
-            m_ptr, m->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_device<GPUDevice>());
-
-        v->SetDefaultValueOfNewFeatures(
-            indices_host, N,
-            init_cursor_list[0],
-            v_ptr, v->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_device<GPUDevice>());
+        std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(3);
+        vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+        vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(m, m_ptr);
+        vars[2] = std::pair<EmbeddingVar<Tindex, T>*, T**>(v, v_ptr);
+        GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(),
+                             gs, indices_as_pointer,
+                             counts_index, N, thread_copy_id_alloc_.get());
 
         ApplyGradients(
             var, m, v, var_ptr,
@@ -2940,9 +2749,12 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_device<GPUDevice>());
 
+        if (has_counts) {
+          const Tensor& counts_tensor = ctx->input(counts_index);
+          var->UpdateCache(*indices_host_ptr, counts_tensor);
+        }
+
         delete[] var_ptr;
-        delete[] value_ptrs;
-        delete[] indices_host;
       }
     }
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
@@ -2951,8 +2763,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
  private:
   bool use_exclusive_lock_;
   bool apply_sparse_rmsprop_;
-  ThreadCopyIdAllocator* thread_copy_id_alloc_ = nullptr;
-  std::function<T*(T*, Tindex, int64, int64, int64)> get_default_v_fn_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
 };
 
 #define REGISTER_KERNELS(D, T, Tindices, Tstep)                             \
@@ -2966,7 +2777,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false>); \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false, false>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsync")           \
                               .Device(DEVICE_##D)                          \
                               .HostMemory("lr")                      \
@@ -2977,7 +2788,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true>); \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true, false>); \
   REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsyncWithCounts")           \
                               .Device(DEVICE_##D)                          \
                               .HostMemory("lr")                      \
@@ -2989,7 +2800,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false>); \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false, true>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")           \
                               .Device(DEVICE_##D)                          \
                               .HostMemory("lr")                      \
@@ -3001,7 +2812,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<Tindices>("Tindices")        \
                               .TypeConstraint<Tstep>("Tstep"),             \
-                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true>);
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true, true>);
 #define REGISTER_GPU_KERNELS(T)        \
   REGISTER_KERNELS(GPU, T, int32, int32);   \
   REGISTER_KERNELS(GPU, T, int64, int32);   \
@@ -3390,7 +3201,8 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename Device, typename T, typename Tindex, bool indices_as_pointer>
+template <typename Device, typename T, typename Tindex,
+          bool indices_as_pointer, bool has_counts>
 class KvSparseApplyAdamWGPUOp : public OpKernel {
  public:
   explicit KvSparseApplyAdamWGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -3399,58 +3211,8 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
     int num_worker_threads = ctx->device()
                              ->tensorflow_cpu_worker_threads()
                              ->num_threads;
-    thread_copy_id_alloc_ = new ThreadCopyIdAllocator(num_worker_threads);
-
-    get_default_v_fn_ =
-        [](T* default_v, Tindex id,
-          int64 index, int64 total_dim, int64 len) {
-      return default_v + len * (id % total_dim);
-    };
-  }
-
-  ~KvSparseApplyAdamWGPUOp() {
-    delete thread_copy_id_alloc_;
-  }
-
-  void LookupEmbeddingPointers(
-      OpKernelContext* ctx, EmbeddingVar<Tindex, T>* var,
-      EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
-      ValuePtr<T>** value_ptrs,
-      std::vector<std::list<int64>>& init_cursor_list,
-      T** var_ptr, T** m_ptr, T** v_ptr, const int64 task_size) {
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-    auto do_work_get_ptrs =
-        [var, m, v, value_ptrs, var_ptr, m_ptr, v_ptr,
-         &init_cursor_list, this,
-         main_thread_id] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc_->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        m_ptr[i] = m->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        v_ptr[i] = v->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        var_ptr[i] = var->LookupOrCreateEmb(
-            value_ptrs[i], var->GetDefaultValue(0));
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          task_size, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
   }
 
   void ApplyGradients(
@@ -3572,7 +3334,6 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
         auto indices_flat = indices.flat<Tindex>();
         auto grad_flat = grad.flat_outer_dims<T>();
         int64 gs = global_step.scalar<int64>()();
-        ValuePtr<T>** value_ptrs = new ValuePtr<T>*[N];
         T beta1_power_scalar = beta1_power.scalar<T>()();
         T beta2_power_scalar = beta2_power.scalar<T>()();
         T lr_scalar = lr.scalar<T>()();
@@ -3585,41 +3346,20 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
             (static_cast<T>(1) - beta1_power_scalar);
 
         // Lookup ValuePtrs of ids and set version of each id  in parallel
-        LookupKeyAndSetVersion(ctx, var, value_ptrs,
-                               gs, indices_flat.data(), N,
-                               indices_as_pointer);
-
-        // Get pointers to embeddings and
-        // check which ids need to be initialized
+        int counts_index = has_counts ? 13 : -1;
         T** var_ptr = new T*[N * 3];
         T** m_ptr = var_ptr + N;
         T** v_ptr = m_ptr + N;
-        int num_worker_threads = ctx->device()
-                                 ->tensorflow_cpu_worker_threads()
-                                 ->num_threads;
-        std::vector<std::list<int64>> init_cursor_list(
-            num_worker_threads + 1);
-        LookupEmbeddingPointers(ctx, var, m, v,
-                                value_ptrs, init_cursor_list,
-                                var_ptr, m_ptr, v_ptr, N);
+        std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(3);
+        vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+        vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(m, m_ptr);
+        vars[2] = std::pair<EmbeddingVar<Tindex, T>*, T**>(v, v_ptr);
+        GetEmbeddingPointers(ctx, vars, indices_flat.data(),
+                             gs, indices_as_pointer,
+                             counts_index, N, thread_copy_id_alloc_.get());
 
         auto stream = ctx->op_device_context()->stream();
         auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
-
-        m->SetDefaultValueOfNewFeatures(
-            indices_flat.data(), N,
-            init_cursor_list[0],
-            m_ptr, m->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_gpu_device());
-
-        v->SetDefaultValueOfNewFeatures(
-            indices_flat.data(), N,
-            init_cursor_list[0],
-            v_ptr, v->GetDefaultValuePtr(),
-            get_default_v_fn_, stream,
-            event_mgr, ctx->eigen_gpu_device());
-
         ApplyGradients(
             var, m, v, var_ptr,
             m_ptr, v_ptr, alpha,
@@ -3629,16 +3369,19 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_gpu_device());
 
+        if (has_counts) {
+          const Tensor& counts_tensor = ctx->input(counts_index);
+          var->UpdateCache(indices, counts_tensor);
+        }
+
         delete[] var_ptr;
-        delete[] value_ptrs;
       }
     }
   }
 
  private:
   bool use_exclusive_lock_;
-  ThreadCopyIdAllocator* thread_copy_id_alloc_ = nullptr;
-  std::function<T*(T*, Tindex, int64, int64, int64)> get_default_v_fn_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
 };
 
 #define REGISTER_KERNELS(T, Tindices)                                 \
@@ -3655,7 +3398,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
                               .HostMemory("weight_decay")            \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, false>); \
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, false, false>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamW")             \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -3669,7 +3412,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
                               .HostMemory("weight_decay")            \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true>);\
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true, false>);\
   REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamWWithCounts")             \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -3684,7 +3427,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
                               .HostMemory("indices_counts")           \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, false>); \
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, false, true>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamWWithCounts")             \
                               .Device(DEVICE_GPU)                     \
                               .HostMemory("indices")                 \
@@ -3699,7 +3442,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
                               .HostMemory("indices_counts")           \
                               .TypeConstraint<T>("T")                 \
                               .TypeConstraint<Tindices>("Tindices"),  \
-                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true>);
+                          KvSparseApplyAdamWGPUOp<GPUDevice, T, Tindices, true, true>);
 #define REGISTER_GPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
   REGISTER_KERNELS(T, int64);
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 02272d13a6b..2a88ea32c8b 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2132,7 +2132,6 @@ def testEmbeddingVariableForLookupTier(self):
       result = sess.run(tires)
       del os.environ["TF_SSDHASH_ASYNC_COMPACTION"]
       del os.environ["TF_MULTI_TIER_EV_EVICTION_THREADS"]
-      print(result)
       for i in range(0, 6):
         if i == 2:
           self.assertEqual(result[i], 1)
@@ -2142,7 +2141,6 @@ def testEmbeddingVariableForLookupTier(self):
           self.assertEqual(result[i], 0)
       sess.run(emb, {ids:[3]})
       result = sess.run(tires)
-      print(result)
       for i in range(0, 5):
         self.assertEqual(result[i], 0)
 
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 625e02a757b..e6140c9c149 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -368,6 +368,7 @@ def _init_from_args(self,
         self._dtype = initial_value.dtype.base_dtype
         self._constraint = constraint
         self._gather_op = None
+        self._counts_tensor = None
         if self._is_primary:
           self._slot_num = 0 
         else:
@@ -799,6 +800,7 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
               default_value,
               counts, is_inference=True,
               name=name)
+        self._counts_tensor = counts
       else:
         value = gen_kv_variable_ops.kv_resource_gather(self._handle,
               indices,
@@ -806,7 +808,6 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
               is_use_default_value_tensor,
               is_inference=True,
               name=name)
-      self._counts_tensor = counts
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):

From c0f9b1be99f0fd4e25ad2352d5d55af26276adab Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Wed, 21 Jun 2023 09:38:53 +0800
Subject: [PATCH 28/91] [Embedding] Fix memory leak to avoid OOM. (#897)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 tensorflow/core/framework/embedding/embedding_var.h | 3 +++
 tensorflow/core/framework/embedding/filter_policy.h | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 201afd8bf5a..85ab9ba51a1 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -798,6 +798,9 @@ class EmbeddingVar : public ResourceBase {
       TypedAllocator::Deallocate(alloc_, default_value_no_permission_,
           value_len_);
     }
+    if (filter_) {
+      delete filter_;
+    }
   }
 
  private:
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 53c1b69f608..49a48014cde 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -27,10 +27,10 @@ struct RestoreBuffer {
   char* freq_buffer = nullptr;
 
   ~RestoreBuffer() {
-    delete key_buffer;
-    delete value_buffer;
-    delete version_buffer;
-    delete freq_buffer;
+    delete []key_buffer;
+    delete []value_buffer;
+    delete []version_buffer;
+    delete []freq_buffer;
   }
 };
 

From 869677fc7a913f41efcebff6d007d1a34d2cc439 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Mon, 26 Jun 2023 15:55:55 +0800
Subject: [PATCH 29/91] [Embedding] Refine APIs for foward-backward
 optimization. (#894)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../framework/embedding/bloom_filter_policy.h |  74 ++++
 .../embedding/counter_filter_policy.h         |  14 +
 .../core/framework/embedding/embedding_var.h  | 151 +++++---
 .../core/framework/embedding/filter_policy.h  |   7 +
 .../embedding/hbm_dram_ssd_storage.h          | 128 ++++++-
 .../framework/embedding/hbm_dram_storage.h    | 134 ++++++-
 .../framework/embedding/multi_tier_storage.h  |  12 +
 .../embedding/nullable_filter_policy.h        |  33 ++
 tensorflow/core/framework/embedding/storage.h |  17 +-
 tensorflow/core/graph/embedding_pass.cc       |  21 +-
 tensorflow/core/graph/graph.h                 |   3 +
 .../core/kernels/kv_variable_lookup_ops.cc    | 360 +-----------------
 tensorflow/core/kernels/training_ali_ops.cc   |  24 +-
 .../ops/embedding_variable_ops_gpu_test.py    |   2 +-
 .../python/ops/embedding_variable_ops_test.py |  51 ++-
 15 files changed, 595 insertions(+), 436 deletions(-)

diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 1ac76b51fc1..c7a1f901ab3 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/filter_policy.h"
+#include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 
 namespace tensorflow {
 
@@ -102,6 +103,70 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         output, num_of_keys, embedding_ptr.data(),
         stream, event_mgr, ctx.gpu_device);
   }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              int64 num_of_keys) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::vector<K>> lookup_or_create_ids(num_worker_threads);
+    std::vector<std::vector<int>>
+        lookup_or_create_cursor(num_worker_threads);
+    std::vector<std::vector<ValuePtr<V>*>>
+        lookup_or_create_ptrs(num_worker_threads);
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    std::vector<std::list<int64>>
+        not_found_cursor_list(num_worker_threads + 1);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    auto do_work = [this, keys, value_ptrs_list,
+                    &lookup_or_create_ids,
+                    &lookup_or_create_ptrs,
+                    &lookup_or_create_cursor,
+                    main_thread_id,
+                    &thread_copy_id_alloc]
+         (int64 start, int64 limit) {
+      int copy_id =
+          thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int i = start; i < limit; i++) {
+        if (GetBloomFreq(keys[i]) >= config_.filter_freq) {
+          lookup_or_create_ids[copy_id].emplace_back(keys[i]);
+          lookup_or_create_ptrs[copy_id].emplace_back(value_ptrs_list[i]);
+          lookup_or_create_cursor[copy_id].emplace_back(i);
+        } else {
+          AddFreq(keys[i], 1);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    std::vector<K> total_ids(num_of_keys);
+    std::vector<ValuePtr<V>*> total_ptrs(num_of_keys);
+    std::vector<int> total_cursors(num_of_keys);
+    int num_of_admit_id = 0;
+    for (int i = 0; i < num_worker_threads; i++) {
+      if (lookup_or_create_ids[i].size() > 0) {
+        memcpy(total_ids.data() + num_of_admit_id,
+               lookup_or_create_ids[i].data(),
+               sizeof(K) * lookup_or_create_ids[i].size());
+        memcpy(total_ptrs.data() + num_of_admit_id,
+               lookup_or_create_ptrs[i].data(),
+               sizeof(ValuePtr<V>*) * lookup_or_create_ptrs[i].size());
+        memcpy(total_cursors.data() + num_of_admit_id,
+               lookup_or_create_cursor[i].data(),
+               sizeof(int) * lookup_or_create_cursor[i].size());
+        num_of_admit_id += lookup_or_create_ids[i].size();
+      }
+    }
+
+    ev_->BatchLookupOrCreateKey(ctx, total_ids.data(), total_ptrs.data(),
+                                num_of_keys, not_found_cursor_list);
+    for (int i = 0; i < total_ptrs.size(); i++) {
+      value_ptrs_list[total_cursors[i]] = total_ptrs[i];
+    }
+  }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
@@ -119,6 +184,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status LookupOrCreateKey(K key, ValuePtr<V>** val,
       bool* is_filter, int64 count) override {
+    *val = nullptr;
     if ((GetFreq(key, *val) + count) >= config_.filter_freq) {
       *is_filter = true;
       return ev_->LookupOrCreateKey(key, val);
@@ -140,6 +206,14 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     return bloom_counter_;
   }
 
+  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+    if (value_ptr == nullptr) {
+      return false;
+    } else {
+      return GetFreq(key, value_ptr) >= config_.filter_freq;
+    }
+  }
+
  private:
   int64 GetBloomFreq(K key) {
     std::vector<int64> hash_val;
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index 5d0711585b4..ec83ee16d6d 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -74,6 +74,16 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         output, num_of_keys, embedding_ptr.data(),
         stream, event_mgr, ctx.gpu_device);
   }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              int64 num_of_keys) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        not_found_cursor_list(num_worker_threads + 1);
+    ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs_list,
+                                num_of_keys, not_found_cursor_list);
+  }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
@@ -200,6 +210,10 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
     return Status::OK();
   }
 
+  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+    return (GetFreq(key, value_ptr) >= config_.filter_freq);
+  }
+
  private:
   EmbeddingConfig config_;
   EV* ev_;
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 85ab9ba51a1..ca5838ea37a 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -205,19 +205,6 @@ class EmbeddingVar : public ResourceBase {
     update_version_fn_(value_ptr, gs);
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
-      int64 update_version, embedding::CopyBackFlag &need_copyback) {
-    Status s = storage_->GetOrCreate(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()), need_copyback);
-    TF_CHECK_OK(s);
-    if (emb_config_.is_primary() &&
-        emb_config_.steps_to_live != 0 &&
-        update_version != -1) {
-      (*value_ptr)->SetStep(update_version);
-    }
-    return s;
-  }
-
   void BatchCommit(const std::vector<K>& keys,
                    const std::vector<ValuePtr<V>*>& value_ptrs) {
     TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs));
@@ -283,6 +270,57 @@ class EmbeddingVar : public ResourceBase {
           worker_threads->workers, num_of_keys,
           value_len_ * sizeof(V), do_work);
   }
+
+  void GetOrCreateKey(const EmbeddingVarContext<CPUDevice>& context,
+                      const Tensor& keys_tensor,
+                      ValuePtr<V>** value_ptrs,
+                      int64 num_of_keys) {
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_filter = false;
+        filter_->LookupOrCreateKey(keys[i], &value_ptrs[i], &is_filter, 1);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers,
+          num_of_keys, value_len_ * sizeof(V), do_work);
+
+    storage_->AddToCachePrefetchList(keys_tensor);
+  }
+
+  void GatherEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                        const Tensor& keys_tensor,
+                        ValuePtr<V>** value_ptrs,
+                        V* output,
+                        int64 num_of_keys) {
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs, output]
+        (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
+        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
+        V* value = nullptr;
+        if (is_admit) {
+          V* default_v =
+              default_value_ +
+                  (keys[i] % emb_config_.default_value_dim) * value_len_;
+          value = LookupOrCreateEmb(value_ptrs[i], default_v);
+        } else {
+          value = default_value_no_permission_;
+        }
+        memcpy(output + i * value_len_, value, sizeof(V) * value_len_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+
+    storage_->AddToCache(keys_tensor);
+  }
+
 #if GOOGLE_CUDA
   void GetEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
                      const K* keys,
@@ -293,6 +331,62 @@ class EmbeddingVar : public ResourceBase {
                          default_value_no_permission_);
   }
 
+  void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                      const Tensor& keys_tensor,
+                      ValuePtr<V>** value_ptrs,
+                      int64 num_of_keys) {
+    const K* keys = (K*)keys_tensor.data();
+    filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    storage_->AddToCachePrefetchList(keys_tensor);
+  }
+
+  void BatchLookupOrCreateKey(
+      const EmbeddingVarContext<GPUDevice>& context,
+      const K* keys,
+      ValuePtr<V>** value_ptrs,
+      int64 num_of_keys,
+      std::vector<std::list<int64>>& not_found_cursor_list) {
+    storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys,
+                               emb_config_.total_num(storage_->GetAllocLen()),
+                               not_found_cursor_list);
+  }
+
+  void GatherEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
+                        const Tensor& keys_tensor,
+                        ValuePtr<V>** value_ptrs,
+                        V* output,
+                        int64 num_of_keys) {
+    std::vector<V*> embedding_ptr(num_of_keys);
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs, output, &embedding_ptr]
+        (int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
+        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
+        if (is_admit) {
+          V* default_v =
+              default_value_ +
+                  (keys[i] % emb_config_.default_value_dim) * value_len_;
+          embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v);
+        } else {
+          embedding_ptr[i] = default_value_no_permission_;
+        }
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+
+    auto stream = context.compute_stream;
+    auto event_mgr = context.event_mgr;
+    CopyEmbeddingsToBuffer(
+        output, num_of_keys, embedding_ptr.data(),
+        stream, event_mgr, context.gpu_device);
+
+    storage_->AddToCache(keys_tensor);
+  }
+
   void BatchLookupOrCreateEmb(
       const EmbeddingVarContext<GPUDevice>& ctx,
       V** var_ptr,
@@ -352,37 +446,6 @@ class EmbeddingVar : public ResourceBase {
     add_freq_fn_(value_ptr, count, emb_config_.filter_freq);
   }
 
-  void LookupWithFreqBatch(const K* keys,
-      V** memcpy_address, int64 start, int64 limit,
-      std::list<int64>& init_cursor,
-      std::list<int64>& copyback_cursor) {
-    ValuePtr<V>* value_ptr = nullptr;
-    for (int64 i = start; i < limit; i++) {
-      embedding::CopyBackFlag copyback_flag =
-          embedding::CopyBackFlag::NOT_COPYBACK;
-      TF_CHECK_OK(LookupOrCreateKey(keys[i], &value_ptr, -1, copyback_flag));
-      memcpy_address[i] = GetAddressOfGpuValuePtr(value_ptr, i, copyback_flag,
-          init_cursor, copyback_cursor);
-    }
-  }
-
-  void LookupWithFreqBatch(const K* keys,
-      V** memcpy_address, int64 start, int64 limit,
-      std::list<int64>& init_cursor,
-      std::list<int64>& copyback_cursor,
-      int64* output_value_ptrs) {
-    ValuePtr<V>* value_ptr = nullptr;
-    for (int64 i = start; i < limit; i++) {
-      embedding::CopyBackFlag copyback_flag =
-          embedding::CopyBackFlag::NOT_COPYBACK;
-      TF_CHECK_OK(LookupOrCreateKey(keys[i], &value_ptr, -1, copyback_flag));
-      value_ptr->AddFreq();
-      output_value_ptrs[i] = (int64)value_ptr;
-      memcpy_address[i] = GetAddressOfGpuValuePtr(value_ptr, i, copyback_flag,
-          init_cursor, copyback_cursor);
-    }
-  }
-
   void BatchInitEmb(int64 size, V** memcpy_address, V* default_value,
       bool* init_flags, int64 value_len) {
     filter_->BatchInitEmb(size, memcpy_address, default_value,
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 49a48014cde..51dddba3e9a 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -50,6 +50,11 @@ class FilterPolicy {
                            int64 num_of_keys,
                            V* default_value_ptr,
                            V* default_value_no_permission) = 0;
+
+  virtual void BatchLookupOrCreateKey(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* keys, ValuePtr<V>** value_ptrs_list,
+      int64 num_of_keys) = 0;
 #endif //GOOGLE_CUDA
 
   virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
@@ -70,6 +75,8 @@ class FilterPolicy {
                 int64 partition_num,
                 bool is_filter,
                 V* default_values) = 0;
+
+  virtual bool is_admit(K key, ValuePtr<V>* value_ptr) = 0;
 };
 } // tensorflow
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index c4eadbd2614..6210e27ab16 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -101,6 +101,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         ssd_value_ptr_list[0], value_len);
   }
 
+  void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* keys,
+      ValuePtr<V>** value_ptr_list,
+      int64 num_of_keys,
+      int64 value_len,
+      std::vector<std::list<int64>>& not_fountd_cursor_list) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        copyback_cursor_list(num_worker_threads + 1);
+    std::vector<std::list<ValuePtr<V>*>>
+        ssd_value_ptr_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, ssd_value_ptr_list,
+                      &not_fountd_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursor_list[0],
+        ssd_value_ptr_list[0], value_len);
+
+    CreateValuePtrs(ctx, keys, value_ptr_list,
+                    not_fountd_cursor_list[0], value_len);
+  }
+
   void Insert(K key, ValuePtr<V>* value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
@@ -525,13 +550,30 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       ValuePtr<V>** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
-      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list) {
+      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list,
+      std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
     uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    std::function<void(std::vector<std::list<int64>>*,
+                       int64, int)> set_not_found_list = 0;
+    if (not_found_cursor_list != nullptr) {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list,
+             int64 i, int copy_id) {
+        (*not_found_cursor_list)[copy_id].emplace_back(i);
+      };
+    } else {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list,
+             int64 i, int copy_id) {};
+    }
+
     auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
                     main_thread_id, &copyback_cursor_list,
-                    &ssd_value_ptr_list]
+                    &ssd_value_ptr_list, set_not_found_list,
+                    &not_found_cursor_list]
         (int64 start, int64 limit) {
       int copy_id =
           thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
@@ -549,6 +591,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
           }
         } else {
           value_ptr_list[i] = nullptr;
+          set_not_found_list(not_found_cursor_list, i, copy_id);
         }
       }
     };
@@ -567,6 +610,16 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
                                      ssd_value_ptr_list[i]);
       }
     }
+
+    if (not_found_cursor_list != nullptr) {
+      for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+        if ((*not_found_cursor_list)[i].size()>0) {
+          (*not_found_cursor_list)[0].splice(
+              (*not_found_cursor_list)[0].end(),
+              (*not_found_cursor_list)[i]);
+        }
+      }
+    }
   }
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
@@ -606,10 +659,22 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         memory_index, gpu_value_ptrs, value_len);
 
     //Insert copyback ids to hbm hash table.
-    auto do_insert = [this, copyback_keys, gpu_value_ptrs]
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs,
+                      memory_index, value_ptr_list]
         (int64 start, int64 limit) {
-      for (int64 i = start; i < limit; i++)
-        hbm_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
+      for (int64 i = start; i < limit; i++) {
+        Status s = hbm_->TryInsert(
+            copyback_keys[i], gpu_value_ptrs[i]);
+        if (!s.ok()) {
+          {
+            mutex_lock l(memory_pool_mu_);
+            embedding_mem_pool_->Deallocate(
+                gpu_value_ptrs[i]->GetValue(0, 0));
+          }
+          delete gpu_value_ptrs[i];
+          hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
+        }
+      }
     };
     auto worker_threads = ctx.worker_threads;
     Shard(worker_threads->num_threads, worker_threads->workers,
@@ -621,6 +686,59 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     }
   }
 
+  void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
+                       const K* keys,
+                       ValuePtr<V>** value_ptr_list,
+                       std::list<int64>& not_found_cursors,
+                       int64 value_len) {
+    int64 total = not_found_cursors.size();
+    if (total > 0) {
+      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<int64> cursor_index(total);
+      //Create Hbm ValuePtrs.
+      {
+        int64 i = 0;
+        auto it = not_found_cursors.cbegin();
+        //Mutex with eviction thread
+        mutex_lock l(memory_pool_mu_);
+        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+          int64 j = *it;
+          cursor_index[i] = j;
+          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
+          V* val_ptr = embedding_mem_pool_->Allocate();
+          bool flag = gpu_value_ptr->SetPtr(val_ptr);
+          if (!flag) {
+            embedding_mem_pool_->Deallocate(val_ptr);
+          }
+          value_ptr_list[j] = gpu_value_ptr;
+          insert_pairs[i].first = keys[j];
+          insert_pairs[i].second = value_ptr_list[j];
+        }
+      }
+
+      //Insert copyback ids to hbm hash table.
+      auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
+          (int64 start, int64 limit) {
+        for (int64 i = start; i < limit; i++) {
+          Status s = hbm_->TryInsert(
+              insert_pairs[i].first, insert_pairs[i].second);
+          if (!s.ok()) {
+            {
+              mutex_lock l(memory_pool_mu_);
+              embedding_mem_pool_->Deallocate(
+                  insert_pairs[i].second->GetValue(0, 0));
+            }
+            delete insert_pairs[i].second;
+            hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
+          }
+        }
+      };
+      auto worker_threads = ctx.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers,
+            total, 100000, do_insert);
+    }
+  }
+
   void AddCopyBackFlagToValuePtr(
       ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index 2921f873908..ed7d197555f 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -88,6 +88,28 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     hbm_->Insert(key, value_ptr);
   }
 
+  void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* keys,
+      ValuePtr<V>** value_ptr_list,
+      int64 num_of_keys,
+      int64 value_len,
+      std::vector<std::list<int64>>& not_fountd_cursor_list) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        copyback_cursor_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, &not_fountd_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursor_list[0],
+        value_len);
+
+    CreateValuePtrs(ctx, keys, value_ptr_list,
+                    not_fountd_cursor_list[0], value_len);
+  }
+
   void Insert(K key, ValuePtr<V>** value_ptr,
               size_t alloc_len) override {
     hbm_->Insert(key, value_ptr, alloc_len);
@@ -449,16 +471,34 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     dram_->SetTotalDims(total_dims);
   }
  private:
-  void BatchGetValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
-                         const K* keys,
-                         ValuePtr<V>** value_ptr_list,
-                         int64 num_of_keys,
-                         std::vector<std::list<int64>>& copyback_cursor_list) {
+  void BatchGetValuePtrs(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* keys,
+      ValuePtr<V>** value_ptr_list,
+      int64 num_of_keys,
+      std::vector<std::list<int64>>& copyback_cursor_list,
+      std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
     uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    std::function<void(std::vector<std::list<int64>>*,
+                       int64, int)> set_not_found_list = 0;
+    if (not_found_cursor_list != nullptr) {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list,
+             int64 i, int copy_id) {
+        (*not_found_cursor_list)[copy_id].emplace_back(i);
+      };
+    } else {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list,
+             int64 i, int copy_id) {};
+    }
+
     auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
-                    main_thread_id, &copyback_cursor_list]
+                    main_thread_id, &copyback_cursor_list,
+                    set_not_found_list, &not_found_cursor_list]
         (int64 start, int64 limit) {
       int copy_id =
           thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
@@ -473,6 +513,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
           }
         } else {
           value_ptr_list[i] = nullptr;
+          set_not_found_list(not_found_cursor_list, i, copy_id);
         }
       }
     };
@@ -487,6 +528,16 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                                        copyback_cursor_list[i]);
       }
     }
+
+    if (not_found_cursor_list != nullptr) {
+      for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+        if ((*not_found_cursor_list)[i].size()>0) {
+          (*not_found_cursor_list)[0].splice(
+              (*not_found_cursor_list)[0].end(),
+              (*not_found_cursor_list)[i]);
+        }
+      }
+    }
   }
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
@@ -525,16 +576,81 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         memory_index, gpu_value_ptrs, value_len);
 
     //Insert copyback ids to hbm hash table.
-    auto do_insert = [this, copyback_keys, gpu_value_ptrs]
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs,
+                      memory_index, value_ptr_list]
         (int64 start, int64 limit) {
-      for (int64 i = start; i < limit; i++)
-        hbm_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
+      for (int64 i = start; i < limit; i++) {
+        Status s = hbm_->TryInsert(
+            copyback_keys[i], gpu_value_ptrs[i]);
+        if (!s.ok()) {
+          {
+            mutex_lock l(memory_pool_mu_);
+            embedding_mem_pool_->Deallocate(
+                gpu_value_ptrs[i]->GetValue(0, 0));
+          }
+          delete gpu_value_ptrs[i];
+          hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
+        }
+      }
     };
     auto worker_threads = ctx.worker_threads;
     Shard(worker_threads->num_threads, worker_threads->workers,
           total, 100000, do_insert);
   }
 
+  void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
+                       const K* keys,
+                       ValuePtr<V>** value_ptr_list,
+                       std::list<int64>& not_found_cursors,
+                       int64 value_len) {
+    int64 total = not_found_cursors.size();
+    if (total > 0) {
+      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<int64> cursor_index(total);
+      //Create Hbm ValuePtrs.
+      {
+        int64 i = 0;
+        auto it = not_found_cursors.cbegin();
+        //Mutex with eviction thread
+        mutex_lock l(memory_pool_mu_);
+        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+          int64 j = *it;
+          cursor_index[i] = j;
+          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
+          V* val_ptr = embedding_mem_pool_->Allocate();
+          bool flag = gpu_value_ptr->SetPtr(val_ptr);
+          if (!flag) {
+            embedding_mem_pool_->Deallocate(val_ptr);
+          }
+          value_ptr_list[j] = gpu_value_ptr;
+          insert_pairs[i].first = keys[j];
+          insert_pairs[i].second = value_ptr_list[j];
+        }
+      }
+
+      //Insert copyback ids to hbm hash table.
+      auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
+          (int64 start, int64 limit) {
+        for (int64 i = start; i < limit; i++) {
+          Status s = hbm_->TryInsert(
+              insert_pairs[i].first, insert_pairs[i].second);
+          if (!s.ok()) {
+            {
+              mutex_lock l(memory_pool_mu_);
+              embedding_mem_pool_->Deallocate(
+                  insert_pairs[i].second->GetValue(0, 0));
+            }
+            delete insert_pairs[i].second;
+            hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
+          }
+        }
+      };
+      auto worker_threads = ctx.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers,
+            total, 100000, do_insert);
+    }
+  }
+
   void AddCopyBackFlagToValuePtr(
       ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index e948eb7be6b..277c20157cd 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -257,6 +257,18 @@ class MultiTierStorage : public Storage<K, V> {
     });
   }
 
+  void AddToCachePrefetchList(const Tensor& indices) override {
+    Schedule([this, indices]() {
+      cache_->add_to_prefetch_list(indices);
+    });
+  }
+
+  void AddToCache(const Tensor& indices) override {
+    Schedule([this, indices]() {
+      cache_->add_to_cache(indices);
+    });
+  }
+
  protected:
   virtual void SetTotalDims(int64 total_dims) = 0;
 
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 0f3ae6a0050..34a9976a4f1 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -80,6 +80,35 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
         output, num_of_keys, embedding_ptr.data(),
         stream, event_mgr, ctx.gpu_device);
   }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, ValuePtr<V>** value_ptrs,
+                              int64 num_of_keys) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>>
+        not_found_cursor_list(num_worker_threads + 1);
+    ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs,
+                                num_of_keys, not_found_cursor_list);
+    std::vector<V*> var_ptrs(num_of_keys);
+    auto do_work = [this, value_ptrs, &var_ptrs]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        bool is_need_set_default_value = false;
+        var_ptrs[i] = ev_->LookupOrCreateEmb(
+            value_ptrs[i], is_need_set_default_value);
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads,
+          worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    ev_->SetDefaultValueOfNewFeatures(
+        keys, num_of_keys,
+        not_found_cursor_list[0],
+        var_ptrs.data(), ctx.compute_stream,
+        ctx.event_mgr, ctx.gpu_device);
+  }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
@@ -195,6 +224,10 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
     return Status::OK();
   }
 
+  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+    return true;
+  }
+
  private:
   EmbeddingConfig config_;
   embedding::Storage<K, V>* storage_;
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 2fe84b57088..2ac2e8f6523 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -63,11 +63,21 @@ class Storage {
   TF_DISALLOW_COPY_AND_ASSIGN(Storage);
 
   virtual Status Get(K key, ValuePtr<V>** value_ptr) = 0;
+#if GOOGLE_CUDA
   virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                         const K* key,
                         ValuePtr<V>** value_ptr_list,
                         int64 num_of_keys,
                         int64 value_len) {}
+
+  virtual void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx,
+      const K* key,
+      ValuePtr<V>** value_ptr_list,
+      int64 num_of_keys,
+      int64 value_len,
+      std::vector<std::list<int64>>& not_found_cursor_list) {}
+#endif //GOOGLE_CUDA
   virtual Status Contains(K key) = 0;
   virtual void Insert(K key, ValuePtr<V>** value_ptr, size_t alloc_len) = 0;
   virtual void InsertToDram(K key, ValuePtr<V>** value_ptr,
@@ -199,12 +209,9 @@ class Storage {
 
   virtual void UpdateCache(const Tensor& indices) {}
 
-  virtual void UpdateCache(const K* indices,
-                           int64 num_indices,
-                           const Tensor& indices_counts) {}
+  virtual void AddToCachePrefetchList(const Tensor& indices) {}
 
-  virtual void UpdateCache(const K* keys,
-                           int64 num_indices) {}
+  virtual void AddToCache(const Tensor& indices) {}
 
  protected:
   int64 alloc_len_ = 0;
diff --git a/tensorflow/core/graph/embedding_pass.cc b/tensorflow/core/graph/embedding_pass.cc
index ce2e79ff253..5abc68ed3ec 100644
--- a/tensorflow/core/graph/embedding_pass.cc
+++ b/tensorflow/core/graph/embedding_pass.cc
@@ -34,20 +34,18 @@ void VLogGraphDebugString(Graph* g) {
 // Embedding ForwardBackward Joint Optimization, should before smart-stage
 class EmbeddingForwardBackwardJointOptimizationPass : public GraphOptimizationPass {
  public:
-  EmbeddingForwardBackwardJointOptimizationPass() : GraphOptimizationPass() {
+  EmbeddingForwardBackwardJointOptimizationPass() : GraphOptimizationPass() {}
+
+  Status Run(const GraphOptimizationPassOptions& options) override {
+    bool embedding_fbj_opt = false;
     tensorflow::ReadBoolFromEnvVar("TF_EMBEDDING_FBJ_OPT",
-                                   /*default_val=*/false, &embedding_fbj_opt_);
-    if (!embedding_fbj_opt_) {
+                                   /*default_val=*/false, &embedding_fbj_opt);
+    if (!embedding_fbj_opt) {
       VLOG(2) << "Graph Optimization Pass TF_EMBEDDING_FBJ_OPT is off.";
+      return Status::OK();
     } else {
       VLOG(2) << "Graph Optimization Pass TF_EMBEDDING_FBJ_OPT is on.";
     }
-  }
-
-  Status Run(const GraphOptimizationPassOptions& options) override {
-    if (!embedding_fbj_opt_) {
-      return Status::OK();
-    }
 
     if (options.graph == nullptr) {
       // TODO(apassos) returning OK feels weird here as we can't do anything
@@ -147,7 +145,7 @@ class EmbeddingForwardBackwardJointOptimizationPass : public GraphOptimizationPa
       } else if (0 == e->src_output() && e->dst()->type_string() == "Reshape") {
         Node* reshape = e->dst();
         for (const Edge *e : reshape->out_edges()) {
-          if (0 == e->src_output() && e->dst()->type_string() == "Unique") {
+          if (0 == e->src_output() && e->dst()->IsUnique()) {
             TF_RETURN_IF_ERROR(GetApplyOpNode(e->dst(), apply_node));
             *apply_edge = const_cast<Edge*>(e);
           }
@@ -215,9 +213,6 @@ class EmbeddingForwardBackwardJointOptimizationPass : public GraphOptimizationPa
     g->RemoveNode(node);
     return Status::OK();
   }
-
- private:
-  bool embedding_fbj_opt_ = false;
 };
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 23,
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 1bf4f2c210b..0e7e032c9a5 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -206,6 +206,9 @@ class Node {
            type_string() == "KvResourceSparseApplyGradientDescentWithCounts" ||
            type_string() == "KvResourceSparseApplyAdamWWithCounts";
   }
+  bool IsUnique() const {
+    return type_string() == "Unique" || type_string() == "UniqueWithCounts";
+  }
   bool IsPlaceholder() const { return type_string() == "Placeholder"; }
   bool IsSwitch() const { return class_ == NC_SWITCH; }
   bool IsMerge() const { return class_ == NC_MERGE; }
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index 57af5ed916e..dc1566239a1 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -95,7 +95,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KV_LOOKUP_RESOURCE
 
-template <typename TKey, typename TValue>
+template <typename Device, typename TKey, typename TValue>
 class KvResourceLookupIDOp : public OpKernel {
  public:
   explicit KvResourceLookupIDOp(OpKernelConstruction* c) : OpKernel(c) {
@@ -119,19 +119,10 @@ class KvResourceLookupIDOp : public OpKernel {
 
       auto indices_flat = indices.flat<TKey>();
       const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
-      auto do_work = [this, indices_flat,
-           out_base, ev] (int64 start, int64 limit) {
-        for (int64 i = start; i < limit; ++i) {
-          ValuePtr<TValue>* value_ptr;
-          bool is_filter = false;
-          ev->LookupOrCreateKey(indices_flat(i), &value_ptr, &is_filter, false);
-          *(out_base + i) = (int64)value_ptr;
-        }
-      };
-
-      auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-      Shard(worker_threads->num_threads, worker_threads->workers, indices_size,
-          100, do_work);
+      EmbeddingVarContext<Device> ev_ctx(c);
+      ev->GetOrCreateKey(ev_ctx, indices,
+                         reinterpret_cast<ValuePtr<TValue>**>(out_base),
+                         indices_size);
     }
   }
 };
@@ -139,11 +130,9 @@ class KvResourceLookupIDOp : public OpKernel {
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID")         \
                               .Device(DEVICE_##dev)               \
-                              .HostMemory("resource")             \
-                              .HostMemory("indices")              \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceLookupIDOp<ktype, vtype>)
+                          KvResourceLookupIDOp<CPUDevice, ktype, vtype>)
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
   REGISTER_KERNELS(dev, int64, type)
@@ -155,142 +144,6 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename TKey, typename TValue>
-class KvResourceLookupIDGPUOp : public OpKernel {
- public:
-  explicit KvResourceLookupIDGPUOp(OpKernelConstruction* c) : OpKernel(c) {
-  }
-
-  ~KvResourceLookupIDGPUOp() {
-    delete[] occupy_flag_;
-  }
-
-  void Compute(OpKernelContext* c) override {
-    EmbeddingVar<TKey, TValue>* ev = nullptr;
-    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
-    core::ScopedUnref unref_me(ev);
-    const Tensor& indices = c->input(1);
-    const int64 N = indices.NumElements();
-
-    TensorShape result_shape = indices.shape();
-
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
-    OP_REQUIRES(c, !ev->IsSingleHbm(),
-        errors::InvalidArgument(
-            "EV with HBM storage can't be used in KvResourceLookupIDGPUOp."));
-
-    if (N > 0) {
-      auto out_flat = out->flat<int64>();
-      int64* out_base = &out_flat(0);
-
-      auto indices_flat = indices.flat<TKey>();
-      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
-      TValue** memcpy_address = new TValue*[indices_size];
-      auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-      int64 num_threads = worker_threads->num_threads;
-      if (occupy_flag_ == nullptr) {
-        mutex_lock l(m_init_occupy_flag_);
-        //double check
-        if (occupy_flag_ == nullptr) {
-          occupy_flag_ = new bool[num_threads];
-          memset(occupy_flag_, 0, sizeof(bool) * num_threads);
-        }
-      }
-      std::vector<std::list<int64>> init_cursor_list(
-          worker_threads->num_threads + 1);
-      std::vector<std::list<int64>> copyback_cursor_list(
-          worker_threads->num_threads + 1);
-      uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-      //Lookup memory address of features
-      auto do_work = [this, indices_flat,
-          out_base, c, ev,
-          memcpy_address, &init_cursor_list,
-          &copyback_cursor_list, main_thread_id,
-          num_threads] (int64 start, int64 limit) {
-        uint64 thread_id = Env::Default()->GetCurrentThreadId();
-        int64 position;
-        if (thread_id == main_thread_id) {
-          position = num_threads;
-        } else {
-          position = -1;
-          {
-            spin_rd_lock l(mu_);
-            auto iter = hash_map_.find(thread_id);
-            if (iter != hash_map_.end()) {
-              position = iter->second;
-            }
-          }
-
-          if (position == -1) {
-          // bind a new thread to a local cursor_list
-            position = thread_id % num_threads;
-            while (!__sync_bool_compare_and_swap(&(occupy_flag_[position]),
-                                                 false, true)) {
-              position = (position + 1) % num_threads;
-            }
-            {
-              spin_wr_lock l(mu_);
-              hash_map_.insert(std::pair<uint64, int64>(thread_id, position));
-            }
-          }
-        }
-        ev->LookupWithFreqBatch(indices_flat.data(), memcpy_address,
-                                start, limit, init_cursor_list[position],
-                                copyback_cursor_list[position], out_base);
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, indices_size,
-            100000, do_work);
-      //Merge init_cursor_list and copyback_cursor_list
-      for (int i = 1; i < worker_threads->num_threads + 1; i++) {
-        if (init_cursor_list[i].size()>0) {
-          init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                     init_cursor_list[i]);
-        }
-        if (copyback_cursor_list[i].size()>0) {
-          copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
-                                         copyback_cursor_list[i]);
-        }
-      }
-      //Pointers in memcpy_address here will
-      //be cast to ValuePtr<Tvalue>* in this funcation.
-      auto stream = c->op_device_context()->stream();
-      auto event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
-
-      ev->AllocateMemoryForNewFeatures(
-          memcpy_address,
-          init_cursor_list[0]);
-      std::function<
-          TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn;
-      get_default_v_fn = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim);
-      };
-      TValue* default_v = ev->GetDefaultValuePtr();
-      ev->SetDefaultValueOfNewFeatures(
-          indices_flat.data(), indices_size,
-          init_cursor_list[0], memcpy_address,
-          stream, event_mgr,
-          c->eigen_gpu_device());
-
-      ev->CopyEmbeddingsFromCPUToGPU(
-          indices_flat.data(),
-          copyback_cursor_list[0],
-          memcpy_address, stream,
-          event_mgr, c->eigen_gpu_device(),
-          worker_threads, out_base);
-
-      delete[] memcpy_address;
-    }
-  }
-
- private:
-  std::map<uint64, int64> hash_map_;
-  mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
-  bool* occupy_flag_ = nullptr;
-  mutex m_init_occupy_flag_;
-};
-
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID")         \
                               .Device(DEVICE_##dev)               \
@@ -298,7 +151,7 @@ class KvResourceLookupIDGPUOp : public OpKernel {
                               .HostMemory("pointer")               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceLookupIDGPUOp<ktype, vtype>)
+                          KvResourceLookupIDOp<GPUDevice, ktype, vtype>)
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
   REGISTER_KERNELS(dev, int64, type)
@@ -310,45 +163,10 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
 #undef REGISTER_KERNELS
 #endif  // GOOGLE_CUDA
 
-template <typename TKey, typename TValue>
+template <typename Device, typename TKey, typename TValue>
 class KvResourceCollectEmbeddingOp : public OpKernel {
  public:
-  explicit KvResourceCollectEmbeddingOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c,
-        c->GetAttr("is_use_default_value_tensor",
-          &is_use_default_value_tensor_));
-    if (is_use_default_value_tensor_) {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * index;
-      };
-    } else {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim) ;
-      };
-    }
-    if (c->num_inputs() == 5) {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return count[index];
-      };
-    } else {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return 1;
-      };
-    }
-    lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, TKey key,
-                    TValue* val, TValue* default_v, int count) {
-      if (key) {
-        TValue* mem_val =
-            ev->LookupOrCreateEmb((ValuePtr<TValue>*)key, default_v);
-        memcpy(val, mem_val, sizeof(TValue) * ev->ValueLen());
-      } else {
-        memcpy(val, default_v, sizeof(TValue) * ev->ValueLen());
-      }
-      return Status::OK();
-    };
-  }
+  explicit KvResourceCollectEmbeddingOp(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
     EmbeddingVar<TKey, TValue>* ev = nullptr;
@@ -365,10 +183,6 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
 
-    int32* counts = nullptr;
-    if (c->num_inputs() == 5)
-      counts = (int32*)c->input(4).data();
-
     if (N > 0) {
       auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
       TValue* out_base = &out_flat(0, 0);
@@ -377,12 +191,6 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
       auto pointer_flat = pointer.flat<int64>();
       const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
       const int64 slice_elems = out_flat.dimension(1);
-      TValue* default_v = nullptr;
-      if (is_use_default_value_tensor_) {
-        default_v = (TValue*)c->input(3).data();
-      } else {
-        default_v = ev->GetDefaultValuePtr();
-      }
       OP_REQUIRES(c, ev->ValueLen() == slice_elems,
           errors::InvalidArgument(
               "ev's value_len should same with output's dimension(1)",
@@ -393,32 +201,12 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
               "MultiLevel EV's Cache size ", ev->CacheSize(),
               " should large than IDs in batch ", N));
       const size_t slice_bytes = slice_elems * sizeof(TValue);
-      auto do_work = [this, indices_flat, pointer_flat,
-           out_base, slice_elems, c, default_v, ev, counts] (
-               int64 start, int64 limit) {
-        for (int64 i = start; i < limit; ++i) {
-          TValue* default_v_ptr = get_default_v_fn_(
-              default_v, indices_flat(i), i, ev->GetDefaultValueDim(),
-              ev->ValueLen());
-          int32 count = get_count_fn_(counts, i);
-          OP_REQUIRES_OK(c, lookup_fn_(ev, pointer_flat(i),
-              out_base + i * slice_elems, default_v_ptr, count));
-        }
-      };
-      auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, indices_size,
-            slice_bytes, do_work);
+      EmbeddingVarContext<Device> ev_ctx(c);
+      ev->GatherEmbeddings(ev_ctx, indices,
+                          (ValuePtr<TValue>**)pointer.data(),
+                          out_base, N);
     }
   }
-
-  private:
-    bool is_use_default_value_tensor_;
-    std::function<
-      TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn_;
-    std::function<int32(int32*, int64)> get_count_fn_;
-    std::function<Status(EmbeddingVar<TKey, TValue>* ev,
-      TKey key, TValue* val, TValue* default_v, int count)> lookup_fn_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
@@ -431,7 +219,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
                               .HostMemory("output")               \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceCollectEmbeddingOp<ktype, vtype>)
+                          KvResourceCollectEmbeddingOp<CPUDevice, ktype, vtype>)
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
@@ -443,124 +231,6 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
-template <typename TKey, typename TValue>
-class KvResourceCollectEmbeddingGPUOp : public OpKernel {
- public:
-  explicit KvResourceCollectEmbeddingGPUOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c,
-        c->GetAttr("is_use_default_value_tensor",
-          &is_use_default_value_tensor_));
-    if (is_use_default_value_tensor_) {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * index;
-      };
-    } else {
-      get_default_v_fn_ = [](TValue* default_v, TKey id, int64 index,
-                            int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim) ;
-      };
-    }
-    if (c->num_inputs() == 5) {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return count[index];
-      };
-    } else {
-      get_count_fn_ = [](const int32* count, int64 index) {
-        return 1;
-      };
-    }
-    lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, TKey key,
-                    TValue* val, TValue* default_v, int count) {
-      if (key) {
-        TValue* mem_val = ev->LookupOrCreateEmb((ValuePtr<TValue>*)key, default_v);
-        memcpy(val, mem_val, sizeof(TValue) * ev->ValueLen());
-      } else {
-        memcpy(val, default_v, sizeof(TValue) * ev->ValueLen());
-      }
-      return Status::OK();
-    };
-  }
-
-  void Compute(OpKernelContext* c) override {
-    EmbeddingVar<TKey, TValue>* ev = nullptr;
-    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
-    core::ScopedUnref unref_me(ev);
-    const Tensor& indices = c->input(1);
-    const Tensor& pointer = c->input(2);
-    const int64 N = indices.NumElements();
-
-    TensorShape result_shape = indices.shape();
-    TensorShape value_shape({ev->ValueLen()});
-    result_shape.AppendShape(value_shape);
-
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
-
-    int32* counts = nullptr;
-    if (c->num_inputs() == 5)
-      counts = (int32*)c->input(4).data();
-
-    OP_REQUIRES(c, !ev->IsSingleHbm(), errors::InvalidArgument(
-        "EV with HBM storage can't be used in KvResourceCollectEmbeddingOp."));
-
-    if (N > 0) {
-      auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
-      TValue* out_base = &out_flat(0, 0);
-
-      auto indices_flat = indices.flat<TKey>();
-      auto pointer_flat = pointer.flat<int64>();
-      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
-      const int64 slice_elems = out_flat.dimension(1);
-      TValue* default_v = nullptr;
-      if (is_use_default_value_tensor_) {
-        default_v = (TValue*)c->input(3).data();
-      } else {
-        default_v = ev->GetDefaultValuePtr();
-      }
-      OP_REQUIRES(c, ev->ValueLen() == slice_elems,
-          errors::InvalidArgument(
-              "ev's value_len should same with output's dimension(1)",
-              std::to_string(slice_elems), std::to_string(ev->ValueLen())));
-      OP_REQUIRES(c, !ev->IsMultiLevel() ||
-          (ev->IsMultiLevel() && ev->CacheSize() >= N),
-          errors::InvalidArgument(
-              "MultiLevel EV's Cache size ", ev->CacheSize(),
-              " should large than IDs in batch ", N));
-      const size_t slice_bytes = slice_elems * sizeof(TValue);
-      TValue** memcpy_address = new TValue*[indices_size];
-      auto do_work = [pointer_flat, memcpy_address] (int64 start, int64 limit) {
-        for (int64 i = start; i < limit; i++) {
-          ValuePtr<TValue>* value_ptr = (ValuePtr<TValue>*)pointer_flat(i);
-          memcpy_address[i] = value_ptr->GetValue(0, 0);
-        }
-      };
-      auto worker_threads = c->device()->tensorflow_cpu_worker_threads();
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, indices_size,
-            slice_bytes, do_work);
-
-      auto stream = c->op_device_context()->stream();
-      auto event_mgr = c->device()->tensorflow_gpu_device_info()->event_mgr;
-      ev->CopyEmbeddingsToBuffer(
-          out_base, indices_size,
-          memcpy_address,
-          stream, event_mgr,
-          c->eigen_gpu_device());
-
-      delete[] memcpy_address;
-    }
-  }
-
-  private:
-    bool is_use_default_value_tensor_;
-    std::function<
-      TValue*(TValue*, TKey, int64, int64, int64)> get_default_v_fn_;
-    std::function<int32(int32*, int64)> get_count_fn_;
-    std::function<Status(EmbeddingVar<TKey, TValue>* ev,
-      TKey key, TValue* val, TValue* default_v, int count)> lookup_fn_;
-};
-
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceCollectEmbedding") \
                               .Device(DEVICE_##dev)               \
@@ -569,7 +239,7 @@ class KvResourceCollectEmbeddingGPUOp : public OpKernel {
                               .HostMemory("default_value")        \
                               .TypeConstraint<vtype>("dtype")     \
                               .TypeConstraint<ktype>("Tkeys"),    \
-                          KvResourceCollectEmbeddingGPUOp<ktype, vtype>)
+                          KvResourceCollectEmbeddingOp<GPUDevice, ktype, vtype>)
 
 #define REGISTER_KERNELS_ALL(dev, type)                           \
   REGISTER_KERNELS(dev, int32, type);                             \
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index b11908a71bf..839ce82feef 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -160,7 +160,7 @@ class KvSparseApplyAdagradOp : public OpKernel {
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
 
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& indices_counts = ctx->input(6);
           var->UpdateCache(indices, indices_counts);
         }
@@ -340,7 +340,7 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
               stream, event_mgr,
               ctx->eigen_device<GPUDevice>());
 
-          if (has_counts) {
+          if (has_counts && !indices_as_pointer) {
             const Tensor& counts_tensor = ctx->input(counts_index);
               var->UpdateCache(*indices_host_ptr, counts_tensor);
           }
@@ -600,7 +600,7 @@ class KvSparseApplyFtrlOp : public OpKernel {
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
 
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const int counts_input_index = has_l2_shrinkage ? 10 : 9;
           const Tensor& indices_counts = ctx->input(counts_input_index);
           var_->UpdateCache(indices, indices_counts);
@@ -1328,7 +1328,7 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
         const int64 cost = 1000;
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& indices_counts = ctx->input(10);
           var->UpdateCache(indices, indices_counts);
         }
@@ -1531,7 +1531,7 @@ class KvSparseApplyAdamOp : public OpKernel {
       const int64 cost = 1000;
       auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
       Shard(worker_threads.num_threads, worker_threads.workers, N, cost, DoWork);
-      if (has_counts) {
+      if (has_counts && !indices_as_pointer) {
         const Tensor& indices_counts = ctx->input(12);
         var->UpdateCache(indices, indices_counts);
       }
@@ -1738,7 +1738,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_gpu_device());
 
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& counts_tensor = ctx->input(counts_index);
           var->UpdateCache(indices, counts_tensor);
         }
@@ -2488,7 +2488,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
         beta1_power_scalar() *= beta1_scalar;
         beta2_power_scalar() *= beta2_scalar;
       }
-      if (has_counts) {
+      if (has_counts && !indices_as_pointer) {
         const Tensor& indices_counts = ctx->input(12);
         var->UpdateCache(indices, indices_counts);
       }
@@ -2749,7 +2749,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_device<GPUDevice>());
 
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& counts_tensor = ctx->input(counts_index);
           var->UpdateCache(*indices_host_ptr, counts_tensor);
         }
@@ -2780,6 +2780,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                           KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false, false>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsync")           \
                               .Device(DEVICE_##D)                          \
+                              .HostMemory("indices")                 \
                               .HostMemory("lr")                      \
                               .HostMemory("beta1")                   \
                               .HostMemory("beta2")                   \
@@ -2803,6 +2804,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
                           KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, false, true>); \
   REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")           \
                               .Device(DEVICE_##D)                          \
+                              .HostMemory("indices")                 \
                               .HostMemory("lr")                      \
                               .HostMemory("beta1")                   \
                               .HostMemory("beta2")                   \
@@ -2953,7 +2955,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
         const int64 cost = 1000;
         auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
         Shard(worker_threads.num_threads, worker_threads.workers, N, cost, do_work);
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& indices = ctx->input(5);
           var->UpdateCache(indices, indices_counts);
         } else {
@@ -3158,7 +3160,7 @@ class KvSparseApplyAdamWOp : public OpKernel {
       const int64 cost = 1000;
       auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
       Shard(worker_threads.num_threads, worker_threads.workers, N, cost, DoWork);
-      if (has_counts) {
+      if (has_counts && !indices_as_pointer) {
         const Tensor& indices_counts = ctx->input(13);
         var->UpdateCache(indices, indices_counts);
       }
@@ -3369,7 +3371,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
             stream, event_mgr,
             ctx->eigen_gpu_device());
 
-        if (has_counts) {
+        if (has_counts && !indices_as_pointer) {
           const Tensor& counts_tensor = ctx->input(counts_index);
           var->UpdateCache(indices, counts_tensor);
         }
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 58b6083ef24..0a62f93af54 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -503,7 +503,7 @@ def runTestAdagrad(self, var, g):
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
-        sess.run([train_op], {ids:[1,2,3]})
+        sess.run([train_op], {ids:[1,1,2,3]})
         sess.run([train_op], {ids:[1,2,4]})
         sess.run([train_op], {ids:[1,2,2]})
         sess.run([train_op], {ids:[1,2,5]})
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 2a88ea32c8b..1175342f410 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2180,7 +2180,7 @@ def runTestAdagrad(self, var, g):
               self.assertEqual(result[i], 0)
 
         sess.run([train_op], {ids:[3, 5]})
-        sess.run([train_op], {ids:[4]})
+        sess.run([train_op], {ids:[4, 4]})
         r1 = sess.run(emb, {ids:[1,2,4,5]})
         r2 = sess.run(emb, {ids:[3]})
         r = r1.tolist() + r2.tolist()
@@ -2248,8 +2248,8 @@ def runTestAdagrad(self, var, g):
               self.assertEqual(result[i], 0)
 
         r1 = sess.run(emb, {ids:[1,2,5,6]})
-        r2 = sess.run(emb, {ids:[4]})
-        r3 = sess.run(emb, {ids:[3]})
+        r2 = sess.run(emb, {ids:[4, 4]})
+        r3 = sess.run(emb, {ids:[3, 3]})
         r = r1.tolist() + r2.tolist() + r3.tolist()
         return r
 
@@ -2501,5 +2501,50 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
           self.assertAllEqual(emb_ori_2[:,0:2], emb_val_2)
     del os.environ["TF_EV_RESTORE_CUSTOM_DIM"]
 
+  def testCPUFbjOpt(self):
+    print("testCPUFbjOpt")
+    os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
+    self._OpitmizerTestTemplate("Adagrad")
+    del os.environ["TF_EMBEDDING_FBJ_OPT"]
+  
+
+  def testCPUFbjOptWithCounterFilter(self):
+    print("testCPUFbjOpt")
+    os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
+    self._CounterFilterTestTemplate("Adagrad")
+    del os.environ["TF_EMBEDDING_FBJ_OPT"]
+  
+  def testCPUFbjOptWithBloomFilter(self):
+    print("testCPUFbjOptWithBloomFilter")
+    os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
+    var = variable_scope.get_embedding_variable("var_1",
+            embedding_dim = 3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CBFFilter(
+                                      filter_freq=3,
+                                      max_element_size = 5,
+                                      false_positive_probability = 0.01)))
+    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run([init])
+      emb1, top, l = sess.run([emb, train_op, loss])
+      emb1, top, l = sess.run([emb, train_op, loss])
+      emb1, top, l = sess.run([emb, train_op, loss])
+      for val in emb1.tolist()[0]:
+        self.assertEqual(val, .0)
+      emb1, top, l = sess.run([emb, train_op, loss])
+      for val in emb1.tolist()[0]:
+        self.assertNotEqual(val, 1.0)
+    del os.environ["TF_EMBEDDING_FBJ_OPT"]
+
 if __name__ == "__main__":
   googletest.main()

From 3c982f93da7e89e919dc8a81b89267d9c23f20d6 Mon Sep 17 00:00:00 2001
From: Tao Peng <jiankeng.pt@alibaba-inc.com>
Date: Tue, 27 Jun 2023 14:57:00 +0800
Subject: [PATCH 30/91] [Allocator] Make ARENA_ARRAY_SIZE to be configurable.
 (#899)

Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
---
 tensorflow/core/framework/ev_allocator.cc |  2 +-
 tensorflow/core/framework/ev_allocator.h  | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/framework/ev_allocator.cc b/tensorflow/core/framework/ev_allocator.cc
index eb79e57f180..fd7db948365 100644
--- a/tensorflow/core/framework/ev_allocator.cc
+++ b/tensorflow/core/framework/ev_allocator.cc
@@ -43,7 +43,7 @@ class CPUChunk : public Chunk<CPUChunk> {
      : Chunk<CPUChunk>(chunk_size, slot_size) {} 
 
   ~CPUChunk() {
-    delete start_;
+    port::AlignedFree(start_);
   }
  
   void GetMemBlock() override {
diff --git a/tensorflow/core/framework/ev_allocator.h b/tensorflow/core/framework/ev_allocator.h
index 5028d45c4d9..d3251b14782 100644
--- a/tensorflow/core/framework/ev_allocator.h
+++ b/tensorflow/core/framework/ev_allocator.h
@@ -27,11 +27,12 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 #define likely(x) __builtin_expect(!!(x), 1)
 #define unlikely(x) __builtin_expect(!!(x), 0)
 
-#define ARENA_ARRAY_SIZE 1024
+#define ARENA_ARRAY_SIZE 128
 
 namespace tensorflow {
 
@@ -465,7 +466,7 @@ class ThreadLocalBin {
     }
   }
 
-private:
+ private:
   void FlushBackToArena(int num) {
     std::unordered_map<Bin<ChunkType>*, std::vector<void *>> bin_ptr_map;
     for (int i = 0; i < num; i++) {
@@ -480,7 +481,7 @@ class ThreadLocalBin {
     }
   }
 
-private:
+ private:
   size_t t_bin_size_;
   PageMap<ChunkType> *page_map_ = nullptr; // not owned
   Arena<ChunkType> *arena_ = nullptr; // not owned
@@ -544,9 +545,17 @@ class EVAllocatorImpl {
     pthread_key_create(&key_, ThreadLocalCacheCleanup);
     page_map_ = new PageMap<ChunkType>();
     page_map_->Init();
-    arenas_ = new std::vector<Arena<ChunkType>>(ARENA_ARRAY_SIZE, page_map_);
+
+    int64 arena_array_size = ARENA_ARRAY_SIZE;
+    Status s = ReadInt64FromEnvVar("ARENA_ARRAY_SIZE",
+        ARENA_ARRAY_SIZE, &arena_array_size);
+    if (!s.ok()) {
+      LOG(ERROR) << "Read ARENA_ARRAY_SIZE env error: " << s.error_message();
+    }
+    LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size;
+
+    arenas_ = new std::vector<Arena<ChunkType>>(arena_array_size, page_map_);
     arena_cur_index = 0;
-    
   }
 
   ~EVAllocatorImpl() {

From 0734a9cb1757b6a94286c69e09b1bc07fd85fde5 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 28 Jun 2023 12:04:33 +0800
Subject: [PATCH 31/91] [Stage] Reimplement PrefetchRunner in C++. (#890)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 docs/docs_en/Stage.md                         |  26 +-
 docs/docs_zh/Stage.md                         |  26 +-
 tensorflow/cc/BUILD                           |  10 +
 tensorflow/cc/training/prefetch_runner.cc     | 267 +++++++++++++
 tensorflow/cc/training/prefetch_runner.h      | 128 +++++++
 tensorflow/core/protobuf/config.proto         |  27 +-
 tensorflow/python/BUILD                       |  44 ++-
 tensorflow/python/client/session.py           |   9 +
 tensorflow/python/ops/prefetch.py             | 152 +++++---
 tensorflow/python/ops/prefetch_runner.cc      |  50 +++
 tensorflow/python/ops/prefetch_runner.h       |  41 ++
 tensorflow/python/ops/prefetch_runner.i       |  30 ++
 tensorflow/python/ops/prefetch_runner.py      | 355 ------------------
 tensorflow/python/ops/prefetch_runner_hook.py |  50 +++
 tensorflow/python/ops/prefetch_test.py        |  96 +----
 tensorflow/python/tensorflow.i                |   2 +
 .../python/training/async_embedding_stage.py  |  31 +-
 17 files changed, 790 insertions(+), 554 deletions(-)
 create mode 100644 tensorflow/cc/training/prefetch_runner.cc
 create mode 100644 tensorflow/cc/training/prefetch_runner.h
 create mode 100644 tensorflow/python/ops/prefetch_runner.cc
 create mode 100644 tensorflow/python/ops/prefetch_runner.h
 create mode 100644 tensorflow/python/ops/prefetch_runner.i
 delete mode 100644 tensorflow/python/ops/prefetch_runner.py
 create mode 100644 tensorflow/python/ops/prefetch_runner_hook.py

diff --git a/docs/docs_en/Stage.md b/docs/docs_en/Stage.md
index 91dd8e04e58..ec029cf06c2 100644
--- a/docs/docs_en/Stage.md
+++ b/docs/docs_en/Stage.md
@@ -8,18 +8,20 @@ A TensorFlow training task is usually composed of sample data reading and graph
 
 `tf.staged`, prefetch the input `features`, and return the prefetched tensor.
 
-| parameter                    | description                                                         | default value                                                |
-| ----------------------- | ------------------------------------------------------------ | ------------------------------------------------------ |
-| features                | The tensor that needs to be executed asynchronously: tensor, list of tensor (each element in the list is a tensor) or dict of tensor (the keys in the dict are all strings, and the values are all tensors) | required                                               |
-| capacity                | The maximum number of cached `items` asynchronous execution results.                    | 1                                                      |
-| num_threads             | Number of threads to execute `items` asynchronously.                                 | 1                                                      |
-| items                   | A list of feed_dict keys that `items` depends on                        | None, `items` does not depend on feed_dict                       |
-| feed_generator          | A generator object for the value of the feed_dict that `items` depends on. A generator object in Python is a method that yields a list. Through this generator object, users can use pure Python for flexible data preprocessing, similar to tensor_pack, see examples for interface and usage. | None, `features` does not depend on feed_dict                    |
-| closed_exception_types  | Exception types recognized as graceful exits                                  | (`tf.errors.OutOfRangeError`, `errors.CancelledError`) |
-| ignored_exception_types | Exception types that are recognized to be ignored and skipped                                   | ()                                                     |
-| use_stage_subgraph_thread_pool   | Whether to run the Stage subgraph on an independent thread pool, you need to create an independent thread pool first        | False(Optional, if it is True, a separate thread pool must be created first)            |
-| stage_subgraph_thread_pool_id         | If you enable the stage subgraph to run on the independent thread pool to specify the independent thread pool index, you need to create an independent thread pool first, and enable the use_stage_subgraph_thread_pool option. | 0, The index range is [0, the number of independent thread pools created - 1]               |
-| stage_subgraph_stream_id | In the GPU Multi-Stream scenario, the index of gpu stream used by stage subgraph          | 0 (optional, 0 means that the stage subgraph shares the gpu stream used by the main graph, the index range is [0, total number of GPU streams -1]) |
+| parameter                      | description                                                                                                                                                                                                                     | default value                                                                                                                            |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| features                       | The tensor that needs to be executed asynchronously: tensor, list of tensor (each element in the list is a tensor) or dict of tensor (the keys in the dict are all strings, and the values are all tensors).                    | required                                                                                                                                 |
+| feed_dict                      | A dictionary that maps graph elements to tensors.                                                                                                                                                                               | {}                                                                                                                                       |
+| capacity                       | The maximum number of cached `items` asynchronous execution results.                                                                                                                                                            | 1                                                                                                                                        |
+| num_threads                    | Number of threads to execute `items` asynchronously.                                                                                                                                                                            | 1                                                                                                                                        |
+| num_client                     | Number of clients of prefetched sample.                                                                                                                                                                                         | 1                                                                                                                                        |
+| timeout_millis                 | Max milliseconds put op can take.                                                                                                                                                                                               | 300000 ms                                                                                                                                |
+| closed_exception_types         | Exception types recognized as graceful exits.                                                                                                                                                                                   | (`tf.errors.OUT_OF_RANGE`,)                                                                                                              |
+| ignored_exception_types        | Exception types that are recognized to be ignored and skipped.                                                                                                                                                                  | ()                                                                                                                                       |
+| use_stage_subgraph_thread_pool | Whether to run the Stage subgraph on an independent thread pool, you need to create an independent thread pool first.                                                                                                           | False (If it is True, a separate thread pool must be created first)                                                                      |
+| stage_subgraph_thread_pool_id  | If you enable the stage subgraph to run on the independent thread pool to specify the independent thread pool index, you need to create an independent thread pool first, and enable the use_stage_subgraph_thread_pool option. | 0, The index range is [0, the number of independent thread pools created - 1]                                                            |
+| stage_subgraph_stream_id       | In the GPU Multi-Stream scenario, the index of gpu stream used by stage subgraph.                                                                                                                                               | 0 (0 means that the stage subgraph shares the gpu stream used by the main graph, the index range is [0, total number of GPU streams -1]) |
+| name                           | Name of prefetching operations.                                                                                                                                                                                                 | None (Automatic generated)                                                                                                               |
 
 Adds `tf.make_prefetch_hook()`hook when create session.
 
diff --git a/docs/docs_zh/Stage.md b/docs/docs_zh/Stage.md
index 02f67a31bf0..20fffa70d21 100644
--- a/docs/docs_zh/Stage.md
+++ b/docs/docs_zh/Stage.md
@@ -8,18 +8,20 @@
 
 `tf.staged`，对输入的 `features` 进行预取，返回预取后的 tensor。
 
-| 参数                    | 含义                                                         | 默认值                                                 |
-| ----------------------- | ------------------------------------------------------------ | ------------------------------------------------------ |
-| features                | 需要异步化执行的 op，可以是 tensor、list of tensor（list 中每一个元素都是 tensor ） 或者 dict of tensor （dict 中的 key 都是 string，value 都是 tensor） | 必选参数                                               |
-| capacity                | 缓存的 `items`异步化执行结果的最大个数。                     | 1                                                      |
-| num_threads             | 异步化执行 `items`的线程数。                                 | 1                                                      |
-| items                   | `items`依赖的 feed_dict 的 key 的列表                        | None，即 `items`不依赖 feed_dict                       |
-| feed_generator          | `items`依赖的 feed_dict 的 value 的 generator 对象。Python 中一个 generator 对象是一种通过 yield 产生 list 的方法。通过这个 generator 对象，用户可以使用纯 Python 进行灵活的数据预处理，类似于 tensor_pack，接口与用法见示例。 | None，即 `features`不依赖 feed_dict                    |
-| closed_exception_types  | 被识别为正常退出的异常类型                                   | (`tf.errors.OutOfRangeError`, `errors.CancelledError`) |
-| ignored_exception_types | 被识别可忽略跳过的异常类型                                   | ()                                                     |
-| use_stage_subgraph_thread_pool   | 是否在独立线程池上运行Stage子图，需要先创建独立线程池        | False(可选，若为True则必须先创建独立线程池)            |
-| stage_subgraph_thread_pool_id         | 如果开启了在独立线程池上运行Stage子图，用于指定独立线程池索引，需要先创建独立线程池，并打开use_stage_subgraph_thread_pool选项。 | 0，索引范围为[0, 创建的独立线程池数量-1]               |
-| stage_subgraph_stream_id | GPU Multi-Stream 场景下, stage子图执行使用的gpu stream的索引         | 0(可选，0表示stage子图共享计算主图使用的gpu stream, 索引范围为[0, gpu stream总数-1]) |
+| 参数                            | 含义                                                                                                                                        | 默认                                                                     |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------ |
+| features                       | 需要异步化执行的 op，可以是 tensor、list of tensor（list 中每一个元素都是 tensor ） 或者 dict of tensor （dict 中的 key 都是 string，value 都是 tensor）| 必选参数                                                                  |
+| feed_dict                      | 将stage子图元素映射为对应tensor的字典                                                                                                           | {}                                                                      |
+| capacity                       | 缓存的 `items`异步化执行结果的最大个数                                                                                                           | 1                                                                       |
+| num_threads                    | 异步化执行 `items`的线程数                                                                                                                     | 1                                                                       |
+| num_clients                    | 消耗预取结果的消费者数量                                                                                                                        | 1                                                                       |
+| timeout_millis                 | 预取结果等待缓存区可用的最大等待时间，超时后本次预取结果将会被丢弃                                                                                      | 300000 ms                                                               |
+| closed_exception_types         | 被识别为正常退出的异常类型                                                                                                                      | (`tf.errors.OUT_OF_RANGE`,)                                              |
+| ignored_exception_types        | 被识别可忽略跳过的异常类型                                                                                                                      | ()                                                                       |
+| use_stage_subgraph_thread_pool | 是否在独立线程池上运行Stage子图，需要先创建独立线程池                                                                                               | False(若为True则必须先创建独立线程池)                                         |
+| stage_subgraph_thread_pool_id  | 如果开启了在独立线程池上运行Stage子图，用于指定独立线程池索引，需要先创建独立线程池，并打开use_stage_subgraph_thread_pool选项                               | 0，索引范围为[0, 创建的独立线程池数量-1]                                       |
+| stage_subgraph_stream_id       | GPU Multi-Stream 场景下, stage子图执行使用的gpu stream的索引                                                                                    | 0 (0表示stage子图共享计算主图使用的gpu stream, 索引范围为[0, gpu stream总数-1]) |
+| name                           | 预取操作的名称                                                                                                                                | None (表示自动生成)                                                        |
 
 Session中加入`tf.make_prefetch_hook()`hook
 
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 07de89f997e..49b3c4de3a3 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -736,6 +736,16 @@ tf_cc_binary(
     ],
 )
 
+cc_library(
+    name = "prefetch_runner",
+    srcs = ["training/prefetch_runner.cc"],
+    hdrs = ["training/prefetch_runner.h"],
+    deps = [
+        ":coordinator",
+        "//tensorflow/core:core_cpu",
+    ],
+)
+
 cc_library(
     name = "queue_runner",
     srcs = ["training/queue_runner.cc"],
diff --git a/tensorflow/cc/training/prefetch_runner.cc b/tensorflow/cc/training/prefetch_runner.cc
new file mode 100644
index 00000000000..b543056ca81
--- /dev/null
+++ b/tensorflow/cc/training/prefetch_runner.cc
@@ -0,0 +1,267 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/training/prefetch_runner.h"
+#include "tensorflow/core/platform/default/logging.h"
+
+namespace tensorflow {
+/*----------------------------- PrefetchRunner -------------------------------*/
+
+PrefetchRunner::PrefetchRunner(std::string graph_key, std::string runner_name,
+                               Session* sess, Coordinator* coord,
+                               PrefetchRunnerOptions& params)
+    : graph_key_(graph_key),
+      name_(runner_name),
+      sess_(sess),
+      coord_(coord),
+      params_(params),
+      thread_nums_(0),
+      is_running_(false) {}
+
+PrefetchRunner::~PrefetchRunner() {
+  Join();
+}
+
+std::string PrefetchRunner::graph_key() const {
+  return graph_key_;
+}
+
+std::string PrefetchRunner::name() const {
+  return name_;
+}
+
+bool PrefetchRunner::IsRunning() const {
+  return is_running_;
+}
+
+void PrefetchRunner::Start() {
+  {
+    mutex_lock l(mu_);
+    thread_nums_ = params_.fetch_ops().size();
+  }
+
+  thread_pool_.resize(thread_nums_);
+  for (size_t i = 0; i < thread_nums_; i++)
+    thread_pool_[i].reset(new std::thread(&PrefetchRunner::Run, this, i));
+
+  if (coord_)
+    cancel_thread_.reset(new std::thread(&PrefetchRunner::Stop, this));
+
+  is_running_ = true;
+}
+
+void PrefetchRunner::Stop() {
+  if (coord_)
+    coord_->WaitForStop();
+
+  sess_->Run({}, {}, {params_.cancel_op()}, nullptr);
+  is_running_ = false;
+}
+
+Status PrefetchRunner::Join() {
+  for (size_t i = 0; i < thread_pool_.size(); i++) {
+    if (thread_pool_[i] != nullptr && thread_pool_[i]->joinable())
+      thread_pool_[i]->join();
+  }
+
+  if (cancel_thread_ != nullptr && cancel_thread_->joinable()) {
+    cancel_thread_->join();
+  }
+
+  return Status::OK();
+}
+
+void PrefetchRunner::Run(size_t index) {
+  Status status;
+
+  // initialize tensor buffer.
+  status = sess_->Run({}, {}, {params_.resume_op()}, nullptr);
+  if (status != Status::OK()) {
+    DealWithCancelledUnexpectedError(status);
+    return;
+  }
+
+  // get value generator
+  auto feed_in_tensors = params_.named_feed_input_tensors();
+  std::vector<std::string> val_consume_ops;
+  std::vector<std::string> val_gen_tensors;
+  val_consume_ops.reserve(feed_in_tensors.size());
+  val_gen_tensors.reserve(feed_in_tensors.size());
+  for (auto iter = feed_in_tensors.begin(); iter != feed_in_tensors.end(); ++iter) {
+    val_consume_ops.emplace_back(iter->first);
+    val_gen_tensors.emplace_back(iter->second);
+  }
+
+  std::vector<Tensor> gen_val;
+  std::vector<std::pair<std::string, Tensor>> gen_inputs;
+  gen_val.reserve(feed_in_tensors.size());
+  gen_inputs.reserve(feed_in_tensors.size());
+  while (true) {
+    if (TF_PREDICT_FALSE(coord_ && coord_->ShouldStop())) {
+      mutex_lock l (mu_);
+      thread_nums_--;
+      return;
+    }
+
+    // generate feed tensor
+    if (!val_gen_tensors.empty()) {
+      gen_val.clear();
+      gen_inputs.clear();
+      status = sess_->Run(params_.run_options(), {}, val_gen_tensors, {},
+                         &gen_val, nullptr);
+      if (TF_PREDICT_FALSE(status != Status::OK())) {
+        if (!CheckRunErrorStatus(status))
+          return;
+      }
+
+      for (size_t i = 0; i < val_gen_tensors.size(); i++) {
+        gen_inputs.emplace_back(val_consume_ops[i], gen_val[i]);
+      }
+    }
+
+    // run prefetch subgraph.
+    status = sess_->Run(params_.run_options(), gen_inputs, {},
+                       {params_.fetch_ops(index)}, nullptr, nullptr);
+    if (TF_PREDICT_FALSE(status != Status::OK())) {
+      if (!CheckRunErrorStatus(status))
+        return;
+    }
+  }
+}
+
+/// Only Status Code is in `ignored_exceptions`, return `true`,
+/// otherwise return `false`.
+bool PrefetchRunner::CheckRunErrorStatus(Status& s) {
+  auto& closed_e = params_.closed_exceptions();
+  auto& ignored_e = params_.ignored_exceptions();
+  if (error::CANCELLED == s.code()) {
+    DealWithCancelledError();
+    return false;
+  } else if (std::count(closed_e.begin(), closed_e.end(), s.code()) != 0) {
+    DealWithClosedError();
+    return false;
+  } else if (std::count(ignored_e.begin(), ignored_e.end(), s.code()) != 0) {
+    DealWithIgnoredError(s);
+    return true;
+  }
+
+  DealWithCancelledUnexpectedError(s);
+  return false;
+}
+
+void PrefetchRunner::DealWithCancelledError() {
+  LOG(INFO) << "PrefetchRunner <" << name_ << "> Prefetching was cancelled.";
+  {
+    mutex_lock l (mu_);
+    thread_nums_--;
+  }
+}
+
+void PrefetchRunner::DealWithClosedError() {
+  LOG(INFO) << "PrefetchRunner <" << name_ << "> Prefetching was closed.";
+  {
+    mutex_lock l(mu_);
+    thread_nums_--;
+    if (thread_nums_ == 0)
+      sess_->Run({}, {}, {params_.close_op()}, nullptr);
+  }
+}
+
+void PrefetchRunner::DealWithIgnoredError(Status& s) {
+  LOG(WARNING) << "PrefetchRunner <" << name_
+               << "> Corrupted inputs were ignored in prefetching: "
+               << s.error_message();
+}
+
+void PrefetchRunner::DealWithCancelledUnexpectedError(Status& s) {
+  LOG(ERROR) << "PrefetchRunner <" << name_
+             << "> Prefetching was cancelled unexpectedly: "
+             << s.error_message();
+  if (coord_)
+    coord_->RequestStop();
+
+  {
+    mutex_lock l (mu_);
+    thread_nums_--;
+  }
+}
+
+/*----------------------------- PrefetchRunnerMgr ----------------------------*/
+
+/*static*/ PrefetchRunnerMgr* PrefetchRunnerMgr::singleton() {
+  static PrefetchRunnerMgr* instance = new PrefetchRunnerMgr;
+  return instance;
+}
+
+PrefetchRunnerMgr::PrefetchRunnerMgr() {}
+
+PrefetchRunnerMgr::~PrefetchRunnerMgr() {
+  register_runner_options_.clear();
+  prefetch_runners_.clear();
+  coords_.clear();
+}
+
+Status PrefetchRunnerMgr::RegisterPrefetchRunner(
+    std::string graph_key, std::string runner_name,
+    PrefetchRunnerOptions& params) {
+  if (register_runner_options_[graph_key].count(runner_name))
+    return Status(errors::AlreadyExists("PrefetchRunner <" + runner_name +
+                                        "> has already existed in graph <" +
+                                        graph_key + ">."));
+
+  register_runner_options_[graph_key][runner_name] = params;
+  return Status::OK();
+}
+
+Status PrefetchRunnerMgr::StartRunners(std::string graph_key, Session* sess) {
+  if (register_runner_options_.count(graph_key) == 0)
+    return Status(
+        errors::NotFound("graph <" + graph_key + "> has no PrefetchRunner"));
+
+  if (prefetch_runners_.count(sess) != 0)
+    return Status(errors::AlreadyExists(
+        "PrefetchRunners has already started in Session."));
+
+  // Create and Start the PrefetchRunners.
+  coords_[sess].reset(new Coordinator());
+
+  for (auto option : register_runner_options_[graph_key]) {
+    std::unique_ptr<PrefetchRunner> runner(new PrefetchRunner(
+        graph_key, option.first, sess, coords_[sess].get(), option.second));
+    prefetch_runners_[sess].insert(runner.get());
+    coords_[sess]->RegisterRunner(std::move(runner));
+  }
+
+  for (auto runner : prefetch_runners_[sess])
+    runner->Start();
+
+  return Status::OK();
+}
+
+Status PrefetchRunnerMgr::StopRunners(std::string graph_key, Session* sess) {
+  if (prefetch_runners_.count(sess) == 0)
+    return Status(errors::NotFound("No PrefechRunners run in Session"));
+
+  coords_[sess]->RequestStop();
+  for (auto runner : prefetch_runners_[sess])
+    runner->Join();
+
+  coords_.erase(sess);
+  prefetch_runners_.erase(sess);
+
+  return Status::OK();
+}
+
+} // end of namespace tensorflow
diff --git a/tensorflow/cc/training/prefetch_runner.h b/tensorflow/cc/training/prefetch_runner.h
new file mode 100644
index 00000000000..ab7c2f726c9
--- /dev/null
+++ b/tensorflow/cc/training/prefetch_runner.h
@@ -0,0 +1,128 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_TRAINING_PREFETCH_RUNNER_H_
+#define TENSORFLOW_CC_TRAINING_PREFETCH_RUNNER_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+/// PrefetchRunner class is responsible for prefetching tensor by repeating
+/// running given ops.
+class PrefetchRunner : public RunnerInterface {
+ public:
+  PrefetchRunner(std::string graph_key, std::string runner_name,
+                 Session* sess, Coordinator* coord,
+                 PrefetchRunnerOptions& params);
+
+  /// The destructor would join all the threads.
+  ~PrefetchRunner();
+
+  std::string name() const;
+
+  std::string graph_key() const;
+
+  /// Returns true iff the runner is running.
+  bool IsRunning() const override;
+
+  /// Starts the prefetch runner with the given session.
+  void Start();
+
+  /// Requests to stop and runs the cancel op. It would be called in a separate
+  /// thread when coordinator is set. If there is no coordinator it should be
+  /// called before calling Join.
+  void Stop();
+
+  /// Joins all the threads. Returns okay if all threads run successfully;
+  /// otherwise returns the first captured failure status.
+  Status Join() final;
+
+ private:
+  std::string graph_key_;
+  std::string name_;
+  /// Parameters required for the session to run.
+  PrefetchRunnerOptions params_;
+  Session *sess_; // not owned
+  Coordinator* coord_; // not owned
+
+  mutex mu_;
+  size_t thread_nums_ GUARDED_BY(mu_);
+  std::atomic<bool> is_running_;
+  std::atomic<bool> force_stop_;
+  std::vector<std::unique_ptr<std::thread>> thread_pool_;
+  std::unique_ptr<std::thread> cancel_thread_;
+
+  /// Run prefetch subgraph.
+  void Run(size_t index);
+
+  /// Check the return status of Session run, return `true` if execution can
+  /// continue, otherwise return `false`.
+  bool CheckRunErrorStatus(Status &s);
+
+  void DealWithCancelledError();
+
+  void DealWithClosedError();
+
+  void DealWithIgnoredError(Status& s);
+
+  void DealWithCancelledUnexpectedError(Status& s);
+};
+
+/// PrefetchRunnerMgr class is used to managed PrefetcRunner.
+class PrefetchRunnerMgr {
+ public:
+  static PrefetchRunnerMgr* singleton();
+  /// Add a new PrefetRunner to PrefetchRunnerMgr.
+  Status RegisterPrefetchRunner(std::string graph_key, std::string runner_name,
+                                PrefetchRunnerOptions& params);
+
+  /// Start all PrefetchRunners
+  Status StartRunners(std::string graph_key, Session* sess);
+
+  /// Stop all PrefetchRunners
+  Status StopRunners(std::string graph_key, Session* sess);
+
+ private:
+  PrefetchRunnerMgr();
+  ~PrefetchRunnerMgr();
+
+  // map<runner_name, PrefetchOptions>
+  typedef std::unordered_map<std::string, PrefetchRunnerOptions>
+      name_runner_options_map_t;
+  // map<graph_key, runner_options_t>
+  std::unordered_map<std::string, name_runner_options_map_t>
+      register_runner_options_;
+
+  // map<Session*, coord>
+  std::map<Session*, std::unique_ptr<Coordinator>> coords_;
+  // map<Session*, runner_name_map_t>. PrefetchRunner* is not owned.
+  std::map<Session*, std::set<PrefetchRunner*>> prefetch_runners_;
+};
+
+} // end of namespace tensorflow
+
+#endif // TENSORFLOW_CC_TRAINING_PREFETCH_RUNNER_H_
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 11617b2087b..5b5c5b07f6f 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -11,6 +11,7 @@ option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobu
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
+import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/protobuf/rewriter_config.proto";
@@ -216,11 +217,35 @@ message GPUOptions {
   bool cuda_graph_enable_jit = 11;
 }
 
+// Options passed to the prefetch runner.
+message PrefetchRunnerOptions {
+  // RunOptions for session run prefetch subgraph.
+  RunOptions run_options = 1;
+  // Ops that repeats by prefetch runner for each thread.
+  repeated string fetch_ops = 2;
+  // Op that stops fetch_ops on stop of prefetch runner.
+  string cancel_op = 3;
+  // Op that restarts fetch_ops on start of this runner.
+  string resume_op = 4;
+  // Op that closes the data buffer.
+  string close_op = 5;
+  // (Optional) named_feed_input_ops is in format <feed_op, value_generator_op>
+  // `feed_op` is the consumer op in main graph.
+  // `value_generator_op` is the generator op that produces the value for
+  // `feed_op`.
+  map<string, string> named_feed_input_tensors = 6;
+  // (Optional) Exception types indicating that the prefetching is normally
+  // finished.
+  repeated error.Code closed_exceptions = 7;
+  // (Optional) Exception types indicating that the prefetching can continue.
+  repeated error.Code ignored_exceptions= 8;
+}
+
 // Options passed to the async embedding
 message AsyncEmbeddingOptions {
   // Prefetch threads num
   int32 threads_num = 1;
-  // Prefetch buffer size 
+  // Prefetch buffer size
   int32 capacity = 2;
   // Use stage subgraph thread pool for stage subgraph or not
   bool use_stage_subgraph_thread_pool = 3;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d43a5ea4f85..edaf20749d9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -136,8 +136,8 @@ py_library(
         ":nn",
         ":ops",
         ":platform",
-	    ":prefetch",
-        ":prefetch_runner",
+        ":prefetch",
+        ":prefetch_runner_hook",
         ":proto_ops",
         ":pywrap_tensorflow",
         ":rnn_ops_gen",
@@ -2966,21 +2966,27 @@ py_library(
         ":framework",
         ":framework_ops",
         ":framework_for_generated_wrappers",
+        ":prefetch_runner_hook",
     ],
 )
 
-py_library(
+cc_library(
     name = "prefetch_runner",
-    srcs = ["ops/prefetch_runner.py"],
-    srcs_version = "PY2AND3",
+    srcs = ["ops/prefetch_runner.cc"],
+    hdrs = ["ops/prefetch_runner.h"],
     deps = [
-        ":array_ops",
-        ":clip_ops",
-        ":control_flow_ops",
-        ":tensor_buffer_ops_gen",
-        ":framework",
-        ":framework_for_generated_wrappers",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/cc:prefetch_runner",
     ],
+    alwayslink = 1,
+)
+
+py_library(
+    name = "prefetch_runner_hook",
+    srcs = ["ops/prefetch_runner_hook.py"],
+    srcs_version = "PY2AND3",
 )
 
 py_library(
@@ -4670,8 +4676,8 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
-	":prefetch",
-	":prefetch_runner",
+        ":prefetch",
+        ":prefetch_runner_hook",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_coordinator_context",
@@ -5228,6 +5234,7 @@ tf_py_wrap_cc(
         "lib/io/file_io.i",
         "lib/io/py_record_reader.i",
         "lib/io/py_record_writer.i",
+        "ops/prefetch_runner.i",
         "platform/base.i",
         "platform/stacktrace_handler.i",
         "pywrap_tfe.i",
@@ -5257,6 +5264,7 @@ tf_py_wrap_cc(
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
+        ":prefetch_runner",
         ":py_exception_registry",
         ":py_func_lib",
         ":py_record_reader_lib",
@@ -6293,11 +6301,10 @@ tf_py_test(
     srcs = ["training/async_embedding_stage_test.py"],
     additional_deps = [
         ":training",
-	":prefetch",
-	":prefetch_runner",
-	":variables",
-	":math_ops",
-	"framework",
+        ":prefetch",
+        ":variables",
+        ":math_ops",
+        "framework",
     ],
 )
 
@@ -6647,7 +6654,6 @@ py_test(
         ":embedding_ops",
         ":prefetch",
         #":tensor_buffer_ops_gen",
-        ":prefetch_runner",
         ":state_ops",
         "//tensorflow/contrib/layers:layers_py",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index c4a39ecc521..fafc5a67df3 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -61,6 +61,11 @@ def sess_str(self):
     """The TensorFlow process to which this session will connect."""
     raise NotImplementedError('sess_str')
 
+  @property
+  def c_session(self):
+    """The underlying Session"""
+    raise NotImplementedError('c_session')
+
   def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
     """Runs operations in the session. See `BaseSession.run()` for details."""
     raise NotImplementedError('run')
@@ -788,6 +793,10 @@ def graph_def(self):
   def sess_str(self):
     return self._target
 
+  @property
+  def c_session(self):
+    return self._session
+
   def as_default(self):
     """Returns a context manager that makes this object the default session.
 
diff --git a/tensorflow/python/ops/prefetch.py b/tensorflow/python/ops/prefetch.py
index fd6c697c329..5ddc29db31d 100644
--- a/tensorflow/python/ops/prefetch.py
+++ b/tensorflow/python/ops/prefetch.py
@@ -20,17 +20,19 @@
 
 import collections
 
-from tensorflow.python.framework import sparse_tensor
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import pywrap_tensorflow as prefetch_runner
+from tensorflow.python.client.session import _REGISTERED_EXPANSIONS
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_tensor_buffer_ops
+from tensorflow.python.ops.prefetch_runner_hook import PrefetchRunnerHook
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-
-from tensorflow.python.ops import gen_tensor_buffer_ops
-from tensorflow.python.ops.prefetch_runner import PrefetchRunner
-
 ops.NotDifferentiable('TensorBufferPut')
 ops.NotDifferentiable('TensorBufferTake')
 ops.NotDifferentiable('TensorBufferCancel')
@@ -38,30 +40,64 @@
 PREFETCH = "prefetch"
 
 @tf_export(v1=["make_prefetch_hook"])
-def make_prefetch_hook(daemon=True, start=True):
-  """Create PrefetchRunner.Hook for prefetching.
-
-  Args:
-    daemon: (Optional.) Whether the threads should be marked as `daemons`,
-      meaning they don't block program exit.
-    start: (Optional.) If `False` threads would not be started.
+def make_prefetch_hook():
+  """Create PrefetchRunnerHook for prefetching.
 
   Returns:
-    A PrefetchRunner.Hook for prefetching.
+    A PrefetchRunnerHook for prefetching.
   """
-  return PrefetchRunner.Hook(PREFETCH, daemon=daemon, start=start)
+  return PrefetchRunnerHook()
+
+def fill_prefetch_runner_options(options,
+                                 fetch_tensors,
+                                 cancel_fetching,
+                                 resume_fetching,
+                                 close_fetching,
+                                 feed_dict={},
+                                 closed_exception_types=(errors.OUT_OF_RANGE,),
+                                 ignored_exception_types=(),
+                                 use_stage_subgraph_thread_pool=False,
+                                 stage_subgraph_thread_pool_id=0):
+  def _feed_fn(feed, feed_val):
+    for tensor_type, _, feed_fn, _ in _REGISTERED_EXPANSIONS:
+      if isinstance(feed, tensor_type):
+        return feed_fn(feed, feed_val)
+    raise TypeError('Feed argument %r has invalid type %r' % (feed, type(feed)))
+
+  options.run_options.use_stage_subgraph_thread_pool = \
+    use_stage_subgraph_thread_pool
+  options.run_options.stage_subgraph_thread_pool_id = \
+    stage_subgraph_thread_pool_id
+  options.fetch_ops.extend([x.name for x in fetch_tensors])
+  options.cancel_op = cancel_fetching.name
+  options.resume_op = resume_fetching.name
+  options.close_op = close_fetching.name
+
+  feed_dict = nest.flatten_dict_items(feed_dict)
+  for feed, feed_val in feed_dict.items():
+    for subfeed, subfeed_val in _feed_fn(feed, feed_val):
+      if not isinstance(subfeed_val, ops.Tensor):
+        raise TypeError('The value of a feed must be a tf.Tensor object. '
+                        'but ' + str(feed) + ' was feed by '
+                        + str(type(feed_val)))
+      options.named_feed_input_tensors[subfeed.name]=subfeed_val.name
+
+  for err_code in closed_exception_types:
+    options.closed_exceptions.append(err_code)
+
+  for err_code in ignored_exception_types:
+    options.ignored_exceptions.append(err_code)
 
 @tf_export(v1=["staged"])
 def staged(
     features,
-    feed_list=None,
-    feed_generator=None,
+    feed_dict={},
     capacity=1,
     num_threads=1,
     num_clients=1,
     timeout_millis=300000,
-    closed_exception_types=None,
-    ignored_exception_types=None,
+    closed_exception_types=(errors.OUT_OF_RANGE,),
+    ignored_exception_types=(),
     use_stage_subgraph_thread_pool=False,
     stage_subgraph_thread_pool_id = 0,
     stage_subgraph_stream_id = 0,
@@ -70,10 +106,13 @@ def staged(
 
   Args:
     features: Nest structure of tensors to prefetch.
-    feed_list: (Optional.) A list of `feed_dict` keys. See
-      @{tf.Session.run} for details of the allowable feed key types.
-    feed_generator: (Optional.) A generator function lambda sess: iterator
-      that yields a list of `feed_dict` values.
+    feed_dict: (Optional.) A dictionary that maps graph elements to values.
+      Each key in `feed_dict` can be one of the following types:
+      * `tf.Tensor` or `tf.compat.v1.placeholder`: the value should be a tensor.
+      * `tf.SparseTensor`: the value should be a `tf.compat.v1.SparseTensorValue`.
+      * a nested tuple of `Tensor`s or `SparseTensor`s, the value should be a
+        nested tuple with the same structure that maps to their corresponding
+        values as above.
     capacity: (Optional.) Max number of samples to keep in the buffer.
     num_threads: (Optional.) Number of threads for prefetching. 1 by
       default.
@@ -83,7 +122,7 @@ def staged(
       default.
     closed_exception_types: (Optional.) Exception types indicating that the
       prefetching is normally finished. Defaults to
-      `(tf.errors.OutOfRangeError, StopIteration)`.
+      `(errors.OUT_OF_RANGE,)`.
     ignored_exception_types: (Optional.) Exception types indicating that the
       prefetching can continue. Defaults to `()`.
     use_stage_subgraph_thread_pool: (Optional.) Use stage subgraph thread pool
@@ -183,30 +222,30 @@ def staged(
             next_tensor_or_nones.popleft())
     prefetched = nest.pack_sequence_as(
         features, next_tensor_or_sparse_tensor_or_nones)
-  runner = PrefetchRunner(
-      fetch_ops=[fetch_tensors] * num_threads,
-      cancel_op=cancel_fetching,
-      resume_op=resume_fetching,
-      close_op=close_fetching,
-      feed_list=feed_list,
-      feed_generator=feed_generator,
-      closed_exception_types=closed_exception_types,
-      ignored_exception_types=ignored_exception_types,
-      use_stage_subgraph_thread_pool=use_stage_subgraph_thread_pool,
-      stage_subgraph_thread_pool_id=stage_subgraph_thread_pool_id)
-  ops.add_to_collection(PREFETCH, runner)
+
+  runner_options = config_pb2.PrefetchRunnerOptions()
+  fill_prefetch_runner_options(runner_options, [fetch_tensors]*num_threads,
+                               cancel_fetching, resume_fetching,
+                               close_fetching, feed_dict,
+                               closed_exception_types, ignored_exception_types,
+                               use_stage_subgraph_thread_pool,
+                               stage_subgraph_thread_pool_id)
+
+  graph_key = ops.get_default_graph()._graph_key
+  prefetch_runner.TF_RegisterPrefetchRunner(graph_key, name+"_prefetch_runner",
+                                            runner_options)
+
   return prefetched
 
 @tf_export(v1=["prefetch_join"])
 def prefetch_join(
     thread_to_features,
-    feed_list=None,
-    feed_generator=None,
+    feed_dict={},
     capacity=1,
     num_clients=1,
     timeout_millis=300000,
-    closed_exception_types=None,
-    ignored_exception_types=None,
+    closed_exception_types=(errors.OUT_OF_RANGE,),
+    ignored_exception_types=(),
     name=None):
   """Prefetch samples from thread_to_features list.
 
@@ -215,10 +254,13 @@ def prefetch_join(
 
   Args:
     thread_to_features: List of nest structure of tensors for each thread.
-    feed_list: (Optional.) A list of `feed_dict` keys. See
-      @{tf.Session.run} for details of the allowable feed key types.
-    feed_generator: (Optional.) A generator function lambda sess: iterator
-      that yields a list of `feed_dict` values.
+    feed_dict: (Optional.) A dictionary that maps graph elements to values.
+      Each key in `feed_dict` can be one of the following types:
+      * `tf.Tensor` or `tf.compat.v1.placeholder`: the value should be a tensor.
+      * `tf.SparseTensor`: the value should be a `tf.compat.v1.SparseTensorValue`.
+      * a nested tuple of `Tensor`s or `SparseTensor`s, the value should be a
+        nested tuple with the same structure that maps to their corresponding
+        values as above.
     capacity: (Optional.) Max number of samples to keep in the buffer.
     num_clients: (Optional.) Number of clients of prefetched sample. 1 by
       default.
@@ -327,14 +369,18 @@ def prefetch_join(
     prefetched = nest.pack_sequence_as(
         thread_to_features[0], next_tensor_or_sparse_tensor_or_nones)
 
-  runner = PrefetchRunner(
-      fetch_ops=thread_to_fetch_tensors,
-      cancel_op=cancel_fetching,
-      resume_op=resume_fetching,
-      close_op=close_fetching,
-      feed_list=feed_list,
-      feed_generator=feed_generator,
-      closed_exception_types=closed_exception_types,
-      ignored_exception_types=ignored_exception_types)
-  ops.add_to_collection(PREFETCH, runner)
+  runner_options = config_pb2.PrefetchRunnerOptions()
+  fill_prefetch_runner_options(runner_options,
+                               thread_to_fetch_tensors,
+                               cancel_fetching,
+                               resume_fetching,
+                               close_fetching,
+                               feed_dict,
+                               closed_exception_types,
+                               ignored_exception_types,
+                               False, 0)
+  graph_key = ops.get_default_graph()._graph_key
+  prefetch_runner.TF_RegisterPrefetchRunner(graph_key, name+"_prefetch_runner",
+                                            runner_options)
+
   return prefetched
diff --git a/tensorflow/python/ops/prefetch_runner.cc b/tensorflow/python/ops/prefetch_runner.cc
new file mode 100644
index 00000000000..4d2e2b9b984
--- /dev/null
+++ b/tensorflow/python/ops/prefetch_runner.cc
@@ -0,0 +1,50 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/training/prefetch_runner.h"
+#include "tensorflow/python/ops/prefetch_runner.h"
+
+namespace tensorflow {
+
+void TF_RegisterPrefetchRunner(const char* graph_key, const char* runner_name,
+                             const void* proto, size_t proto_len,
+                             TF_Status* status) {
+  tensorflow::PrefetchRunnerOptions options;
+  if (!options.ParseFromArray(proto, proto_len)) {
+    status->status =
+        errors::InvalidArgument("Unparseable PrefetchRunnerOptions");
+    return;
+  }
+
+  auto prefetch_runner_mgr = tensorflow::PrefetchRunnerMgr::singleton();
+  status->status = prefetch_runner_mgr->RegisterPrefetchRunner(
+      graph_key, runner_name, options);
+}
+
+void TF_StartPrefetchRunners(const char* graph_key, TF_Session* session,
+                             TF_Status* status) {
+  auto prefetch_runner_mgr = tensorflow::PrefetchRunnerMgr::singleton();
+  status->status =
+      prefetch_runner_mgr->StartRunners(graph_key, session->session);
+}
+
+void TF_StopPrefetchRunners(const char* graph_key, TF_Session* session,
+                            TF_Status* status) {
+  auto prefetch_runner_mgr = tensorflow::PrefetchRunnerMgr::singleton();
+  status->status =
+      prefetch_runner_mgr->StopRunners(graph_key, session->session);
+}
+
+} // end of namespace tensorflow
diff --git a/tensorflow/python/ops/prefetch_runner.h b/tensorflow/python/ops/prefetch_runner.h
new file mode 100644
index 00000000000..1ae82db5107
--- /dev/null
+++ b/tensorflow/python/ops/prefetch_runner.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_OPS_PREFETCH_RUNNER_H_
+#define TENSORFLOW_PYTHON_OPS_PREFETCH_RUNNER_H_
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/c/tf_status.h"
+
+namespace tensorflow {
+
+// Register a PrefetchRunner to PrefetchRunnerMgr.
+void TF_RegisterPrefetchRunner(const char* graph_key,
+                             const char* runner_name,
+                             const void* proto, size_t proto_len,
+                             TF_Status* status);
+
+// Start PrefetchRunners managed by PrefetchRunnerMgr.
+void TF_StartPrefetchRunners(const char* graph_key, TF_Session* session,
+                             TF_Status* status);
+
+// Stop PrefetchRunners managed by PrefetchRunnerMgr.
+void TF_StopPrefetchRunners(const char* graph_key, TF_Session* session,
+                            TF_Status* status);
+
+} // end of namespace tensorflow
+
+#endif // End of TENSORFLOW_PYTHON_OPS_PREFETCH_RUNNER_H_
diff --git a/tensorflow/python/ops/prefetch_runner.i b/tensorflow/python/ops/prefetch_runner.i
new file mode 100644
index 00000000000..870d56ed616
--- /dev/null
+++ b/tensorflow/python/ops/prefetch_runner.i
@@ -0,0 +1,30 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/ops/prefetch_runner.h"
+%}
+
+%rename("_TF_RegisterPrefetchRunner") TF_RegisterPrefetchRunner;
+
+%include "tensorflow/python/ops/prefetch_runner.h"
+
+%insert("python") %{
+  def TF_RegisterPrefetchRunner(graph_key, runner_name, runner_options):
+    opt_str = runner_options.SerializeToString()
+    _TF_RegisterPrefetchRunner(graph_key, runner_name, opt_str)
+%}
diff --git a/tensorflow/python/ops/prefetch_runner.py b/tensorflow/python/ops/prefetch_runner.py
deleted file mode 100644
index e5653725c89..00000000000
--- a/tensorflow/python/ops/prefetch_runner.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prefetch runner for prefetching ops.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-import threading
-import weakref
-
-from six.moves import xrange
-
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import session_run_hook
-from tensorflow.core.protobuf import config_pb2
-
-class PrefetchRunner(object): # pylint: disable=useless-object-inheritance
-  """Prefetch tensors by repeating running given ops.
-
-  The `PrefetchRunner`, combined with the `Coordinator` provides a way to
-  compute tensors asynchronously using multiple threads.
-  """
-
-  class Hook(session_run_hook.SessionRunHook):
-    """SessionRunHook that starts prefetch runners after session creation."""
-    def __init__(self, collection, daemon=True, start=True):
-      """Build PrefetchRunner.Hook.
-
-      Args:
-        collection: Name of the runner collection.
-        daemon: (Optional.) Whether the threads should be marked as `daemons`,
-          meaning they don't block program exit.
-        start: (Optional.) If `False` threads would not be started.
-      """
-      super(PrefetchRunner.Hook, self).__init__()
-      self._collection = collection
-      self._daemon = daemon
-      self._start = start
-
-    def after_create_session(self, session, coord):
-      self.create_threads(sess=session, coord=coord)
-
-    def create_threads(self, sess=None, coord=None):
-      """Create threads for runners in specific collection.
-
-      It starts threads for all runners collected in the graph. It returns
-      the list of all threads.
-
-      Args:
-        sess: `Session` used to run the stage ops. Defaults to the
-          default session.
-        coord: (Optional.) `Coordinator` for coordinating the started threads.
-
-      Raises:
-        ValueError: if `sess` is None and there isn't any default session.
-        TypeError: if `sess` is not a `tf.Session` object.
-
-      Returns:
-        A list of threads.
-      """
-      if sess is None:
-        sess = ops.get_default_session()
-        if not sess:
-          raise ValueError("Cannot start threads: No default session is "
-                           "registered. Use `with sess.as_default()` or use "
-                           "explicit session in create_threads")
-
-      if not isinstance(sess, session_lib.SessionInterface):
-        if sess.__class__.__name__ in [
-            "MonitoredSession", "SingularMonitoredSession"]:
-          return []
-        raise TypeError("sess must be a `tf.Session` object. "
-                        "Given class: {}".format(sess.__class__))
-
-      with sess.graph.as_default():
-        threads = []
-        for runner in ops.get_collection(self._collection):
-          threads.extend(runner.create_threads(
-              sess, coord=coord, daemon=self._daemon, start=self._start))
-      return threads
-
-  def __init__(
-      self,
-      fetch_ops,
-      cancel_op,
-      resume_op,
-      close_op,
-      feed_list=None,
-      feed_generator=None,
-      closed_exception_types=None,
-      ignored_exception_types=None,
-      use_stage_subgraph_thread_pool=False,
-      stage_subgraph_thread_pool_id=0):
-    """Create a PrefetchRunner.
-
-    When you later call the `create_threads()` method, the `PrefetchRunner` will
-    create threads for `fetch_ops`. Each thread will prefetch
-    in parallel.
-
-    Args:
-      fetch_ops: Ops that repeats by this runner for each thread.
-      cancel_op: Op that stops fetch_ops on stop of this runner.
-      resume_op: Op that restarts fetch_ops on start of this runner.
-      close_op: Op that closes the data buffer.
-      feed_list: (Optional.) A list of `feed_dict` keys. See
-        @{tf.Session.run} for details of the allowable feed key types.
-      feed_generator: (Optional.) A generator function lambda sess: iterator
-        that yields a list of `feed_dict` values.
-      closed_exception_types: (Optional.) Exception types indicating that the
-        prefetching is normally finished. Defaults to
-        `(tf.errors.OutOfRangeError, StopIteration)`.
-      ignored_exception_types: (Optional.) Exception types indicating that the
-        prefetching can continue. Defaults to `()`.
-      use_stage_subgraph_thread_pool: (Optional.) Use stage subgraph thread pool
-        to run stage graph or not.
-      stage_subgraph_thread_pool_id: (Optional.) Specifies the stage subgraph
-        thread pool to use when enable use_stage_subgraph_thread_pool. 0 by default.
-    """
-    try:
-      executing_eagerly = context.executing_eagerly()
-    except: # pylint: disable=bare-except
-      executing_eagerly = context.in_eager_mode()
-    else:
-      executing_eagerly = False
-    if not executing_eagerly:
-      self._name = ops.get_default_graph().unique_name(self.__class__.__name__)
-    else:
-      self._name = context.context().scope_name
-    self._fetch_ops = fetch_ops
-    self._cancel_op = cancel_op
-    self._resume_op = resume_op
-    self._close_op = close_op
-    if (feed_list is None) != (feed_generator is None):
-      raise ValueError("feed_list and feed_generator must both exits")
-    self._feed_list = list(feed_list) if feed_list else None
-    self._feed_generator = feed_generator
-    if not closed_exception_types:
-      self._closed_exception_types = (errors.OutOfRangeError, StopIteration)
-    else:
-      self._closed_exception_types = tuple(closed_exception_types)
-    if not ignored_exception_types:
-      self._ignored_exception_types = ()
-    else:
-      self._ignored_exception_types = tuple(ignored_exception_types)
-    self._lock = threading.Lock()
-    self._runs_per_session = weakref.WeakKeyDictionary()
-    self._exceptions_raised = []
-    self._use_stage_subgraph_thread_pool = use_stage_subgraph_thread_pool
-    self._stage_subgraph_thread_pool_id = stage_subgraph_thread_pool_id
-
-  @property
-  def name(self):
-    """Name of this runner."""
-    return self._name
-
-  @property
-  def feed_list(self):
-    """List of feeds used in prefetch ops."""
-    return self._feed_list
-
-  @property
-  def num_threads(self):
-    """The number of running threads."""
-    return len(self._fetch_ops)
-
-  @property
-  def closed_exception_types(self):
-    """Exception types indicating that prefetching is normally finished."""
-    return self._closed_exception_types
-
-  @property
-  def exceptions_raised(self):
-    """Exceptions raised but not handled by the `PrefetchRunner` threads.
-
-    Exceptions raised in `PrefetchRunner` threads are handled in one of two ways
-    depending on whether or not a `Coordinator` was passed to
-    `create_threads()`:
-
-    * With a `Coordinator`, exceptions are reported to the coordinator and
-      forgotten by the `PrefetchRunner`.
-    * Without a `Coordinator`, exceptions are captured by the `PrefetchRunner`
-      and made available in this `exceptions_raised` property.
-
-    Returns:
-      A list of Python `Exception` objects.  The list is empty if no exception
-      was captured.  (No exceptions are captured when using a Coordinator.)
-    """
-    return self._exceptions_raised
-
-  # pylint: disable=broad-except
-  def run(self, sess, coord, index):
-    """Run prefetching in thread.
-
-    Args:
-      sess: A `Session`.
-      coord: A `Coordinator` object for reporting errors and checking stop
-        conditions.
-      index: Index of current thread.
-    """
-    decremented = False
-    try:
-      sess.run(self._resume_op)
-      run_fetch = sess.make_callable(
-          self._fetch_ops[index], self._feed_list, True)
-      close = sess.make_callable(self._close_op)
-      feed_list = self._feed_list if self._feed_list else []
-      if self._feed_generator:
-        feed_iterator = self._feed_generator(sess)
-      else:
-        feed_iterator = itertools.repeat([])
-      run_options = config_pb2.RunOptions()
-      run_options.use_stage_subgraph_thread_pool = self._use_stage_subgraph_thread_pool
-      run_options.stage_subgraph_thread_pool_id = self._stage_subgraph_thread_pool_id
-      while True:
-        try:
-          # Use `next` instead of `for .. in` to reraise exception in generator.
-          feed = next(feed_iterator)
-          if coord and coord.should_stop():
-            break
-          if not isinstance(feed, (list, tuple)):
-            raise ValueError(
-                'feed_generator must generate a tuple, not {} ({})'.format(
-                    feed, type(feed).__name__))
-          if len(feed) != len(feed_list):
-            raise ValueError(
-                'feed_generator must generate a tuple of {} items, not {} '
-                '({} items)'.format(
-                    len(feed_list), feed, len(feed)))
-          run_fetch(*feed, options=run_options)
-        except errors.CancelledError:
-          logging.info("Prefetching was cancelled.")
-          return
-        except self._closed_exception_types as e:  # pylint: disable=catching-non-exception
-          logging.info("Prefetching was closed.")
-          with self._lock:
-            self._runs_per_session[sess] -= 1
-            decremented = True
-            if self._runs_per_session[sess] == 0:
-              try:
-                close()
-              except Exception:
-                pass
-            return
-        except self._ignored_exception_types as e:  # pylint: disable=catching-non-exception
-          logging.warning(
-              "Corrupted inputs were ignored in prefetching:\n\n%s", e)
-          continue
-    except Exception as e:
-      if coord:
-        coord.request_stop(e)
-        if not isinstance(e, errors.CancelledError) and \
-           not isinstance(e, self._closed_exception_types) and \
-           not isinstance(e, self._ignored_exception_types):
-          logging.error(
-              "Prefetching was cancelled unexpectedly:\n\n%s", e)
-          raise
-      else:
-        with self._lock:
-          self._exceptions_raised.append(e)
-        raise
-    finally:
-      if not decremented:
-        with self._lock:
-          self._runs_per_session[sess] -= 1
-
-  def cancel_on_stop(self, sess, coord):
-    """Clean up resources on stop.
-
-    Args:
-      sess: A `Session`.
-      coord: A `Coordinator` object for reporting errors and checking stop
-        conditions.
-    """
-    coord.wait_for_stop()
-    try:
-      cancel = sess.make_callable(self._cancel_op)
-      cancel()
-    except Exception:
-      pass
-  # pylint: enable=broad-except
-
-  def create_threads(self, sess, coord=None, daemon=False, start=False):
-    """Create threads to prefetch for the given session.
-
-    This method requires a session in which the graph was launched. It creates
-    a list of threads, optionally starting them.
-
-    The `coord` argument is an optional coordinator that the threads will use
-    to terminate together and report exceptions.  If a coordinator is given,
-    this method starts an additional thread to cancel when the coordinator
-    requests a stop.
-
-    If previously created threads for the given session are still running, no
-    new threads will be created.
-
-    Args:
-      sess: A `Session`.
-      coord: (Optional.) `Coordinator` object for reporting errors and checking
-        stop conditions.
-      daemon: (Optional.) Boolean. If `True` make the threads daemon threads.
-      start: (Optional.) Boolean. If `True` starts the threads.  If `False` the
-        caller must call the `start()` method of the returned threads.
-
-    Returns:
-      A list of threads.
-    """
-    with self._lock:
-      try:
-        if self._runs_per_session[sess] > 0:
-          # Already started: no new threads to return.
-          return []
-      except KeyError:
-        pass
-      self._runs_per_session[sess] = self.num_threads
-      self._exceptions_raised = []
-
-    ret_threads = []
-    for i in xrange(self.num_threads):
-      ret_threads.append(threading.Thread(
-          target=self.run,
-          args=(sess, coord, i),
-          name="PrefetchThread-%s-%s" % (self.name, i)))
-    if coord:
-      name = "CancelOnStopThread-%s" % self.name
-      ret_threads.append(threading.Thread(
-          target=self.cancel_on_stop,
-          args=(sess, coord),
-          name=name))
-    for t in ret_threads:
-      if coord:
-        coord.register_thread(t)
-      if daemon:
-        t.daemon = True
-      if start:
-        t.start()
-    return ret_threads
diff --git a/tensorflow/python/ops/prefetch_runner_hook.py b/tensorflow/python/ops/prefetch_runner_hook.py
new file mode 100644
index 00000000000..dddb0347ad9
--- /dev/null
+++ b/tensorflow/python/ops/prefetch_runner_hook.py
@@ -0,0 +1,50 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prefetch runner for prefetching ops.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.framework import ops
+from tensorflow.python.training import session_run_hook
+from tensorflow.python import pywrap_tensorflow as prefetch_runner
+
+class PrefetchRunnerHook(session_run_hook.SessionRunHook):
+  """
+  PrefetchRunnerHook that starts prefetch runners after session creation and
+  stops prefetch runners before session close.
+  """
+  def __init__(self):
+    """Build PrefetchRunnerHook."""
+    super(PrefetchRunnerHook, self).__init__()
+
+  def cancel_on_stop(self, session, coord, graph_key):
+    coord.wait_for_stop()
+    prefetch_runner.TF_StopPrefetchRunners(graph_key, session.c_session)
+
+  def after_create_session(self, session, coord):
+    session._extend_graph()
+    graph_key = ops.get_default_graph()._graph_key
+    prefetch_runner.TF_StartPrefetchRunners(graph_key, session.c_session)
+    self._stop_thread = threading.Thread(target=self.cancel_on_stop,
+                                         args=(session, coord, graph_key),
+                                         name="prefetch_runner_cancel_on_stop")
+    coord.register_thread(self._stop_thread)
+    self._stop_thread.daemon=True
+    self._stop_thread.start()
diff --git a/tensorflow/python/ops/prefetch_test.py b/tensorflow/python/ops/prefetch_test.py
index fa68b4291c3..a1978c77e9d 100644
--- a/tensorflow/python/ops/prefetch_test.py
+++ b/tensorflow/python/ops/prefetch_test.py
@@ -49,7 +49,7 @@ def test_simple(self):
 
     with self.test_session(use_gpu=True, graph=graph) as sess:
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for _ in xrange(capacity * 3):
         self.assertAllClose(value, sess.run(y), rtol=1e-6)
       coord.request_stop()
@@ -67,7 +67,7 @@ def test_string(self):
 
     with self.test_session(use_gpu=True, graph=graph) as sess:
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for _ in xrange(capacity * 3):
         self.assertEqual(value, sess.run(y).decode())
       coord.request_stop()
@@ -93,7 +93,7 @@ def test_sparse(self):
       indices_data = sess.run(indices)
       dense_shape_data = sess.run(dense_shape)
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for _ in xrange(3):
         prefetched = sess.run(y)
         self.assertAllClose(values_data, prefetched.values, rtol=1e-6)
@@ -125,7 +125,7 @@ def test_list(self):
       dense_shape_data = sess.run(dense_shape)
       x2_data = sess.run(x2)
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for _ in xrange(3):
         prefetched = sess.run(y)
         self.assertAllClose(values_data, prefetched[0].values, rtol=1e-6)
@@ -160,7 +160,7 @@ def test_dict(self):
       dense_shape_data = sess.run(dense_shape)
       x2_data = sess.run(x2)
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for _ in xrange(3):
         prefetched = sess.run(y)
         self.assertAllClose(values_data, prefetched['bar'].values, rtol=1e-6)
@@ -171,27 +171,22 @@ def test_dict(self):
       coord.request_stop()
 
   def test_dict_from_feeds(self):
-    def my_generator_fn3(_):
-      for i in xrange(3):
-        yield [i]
-
     with ops.Graph().as_default() as graph:
       with ops.device('/cpu:0'):
         values = array_ops.constant([1, 1, 1], dtype=dtypes.int64)
         indices = array_ops.constant(
-            ([0, 0], [0, 1], [0, 2]), dtype=dtypes.int64)
+          ([0, 0], [0, 1], [0, 2]), dtype=dtypes.int64)
         dense_shape = array_ops.constant([3, 3], dtype=dtypes.int64)
-
         x1 = sparse_tensor.SparseTensor(values=values,
                                         indices=indices,
                                         dense_shape=dense_shape)
         x2 = array_ops.constant(42.0, dtype=dtypes.float32, shape=[])
         x3 = array_ops.placeholder(dtypes.int32, shape=[])
         x = {'foo': x2, 'bar': x1, 'foobar': x3}
+        feed_tensor = array_ops.constant(2, dtype=dtypes.int32)
       with ops.device(test.gpu_device_name()):
         y = prefetch.staged(
-            x, feed_list=[x3], feed_generator=my_generator_fn3,
-            timeout_millis=1000)
+            x, feed_dict={x3: feed_tensor}, timeout_millis=1000)
 
     graph.finalize()
 
@@ -201,7 +196,7 @@ def my_generator_fn3(_):
       dense_shape_data = sess.run(dense_shape)
       x2_data = sess.run(x2)
       coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
+      prefetch.make_prefetch_hook().after_create_session(sess, coord)
       for i in xrange(3):
         prefetched = sess.run(y)
         self.assertAllClose(values_data, prefetched['bar'].values, rtol=1e-6)
@@ -209,78 +204,7 @@ def my_generator_fn3(_):
         self.assertAllClose(
             dense_shape_data, prefetched['bar'].dense_shape, rtol=1e-6)
         self.assertAllClose(x2_data, prefetched['foo'], rtol=1e-6)
-        self.assertAllClose(i, prefetched['foobar'], rtol=1e-6)
-      coord.request_stop()
-
-  def test_dict_from_feeds_with_session_run(self):
-    with ops.Graph().as_default() as graph:
-      ph = array_ops.placeholder(dtypes.int32, shape=[])
-      count_op = array_ops.constant(100) + ph
-      def my_generator_fn100(sess):
-        for i in xrange(100):
-          yield [sess.run(count_op, feed_dict={ph: i})]
-
-      with ops.device('/cpu:0'):
-        x3 = array_ops.placeholder(dtypes.int32, shape=[])
-      with ops.device(test.gpu_device_name()):
-        yy = prefetch.staged(
-            x3,
-            feed_list=[x3],
-            feed_generator=my_generator_fn100,
-            capacity=4,
-            num_threads=4,
-            timeout_millis=1000)
-
-    graph.finalize()
-
-    with self.test_session(use_gpu=True, graph=graph) as sess:
-      coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
-      for i in xrange(99):
-        print(i, ':', sess.run(yy), end=', ')
-      print('done.')
-      coord.request_stop()
-
-  def test_corrupted_inputs(self):
-    def csv_generator(_):
-      for i in xrange(10):
-        if i < 9:
-          yield [u'abc,def']
-        else:
-          yield [u'corrupted"record,xyz']
-
-    with ops.Graph().as_default() as graph:
-      with ops.device('/cpu:0'):
-        x1 = array_ops.placeholder(dtypes.string, shape=[])
-        x2 = array_ops.constant(42.0, dtype=dtypes.float32, shape=[])
-        decoded_x1 = parsing_ops.decode_csv(x1, record_defaults=[[''], ['']], use_quote_delim=False)
-        x = {'x1': decoded_x1, 'x2': x2}
-        y = prefetch.staged(
-            x,
-            feed_list=[x1],
-            feed_generator=csv_generator,
-            ignored_exception_types=(errors.InvalidArgumentError,),
-            timeout_millis=1000)
-
-    graph.finalize()
-
-    with self.test_session(use_gpu=True, graph=graph) as sess:
-      x2_data = sess.run(x2)
-      coord = coordinator.Coordinator()
-      prefetch.make_prefetch_hook().create_threads(sess, coord)
-      for _ in xrange(9):
-        try:
-          prefetched = sess.run(y)
-        except errors.OutOfRangeError:
-          break
-        self.assertEqual(
-            [u'abc', u'def'],
-            [s.decode() for s in prefetched['x1']])
-        self.assertAllClose(x2_data, prefetched['x2'], rtol=1e-6)
-      try:
-        prefetched = sess.run(y)
-      except errors.OutOfRangeError:
-        pass
+        self.assertAllClose(2, prefetched['foobar'], rtol=1e-6)
       coord.request_stop()
 
   def test_preemption_retry(self):
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 3aef568e4e0..ad322de1a1f 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -59,3 +59,5 @@ limitations under the License.
 
 %include "tensorflow/python/util/traceme.i"
 %include "tensorflow/python/util/scoped_annotation.i"
+
+%include "tensorflow/python/ops/prefetch_runner.i"
\ No newline at end of file
diff --git a/tensorflow/python/training/async_embedding_stage.py b/tensorflow/python/training/async_embedding_stage.py
index 89871ff0935..32433387c1c 100644
--- a/tensorflow/python/training/async_embedding_stage.py
+++ b/tensorflow/python/training/async_embedding_stage.py
@@ -28,8 +28,8 @@
 @tf_export(v1=["async_embedding_mark_node"])
 def async_embedding_mark_node(embedding_tensor):
     """ mark embedding lookup output
-    Args: 
-    embedding_tensor: output tensor of embedding lookup function, 
+    Args:
+    embedding_tensor: output tensor of embedding lookup function,
     usually it is consumed by hidden layers in the neural network.
     """
     ops.add_to_collections(ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS, embedding_tensor)
@@ -77,7 +77,7 @@ def stage(self, graph):
         logging.info('async embedding stage begin')
         logging.info('async embedding thread num: ' + str(self._threads_num))
         logging.info('async embedding capacity: ' + str(self._capacity))
-        
+
         self._save_graph(graph, "graph_before_async_embedding")
         self._find_start_node(graph)
         self._mark_nodes_status()
@@ -130,7 +130,7 @@ def _find_start_node(self, graph):
             raise Exception("No boundary ops found")
 
         # travel all inputs node of boundary, remove the node if it is in
-        # candidate_boundary_ops, and collect all no input node 
+        # candidate_boundary_ops, and collect all no input node
         is_visited = set()
         control_flow_nodes = set()
         no_data_input_nodes =set()
@@ -144,7 +144,7 @@ def _find_start_node(self, graph):
                 # check io staged get node is existed or not
                 if (visit_node.type == "TensorBufferTake"):
                     is_find_io_staged_get = True
-                
+
                 # if meet node in candidate_boundary_ops and is not boundary_node
                 # remove it from candidate_boundary_ops
                 if visit_node != boundary_node and \
@@ -178,9 +178,9 @@ def _find_start_node(self, graph):
                 candidate_boundary_ops.add(node)
             elif self._is_variable_init_op(node):
                 candidate_boundary_ops.add(node)
-        
+
         self._start_nodes = candidate_boundary_ops
-        
+
     def _mark_nodes_status(self):
         active_stack = list(self._start_nodes)
 
@@ -259,7 +259,7 @@ def _perform_stage(self):
                                 num_threads=self._threads_num,
                                 capacity=self._capacity,
                                 timeout_millis=1000*60*60*3,
-                                closed_exception_types= (errors.OutOfRangeError,),
+                                closed_exception_types= (errors.OUT_OF_RANGE,),
                                 use_stage_subgraph_thread_pool = self._use_stage_subgraph_thread_pool,
                                 stage_subgraph_thread_pool_id = self._stage_subgraph_thread_pool_id)
 
@@ -289,7 +289,7 @@ def _perform_stage(self):
                             break
                     if self._stage_put_node is None:
                         raise Exception('no stage put node is found')
-                    
+
                     self._stage_get_node = target_output.op.inputs[0].op
                     logging.info('async embedding stage_get_node: {}'.format(self._stage_get_node.name))
 
@@ -327,7 +327,7 @@ def _perform_stage(self):
                 control_outputs.add(control_input_node)
             control_inputs.append(self._stage_get_node)
             active_node._control_inputs = control_inputs
-        
+
         # switch inactive node control output from active node to stage put node
         control_outputs = list(control_outputs)
         control_outputs.extend(self._stage_put_node.control_inputs)
@@ -349,7 +349,7 @@ def _find_node_cycle_dependent_on_stage(self, node, visited_set=set()):
                input_node.type == "Identity" and input_node.inputs[0].op != self._stage_get_node:
                 # skip data edge from inactive variable node to active node
                 return [node]
-        
+
         for control_input in node.control_inputs:
             if control_input != self._stage_get_node and \
                control_input not in self._active_nodes:
@@ -381,20 +381,20 @@ def _print_ops_path(self, tag, ops):
         path += ")"
         path += " --> "
         self._print_one_log(tag, path)
-    
+
     def _check_graph(self):
         visited_nodes = set()
         nodes = self._find_node_cycle_dependent_on_stage(self._stage_get_node, visited_nodes)
         if nodes:
             self._print_ops_path('find node cycle dependent on stage', nodes)
             raise Exception('check graph failed, find node cycle dependent on stage')
-    
+
     def _get_op_device_str(self, op):
         dev_str = op.device
         if dev_str == None:
             return ''
         return dev_str
-        
+
     def _set_op_device(self, op, dev_str):
         origin_device = self._get_op_device_str(op)
         if origin_device == '':
@@ -412,7 +412,7 @@ def _set_op_device(self, op, dev_str):
                     device += '/' + field_
             device += dev_str
         op._set_device(device)
-        
+
     def _place_io_embedding_subgraph_on_cpu(self, graph):
         dev_str = '/device:CPU:0'
 
@@ -463,4 +463,3 @@ def _place_io_embedding_subgraph_on_cpu(self, graph):
         # 5. place stage_closed_nodes on cpu
         for stage_closed_node in stage_closed_nodes:
             self._set_op_device(stage_closed_node, dev_str)
-

From 26c2755166b562900b20012aed2cec550f94b8db Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 27 Jun 2023 21:05:07 -0700
Subject: [PATCH 32/91] [Embedding] Refactor SingleHBM and GroupEmbedding code.
 (#896)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 .../feature_column/sequence_feature_column.py |   1 -
 .../core/framework/embedding/embedding_var.h  |  20 +-
 .../framework/embedding/gpu_hash_map_kv.h     |  68 +++---
 .../framework/embedding/gpu_hash_table.cu.cc  | 196 ++++++++++++------
 .../core/framework/embedding/gpu_hash_table.h |   9 +-
 .../core/framework/embedding/kv_interface.h   |   7 +-
 .../framework/embedding/single_tier_storage.h |  12 +-
 tensorflow/core/framework/embedding/storage.h |   8 +-
 .../group_embedding_lookup_ops.cc             |  83 +-------
 ...dding_lookup_sparse_backward_base_ops.cu.h |  56 ++---
 ...edding_lookup_sparse_forward_base_ops.cu.h |  32 ++-
 ...embedding_lookup_sparse_forward_base_ops.h |   4 -
 ...oup_embedding_lookup_sparse_forward_ops.cc |  75 ++-----
 ..._embedding_lookup_sparse_forward_ops.cu.cc |  43 +---
 .../core/kernels/kv_variable_lookup_ops.cc    |  33 +--
 .../core/kernels/training_ali_ops_gpu.cu.cc   |  52 +++--
 .../feature_column/feature_column_v2.py       |   1 +
 17 files changed, 324 insertions(+), 376 deletions(-)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 07650a723d1..8c855e4f783 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -120,7 +120,6 @@ def sequence_input_layer(
                 None, default_name=column._var_scope_name
             ):
                 if group_name != "":
-                    group_name_set.add(group_name)
                     output_tensors.append(None)  # placeholder
                     group_embedding_list.append(index)
                     embedding_columns.append(column)
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index ca5838ea37a..ae5760bfbc0 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -326,9 +326,14 @@ class EmbeddingVar : public ResourceBase {
                      const K* keys,
                      V* output,
                      int64 num_of_keys) {
-    filter_->BatchLookup(context, keys, output,
-                         num_of_keys, default_value_,
-                         default_value_no_permission_);
+    if (IsSingleHbm()) {
+      storage_->BatchLookup(context.gpu_device, keys, 
+		            output, num_of_keys, default_value_);
+    } else {
+      filter_->BatchLookup(context, keys, output,
+                           num_of_keys, default_value_,
+                           default_value_no_permission_);
+    }
   }
 
   void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
@@ -811,10 +816,9 @@ class EmbeddingVar : public ResourceBase {
   }
 
   void LookupOrCreate(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {
+      int32 default_v_num, size_t n, const Eigen::GpuDevice& device) {
     storage_->BatchLookupOrCreate(key, val, default_v, default_v_num,
-        is_use_default_value_tensor, n, device);
+        n, device);
   }
 
   void LookupOrCreateKey(const K* key, int32* item_idxs, size_t n,
@@ -823,10 +827,10 @@ class EmbeddingVar : public ResourceBase {
   }
 
   void Lookup(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
+      int32 default_v_num,
       size_t n, const Eigen::GpuDevice& device) {
     storage_->BatchLookup(key, val, default_v, default_v_num,
-        is_use_default_value_tensor, n, device);
+        n, device);
   }
   
   int32 SlotNum() {
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 82edf045f60..56542237a3e 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -67,33 +67,37 @@ class GPUHashMapKV : public KVInterface<K, V> {
 
   Status BatchLookupOrCreateKeys(const K* keys, size_t n, int32* item_idxs,
                                  const Eigen::GpuDevice& device) {
-    mutex_lock lock(lock_);
-    int remaining_size =
-        n + *(hash_table_->start_idx) -
-        hash_table_->mem_bank_num * hash_table_->initial_bank_size;
-    if (remaining_size > 0) {
-      Resize(remaining_size);
+    if (n > 0) {
+      mutex_lock lock(lock_);
+      int remaining_size =
+          n + *(hash_table_->start_idx) -
+          hash_table_->mem_bank_num * hash_table_->initial_bank_size;
+      if (remaining_size > 0) {
+        Resize(remaining_size);
+      }
+      functor::KvLookupInsertKey<Eigen::GpuDevice, K, V>()(
+          keys, item_idxs, n, hash_table_, hash_table_->start_idx,
+          device.stream());
     }
-    functor::KvLookupInsertKey<Eigen::GpuDevice, K, V>()(
-        keys, item_idxs, n, hash_table_, hash_table_->start_idx,
-        device.stream());
     return Status::OK();
   }
 
   Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
-                             int32 default_v_num,
-                             bool is_use_default_value_tensor, size_t n,
+                             int32 default_v_num, size_t n,
                              const Eigen::GpuDevice& device) {
-    int32* item_idxs =
-        TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
-    BatchLookupOrCreateKeys(keys, n, item_idxs, device);
-    functor::KvLookupCreateEmb<Eigen::GpuDevice, K, V>()(
-        keys, val, default_v, value_len_, item_idxs, n, config_.emb_index,
-        default_v_num, is_use_default_value_tensor, hash_table_->d_bank_ptrs,
-        hash_table_->d_existence_flag_ptrs,
-        (config_.block_num * (1 + config_.slot_num)),
-        hash_table_->initial_bank_size, device.stream());
-    TypedAllocator::Deallocate(alloc_, item_idxs, n);
+    if (n > 0) {
+      int32* item_idxs =
+          TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
+      BatchLookupOrCreateKeys(keys, n, item_idxs, device);
+      functor::KvLookupCreateEmb<Eigen::GpuDevice, K, V>()(
+          keys, val, default_v, value_len_, item_idxs, n, config_.emb_index,
+          default_v_num, hash_table_->d_bank_ptrs,
+          hash_table_->d_existence_flag_ptrs,
+          (config_.block_num * (1 + config_.slot_num)),
+          hash_table_->initial_bank_size, device.stream());
+      TypedAllocator::Deallocate(alloc_, item_idxs, n);
+    }
+    
     return Status::OK();
   }
 
@@ -256,11 +260,23 @@ class GPUHashMapKV : public KVInterface<K, V> {
 
   GPUHashTable<K, V>* HashTable() override { return hash_table_; }
 
-  Status BatchLookup(const K* keys, V* val, V* default_v, int32 default_v_num,
-                     bool is_use_default_value_tensor, size_t n,
-                     const Eigen::GpuDevice& device) override {
-    functor::KvLookupKey<Eigen::GpuDevice, K, V>()(
-        keys, val, n, value_len_, static_hash_table_, device.stream());
+  Status BatchLookup(const Eigen::GpuDevice& device, const K* keys,
+		      V* val, size_t n, const V* default_v) override {
+    if (n > 0) {
+      if (is_inference_) {
+        functor::KvLookupKey<GPUStaticHashTable<K, V>, K, V>()(
+          keys, val, n, value_len_, config_.emb_index,
+          (config_.block_num * (1 + config_.slot_num)),
+          static_hash_table_, default_v, 
+          config_.default_value_dim, device.stream());   
+      } else {
+        functor::KvLookupKey<GPUHashTable<K, V>, K, V>()(
+          keys, val, n, value_len_, config_.emb_index, 
+          (config_.block_num * (1 + config_.slot_num)),
+          hash_table_, default_v, 
+          config_.default_value_dim, device.stream());
+      }
+    }
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc b/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
index b56bd5b7210..e730471ee50 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
+++ b/tensorflow/core/framework/embedding/gpu_hash_table.cu.cc
@@ -37,6 +37,11 @@ namespace cg = cooperative_groups;
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace {
+const size_t BLOCK_SIZE = 128;
+const size_t STRIDE = 1;
+const size_t TILE_SIZE = 4;
+}
 template <typename T>
 class gpu_hash_map_tf_allocator {
  public:
@@ -228,16 +233,13 @@ struct KvInitStaticMap<GPUDevice, Key, V> {
           cudaMemsetAsync(map.get_num_success(), 0, sizeof(atomicT), stream));
 
       auto n = std::min((size_t)65535, num_to_insert);
-      auto const block_size = 128;
-      auto stride = 1;
-      auto const tile_size = 4;
       auto const grid_size =
-          (tile_size * n + stride * block_size - 1) / (stride * block_size);
+          (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) / (STRIDE * BLOCK_SIZE);
       TF_CHECK_OK(GpuLaunchKernel(
-          kv_initialize_static_map<block_size, tile_size, Key, V, MutableViewT,
+          kv_initialize_static_map<BLOCK_SIZE, TILE_SIZE, Key, V, MutableViewT,
                                    cuco::detail::MurmurHash3_32<Key>,
                                    thrust::equal_to<Key>>,
-          grid_size, block_size, 0, stream, keys, n, dimension,
+          grid_size, BLOCK_SIZE, 0, stream, keys, n, dimension,
           map.get_device_mutable_view(), map.get_num_success(),
           cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
 
@@ -255,11 +257,82 @@ struct KvInitStaticMap<GPUDevice, Key, V> {
 template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
           typename ViewT, typename Hash = cuco::detail::MurmurHash3_32<Key>,
           typename KeyEqual = thrust::equal_to<Key>>
-__global__ void kv_lookup_key_kernel(const Key* key_first, const V* value_srcs,
-                                     V* value_first, size_t num_items,
-                                     int32 dimension, ViewT map_views,
-                                     Hash hash = Hash{},
-                                     KeyEqual key_equal = KeyEqual{}) {
+__global__ void kv_lookup_dynamic_key_kernel(
+    const Key* key_first, V** value_srcs, V* value_first, const V* default_v,
+    int32 default_v_num, size_t num_items, int32 dimension, ViewT* submap_views,
+    uint32_t num_submaps, int32 slot_idx, int32 slot_num, int32 bank_size,
+    Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) {
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+  auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 found_value = empty_value_sentinel;
+
+    for (auto i = 0; i < num_submaps; ++i) {
+      auto submap_view = submap_views[i];
+      auto found = submap_view.find(tile, key, hash, key_equal);
+      if (found != submap_view.end()) {
+        found_value = found->second;
+        break;
+      }
+    }
+    if (found_value == empty_value_sentinel) {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            default_v[key % default_v_num * dimension + id];
+      }
+    } else {
+      auto bank_idx = found_value / bank_size;
+      auto offset_in_bank = found_value % bank_size;
+      auto slot_offset = bank_idx * slot_num + slot_idx;
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            value_srcs[slot_offset][offset_in_bank * dimension + id];
+      }
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+template <typename Key, typename V>
+struct KvLookupKey<GPUHashTable<Key, V>, Key, V> {
+  void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension,
+                  int32 slot_idx, int32 slot_num,
+                  GPUHashTable<Key, V>* hash_table, const V* default_v,
+                  int32 default_v_num, cudaStream_t stream) {
+    using mutableViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::mutable_view_type;
+    using ViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::view_type;
+
+    auto& map = hash_table->hash_table->map_;
+
+    auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) /
+                           (STRIDE * BLOCK_SIZE);
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_lookup_dynamic_key_kernel<BLOCK_SIZE, TILE_SIZE, Key, V, ViewT>,
+        grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->d_bank_ptrs, vals,
+        default_v, default_v_num, num_items, dimension,
+        map.get_submap_views().data().get(), map.get_submaps().size(), slot_idx,
+        slot_num, hash_table->initial_bank_size,
+        cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+  }
+};
+
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename ViewT, typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_lookup_static_key_kernel(const Key* key_first,
+                                            const V* value_srcs, V* value_first,
+                                            const V* default_v, int32 default_v_num,
+                                            size_t num_items, int32 dimension,
+                                            ViewT map_views, Hash hash = Hash{},
+                                            KeyEqual key_equal = KeyEqual{}) {
   auto grid = cooperative_groups::this_grid();
   auto block = cooperative_groups::this_thread_block();
   auto tile = cooperative_groups::tiled_partition<tile_size>(block);
@@ -276,8 +349,13 @@ __global__ void kv_lookup_key_kernel(const Key* key_first, const V* value_srcs,
       found_value = found->second;
     }
 
-    if (tile.thread_rank() == 0) {
-      for (auto id = threadIdx.x; id < dimension; id += blockDim.x) {
+    if (found_value == empty_value_sentinel) {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            default_v[key % default_v_num * dimension + id];
+      }
+    } else {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
         value_first[key_idx * dimension + id] = value_srcs[found_value + id];
       }
     }
@@ -286,24 +364,23 @@ __global__ void kv_lookup_key_kernel(const Key* key_first, const V* value_srcs,
 }
 
 template <typename Key, typename V>
-struct KvLookupKey<GPUDevice, Key, V> {
+struct KvLookupKey<GPUStaticHashTable<Key, V>, Key, V> {
   void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension,
-                  GPUStaticHashTable<Key, V>* hash_table, cudaStream_t stream) {
+                  int32 slot_idx, int32 slot_num,
+                  GPUStaticHashTable<Key, V>* hash_table, const V* default_v,
+                  int32 default_v_num, cudaStream_t stream) {
     using ViewT = typename cuco::static_map<
         Key, int32, cuda::thread_scope_device,
         gpu_hash_map_tf_allocator<uint8_t>>::device_view;
     auto& map = hash_table->hash_table->map_;
 
-    auto const block_size = 128;
-    auto const stride = 1;
-    auto const tile_size = 4;
-    auto const grid_size = (tile_size * num_items + stride * block_size - 1) /
-                           (stride * block_size);
+    auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) /
+                           (STRIDE * BLOCK_SIZE);
     TF_CHECK_OK(GpuLaunchKernel(
-        kv_lookup_key_kernel<block_size, tile_size, Key, V, ViewT>, grid_size,
-        block_size, 0, stream, keys, hash_table->values_d, vals, num_items,
-        dimension, map.get_device_view(), cuco::detail::MurmurHash3_32<Key>{},
-        thrust::equal_to<Key>{}));
+        kv_lookup_static_key_kernel<BLOCK_SIZE, TILE_SIZE, Key, V, ViewT>,
+        grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->values_d, vals,
+        default_v, default_v_num, num_items, dimension, map.get_device_view(),
+        cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
   }
 };
 
@@ -394,16 +471,13 @@ struct KvLookupInsertKey<GPUDevice, Key, V> {
 
         auto n = std::min(capacity_remaining, num_to_insert);
 
-        auto const block_size = 128;
-        auto const stride = 1;
-        auto const tile_size = 4;
-        auto const grid_size =
-            (tile_size * n + stride * block_size - 1) / (stride * block_size);
+	auto const grid_size = (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) /
+                           (STRIDE * BLOCK_SIZE);
         TF_CHECK_OK(GpuLaunchKernel(
             kv_lookup_and_insert_key_kernel<
-                block_size, tile_size, Key, mutableViewT, ViewT,
+                BLOCK_SIZE, TILE_SIZE, Key, mutableViewT, ViewT,
                 cuco::detail::MurmurHash3_32<Key>, thrust::equal_to<Key>>,
-            grid_size, block_size, 0, stream, key_first, value_first, n,
+            grid_size, BLOCK_SIZE, 0, stream, key_first, value_first, n,
             map.get_submap_mutable_views().data().get(),
             map.get_submap_views().data().get(), map.get_submaps().size(),
             map.get_num_successes(), start_idx, submap_idx,
@@ -424,9 +498,8 @@ struct KvLookupInsertKey<GPUDevice, Key, V> {
 template <typename Key, typename Value>
 __global__ void kv_lookup_or_create_emb_kernel(
     const Key* key_first, Value* val, Value* default_v, int64 dim,
-    bool is_use_default_value_tensor, int32* item_idxs, int32 slot_idx,
-    Value** d_banks, bool** d_flags, int32 slot_num, int32 default_v_num,
-    int32 bank_size) {
+    int32* item_idxs, int32 slot_idx, Value** d_banks, 
+    bool** d_flags, int32 slot_num, int32 default_v_num, int32 bank_size) {
   auto item_idx = blockIdx.x;
   auto item_pos = item_idxs[item_idx];
   auto bank_idx = item_pos / bank_size;
@@ -437,19 +510,13 @@ __global__ void kv_lookup_or_create_emb_kernel(
   if (stored == false) {
     d_flags[slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      int32 default_v_idx;
-      if (is_use_default_value_tensor) {
-        default_v_idx = item_idx % default_v_num;
-      } else {
-        default_v_idx = *(key_first + item_idx) % default_v_num;
-      }
+      int32 default_v_idx = *(key_first + item_idx) % default_v_num;
       d_banks[slot_offset][offset_in_bank * dim + id] =
           default_v[default_v_idx * dim + id];
     }
   }
   for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      val[item_idx * dim + id] =
-          d_banks[slot_offset][offset_in_bank * dim + id];
+    val[item_idx * dim + id] = d_banks[slot_offset][offset_in_bank * dim + id];
   }
 }
 
@@ -457,7 +524,7 @@ template <typename Key, typename Value>
 struct KvLookupCreateEmb<GPUDevice, Key, Value> {
   void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim,
                   int32* item_idxs, int32 num_items, int32 slot_idx,
-                  int32 default_v_num, bool is_use_default_value_tensor,
+                  int32 default_v_num,
                   Value** d_banks, bool** d_flags, int32 slot_num,
                   int32 bank_size, cudaStream_t stream) {
     auto const block_size = 256;
@@ -465,7 +532,7 @@ struct KvLookupCreateEmb<GPUDevice, Key, Value> {
     TF_CHECK_OK(
         GpuLaunchKernel(kv_lookup_or_create_emb_kernel<Key, Value>, grid_size,
                         block_size, 0, stream, key_first, val, default_v, dim,
-                        is_use_default_value_tensor, item_idxs, slot_idx,
+                        item_idxs, slot_idx,
                         d_banks, d_flags, slot_num, default_v_num, bank_size));
   }
 };
@@ -608,22 +675,35 @@ struct KvEmbGetSnapshot<GPUDevice, Key, Value> {
 
 }  // namespace functor
 
-#define REGISTER_ALL_TYPE(type)                                       \
-  template struct functor::KvInitStaticMap<GPUDevice, int32, type>;   \
-  template struct functor::KvInitStaticMap<GPUDevice, int64, type>;   \
-  template struct functor::KvLookupKey<GPUDevice, int32, type>;       \
-  template struct functor::KvLookupKey<GPUDevice, int64, type>;       \
-  template struct functor::KvLookupInsertKey<GPUDevice, int32, type>; \
-  template struct functor::KvLookupInsertKey<GPUDevice, int64, type>; \
-  template struct functor::KvLookupCreateEmb<GPUDevice, int32, type>; \
-  template struct functor::KvLookupCreateEmb<GPUDevice, int64, type>; \
-  template struct functor::KvKeyGetSnapshot<GPUDevice, int32, type>;  \
-  template struct functor::KvKeyGetSnapshot<GPUDevice, int64, type>;  \
-  template struct functor::KvEmbGetSnapshot<GPUDevice, int32, type>;  \
-  template struct functor::KvEmbGetSnapshot<GPUDevice, int64, type>;  \
-  template struct functor::KvUpdateEmb<GPUDevice, int32, type>;       \
+#define REGISTER_ALL_TYPE(type)                                                \
+  template struct functor::KvInitStaticMap<GPUDevice, int32, type>;            \
+  template struct functor::KvInitStaticMap<GPUDevice, int64, type>;            \
+  template struct functor::KvLookupInsertKey<GPUDevice, int32, type>;          \
+  template struct functor::KvLookupInsertKey<GPUDevice, int64, type>;          \
+  template struct functor::KvLookupCreateEmb<GPUDevice, int32, type>;          \
+  template struct functor::KvLookupCreateEmb<GPUDevice, int64, type>;          \
+  template struct functor::KvKeyGetSnapshot<GPUDevice, int32, type>;           \
+  template struct functor::KvKeyGetSnapshot<GPUDevice, int64, type>;           \
+  template struct functor::KvEmbGetSnapshot<GPUDevice, int32, type>;           \
+  template struct functor::KvEmbGetSnapshot<GPUDevice, int64, type>;           \
+  template struct functor::KvUpdateEmb<GPUDevice, int32, type>;                \
   template struct functor::KvUpdateEmb<GPUDevice, int64, type>;
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE)
+
+#define REGISTER_LOOKUP_KERNEL_ALL(hash_table, type)                     \
+  template struct functor::KvLookupKey<hash_table<int32, type>, int32, type>; \
+  template struct functor::KvLookupKey<hash_table<int64, type>, int64, type > ;
+#define REGISTER_INFERENCE_LOOKUP_KERNEL(type) \
+  REGISTER_LOOKUP_KERNEL_ALL(GPUHashTable, type)
+#define REGISTER_TRAINING_LOOKUP_KERNEL(type) \
+  REGISTER_LOOKUP_KERNEL_ALL(GPUStaticHashTable, type)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_INFERENCE_LOOKUP_KERNEL)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_TRAINING_LOOKUP_KERNEL)
+
+#undef REGISTER_INFERENCE_LOOKUP_KERNEL
+#undef REGISTER_TRAINING_LOOKUP_KERNEL
+#undef REGISTER_LOOKUP_KERNEL_ALL_TYPE
 #undef REGISTER_ALL_TYPE
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/gpu_hash_table.h b/tensorflow/core/framework/embedding/gpu_hash_table.h
index 076f3e767c7..a42354ea266 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_table.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_table.h
@@ -73,11 +73,12 @@ class GPUHashTable {
 
 namespace functor {
 
-template <typename Device, typename Key, typename V>
+template <typename HashTable, typename Key, typename V>
 struct KvLookupKey {
   void operator()(const Key* key_first, V* value_first, int32 num_items,
-                  int32 dimension, GPUStaticHashTable<Key, V>* hash_table,
-                  cudaStream_t stream);
+                  int32 dimension, int32 slot_idx, int32 slot_num,
+                  HashTable* hash_table,
+                  const V* default_v, int32 default_v_num, cudaStream_t stream);
 };
 
 template <typename Device, typename Key, typename V>
@@ -99,7 +100,7 @@ template <typename Device, typename Key, typename Value>
 struct KvLookupCreateEmb {
   void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim,
                   int32* item_idxs, int32 num_items, int32 slot_idx,
-                  int32 default_v_num, bool is_use_default_value_tensor,
+                  int32 default_v_num,
                   Value** d_banks, bool** d_flags, int32 slot_num,
                   int32 bank_size, cudaStream_t stream);
 };
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 64e0c4685f0..40108a140cc 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -101,7 +101,7 @@ class KVInterface {
   virtual Iterator* GetIterator() { return nullptr; }
 
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
+      int32 default_v_num,
       size_t n, const Eigen::GpuDevice& device) {
     return Status::OK();
   }
@@ -110,9 +110,8 @@ class KVInterface {
     return Status::OK();
   }
 
-  virtual Status BatchLookup(const K* keys, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {
+  virtual Status BatchLookup(const Eigen::GpuDevice& device, 
+      const K* keys, V* val, size_t n, const V* default_v) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
   }
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index ad9dc4e15b6..2ebc4e3dc40 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -444,10 +444,10 @@ class HbmStorage : public SingleTierStorage<K, V> {
   }
 
   void BatchLookupOrCreate(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
+      int32 default_v_num,
       size_t n, const Eigen::GpuDevice& device) override {
     SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
-        is_use_default_value_tensor, n, device);
+        n, device);
   }
 
   void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
@@ -455,11 +455,9 @@ class HbmStorage : public SingleTierStorage<K, V> {
     SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs, device);
   }
 
-  void BatchLookup(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) override {
-    SingleTierStorage<K, V>::kv_->BatchLookup(key, val, default_v, default_v_num,
-        is_use_default_value_tensor, n, device);
+  void BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val,
+                   size_t n, const V* default_v) override {
+    SingleTierStorage<K, V>::kv_->BatchLookup(device, keys, val, n, default_v);
   }
   
   int64 GetSnapshot(std::vector<K>* key_list,
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 2ac2e8f6523..7a7deaae483 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -134,13 +134,11 @@ class Storage {
       const DeviceBase::CpuWorkerThreads* worker_threads) = 0;
 
   virtual void BatchLookupOrCreate(const K* key, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {}
+      int32 default_v_num, size_t n, const Eigen::GpuDevice& device) {}
   virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
       const Eigen::GpuDevice& device) {}
-  virtual void BatchLookup(const K* keys, V* val, V* default_v,
-      int32 default_v_num, bool is_use_default_value_tensor,
-      size_t n, const Eigen::GpuDevice& device) {}
+  virtual void BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val,
+                           size_t n, const V* default_v) {}
   virtual void ImportToHbm(const std::vector<K>& keys,
       const std::vector<V>& values, const Eigen::GpuDevice* device,
       const EmbeddingConfig& emb_config) {};
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
index ba251559f7b..1a875738d20 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops.cc
@@ -25,9 +25,9 @@ namespace tensorflow {
 #define USING_BASE_CLASS_MEMBER                                            \
   using GroupLookupBaseCpuOp<TKey, TValue>::m_num_lookup;                  \
   using GroupLookupBaseCpuOp<TKey, TValue>::m_dimension;                   \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor; \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_get_default_v_fn;            \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_lookup_fn;
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
 template <typename TKey, typename TValue>
 class GroupEmbeddingVariableLookupDenseCpuOp
@@ -38,32 +38,6 @@ class GroupEmbeddingVariableLookupDenseCpuOp
       : GroupLookupBaseCpuOp<TKey, TValue>(c) {
     OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
                                  &m_is_use_default_value_tensor));
-    bool is_inference;
-    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
-
-    if (m_is_use_default_value_tensor) {
-      m_get_default_v_fn = [](TValue* default_v, TKey id, int64 index,
-                              int64 total_dim,
-                              int64 len) { return default_v + len * index; };
-    } else {
-      m_get_default_v_fn = [](TValue* default_v, TKey id, int64 index,
-                              int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim);
-      };
-    }
-    if (!is_inference) {
-      m_lookup_fn = [](EmbeddingVar<TKey, TValue>* ev, TKey key, TValue* val,
-                       TValue* default_v, int count) {
-        ev->LookupOrCreate(key, val, default_v, count);
-        return Status::OK();
-      };
-    } else {
-      m_lookup_fn = [](EmbeddingVar<TKey, TValue>* ev, TKey key, TValue* val,
-                       TValue* default_v, int count) {
-        ev->LookupOrCreate(key, val, default_v);
-        return Status::OK();
-      };
-    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -79,6 +53,7 @@ class GroupEmbeddingVariableLookupDenseCpuOp
       core::ScopedUnref unref_me(embedding_var);
 
       const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i);
+      auto dense_values = dense_values_tensor.flat<TKey>().data();
       int nnz = dense_values_tensor.NumElements();
 
       auto dense_values_tensor_shape = dense_values_tensor.shape();
@@ -98,55 +73,15 @@ class GroupEmbeddingVariableLookupDenseCpuOp
                                   embedding_var->CacheSize(),
                                   " should large than IDs in batch ", nnz));
 
-      // Stage 1
-      Tensor unique_idx_tensor;
-      Tensor unique_tensor;
-      Tensor unique_counter;
-
-      UniqueWithoutAxis<TKey, int32>(
-          ctx, dense_values_tensor, &unique_idx_tensor, &unique_tensor,
-          &unique_counter, 0, this->partition_size_, this->serial_,
-          this->unique_ratio_hint_, this->map_flag_);
-
-      ctx->set_output(m_num_lookup + i, unique_tensor);
-      ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
-
-      auto* unique = unique_tensor.flat<TKey>().data();
-      auto* unique_idx = unique_idx_tensor.flat<int>().data();
-
-      TValue* default_v = nullptr;
+      EmbeddingVarContext<CPUDevice> ev_ctx(ctx);
       if (m_is_use_default_value_tensor) {
-        default_v =
-            reinterpret_cast<TValue*>(ctx->input(m_num_lookup * 2).data());
+        embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding,
+            nnz, reinterpret_cast<TValue *>(ctx->input(m_num_lookup * 4 + 1).data()));
       } else {
-        default_v = embedding_var->GetDefaultValuePtr();
+        embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding, nnz);
+        embedding_var->UpdateCache(dense_values_tensor, true);
       }
 
-      int slice_bytes = nnz * m_dimension * 1000;
-      auto do_lookup = [this, ctx, embedding_var, unique, default_v, unique_idx,
-                        gather_embedding](int64 start, int64 end) {
-        for (int k = start; k < end; ++k) {
-          auto indices = unique_idx[k];
-          TKey unique_id = unique[indices];
-          TValue* default_v_ptr = m_get_default_v_fn(
-              default_v, unique_id, indices,
-              embedding_var->GetDefaultValueDim(), embedding_var->ValueLen());
-          OP_REQUIRES_OK(ctx, m_lookup_fn(embedding_var, unique_id,
-                                          gather_embedding + k * m_dimension,
-                                          default_v_ptr, 1 /*count*/));
-        }
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, nnz,
-            slice_bytes, do_lookup);
-
-      if (embedding_var->IsMultiLevel()) {
-        embedding::BatchCache<TKey>* cache = embedding_var->Cache();
-        embedding_var->storage()->Schedule(
-            [embedding_var, dense_values_tensor] {
-              embedding::BatchCache<TKey>* cache = embedding_var->Cache();
-              cache->update(dense_values_tensor);
-            });
-      }
     }
   }
 };
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
index 5c352144234..551086bfd4d 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
@@ -106,36 +106,38 @@ __global__ void ComputeSparseGradFn(
   // each thread corresponding to one element in the embedding vector
   const int tid = tile.thread_rank();
 
-  for (int idx = 0; idx < num_lookups; ++idx) {
-    const int value_offset = args[idx].offset_indices_[bid];
-    int feature_num;
-    if (bid == (batch_size - 1)) {
-      feature_num = args[idx].nnz_ - value_offset;
-    } else {
-      feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
-    }
+  if (bid < batch_size && tid < dimension ) {
+    for (int idx = 0; idx < num_lookups; ++idx) {
+      const int value_offset = args[idx].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[idx].nnz_ - value_offset;
+      } else {
+        feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
+      }
 
-    if (feature_num > 0) {
-      float grad = args[idx].grads_[bid * dimension + tid];
-      grad = CombineGrad<combiner>(grad, feature_num);
-      for (int i = 0; i < feature_num; i++) {
-        float grad_i = grad;
-        if (max_norm > 0.0f) {
-          int64_t indices = int(args[idx].sp_values_[value_offset + i]);
-          float emb_element =
-              args[idx].emb_variable_[indices * dimension + tid];
-          if (tid == 0) {
-            l2_sum = 0.0f;
-          }
-          tile.shfl(l2_sum, 0);
-          atomicAdd(&l2_sum, emb_element * emb_element);
-          tile.sync();
-          float l2_norm = sqrtf(l2_sum);
-          if (l2_norm > max_norm) {
-            grad_i *= max_norm / l2_norm;
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+        for (int i = 0; i < feature_num; i++) {
+          float grad_i = grad;
+          if (max_norm > 0.0f) {
+            int64_t indices = int(args[idx].sp_values_[value_offset + i]);
+            float emb_element =
+                args[idx].emb_variable_[indices * dimension + tid];
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            float l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
           }
+          args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
         }
-        args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
       }
     }
   }
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
index 665c2a6703e..1eef2152c58 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
@@ -59,22 +59,35 @@ __global__ void SetToIntMaxSTG128(const int batch_size, int* values_offset) {
   }
 }
 
+__device__ void FilledEmptyRowNumber(int batch_size, volatile int* values_offset) {
+  const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  const int int_max = 0x7fffffff;
+  if (thread_offset > 1) {
+    if (thread_offset < batch_size) {
+        while (values_offset[thread_offset] == int_max) {
+          const int compare = values_offset[thread_offset-1];
+          if (compare != int_max) {
+            atomicMin((int*)values_offset + thread_offset, compare);
+          }
+        }
+      }
+  } else {
+    if (values_offset[thread_offset] == int_max) {
+      values_offset[thread_offset] = 0;
+    }
+  }
+}
+
 __global__ void CalcPerElementRowOffset(int batch_size, int nnz,
                                         int stride, const int64_t* indices,
-                                        volatile int* values_offset) {
+                                        int* values_offset) {
   const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x;
-  const int int_max = 0x7fffffff;
   if (thread_offset < nnz) {
     const int64_t element_row = indices[stride*thread_offset];
     atomicMin((int*)values_offset + int(element_row), thread_offset);
-    __syncthreads();
-    if (thread_offset < int(batch_size - 1)) {
-      while (values_offset[thread_offset + 1] == int_max) {
-      }
-      const int compare = values_offset[thread_offset + 1];
-      atomicMin((int*)values_offset + thread_offset, compare);
-    }
   }
+  __syncthreads();
+  FilledEmptyRowNumber(batch_size, values_offset);
 }
 
 inline void launch_cal_per_element_row_offset(const int batch_size, int nnz, int stride,
@@ -85,7 +98,6 @@ inline void launch_cal_per_element_row_offset(const int batch_size, int nnz, int
   int blocks = (batch_size - 1) / threads + 1;
 
   SetToIntMaxSTG128<<<blocks, threads, 0, stream>>>(batch_size, offset_indices);
-
   blocks = (nnz - 1) / threads + 1;
   CalcPerElementRowOffset<<<blocks, threads, 0, stream>>>(
       batch_size, nnz, stride, sp_indices, offset_indices);
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
index a49890dcbc9..001e3cddb49 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
@@ -43,10 +43,6 @@ class GroupLookupBaseCpuOp : public OpKernel {
   }
 
  protected:
-  std::function<TValue*(TValue*, TKey, int64, int64, int64)> m_get_default_v_fn;
-  std::function<Status(EmbeddingVar<TKey, TValue>* ev, TKey key, TValue* val,
-                       TValue* default_v, int count)>
-      m_lookup_fn;
   // float max_norm_;
   int m_num_lookup;
   int m_dimension;
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
index e05aadbd350..fd644f903aa 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -27,9 +28,9 @@ namespace tensorflow {
   using GroupLookupBaseCpuOp<TKey, TValue>::m_num_lookup;                  \
   using GroupLookupBaseCpuOp<TKey, TValue>::m_dimension;                   \
   using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor; \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_sequence;                 \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_get_default_v_fn;            \
-  using GroupLookupBaseCpuOp<TKey, TValue>::m_lookup_fn;
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_sequence;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
 template <typename TKey, typename TValue>
 class GroupEmbeddingVariableLookupCpuOp
@@ -39,36 +40,9 @@ class GroupEmbeddingVariableLookupCpuOp
  public:
   explicit GroupEmbeddingVariableLookupCpuOp(OpKernelConstruction *c)
       : GroupLookupBaseCpuOp<TKey, TValue>(c) {
-    bool is_inference;
-    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
-
-    if (!is_inference) {
-      m_lookup_fn = [](EmbeddingVar<TKey, TValue> *ev, TKey key, TValue *val,
-                       TValue *default_v, int count) {
-        ev->LookupOrCreate(key, val, default_v, count);
-        return Status::OK();
-      };
-    } else {
-      m_lookup_fn = [](EmbeddingVar<TKey, TValue> *ev, TKey key, TValue *val,
-                       TValue *default_v, int count) {
-        ev->LookupOrCreate(key, val, default_v);
-        return Status::OK();
-      };
-    }
 
     OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
                                  &m_is_use_default_value_tensor));
-
-    if (m_is_use_default_value_tensor) {
-      m_get_default_v_fn = [](TValue *default_v, TKey id, int64 index,
-                              int64 total_dim,
-                              int64 len) { return default_v + len * index; };
-    } else {
-      m_get_default_v_fn = [](TValue *default_v, TKey id, int64 index,
-                              int64 total_dim, int64 len) {
-        return default_v + len * (id % total_dim);
-      };
-    }
   }
 
   void Compute(OpKernelContext *ctx) override {
@@ -137,14 +111,6 @@ class GroupEmbeddingVariableLookupCpuOp
         batch_nums[k] += batch_nums[k - 1];
       }
 
-      TValue *default_v = nullptr;
-      if (m_is_use_default_value_tensor) {
-        default_v =
-            reinterpret_cast<TValue *>(ctx->input(m_num_lookup * 4 + 1).data());
-      } else {
-        default_v = embedding_var->GetDefaultValuePtr();
-      }
-
       // Stage 2
       Tensor unique_embedding;
       unique_shape.AppendShape({static_cast<int64>(m_dimension)});
@@ -154,30 +120,13 @@ class GroupEmbeddingVariableLookupCpuOp
           ctx, ctx->allocate_temp(DataTypeToEnum<TValue>::v(), unique_shape,
                                   &unique_embedding, attr));
       auto unique_embedding_data = unique_embedding.flat<TValue>().data();
-
-      int slice_bytes = unique_nnz * m_dimension * 1000;
-      auto do_lookup = [this, ctx, embedding_var, unique, default_v,
-                        unique_embedding_data](int64 start, int64 end) {
-        for (int k = start; k < end; ++k) {
-          TValue *default_v_ptr = m_get_default_v_fn(
-              default_v, unique[k], k, embedding_var->GetDefaultValueDim(),
-              embedding_var->ValueLen());
-          OP_REQUIRES_OK(ctx,
-                         m_lookup_fn(embedding_var, unique[k],
-                                     unique_embedding_data + k * m_dimension,
-                                     default_v_ptr, 1 /*count*/));
-        }
-      };
-      Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
-            slice_bytes /*cost*/, do_lookup);
-
-      if (embedding_var->IsMultiLevel()) {
-        embedding::BatchCache<TKey> *cache = embedding_var->Cache();
-        embedding_var->storage()->Schedule(
-            [embedding_var, sp_values_tensor] {
-              embedding::BatchCache<TKey> *cache = embedding_var->Cache();
-              cache->update(sp_values_tensor);
-            });
+      EmbeddingVarContext<CPUDevice> ev_ctx(ctx); 
+      if (m_is_use_default_value_tensor) {
+        embedding_var->GetEmbeddings(ev_ctx, unique, unique_embedding_data,
+            unique_nnz, reinterpret_cast<TValue *>(ctx->input(m_num_lookup * 4 + 1).data()));
+      } else {
+        embedding_var->GetEmbeddings(ev_ctx, unique, unique_embedding_data, unique_nnz);
+        embedding_var->UpdateCache(unique_tensor, unique_counter, true/*called_by_gather*/);
       }
 
       std::vector<TValue> default_weights(nnz, 1.0);
@@ -205,7 +154,7 @@ class GroupEmbeddingVariableLookupCpuOp
                                                &gather_embedding_tensor));
       auto gather_embedding = gather_embedding_tensor->flat<TValue>().data();
 
-      slice_bytes = nnz / batch_size * m_dimension * 1000;
+      int slice_bytes = nnz / batch_size * m_dimension * 1000;
       // todo: clean these redundant code
       if (this->m_combiner == "mean") {
         auto embedding_var_mean_combiner = [this, &gather_embedding, batch_nums,
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
index f9b9363e1aa..7cb1cfc098f 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
@@ -41,25 +41,6 @@ class GroupEmbeddingVarLookupOp
       : GroupEmbeddingLookupForwardBaseOp<TKey, TValue>(c) {
     OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
                                  &is_use_default_value_tensor_));
-    bool is_inference;
-    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
-    if (!is_inference) {
-      lookup_fn_ = [](EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device) {
-        ev->LookupOrCreate(key, val, default_v, default_v_num,
-            is_use_default_value_tensor, n, device);
-      };
-    } else {
-      lookup_fn_ = [](EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device) {
-        ev->Lookup(key, val, default_v, default_v_num,
-            is_use_default_value_tensor, n, device);
-      };
-    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -106,7 +87,8 @@ class GroupEmbeddingVarLookupOp
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<TValue>::value,
                                              {N * dimension}, &out_tensor));
       TValue* out_base = out_tensor.flat<TValue>().data();
-
+      
+      EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
       if (ev->IsSingleHbm()) {
         if (is_use_default_value_tensor_) {
           Tensor default_values(ctx->input(5 * this->num_lookups_));
@@ -114,17 +96,13 @@ class GroupEmbeddingVarLookupOp
           auto default_values_matrix =
               default_values.shaped<TValue, 2>({default_value_num, dimension});
           TValue* default_v_base = &default_values_matrix(0, 0);
-          lookup_fn_(ev, key_base, out_base, default_v_base,
-                     default_value_num, is_use_default_value_tensor_, N,
-                     device);
-          
+	        ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
         } else {
-          lookup_fn_(ev, key_base, out_base, ev->GetDefaultValuePtr(),
-                     ev->GetDefaultValueDim(), true, N, device);
+          ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
         }
       } else {
-        Tensor indices_host(
-            sp_values_tensor.dtype(), sp_values_tensor.shape());
+        TensorShape indices_host_shape = sp_values_tensor.shape();
+        Tensor indices_host(sp_indices_tensor.dtype(), indices_host_shape);
         //Copy ids from GPU to CPU for CPU Lookup.
         auto stream = ctx->op_device_context()->stream();
         auto event_mgr = ctx->device()->tensorflow_gpu_device_info()->event_mgr;
@@ -132,9 +110,8 @@ class GroupEmbeddingVarLookupOp
         stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TFKey));
         SyncWithEventMgr(stream, event_mgr);
         EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
-        ev->GetEmbeddings(ev_ctx, (TFKey*)indices_host.data(),
-                          out_base, N);
-        ev->UpdateCache(indices_host);
+        ev->GetEmbeddings(ev_ctx, (TFKey*)indices_host.data(), out_base, N);
+        ev->UpdateCache(indices_host, true);
       }
 
       TensorShape emb_vectors_tensor_shape;
@@ -205,10 +182,6 @@ class GroupEmbeddingVarLookupOp
   }
 
  private:
-  std::function<void(EmbeddingVar<TFKey, TValue>* ev, const TFKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
   bool is_use_default_value_tensor_;
 };
 
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index dc1566239a1..c69aec8ebb9 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -348,26 +348,7 @@ class KvResourceGatherGPUOp : public OpKernel {
     OP_REQUIRES_OK(c,
         c->GetAttr("is_use_default_value_tensor",
           &is_use_default_value_tensor_));
-    bool is_inference;
-    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference));
-    if (!is_inference) {
-      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, const TKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device) {
-        ev->LookupOrCreate(key, val, default_v, default_v_num,
-            is_use_default_value_tensor, n, device);
-      };
-    } else {
-      lookup_fn_ = [](EmbeddingVar<TKey, TValue>* ev, const TKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device) {
-        ev->Lookup(key, val, default_v, default_v_num,
-            is_use_default_value_tensor, n, device);
-      };
     }
-  }
 
   void Compute(OpKernelContext* c) override {
     EmbeddingVar<TKey, TValue>* ev = nullptr;
@@ -406,6 +387,7 @@ class KvResourceGatherGPUOp : public OpKernel {
               "MultiLevel EV's Cache size ", ev->CacheSize(),
               " should large than IDs in batch ", N));
       const size_t slice_bytes = slice_elems * sizeof(TValue);
+      EmbeddingVarContext<GPUDevice> ev_ctx(c);
       if (ev->IsSingleHbm()) {
         const TKey* key_base = &indices_flat(0);
         const Device& device = c->eigen_device<Device>();
@@ -415,13 +397,9 @@ class KvResourceGatherGPUOp : public OpKernel {
           auto default_values_matrix = default_values.shaped<TValue, 2>(
               {default_value_num, ev->ValueLen()});
           TValue* default_v_base = &default_values_matrix(0, 0);
-          lookup_fn_(ev, key_base, out_base, default_v_base,
-              default_value_num, is_use_default_value_tensor_,
-              indices_size, device);
+	        ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
         } else {
-          lookup_fn_(ev, key_base, out_base, ev->GetDefaultValuePtr(),
-              ev->GetDefaultValueDim(), is_use_default_value_tensor_,
-              indices_size, device);
+	        ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
         }
       } else {
         Tensor indices_host(indices.dtype(), indices.shape());
@@ -448,10 +426,6 @@ class KvResourceGatherGPUOp : public OpKernel {
 
   private:
     bool is_use_default_value_tensor_;
-    std::function<void(EmbeddingVar<TKey, TValue>* ev, const TKey* key,
-                      TValue* val, TValue* default_v, int32 default_v_num,
-                      bool is_use_default_value_tensor,
-                      size_t n, const Eigen::GpuDevice& device)> lookup_fn_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                       \
@@ -613,4 +587,3 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_ALL)
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/training_ali_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ali_ops_gpu.cu.cc
index fecc7c555a5..a16319cf03e 100644
--- a/tensorflow/core/kernels/training_ali_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ali_ops_gpu.cu.cc
@@ -44,8 +44,9 @@ __device__ Eigen::half impl_rsqrt(Eigen::half x) {
   return __float2half(rsqrt(__half2float(x)));
 }
 
-template <typename Value>
-__global__ void kv_sparse_apply_adagrad_kernel(int32* item_idxs,
+template <typename TKey, typename Value>
+__global__ void kv_sparse_apply_adagrad_kernel(const TKey* key_base,
+                                               int32* item_idxs,
                                                int64 dim,
                                                Value** d_banks,
                                                bool** d_flags,
@@ -72,13 +73,15 @@ __global__ void kv_sparse_apply_adagrad_kernel(int32* item_idxs,
   if (var_default_v != nullptr && var_stored == false) {
     d_flags[var_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[var_slot_offset][offset_in_bank * dim + id] = var_default_v[(item_idx % var_default_v_num) * dim + id];
+      d_banks[var_slot_offset][offset_in_bank * dim + id] = 
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + id];
     }
   }
   if (acc_default_v != nullptr && acc_stored == false) {
     d_flags[acc_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[acc_slot_offset][offset_in_bank * dim + id] = acc_default_v[(item_idx % acc_default_v_num) * dim + id];
+      d_banks[acc_slot_offset][offset_in_bank * dim + id] = 
+          acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim + id];
     }
   }
   for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
@@ -106,9 +109,9 @@ struct KvSparseApplyAdagrad<GPUDevice, TKey, T> {
     auto const block_size = 256;
     auto const grid_size = num_items;
     GPUHashTable<TKey, T>* hashtable = var->HashTable();
-    TF_CHECK_OK(GpuLaunchKernel(kv_sparse_apply_adagrad_kernel<T>,
+    TF_CHECK_OK(GpuLaunchKernel(kv_sparse_apply_adagrad_kernel<TKey, T>,
                                 grid_size, block_size, 0, device.stream(),
-                                item_idxs, var->ValueLen(), hashtable->d_bank_ptrs,
+                                key_base, item_idxs, var->ValueLen(), hashtable->d_bank_ptrs,
                                 hashtable->d_existence_flag_ptrs,
                                 var->EmbIdx(), accum->EmbIdx(),
                                 var->SlotNum(), hashtable->initial_bank_size,
@@ -234,8 +237,9 @@ T blockReduceSum(T val)
   return val;
 }
 
-template <typename Value>
-__global__ void kv_sparse_apply_ftrl_kernel(int32* item_idxs,
+template <typename TKey, typename Value>
+__global__ void kv_sparse_apply_ftrl_kernel(const TKey* key_base,
+                                            int32* item_idxs,
                                             int64 dim,
                                             Value** d_banks,
                                             bool** d_flags,
@@ -275,19 +279,22 @@ __global__ void kv_sparse_apply_ftrl_kernel(int32* item_idxs,
   if (var_default_v != nullptr && var_stored == false) {
     d_flags[var_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[var_slot_offset][offset_in_bank * dim + id] = var_default_v[(item_idx % var_default_v_num) * dim + id];
+      d_banks[var_slot_offset][offset_in_bank * dim + id] = 
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim + id];
     }
   }
   if (acc_default_v != nullptr && acc_stored == false) {
     d_flags[acc_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[acc_slot_offset][offset_in_bank * dim + id] = acc_default_v[(item_idx % acc_default_v_num) * dim + id];
+      d_banks[acc_slot_offset][offset_in_bank * dim + id] = 
+          acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim + id];
     }
   }
   if (linear_default_v != nullptr && linear_stored == false) {
     d_flags[linear_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[linear_slot_offset][offset_in_bank * dim + id] = linear_default_v[(item_idx % linear_default_v_num) * dim + id];
+      d_banks[linear_slot_offset][offset_in_bank * dim + id] = 
+          linear_default_v[(*(key_base + item_idx) % linear_default_v_num) * dim + id];
     }
   }
   Value linear_tmp = 0;
@@ -363,9 +370,9 @@ struct KvSparseApplyFtrl<GPUDevice, TKey, T> {
     auto const block_size = 256;
     auto const grid_size = num_items;
     auto hashtable = var->HashTable();
-    TF_CHECK_OK(GpuLaunchKernel(kv_sparse_apply_ftrl_kernel<T>,
+    TF_CHECK_OK(GpuLaunchKernel(kv_sparse_apply_ftrl_kernel<TKey, T>,
                                 grid_size, block_size, (var->ValueLen()) * sizeof(T), device.stream(),
-                                item_idxs, var->ValueLen(), hashtable->d_bank_ptrs,
+                                key_base, item_idxs, var->ValueLen(), hashtable->d_bank_ptrs,
                                 hashtable->d_existence_flag_ptrs,
                                 var->EmbIdx(), accum->EmbIdx(), linear->EmbIdx(),
                                 var->SlotNum(), hashtable->initial_bank_size,
@@ -376,8 +383,9 @@ struct KvSparseApplyFtrl<GPUDevice, TKey, T> {
   }
 };
 
-template <typename T>
-__global__ void KvSparseApplyAdamAsyncKernel(int32 *item_idxs,
+template <typename TKey, typename T>
+__global__ void KvSparseApplyAdamAsyncKernel(const TKey* key_base,
+                                              int32 *item_idxs,
                                               int64 dim,
                                               T **d_banks,
                                               bool **d_flags,
@@ -424,19 +432,22 @@ __global__ void KvSparseApplyAdamAsyncKernel(int32 *item_idxs,
   if (var_default_v != nullptr && var_stored == false) {
     d_flags[var_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[var_slot_offset][offset_in_bank*dim + id] = var_default_v[(item_idx%var_default_v_num)*dim + id];
+      d_banks[var_slot_offset][offset_in_bank*dim + id] = 
+          var_default_v[(*(key_base + item_idx)%var_default_v_num)*dim + id];
     }
   }
   if (v_default_v != nullptr && v_stored == false) {
     d_flags[v_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[v_slot_offset][offset_in_bank*dim + id] = v_default_v[(item_idx%v_default_v_num)*dim + id];
+      d_banks[v_slot_offset][offset_in_bank*dim + id] = 
+          v_default_v[(*(key_base + item_idx)%v_default_v_num)*dim + id];
     }
   }
   if (m_default_v != nullptr && m_stored == false) {
     d_flags[m_slot_offset][offset_in_bank] = true;
     for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
-      d_banks[m_slot_offset][offset_in_bank*dim + id] = m_default_v[(item_idx%m_default_v_num)*dim + id];
+      d_banks[m_slot_offset][offset_in_bank*dim + id] = 
+          m_default_v[(*(key_base + item_idx)%m_default_v_num)*dim + id];
     }
   }
 
@@ -494,9 +505,10 @@ struct KvSparseApplyAdamAsync<GPUDevice, T, Tindex, Tstep> {
       auto const block_size = 256;
       auto const grid_size = N;
       auto hashtable = var->HashTable();
-      TF_CHECK_OK(GpuLaunchKernel(KvSparseApplyAdamAsyncKernel<T>,
+      TF_CHECK_OK(GpuLaunchKernel(KvSparseApplyAdamAsyncKernel<Tindex, T>,
                             grid_size, block_size, 0, d.stream(),
-                            item_idxs, var->ValueLen(), hashtable->d_bank_ptrs,
+                            indices_vec.data(), item_idxs, var->ValueLen(), 
+                            hashtable->d_bank_ptrs,
                             hashtable->d_existence_flag_ptrs, var->EmbIdx(),
                             v->EmbIdx(), m->EmbIdx(), var->SlotNum(),
                             hashtable->initial_bank_size, beta1_scalar.data(),
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index eb3cb3bc702..c5266b375a2 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -4332,6 +4332,7 @@ def _get_dense_tensor(self, admitted_ec, inputs,
         sequence_lengths[index] = fc_utils.sequence_length_from_sparse_tensor(
             sp_id)
         is_sequence = True
+      sp_id, _ = _prune_invalid_ids(sp_id, None)
       sp_ids.append(sp_id)
       combiners.append(ec.combiner)
       with variable_scope.variable_scope(ec._var_scope_name):

From 7a4cab44ceee82d84c9dc2ec23f43083405337da Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Fri, 30 Jun 2023 09:20:15 +0800
Subject: [PATCH 33/91] [Graph] Support IteratorGetNext for SmartStage as a
 starting node for (#901)

searching.

    Support IteratorGetNext node as the starting node for searching
    the boundary of stage subgraph.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/cc/BUILD                           |   2 +-
 tensorflow/core/BUILD                         |   1 +
 tensorflow/core/graph/smart_stage_pass.cc     | 516 +++++++++++++-----
 tensorflow/core/protobuf/config.proto         |  31 +-
 .../python/training/smart_stage_options.py    | 111 ++++
 tensorflow/python/training/training.py        |   1 +
 6 files changed, 528 insertions(+), 134 deletions(-)
 create mode 100644 tensorflow/python/training/smart_stage_options.py

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 49b3c4de3a3..94000b22a08 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -742,7 +742,7 @@ cc_library(
     hdrs = ["training/prefetch_runner.h"],
     deps = [
         ":coordinator",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base_no_ops",
     ],
 )
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 925f164d8b8..8ae5b4f156c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3504,6 +3504,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "//third_party/eigen3",
+        "//tensorflow/cc:prefetch_runner",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/profiler:nvtx_utils",
         "//tensorflow/core/profiler/lib:traceme",
diff --git a/tensorflow/core/graph/smart_stage_pass.cc b/tensorflow/core/graph/smart_stage_pass.cc
index b30e0c8db20..087ff14a00d 100644
--- a/tensorflow/core/graph/smart_stage_pass.cc
+++ b/tensorflow/core/graph/smart_stage_pass.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <queue>
 #include <map>
 
+#include "tensorflow/cc/training/prefetch_runner.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -29,18 +30,19 @@ namespace tensorflow {
 class SmartStagePass : public GraphOptimizationPass {
  public:
   Status Run(const GraphOptimizationPassOptions& options) override {
-    if (options.session_options == nullptr) {
+    if (options.session_options == nullptr)
       return Status::OK();
-    }
 
     bool is_enable_smart_stage =
       options.session_options->config.graph_options()
           .optimizer_options().do_smart_stage();
-    if (is_enable_smart_stage) {
+    if (is_enable_smart_stage)
       LOG(INFO) << "Run SmartStage Optimization";
-    } else {
+    else
       return Status::OK();
-    }
+
+    auto smart_stage_options = options.session_options->config.graph_options()
+                               .optimizer_options().smart_stage_options();
 
     Graph *graph = options.graph->get();
     if (graph == nullptr)
@@ -48,208 +50,466 @@ class SmartStagePass : public GraphOptimizationPass {
     std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, new_graph.get());
 
-    // Get Target Node.
-    std::vector<std::string> target_nodes;
-    GetTargetNodesName(target_nodes);
-
-    SmartStageGraph(new_graph, target_nodes);
+    TF_RETURN_IF_ERROR(SmartStageGraph(new_graph, smart_stage_options));
 
     options.graph->swap(new_graph);
     return Status::OK();
   }
 
  private:
-  void GetTargetNodesName(std::vector<std::string> & target_nodes) {
+  Status SmartStageGraph(std::unique_ptr<Graph>& g,
+                         const SmartStageOptions& options) {
+    // Try to find Stage, UnStage node.
+    Node* stage_node;
+    Node* unstage_node;
+    TF_RETURN_IF_ERROR(GetStageUnStageNode(g, stage_node, unstage_node));
+    if (stage_node != nullptr && unstage_node != nullptr) {
+      VLOG(1)
+        << "SmartStage: Start searching from a user-specified position.";
+      Status s =
+          SmartStageFromStageUnStageNode(g, stage_node, unstage_node);
+      return s;
+    }
+
+    // Try to find IteratorGetNext node.
+    Node* get_next_node = nullptr;
+    TF_RETURN_IF_ERROR(GetIteratorGetNextNode(g, get_next_node));
+    if (get_next_node != nullptr) {
+      VLOG(1) << "SmartStage: Start searching from IteratorGetNext node";
+      Status s = SmartStageFromIteratorGetNextNode(g, get_next_node, options);
+      return s;
+    }
+
+    LOG(WARNING) << "SmartStage: Failed to get starting position, SmartStage "
+                    "is disabled. Or manually specify the starting position "
+                    "using the 'tf.staged' interface";
+    return Status::OK();
+  }
+
+  void GetTargetNodesName(std::unordered_set<std::string>& target_nodes) {
     std::string tn;
     ReadStringFromEnvVar("TARGET_NODES_NAME", "", &tn);
     for (std::string s : str_util::Split(tn, ';')) {
-      target_nodes.push_back(s.substr(0, s.find_last_of(':')));
+      target_nodes.insert(s.substr(0, s.find_last_of(':')));
     }
   }
 
-  void SmartStageGraph(std::unique_ptr<Graph>& g,
-                       const std::vector<std::string>& target_nodes) {
-    // Get Stage and UnStage node.
-    std::map<std::string, Node*> stage_node_map;
-    std::map<std::string, Node*> unstage_node_map;
-    for (Node* n : g.get()->op_nodes()) {
+  Status GetStageUnStageNode(const std::unique_ptr<Graph>& g,
+                             Node*& stage_node, Node*& unstage_node) {
+    stage_node = nullptr;
+    unstage_node = nullptr;
+
+    unsigned int stage_counter = 0;
+    unsigned int unstage_counter = 0;
+    for (Node* n : g->op_nodes()) {
       if (n->IsStage()) {
-        std::string name = n->def().attr().at("shared_name").s();
-        stage_node_map[name] = n;
+        stage_node = n;
+        stage_counter++;
       } else if (n->IsUnstage()) {
-        std::string name = n->def().attr().at("shared_name").s();
-        unstage_node_map[name] = n;
+        unstage_node = n;
+        unstage_counter++;
       }
     }
 
-    for (auto it = stage_node_map.begin(); it != stage_node_map.end(); ++it) {
-      if (unstage_node_map.find(it->first) != unstage_node_map.end()) {
-        StageGraph(g.get(), it->second, unstage_node_map[it->first],
-                   target_nodes);
-      }
+    if (stage_counter != unstage_counter)
+      return errors::Internal(
+          "the number of Stage nodes and UnStage nodes does not match.");
+
+    if (stage_counter > 1)
+      return errors::Internal("there are multiple Stage nodes in the graph.");
+
+    if (unstage_counter > 1)
+      return errors::Internal("there are multiple UnStage nodes in the graph.");
+
+    if (stage_counter == 1) {
+      const std::string& s1 = stage_node->def().attr().at("shared_name").s();
+      const std::string& s2 = unstage_node->def().attr().at("shared_name").s();
+      if (s1 != s2)
+        return errors::Internal(
+            "the Stage node and the UnStage node in the graph do not match.");
     }
+
+    return Status::OK();
   }
 
-  void StageGraph(Graph* dest, Node* stage_node, Node* unstage_node,
-        const std::vector<std::string>& target_nodes) {
-    std::string s1 = stage_node->def().attr().at("shared_name").s();
-    std::string s2 = unstage_node->def().attr().at("shared_name").s();
-    CHECK(s1 == s2);
+  Status GetIteratorGetNextNode(const std::unique_ptr<Graph>& g,
+                                Node*& get_next_node) {
+    get_next_node = nullptr;
+    unsigned int counter = 0;
+    for (Node* n : g->op_nodes()) {
+      if (n->type_string() == "IteratorGetNext") {
+        counter++;
+        get_next_node = n;
+      }
+    }
+
+    if (counter > 1)
+      return errors::Internal(
+          "there are multiple IteratorGetNext nodes in the graph.");
 
+    return Status::OK();
+  }
+
+  Status SmartStageFromStageUnStageNode(std::unique_ptr<Graph>& g,
+                                        Node* stage_node, Node* unstage_node) {
+    // gather start_nodes and relink edge.
     std::vector<const Edge*> out_edges;
+    out_edges.reserve(unstage_node->out_edges().size());
     for (const Edge* e : unstage_node->out_edges()) {
-      if (!e->IsControlEdge()) {
-        out_edges.push_back(e);
-      }
+      // skip control edge.
+      if (e->IsControlEdge())
+        continue;
+      out_edges.emplace_back(e);
     }
-
-    std::unordered_set<Node *> source_node_set;
+    std::unordered_set<const Node*> start_nodes;
     for (const Edge* out_edge : out_edges) {
-      const Edge* in_edge = NULL;
+      // gather start node.
+      start_nodes.insert(out_edge->dst());
+      // reconnect edges cut by Stage node and UnStage node.
+      const Edge* in_edge = nullptr;
       int index = out_edge->src_output();
-      Status s = stage_node->input_edge(index, &in_edge);
-      TF_CHECK_OK(s);
-      Node* dst = out_edge->dst();
-      s = dest->UpdateEdge(in_edge->src(), in_edge->src_output(),
-                           out_edge->dst(), out_edge->dst_input());
-      TF_CHECK_OK(s);
-      source_node_set.insert(dst);
+      TF_RETURN_IF_ERROR(stage_node->input_edge(index, &in_edge));
+      TF_RETURN_IF_ERROR(g->UpdateEdge(in_edge->src(), in_edge->src_output(),
+                                       out_edge->dst(), out_edge->dst_input()));
     }
-
     std::vector<const Edge*> in_edges;
-    for (auto* e : stage_node->in_edges()) {
-      in_edges.push_back(e);
-    }
-
+    in_edges.reserve(stage_node->in_edges().size());
+    for (const Edge* e : stage_node->in_edges())
+      in_edges.emplace_back(e);
     for (const Edge* e : in_edges) {
-      dest->RemoveEdge(e);
+      g->RemoveEdge(e);
     }
 
-    std::vector<const Edge*> edge_vec;
-    GetStagingEdges(*dest, source_node_set, target_nodes, edge_vec);
+    std::unordered_set<const Node*> compute_graph_nodes;
+    MarkComputeGraph(g, compute_graph_nodes);
+
+    std::vector<const Edge*> stage_edges;
+    GetStageEdges(g, start_nodes, compute_graph_nodes, stage_edges);
+    Status s = AddStageNodeToGraph(g, stage_node, unstage_node, stage_edges,
+                                   SmartStageOptions());
+    return s;
+  }
+
+  Status SmartStageFromIteratorGetNextNode(std::unique_ptr<Graph>& g,
+                                           const Node* get_next_node,
+                                           const SmartStageOptions& options) {
+    // gather start_nodes
+    std::unordered_set<const Node*> start_nodes;
+    start_nodes.insert(get_next_node);
+
+    std::unordered_set<const Node*> compute_graph_nodes;
+    MarkComputeGraph(g, compute_graph_nodes);
+
+    std::vector<const Edge*> stage_edges;
+    GetStageEdges(g, start_nodes, compute_graph_nodes, stage_edges);
 
-    ModifyGraph(dest, stage_node, unstage_node, edge_vec);
+    Status s = AddStageNodeToGraph(g, nullptr, nullptr, stage_edges, options);
+    return s;
   }
 
-  void GetStagingEdges(const Graph& dest,
-      const std::unordered_set<Node *>& source_node_set,
-      const std::vector<std::string>& target_nodes,
-      std::vector<const Edge*>& edge_vec) {
-    std::queue<const Node*> q;
-    for (Node* n : dest.op_nodes()) {
+  void MarkComputeGraph(const std::unique_ptr<Graph>& g,
+                        std::unordered_set<const Node*>& compute_graph_nodes) {
+    // get target nodes.
+    std::unordered_set<std::string> target_nodes;
+    GetTargetNodesName(target_nodes);
+
+    // mark compute graph
+    std::queue<const Node*> queue;
+    for (const Node* n : g->op_nodes()) {
       if (n->IsVariable() || n->IsKvVarHandle() || n->IsPlaceholder() ||
           n->IsControlFlow() || n->type_string() == "VarHandleOp" ||
-          std::find(target_nodes.begin(), target_nodes.end(), n->name()) !=
-              target_nodes.end()) {
-        q.push(n);
+          target_nodes.count(n->name()) != 0) {
+        queue.push(n);
       }
     }
-
-    std::vector<bool> is_var_relate(dest.num_node_ids(), false);
-    while (!q.empty()) {
-      const Node* node = q.front();
-      q.pop();
-      is_var_relate[node->id()] = true;
+    while (!queue.empty()) {
+      const Node* node = queue.front();
+      queue.pop();
+      compute_graph_nodes.insert(node);
       for (const Edge* e : node->out_edges()) {
         if (e->dst()->type_string() == "_OPT_KvResourceLookupID") {
           continue;
-        } else if (!is_var_relate[e->dst()->id()]) {
-          q.push(e->dst());
+        } else if (compute_graph_nodes.count(e->dst()) == 0) {
+          queue.push(e->dst());
         }
       }
     }
+  }
 
-    std::queue<Node *> queue;
-    for (Node *n : source_node_set) {
+  void GetStageEdges(const std::unique_ptr<Graph>& g,
+                     const std::unordered_set<const Node*>& start_nodes,
+                     const std::unordered_set<const Node*>& compute_graph_nodes,
+                     std::vector<const Edge*>& stage_edges) {
+    std::queue<const Node*> queue;
+    for (const Node* n : start_nodes) {
       queue.push(n);
     }
 
-    std::unordered_set<Node *> has_visit_node;
+    std::unordered_set<const Node*> has_visit_node;
     while (!queue.empty()) {
-      Node *n = queue.front();
+      const Node* n = queue.front();
       queue.pop();
-      if (has_visit_node.find(n) != has_visit_node.end()) {
+      if (has_visit_node.count(n) != 0)
         continue;
-      }
 
       has_visit_node.insert(n);
-      for (auto edge : n->out_edges()) {
-        Node *dst = edge->dst();
-        if (is_var_relate[dst->id()]) {
-          edge_vec.push_back(edge);
-        } else {
+      for (const Edge* edge : n->out_edges()) {
+        const Node* dst = edge->dst();
+        if (compute_graph_nodes.count(dst) != 0)
+          stage_edges.push_back(edge);
+        else
           queue.push(dst);
-        }
       }
     }
   }
 
-  void ModifyGraph(Graph* dest, Node* stage_node, Node* unstage_node,
-                   std::vector<const Edge*>& edge_vec) {
-    std::vector<DataType> type_vec;
-    int i = 0;
+  Status GenerateStageNode(std::unique_ptr<Graph>& g, const Node* stage_node,
+                           const std::vector<NodeDefBuilder::NodeOut>& src_list,
+                           const SmartStageOptions& options,
+                           Node*& new_stage_node,
+                           std::string& stage_node_name) {
+    NodeDef stage_node_def;
+    if (stage_node) {
+      stage_node_name = stage_node->name();
+      auto builder =
+          NodeDefBuilder(stage_node_name, "TensorBufferPut")
+              .Device(stage_node->requested_device())
+              .Input(src_list)
+              .Attr("container", stage_node->def().attr().at("container"))
+              .Attr("shared_capacity",
+                    stage_node->def().attr().at("shared_capacity"))
+              .Attr("shared_name", stage_node->def().attr().at("shared_name"))
+              .Attr("timeout_millis",
+                    stage_node->def().attr().at("timeout_millis"));
+
+      if (stage_node->def().attr().contains("_stream_id"))
+        builder.Attr("_stream_id", stage_node->def().attr().at("_stream_id"));
+
+      TF_RETURN_IF_ERROR(builder.Finalize(&stage_node_def));
+    } else {
+      std::string name_prefix = "prefetch";
+      if (!options.name().empty())
+        name_prefix = options.name();
+      stage_node_name = name_prefix + "/TensorBufferPut";
+
+      auto builder = NodeDefBuilder(stage_node_name, "TensorBufferPut")
+                         .Input(src_list)
+                         .Attr("shared_capacity", options.capacity())
+                         .Attr("shared_name", name_prefix)
+                         .Attr("timeout_millis", options.timeout_millis());
+
+      if (options.stage_subgraph_stream_id() > 0)
+        builder.Attr("_stream_id", options.stage_subgraph_stream_id());
+
+      TF_RETURN_IF_ERROR(builder.Finalize(&stage_node_def));
+    }
+
+    Status s;
+    new_stage_node = g->AddNode(stage_node_def, &s);
+    return s;
+  }
+
+  Status GenerateUnStageNode(std::unique_ptr<Graph>& g,
+                             const Node* unstage_node,
+                             const std::vector<DataType>& type_vec,
+                             const SmartStageOptions& options,
+                             Node*& new_unstage_node,
+                             std::string& unstage_node_name) {
+    NodeDef unstage_node_def;
+    if (unstage_node) {
+      unstage_node_name = unstage_node->name();
+      TF_RETURN_IF_ERROR(
+          NodeDefBuilder(unstage_node_name, "TensorBufferTake")
+              .Device(unstage_node->requested_device())
+              .Attr("container", unstage_node->def().attr().at("container"))
+              .Attr("dtypes", DataTypeSlice(type_vec))
+              .Attr("shared_capacity",
+                    unstage_node->def().attr().at("shared_capacity"))
+              .Attr("shared_name", unstage_node->def().attr().at("shared_name"))
+              .Attr("shared_threads",
+                    unstage_node->def().attr().at("shared_threads"))
+              .Finalize(&unstage_node_def));
+    } else {
+      std::string name_prefix = "prefetch";
+      if (!options.name().empty())
+        name_prefix = options.name();
+      unstage_node_name = name_prefix + "/TensorBufferTake";
+
+      int num_clients = 1;
+      if (options.num_clients() > 1)
+        num_clients = options.num_clients();
+
+      TF_RETURN_IF_ERROR(NodeDefBuilder(unstage_node_name, "TensorBufferTake")
+                             .Attr("dtypes", DataTypeSlice(type_vec))
+                             .Attr("shared_capacity", options.capacity())
+                             .Attr("shared_name", name_prefix)
+                             .Attr("shared_threads", num_clients)
+                             .Finalize(&unstage_node_def));
+    }
+
+    Status s;
+    new_unstage_node = g->AddNode(unstage_node_def, &s);
+    return s;
+  }
+
+  Status GenerateStageControlNodes(std::unique_ptr<Graph>& g,
+                                   const SmartStageOptions& options,
+                                   std::string& cancel_node_name,
+                                   std::string& resume_node_name,
+                                   std::string& close_node_name) {
+    std::string name_prefix = "prefetch";
+    if (!options.name().empty())
+      name_prefix = options.name();
+
+    Status s;
+    // Create cancel op.
+    NodeDef cancel_node_def;
+    cancel_node_name = name_prefix + "/TensorBufferCancel";
+    TF_RETURN_IF_ERROR(NodeDefBuilder(cancel_node_name, "TensorBufferCancel")
+                           .Attr("shared_name", name_prefix)
+                           .Attr("shared_capacity", options.capacity())
+                           .Finalize(&cancel_node_def));
+    g->AddNode(cancel_node_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
+    // Create resume op.
+    NodeDef resume_node_def;
+    resume_node_name = name_prefix + "/TensorBufferResume";
+    TF_RETURN_IF_ERROR(NodeDefBuilder(resume_node_name, "TensorBufferCancel")
+                           .Attr("is_cancelled", false)
+                           .Attr("shared_name", name_prefix)
+                           .Attr("shared_capacity", options.capacity())
+                           .Finalize(&resume_node_def));
+    g->AddNode(resume_node_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
+    // Create close op.
+    NodeDef close_node_def;
+    close_node_name = name_prefix + "/TensorBufferClose";
+    TF_RETURN_IF_ERROR(NodeDefBuilder(close_node_name, "TensorBufferClose")
+                           .Attr("shared_name", name_prefix)
+                           .Attr("shared_capacity", options.capacity())
+                           .Finalize(&close_node_def));
+    g->AddNode(close_node_def, &s);
+
+    return s;
+  }
+
+  void CreatePrefetchRunner(const SmartStageOptions& options,
+                            const std::string& fetch_op,
+                            const std::string& cancel_op,
+                            const std::string& resume_op,
+                            const std::string& close_op) {
+    auto runner_options = options.runner_options();
+    for (size_t i = 0; i < options.num_threads(); i++)
+      runner_options.add_fetch_ops(fetch_op);
+    runner_options.set_cancel_op(cancel_op);
+    runner_options.set_resume_op(resume_op);
+    runner_options.set_close_op(close_op);
+
+    std::string name_prefix = "prefetch";
+    if (!options.name().empty())
+      name_prefix = options.name();
+
+    auto prefetch_runner_mgr = PrefetchRunnerMgr::singleton();
+    prefetch_runner_mgr->RegisterPrefetchRunner(
+        options.graph_key(), name_prefix + "_prefetch_runner", runner_options);
+  }
+
+  Status AddStageNodeToGraph(std::unique_ptr<Graph>& g,
+                             Node* stage_node,
+                             Node* unstage_node,
+                             std::vector<const Edge*>& stage_edges,
+                             const SmartStageOptions& options) {
+    int index = 0;
     std::map<std::string, int64> edge_map;
+    std::vector<DataType> type_vec;
     std::vector<NodeDefBuilder::NodeOut> src_list;
     std::map<const Edge*, int64> edge_to_stage;
     std::map<const Edge*, int64> edge_to_unstage;
-    for (const Edge* e : edge_vec) {
+    for (const Edge* e : stage_edges) {
       if (e->IsControlEdge()) {
-        // control flow is implemented by stage node and unstage node, remove control edge.
-        dest->RemoveEdge(e);
+        // control flow is implemented by stage node and unstage node, remove
+        // control edge.
+        g->RemoveEdge(e);
         continue;
       }
       std::string name = e->src()->name() + std::to_string(e->src_output());
-      if (edge_map.find(name) == edge_map.end()) {
+      if (edge_map.count(name) == 0) {
         type_vec.push_back(e->src()->output_type(e->src_output()));
-        src_list.emplace_back(e->src()->name(), e->src_output(), e->src()->output_type(e->src_output()));
-        edge_to_stage[e] = i;
-        edge_map[name] = i;
-        ++i;
+        src_list.emplace_back(e->src()->name(), e->src_output(),
+                              e->src()->output_type(e->src_output()));
+        edge_to_stage[e] = index;
+        edge_map[name] = index;
+        ++index;
       }
       edge_to_unstage[e] = edge_map[name];
     }
 
-    NodeDef node_def_stage;
-    TF_CHECK_OK(NodeDefBuilder(stage_node->name(), "TensorBufferPut")
-    .Device(stage_node->requested_device())
-    .Input(src_list)
-    .Attr("container", stage_node->def().attr().at("container"))
-    .Attr("shared_capacity", stage_node->def().attr().at("shared_capacity"))
-    .Attr("shared_name", stage_node->def().attr().at("shared_name"))
-    .Attr("timeout_millis", stage_node->def().attr().at("timeout_millis"))
-    .Finalize(&node_def_stage));
-    if (stage_node->def().attr().contains("_stream_id")) {
-      auto stream_id_attr = stage_node->def().attr().at("_stream_id");
-      node_def_stage.mutable_attr()->insert({"_stream_id", stream_id_attr});
-    }
-    Status s;
-    Node* stage_xxx = dest->AddNode(node_def_stage, &s);
-    TF_CHECK_OK(s);
-    dest->RemoveNode(stage_node);
-
-    NodeDef node_def_unstage;
-    TF_CHECK_OK(NodeDefBuilder(unstage_node->name(), "TensorBufferTake")
-    .Device(unstage_node->requested_device())
-    .Attr("container", unstage_node->def().attr().at("container"))
-    .Attr("dtypes", DataTypeSlice(type_vec))
-    .Attr("shared_capacity", unstage_node->def().attr().at("shared_capacity"))
-    .Attr("shared_name", unstage_node->def().attr().at("shared_name"))
-    .Attr("shared_threads", unstage_node->def().attr().at("shared_threads"))
-    .Finalize(&node_def_unstage));
-    Node* unstage_xxx = dest->AddNode(node_def_unstage, &s);
-    TF_CHECK_OK(s);
-    dest->RemoveNode(unstage_node);
+    Node* new_stage_node;
+    std::string stage_node_name;
+    TF_RETURN_IF_ERROR(GenerateStageNode(g, stage_node, src_list, options,
+                                         new_stage_node, stage_node_name));
+
+    Node* new_unstage_node;
+    std::string unstage_node_name;
+    TF_RETURN_IF_ERROR(GenerateUnStageNode(g, unstage_node, type_vec, options,
+                                           new_unstage_node,
+                                           unstage_node_name));
 
     for (auto it = edge_to_stage.begin(); it != edge_to_stage.end(); ++it) {
       const Edge* e = it->first;
-      dest->AddEdge(e->src(), e->src_output(), stage_xxx, it->second);
+      g->AddEdge(e->src(), e->src_output(), new_stage_node, it->second);
     }
 
     for (auto it = edge_to_unstage.begin(); it != edge_to_unstage.end(); ++it) {
       const Edge* e = it->first;
-      Status s = dest->UpdateEdge(unstage_xxx, it->second, e->dst(), e->dst_input());
-      TF_CHECK_OK(s);
+      TF_RETURN_IF_ERROR(
+          g->UpdateEdge(new_unstage_node, it->second, e->dst(), e->dst_input()));
+    }
+
+    if (stage_node != nullptr && unstage_node != nullptr) {
+      g->RemoveNode(stage_node);
+      g->RemoveNode(unstage_node);
+    } else {
+      // need to create `tensor_buffer_cancel`, `tensor_buffer_resume`,
+      // `tensor_buffer_close` and PrefetchRunner.
+      std::string cancel_node_name;
+      std::string resume_node_name;
+      std::string close_node_name;
+      TF_RETURN_IF_ERROR(GenerateStageControlNodes(
+          g, options, cancel_node_name, resume_node_name, close_node_name));
+      CreatePrefetchRunner(options, stage_node_name, cancel_node_name,
+                           resume_node_name, close_node_name);
+    }
+
+    TF_RETURN_IF_ERROR(CheckGraphCircle(new_stage_node, new_unstage_node));
+
+    return Status::OK();
+  }
+
+  Status CheckGraphCircle(Node* stage_node, Node* unstage_node) {
+    std::unordered_set<const Node*> accessed;
+    std::queue<const Node*> queue;
+    queue.push(unstage_node);
+
+    while (!queue.empty()) {
+      const Node* node = queue.front();
+      queue.pop();
+
+      if (accessed.count(node) != 0)
+        continue;
+
+      accessed.insert(node);
+
+      if (node == stage_node)
+        return errors::Internal(
+            "there is a cycle in the graph after smart stage.");
+
+      for (const Edge* e : node->out_edges())
+        queue.push(e->dst());
     }
+    return Status::OK();
   }
 };
 
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 5b5c5b07f6f..98d7f55fb5a 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -241,6 +241,26 @@ message PrefetchRunnerOptions {
   repeated error.Code ignored_exceptions= 8;
 }
 
+// Options passed to the smart stage pass
+message SmartStageOptions {
+  // Max number of samples to keep in the buffer.
+  int32 capacity = 1;
+  // Number of threads for prefetching.
+  int32 num_threads = 2;
+  // Number of clients of prefetched sample.
+  int32 num_clients = 3;
+  // Max milliseconds put op can take.
+  int32 timeout_millis = 4;
+  // Specifies which stream to use for the Stage subgraph.
+  int32 stage_subgraph_stream_id = 5;
+  // Options passed to the prefetch runner.
+  PrefetchRunnerOptions runner_options = 6;
+  // Key of Graph.
+  string graph_key = 7;
+  // Name of prefetching operations.
+  string name = 8;
+}
+
 // Options passed to the async embedding
 message AsyncEmbeddingOptions {
   // Prefetch threads num
@@ -303,11 +323,12 @@ message OptimizerOptions {
   bool do_op_fusion = 7;
   int32 micro_batch_num = 9;
   bool do_smart_stage = 10;
-  bool stage_subgraph_on_cpu = 11;
-  bool do_async_embedding = 12;
-  AsyncEmbeddingOptions async_embedding_options = 13;
-  bool device_placement_optimization = 14;
-  bool stage_multi_stream = 15;
+  SmartStageOptions smart_stage_options = 11;
+  bool stage_subgraph_on_cpu = 12;
+  bool do_async_embedding = 13;
+  AsyncEmbeddingOptions async_embedding_options = 14;
+  bool device_placement_optimization = 15;
+  bool stage_multi_stream = 16;
 }
 
 message GraphOptions {
diff --git a/tensorflow/python/training/smart_stage_options.py b/tensorflow/python/training/smart_stage_options.py
new file mode 100644
index 00000000000..a8fbf9ddbf1
--- /dev/null
+++ b/tensorflow/python/training/smart_stage_options.py
@@ -0,0 +1,111 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
+
+PREFETCH = "prefetch"
+
+@tf_export(v1=["SmartStageOptions"])
+def SmartStageOptions(
+    capacity=1,
+    num_threads=1,
+    num_clients=1,
+    timeout_millis=300000,
+    closed_exception_types=(errors.OUT_OF_RANGE,),
+    ignored_exception_types=(),
+    use_stage_subgraph_thread_pool=False,
+    stage_subgraph_thread_pool_id=0,
+    stage_subgraph_stream_id=0,
+    graph=None,
+    name=None):
+  """Generate SmartStageOptions.
+
+  Args:
+    capacity: (Optional.) Max number of samples to keep in the buffer.
+    num_threads: (Optional.) Number of threads for prefetching. 1 by
+      default.
+    num_clients: (Optional.) Number of clients of prefetched sample. 1 by
+      default.
+    timeout_millis: (Optional.) Max milliseconds put op can take, 5 min by
+      default.
+    closed_exception_types: (Optional.) Exception types indicating that the
+      prefetching is normally finished. Defaults to
+      `(errors.OUT_OF_RANGE,)`.
+    ignored_exception_types: (Optional.) Exception types indicating that the
+      prefetching can continue. Defaults to `()`.
+    use_stage_subgraph_thread_pool: (Optional.) Use stage subgraph thread pool
+      to run stage graph or not.
+    stage_subgraph_thread_pool_id: (Optional.) Specifies the stage subgraph
+      thread pool to use when enable use_stage_subgraph_thread_pool. 0 by default.
+    stage_subgraph_stream_id: (Optional.) Specifies which stream to use for the
+      Stage subgraph. The default value is 0.
+    graph: (Optional.) Specify the graph for SmartStage, which is the graph
+      passed to the Session.
+    name: (Optional.) Name of prefetching operations.
+
+  Returns:
+    SmartStageOptions.
+  """
+  options = config_pb2.SmartStageOptions()
+  if capacity < 1:
+    raise ValueError('capacity must >= 1')
+  options.capacity = capacity
+
+  if num_threads < 1:
+    raise ValueError('num_threads must >= 1')
+  options.num_threads = num_threads
+
+  if num_clients < 1:
+    raise ValueError('num_clients must >= 1')
+  options.num_clients = num_clients
+
+  if timeout_millis <= 0:
+    raise ValueError('timeout_millis must > 0')
+  options.timeout_millis = timeout_millis
+
+  for err_code in closed_exception_types:
+    options.runner_options.closed_exceptions.append(err_code)
+
+  for err_code in ignored_exception_types:
+    options.runner_options.ignored_exceptions.append(err_code)
+
+  options.runner_options.run_options.use_stage_subgraph_thread_pool = \
+    use_stage_subgraph_thread_pool
+
+  if stage_subgraph_thread_pool_id < 0:
+    raise ValueError('stage_subgraph_thread_pool_id must >= 0')
+  options.runner_options.run_options.stage_subgraph_thread_pool_id = \
+    stage_subgraph_thread_pool_id
+
+  if stage_subgraph_stream_id < 0:
+    raise ValueError('stage_subgraph_stream_id >= 0')
+  options.stage_subgraph_stream_id = stage_subgraph_stream_id
+
+  if graph is None:
+    graph = ops.get_default_graph()
+  options.graph_key = graph._graph_key
+
+  if name is None:
+    name = ops.get_default_graph().unique_name(PREFETCH)
+  options.name = name
+
+  return options
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 043236b038c..ad390e93b5c 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -114,6 +114,7 @@
 from tensorflow.python.training.training_util import get_or_create_global_step
 from tensorflow.python.training.warm_starting_util import VocabInfo
 from tensorflow.python.training.warm_starting_util import warm_start
+from tensorflow.python.training.smart_stage_options import SmartStageOptions
 from tensorflow.python.pywrap_tensorflow import do_quantize_training_on_graphdef
 from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
 from tensorflow.python.util.tf_export import tf_export

From 2c90ada9444f4087b9133b1abd3b316b329f43f1 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Mon, 3 Jul 2023 15:57:32 +0800
Subject: [PATCH 34/91] [Docs] Update document for SmartStage. (#902)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 docs/docs_en/Smart-Stage.md | 119 ++++++++++++++++++++++++++-----
 docs/docs_zh/Smart-Stage.md | 136 ++++++++++++++++++++++++++++++------
 2 files changed, 217 insertions(+), 38 deletions(-)

diff --git a/docs/docs_en/Smart-Stage.md b/docs/docs_en/Smart-Stage.md
index b322f034838..f8ff4731fea 100644
--- a/docs/docs_en/Smart-Stage.md
+++ b/docs/docs_en/Smart-Stage.md
@@ -8,30 +8,116 @@ DeepRec provides the stage feature, which can realize the asynchronous execution
 
 ## Feature
 
-There is a stage in the user's original graph, by enabling the smart stage feature, it automatically optimizes the maximum possible stage range and modifies the actual physical calculation graph (without affecting the Graphdef), improving performance.
+By enabling the smart stage feature, it automatically optimizes the maximum possible stage range from a certain starting node and modifies the actual physical calculation graph (without affecting the Graphdef), improving performance.
 
+## API
+### 1. Automatically SmartStage (Recommend)
+The premise of automatic SmartStage is that the model uses the `tf.data.Iterator` interface to read sample data from `tf.data.Dataset`.
+
+1. The `tf.SmartStageOptions` interface returns the configuration for executing the stage subgraph, and its parameters are as follows:
+
+| parameter                      | description                                                                                                                                                                                                                     | default value                                                                                                                            |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| capacity                       | The maximum number of cached asynchronous execution results.                                                                                                                                                                    | 1                                                                                                                                        |
+| num_threads                    | Number of threads to execute stage subgraph asynchronously.                                                                                                                                                                     | 1                                                                                                                                        |
+| num_client                     | Number of clients of prefetched sample.                                                                                                                                                                                         | 1                                                                                                                                        |
+| timeout_millis                 | Max milliseconds put op can take.                                                                                                                                                                                               | 300000 ms                                                                                                                                |
+| closed_exception_types         | Exception types recognized as graceful exits.                                                                                                                                                                                   | (`tf.errors.OUT_OF_RANGE`,)                                                                                                              |
+| ignored_exception_types        | Exception types that are recognized to be ignored and skipped.                                                                                                                                                                  | ()                                                                                                                                       |
+| use_stage_subgraph_thread_pool | Whether to run the Stage subgraph on an independent thread pool, you need to create an independent thread pool first.                                                                                                           | False (If it is True, a separate thread pool must be created first)                                                                      |
+| stage_subgraph_thread_pool_id  | If you enable the stage subgraph to run on the independent thread pool to specify the independent thread pool index, you need to create an independent thread pool first, and enable the use_stage_subgraph_thread_pool option. | 0, The index range is [0, the number of independent thread pools created - 1]                                                            |
+| stage_subgraph_stream_id       | In the GPU Multi-Stream scenario, the index of gpu stream used by stage subgraph.                                                                                                                                               | 0 (0 means that the stage subgraph shares the gpu stream used by the main graph, the index range is [0, total number of GPU streams -1]) |
+| graph                          | The Graph that needs to be optimized by SmartStage, which is the same as the Graph passed to the Session                                                                                                                        | None (Use default graph)                                                                                                                 |
+| name                           | Name of prefetching operations.                                                                                                                                                                                                 | None (Automatic generated)                                                                                                               |
+
+> For how to create an independent thread pool or use GPU Multi-Stream, please refer to [Pipeline-Stage](./Stage.md).
+
+2. The configuration generated by the `tf.SmartStageOptions` interface needs to be assigned to `tf.ConfigProto`.
+    ```python
+    sess_config = tf.ConfigProto()
+    smart_stage_options = tf.SmartStageOptions(capacity=40, num_threads=4)
+    sess_config.graph_options.optimizer_options.smart_stage_options.CopyFrom(smart_stage_options)
+    ```
+
+3. Set the following options in `tf.ConfigProto` to enable SmartStage.
+    - CPU scenario
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True
+    ```
+    - GPU scenario
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True
+    sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+    ```
+4. Add `tf.make_prefetch_hook()` hook to Session.
+
+### 2. SmartStage when Graph contains Stage
+The original graph has been manually split using the `tf.staged` interface.
+> For more detail of `tf.staged`, please refer to [Pipeline-Stage](./Stage.md).
+
+1. Set the following options in `tf.ConfigProto` to enable SmartStage.
+    - CPU scenario
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True
+    ```
+    - GPU scenario
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True
+    sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+    ```
+2. Add `tf.make_prefetch_hook()` hook to Session.
 
-**Attention**：The prerequisite for this feature is that there is at least one stage in the user's original image
+## Example
+#### Automatically SmartStage (Recommend)
+```python
+import tensorflow as tf
 
-## API
-ConfigProro defines the following configuration options.
+def parse_csv(value):
+    v = tf.io.decode_csv(value, record_defaults=[[''], ['']])
+    return v
+    
+dataset = tf.data.TextLineDataset('./test_data.csv')
+dataset = dataset.batch(2)
+dataset = dataset.map(parse_csv, num_parallel_calls=2)
+dataset_output_types = tf.data.get_output_types(dataset)
+dataset_output_shapes = tf.data.get_output_shapes(dataset)
+iterator = tf.data.Iterator.from_structure(dataset_output_types, dataset_output_shapes)
+xx = iterator.get_next()
+xx = list(xx)
+
+init_op = iterator.make_initializer(dataset)
 
-- CPU scenario
+var = tf.get_variable("var", shape=[100, 3], initializer=tf.ones_initializer())
+xx[0] = tf.string_to_hash_bucket(xx[0], num_buckets=10)
+xx[0] = tf.nn.embedding_lookup(var, xx[0])
+xx[1]=tf.concat([xx[1], ['xxx']], axis = 0)
+target = tf.concat([tf.as_string(xx[0]), [xx[1], xx[1]]], 0)
 
-```python
-sess_config = tf.ConfigProto()
-sess_config.graph_options.optimizer_options.do_smart_stage = True
-```
-- GPU scenario
+config = tf.ConfigProto()
+# enable smart stage
+config.graph_options.optimizer_options.do_smart_stage = True
+smart_stage_options = tf.SmartStageOptions(capacity=1, num_threads=1)
+config.graph_options.optimizer_options.smart_stage_options.CopyFrom(smart_stage_options)
 
-```python
-sess_config = tf.ConfigProto()
-sess_config.graph_options.optimizer_options.do_smart_stage = True
-sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
-```
+# For GPU training, consider enabling the following options for better performance
+# config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+    
+# mark target 节点
+tf.train.mark_target_node([target])
 
-## Example
+scaffold = tf.train.Scaffold(
+    local_init_op=tf.group(tf.local_variables_initializer(), init_op))
+with tf.train.MonitoredTrainingSession(config=config, scaffold=scaffold, 
+                                       hooks=[tf.make_prefetch_hook()]) as sess:
+    for i in range(5):
+        print(sess.run([target]))
+```
 
+#### SmartStage when Graph contains Stage.
 ```python
 import tensorflow as tf
 
@@ -54,6 +140,7 @@ config = tf.ConfigProto()
 config.graph_options.optimizer_options.do_smart_stage = True
 # For GPU training, consider enabling the following options for better performance
 # config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+
 # mark target node
 tf.train.mark_target_node([target])
 
diff --git a/docs/docs_zh/Smart-Stage.md b/docs/docs_zh/Smart-Stage.md
index 3445d31e83f..027cbec3d1b 100644
--- a/docs/docs_zh/Smart-Stage.md
+++ b/docs/docs_zh/Smart-Stage.md
@@ -5,29 +5,120 @@ DeepRec已经提供了stage 功能，该功能可以实现IO Bound操作和计
 由于`tf.staged`需要用户指定stage的边界，一方面会增加使用难度，另一方面会导致stage颗粒度不够精细，难以做到更多op的异步执行。因此我们提出了SmartStage功能。用户不需要对TF Graph有OP级别理解的情况下，就可以使stage发挥最大的性能提升。
 
 ## 功能说明
-在用户的原图中有stage阶段的前提下，通过开启smart stage功能，自动化的寻优最大可以stage的范围，修改实际物理计算图（不影响Graphdef图），从而提高性能。
+通过开启smart stage功能，自动化的寻优最大可以stage的范围，修改实际物理计算图（不影响Graphdef图），从而提高性能。
+
+## 用户接口
+### 1. 自动SmartStage(推荐)
+自动SmartStage的前提是模型使用了`tf.data.Iterator`接口从`tf.data.Dataset`中读取样本数据。
+
+1. `tf.SmartStageOptions`接口返回执行stage子图的配置，其参数如下：
+
+| 参数                            | 含义                                                                                                                                        | 默认                                                                     |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------ |
+| capacity                       | 缓存的异步化执行结果的最大个数                                                                                                                   | 1                                                                       |
+| num_threads                    | 异步化执行stage子图的线程数                                                                                                                     | 1                                                                       |
+| num_clients                    | 消耗预取结果的消费者数量                                                                                                                        | 1                                                                       |
+| timeout_millis                 | 预取结果等待缓存区可用的最大等待时间，超时后本次预取结果将会被丢弃                                                                                      | 300000 ms                                                               |
+| closed_exception_types         | 被识别为正常退出的异常类型                                                                                                                      | (`tf.errors.OUT_OF_RANGE`,)                                              |
+| ignored_exception_types        | 被识别可忽略跳过的异常类型                                                                                                                      | ()                                                                       |
+| use_stage_subgraph_thread_pool | 是否在独立线程池上运行Stage子图，需要先创建独立线程池                                                                                               | False(若为True则必须先创建独立线程池)                                         |
+| stage_subgraph_thread_pool_id  | 如果开启了在独立线程池上运行Stage子图，用于指定独立线程池索引，需要先创建独立线程池，并打开use_stage_subgraph_thread_pool选项                               | 0，索引范围为[0, 创建的独立线程池数量-1]                                       |
+| stage_subgraph_stream_id       | GPU Multi-Stream 场景下, stage子图执行使用的gpu stream的索引                                                                                    | 0 (0表示stage子图共享计算主图使用的gpu stream, 索引范围为[0, gpu stream总数-1]) |
+| graph                          | 需要执行SmartStage优化的Graph，需要与传递给Session的Graph相同                                                                                     | None (表示使用默认Graph)                                                   |
+| name                           | 预取操作的名称                                                                                                                                | None (表示自动生成)                                                        |
+    
+> 关于如何创建独立线程池以及如何使用GPU Multi-Stream，请参见[流水线](./Stage.md)。
+
+2. `tf.SmartStageOptions`接口生成的配置需要赋值给`tf.ConfigProto`。
+    ```python
+    sess_config = tf.ConfigProto()
+    smart_stage_options = tf.SmartStageOptions(capacity=40, num_threads=4)
+    sess_config.graph_options.optimizer_options.smart_stage_options.CopyFrom(smart_stage_options)
+    ```
+3. 设置`tf.ConfigProto`中的如下选项来开启SmartStage。
+    - CPU场景
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True # 开启SmartStage
+    ```
+
+    - GPU场景
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True # 开启SmartStage
+    sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True # 针对GPU场景优化的选项
+    ```
+
+4. Session中加入`tf.make_prefetch_hook()` hook
+
+### 2. 图中存在Stage阶段时的SmartStage
+原图已经使用`tf.staged`接口手动分图。
+> 关于`tf.staged`接口请参见[流水线](./Stage.md)。
+
+1. 直接设置`tf.ConfigProto`中的相关选项即可开启SmartStage。
+    **CPU场景**
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True # 开启SmartStage
+    ```
+
+    **GPU场景**
+    ```python
+    sess_config = tf.ConfigProto()
+    sess_config.graph_options.optimizer_options.do_smart_stage = True # 开启SmartStage
+    sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True # 针对GPU场景优化的选项
+    ```
+
+2. Session中加入`tf.make_prefetch_hook()` hook
 
-**注意**：该功能的先决条件是，用户的原图中存在至少一个stage阶段
-
-## 用户接口（CPU 场景）
-ConfigProro中定义了如下配置选项
+## 代码示例
+### 自动SmartStage(推荐)
 
 ```python
-sess_config = tf.ConfigProto()
-sess_config.graph_options.optimizer_options.do_smart_stage = True # 通用优化选项
-```
-## 用户接口（GPU 场景）
+import tensorflow as tf
 
-ConfigProro中定义了如下配置选项
+def parse_csv(value):
+    v = tf.io.decode_csv(value, record_defaults=[[''], ['']])
+    return v
+    
+dataset = tf.data.TextLineDataset('./test_data.csv')
+dataset = dataset.batch(2)
+dataset = dataset.map(parse_csv, num_parallel_calls=2)
+dataset_output_types = tf.data.get_output_types(dataset)
+dataset_output_shapes = tf.data.get_output_shapes(dataset)
+iterator = tf.data.Iterator.from_structure(dataset_output_types, dataset_output_shapes)
+xx = iterator.get_next()
+xx = list(xx)
+
+init_op = iterator.make_initializer(dataset)
 
-```python
-sess_config = tf.ConfigProto()
-sess_config.graph_options.optimizer_options.do_smart_stage = True
-sess_config.graph_options.optimizer_options.stage_subgraph_on_cpu = True # 针对GPU训练优化的选项
-```
+var = tf.get_variable("var", shape=[100, 3], initializer=tf.ones_initializer())
+xx[0] = tf.string_to_hash_bucket(xx[0], num_buckets=10)
+xx[0] = tf.nn.embedding_lookup(var, xx[0])
+xx[1]=tf.concat([xx[1], ['xxx']], axis = 0)
+target = tf.concat([tf.as_string(xx[0]), [xx[1], xx[1]]], 0)
 
-## 代码示例
+config = tf.ConfigProto()
+# enable smart stage
+config.graph_options.optimizer_options.do_smart_stage = True
+smart_stage_options = tf.SmartStageOptions(capacity=1, num_threads=1)
+config.graph_options.optimizer_options.smart_stage_options.CopyFrom(smart_stage_options)
+
+# 对于GPU训练，可以考虑开启以下选项来获得更好的性能
+# config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+    
+# mark target 节点
+tf.train.mark_target_node([target])
 
+scaffold = tf.train.Scaffold(
+    local_init_op=tf.group(tf.local_variables_initializer(), init_op))
+with tf.train.MonitoredTrainingSession(config=config, scaffold=scaffold, 
+                                       hooks=[tf.make_prefetch_hook()]) as sess:
+    for i in range(5):
+        print(sess.run([target]))
+```
+
+### 图中存在Stage阶段时的SmartStage
 ```python
 import tensorflow as tf
 
@@ -50,15 +141,18 @@ config = tf.ConfigProto()
 config.graph_options.optimizer_options.do_smart_stage = True
 # 对于GPU训练，可以考虑开启以下选项来获得更好的性能
 # config.graph_options.optimizer_options.stage_subgraph_on_cpu = True
+
 # mark target 节点
 tf.train.mark_target_node([target])
 
 with tf.train.MonitoredTrainingSession(config=config,
                                        hooks=[tf.make_prefetch_hook()]) as sess:
-  for i in range(5):
-      print(sess.run([target]))
+for i in range(5):
+    print(sess.run([target]))
 ```
-## 性能对比（CPU场景）
+
+## 性能对比
+### CPU场景
 在modelzoo中的DLRM模型中测试该功能
 机型为Aliyun ECS 实例 ecs.hfg7.8xlarge
 
@@ -74,9 +168,7 @@ with tf.train.MonitoredTrainingSession(config=config,
 | DLRM | w/o smart stage |  201 (baseline)  |
 | DLRM | w/  smart stage |  212 (+ 1.05x)   |
 
-## 性能对比（GPU场景）
-
-------
+### GPU场景
 
 在modelzoo中的模型测试该功能在GPU训练场景下的性能。
 

From e810009b96ab2a7ccda7a5a736859d1f9a28b241 Mon Sep 17 00:00:00 2001
From: shijieliu <aleliu@nvidia.com>
Date: Tue, 4 Jul 2023 15:58:27 +0800
Subject: [PATCH 35/91] update sok to 1.20 (#898)

Signed-off-by: aleliu <aleliu@nvidia.com>
---
 tensorflow/tools/pip_package/build_sok.sh | 2 +-
 tensorflow/workspace.bzl                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_sok.sh b/tensorflow/tools/pip_package/build_sok.sh
index caad68de90e..2c99ceb5ac1 100755
--- a/tensorflow/tools/pip_package/build_sok.sh
+++ b/tensorflow/tools/pip_package/build_sok.sh
@@ -17,4 +17,4 @@ export SOK_COMPILE_GPU_SM="70;75;80"
 cd ./bazel-DeepRec/external/hugectr/sparse_operation_kit
 
 "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel
-pip install ./dist/merlin_sok-1.1.4-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
+pip install ./dist/merlin_sok-1.2.0-cp38-cp38-linux_x86_64.whl
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9e99cd19904..3495efd182d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1372,9 +1372,9 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     http_archive(
         name = "hugectr",                                     # Apache License 2.0
         build_file = "//third_party:hugectr.BUILD",
-        strip_prefix = "HugeCTR-23.05.01",
+        strip_prefix = "HugeCTR-23.06.00",
         urls = [
-            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.05.01.tar.gz",
+            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.06.00.tar.gz",
         ],
     )
 

From 3c4796b3ff94e6eaf89b63fd5cff82f503bf4ade Mon Sep 17 00:00:00 2001
From: RTXUX <RTXUX@users.noreply.github.com>
Date: Wed, 5 Jul 2023 11:01:57 +0800
Subject: [PATCH 36/91] [Dockerfile] Fix syntax error in Dockerfile. (#904)

Signed-off-by: RTXUX <wyf@rtxux.xyz>
---
 .../Dockerfile.base-py3.6-cu114-ubuntu18.04   |  2 +-
 .../Dockerfile.base-py3.6-cu116-ubuntu18.04   |  2 +-
 .../Dockerfile.base-py3.8-cu117-ubuntu22.04   |  2 +-
 .../Dockerfile.base-py3.8-ubuntu20.04         |  2 +-
 .../Dockerfile.devel-py3.6-cu116-ubuntu18.04  | 16 +++++++--------
 .../Dockerfile.devel-py3.8-cu116-ubuntu20.04  | 20 +++++++++----------
 .../Dockerfile.devel-py3.8-cu117-ubuntu22.04  | 20 +++++++++----------
 .../Dockerfile.devel-py3.8-ubuntu20.04        | 20 +++++++++----------
 .../Dockerfile.devel-py3.8-ubuntu22.04        | 20 +++++++++----------
 9 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/cibuild/dockerfiles/Dockerfile.base-py3.6-cu114-ubuntu18.04 b/cibuild/dockerfiles/Dockerfile.base-py3.6-cu114-ubuntu18.04
index 0a0c14866d1..9baef145fa3 100644
--- a/cibuild/dockerfiles/Dockerfile.base-py3.6-cu114-ubuntu18.04
+++ b/cibuild/dockerfiles/Dockerfile.base-py3.6-cu114-ubuntu18.04
@@ -19,7 +19,7 @@ RUN apt-get update && \
 
 RUN pip install \
     astor==0.8.1 \
-    numpy==1.16.6 && \
+    numpy==1.16.6 \
     protobuf==3.17.3 && \
     pip install --no-deps \
     keras-preprocessing==1.0.5
diff --git a/cibuild/dockerfiles/Dockerfile.base-py3.6-cu116-ubuntu18.04 b/cibuild/dockerfiles/Dockerfile.base-py3.6-cu116-ubuntu18.04
index a73ce082d8a..00d06b42ff0 100644
--- a/cibuild/dockerfiles/Dockerfile.base-py3.6-cu116-ubuntu18.04
+++ b/cibuild/dockerfiles/Dockerfile.base-py3.6-cu116-ubuntu18.04
@@ -19,7 +19,7 @@ RUN apt-get update && \
 
 RUN pip install \
     astor==0.8.1 \
-    numpy==1.16.6 && \
+    numpy==1.16.6 \
     protobuf==3.17.3 && \
     pip install --no-deps \
     keras-preprocessing==1.0.5
diff --git a/cibuild/dockerfiles/Dockerfile.base-py3.8-cu117-ubuntu22.04 b/cibuild/dockerfiles/Dockerfile.base-py3.8-cu117-ubuntu22.04
index 879bbeb81c4..5a923d3ddf6 100644
--- a/cibuild/dockerfiles/Dockerfile.base-py3.8-cu117-ubuntu22.04
+++ b/cibuild/dockerfiles/Dockerfile.base-py3.8-cu117-ubuntu22.04
@@ -51,7 +51,7 @@ RUN pip install wheel==0.37.1
 
 RUN pip install \
     astor==0.8.1 \
-    numpy==1.16.6 && \
+    numpy==1.16.6 \
     protobuf==3.17.3 && \
     pip install --no-deps \
     keras-preprocessing==1.0.5
diff --git a/cibuild/dockerfiles/Dockerfile.base-py3.8-ubuntu20.04 b/cibuild/dockerfiles/Dockerfile.base-py3.8-ubuntu20.04
index c37ac4bcf81..21d87e2b31f 100644
--- a/cibuild/dockerfiles/Dockerfile.base-py3.8-ubuntu20.04
+++ b/cibuild/dockerfiles/Dockerfile.base-py3.8-ubuntu20.04
@@ -19,7 +19,7 @@ RUN apt-get update && \
 
 RUN pip install \
     astor==0.8.1 \
-    numpy==1.16.6 && \
+    numpy==1.16.6 \
     protobuf==3.17.3 && \
     pip install --no-deps \
     keras-preprocessing==1.0.5
diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.6-cu116-ubuntu18.04 b/cibuild/dockerfiles/Dockerfile.devel-py3.6-cu116-ubuntu18.04
index 5b1b4775a2c..4f562794afd 100644
--- a/cibuild/dockerfiles/Dockerfile.devel-py3.6-cu116-ubuntu18.04
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.6-cu116-ubuntu18.04
@@ -5,12 +5,12 @@ RUN apt-get install -y libz-dev
 RUN apt-get install -y openjdk-8-jdk
 
 RUN pip install \
-    h5py==2.10.0 && \
-    spicy==0.16.0 && \
-    portpicker==1.4.0 && \
-    sklearn==0.0 && \
-    tensorflow-estimator==1.15.0 && \
-    grpcio==1.47.0 && \
-    grpcio-tools==1.47.0 && \
-    pyarrow==2.0.0 && \
+    h5py==2.10.0 \
+    spicy==0.16.0 \
+    portpicker==1.4.0 \
+    sklearn==0.0 \
+    tensorflow-estimator==1.15.0 \
+    grpcio==1.47.0 \
+    grpcio-tools==1.47.0 \
+    pyarrow==2.0.0 \
     fastparquet==0.6.0
diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04 b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04
index 8b09b240763..5a318d41d09 100644
--- a/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04
@@ -5,14 +5,14 @@ RUN apt-get install -y libz-dev
 RUN apt-get install -y openjdk-8-jdk
 
 RUN pip install \
-    h5py==2.10.0 && \
-    spicy==1.5.4 && \
-    scikit-learn==0.24.2 && \
-    portpicker==1.4.0 && \
-    sklearn==0.0 && \
-    tensorflow-estimator==1.15.2 && \
-    grpcio==1.47.0 && \
-    grpcio-tools==1.47.0 && \
-    pyarrow==2.0.0 && \
-    pandas==1.1.5 && \
+    h5py==2.10.0 \
+    spicy==1.5.4 \
+    scikit-learn==0.24.2 \
+    portpicker==1.4.0 \
+    sklearn==0.0 \
+    tensorflow-estimator==1.15.2 \
+    grpcio==1.47.0 \
+    grpcio-tools==1.47.0 \
+    pyarrow==2.0.0 \
+    pandas==1.1.5 \
     fastparquet==0.6.0.post1
diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu117-ubuntu22.04 b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu117-ubuntu22.04
index 98248a20987..557aa8f0403 100644
--- a/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu117-ubuntu22.04
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu117-ubuntu22.04
@@ -5,14 +5,14 @@ RUN apt-get install -y libz-dev
 RUN apt-get install -y openjdk-8-jdk
 
 RUN pip install \
-    h5py==2.10.0 && \
-    spicy==1.5.4 && \
-    scikit-learn==0.24.2 && \
-    portpicker==1.4.0 && \
-    sklearn==0.0 && \
-    tensorflow-estimator==1.15.2 && \
-    grpcio==1.47.0 && \
-    grpcio-tools==1.47.0 && \
-    pyarrow==2.0.0 && \
-    pandas==1.1.5 && \
+    h5py==2.10.0 \
+    spicy==1.5.4 \
+    scikit-learn==0.24.2 \
+    portpicker==1.4.0 \
+    sklearn==0.0 \
+    tensorflow-estimator==1.15.2 \
+    grpcio==1.47.0 \
+    grpcio-tools==1.47.0 \
+    pyarrow==2.0.0 \
+    pandas==1.1.5 \
     fastparquet==0.6.0.post1
diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu20.04 b/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu20.04
index 673a600097e..9fd5988151c 100644
--- a/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu20.04
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu20.04
@@ -5,14 +5,14 @@ RUN apt-get install -y libz-dev
 RUN apt-get install -y openjdk-8-jdk
 
 RUN pip install \
-    h5py==2.10.0 && \
-    spicy==1.5.4 && \
-    scikit-learn==0.24.2 && \
-    portpicker==1.4.0 && \
-    sklearn==0.0 && \
-    tensorflow-estimator==1.15.2 && \
-    grpcio==1.47.0 && \
-    grpcio-tools==1.47.0 && \
-    pyarrow==2.0.0 && \
-    pandas==1.1.5 && \
+    h5py==2.10.0 \
+    spicy==1.5.4 \
+    scikit-learn==0.24.2 \
+    portpicker==1.4.0 \
+    sklearn==0.0 \
+    tensorflow-estimator==1.15.2 \
+    grpcio==1.47.0 \
+    grpcio-tools==1.47.0 \
+    pyarrow==2.0.0 \
+    pandas==1.1.5 \
     fastparquet==0.6.0.post1
diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu22.04 b/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu22.04
index 43dc4f1c0a0..f0edce7d0bd 100644
--- a/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu22.04
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.8-ubuntu22.04
@@ -5,14 +5,14 @@ RUN apt-get install -y libz-dev
 RUN apt-get install -y openjdk-8-jdk
 
 RUN pip install \
-    h5py==2.10.0 && \
-    spicy==1.5.4 && \
-    scikit-learn==0.24.2 && \
-    portpicker==1.4.0 && \
-    sklearn==0.0 && \
-    tensorflow-estimator==1.15.2 && \
-    grpcio==1.47.0 && \
-    grpcio-tools==1.47.0 && \
-    pyarrow==2.0.0 && \
-    pandas==1.1.5 && \
+    h5py==2.10.0 \
+    spicy==1.5.4 \
+    scikit-learn==0.24.2 \
+    portpicker==1.4.0 \
+    sklearn==0.0 \
+    tensorflow-estimator==1.15.2 \
+    grpcio==1.47.0 \
+    grpcio-tools==1.47.0 \
+    pyarrow==2.0.0 \
+    pandas==1.1.5 \
     fastparquet==0.6.0.post1

From 96d66abfc4df30ace9e27a57bfb11ffa9404e411 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 11 Jul 2023 11:40:16 +0800
Subject: [PATCH 37/91] [Embedding] Add performance test for EmbeddingVariable.
 (#910)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/kernels/BUILD                 |  23 +
 ...edding_variable_memory_performance_test.cc | 438 ++++++++++++++++++
 2 files changed, 461 insertions(+)
 create mode 100644 tensorflow/core/kernels/embedding_variable_memory_performance_test.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 14788ede450..7d494726886 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -460,6 +460,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "embedding_variable_memory_performance_test",
+    srcs = ["embedding_variable_memory_performance_test.cc"],
+    extra_copts = ["-fexceptions", "-g"],
+    deps = [
+        ":io",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "tensor_flag_utils",
     srcs = [
diff --git a/tensorflow/core/kernels/embedding_variable_memory_performance_test.cc b/tensorflow/core/kernels/embedding_variable_memory_performance_test.cc
new file mode 100644
index 00000000000..6aa30b3ab40
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_variable_memory_performance_test.cc
@@ -0,0 +1,438 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include <thread>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#endif //GOOGLE_CUDA
+
+#include <time.h>
+#include <sys/resource.h>
+#include "tensorflow/core/framework/embedding/kv_interface.h"
+#include "tensorflow/core/framework/embedding/cache.h"
+#include "tensorflow/core/kernels/kv_variable_ops.h"
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
+namespace tensorflow {
+struct ProcMemory {
+  long size;      // total program size
+  long resident;  // resident set size
+  long share;     // shared pages
+  long trs;       // text (code)
+  long lrs;       // library
+  long drs;       // data/stack
+  long dt;        // dirty pages
+
+  ProcMemory() : size(0), resident(0), share(0),
+                 trs(0), lrs(0), drs(0), dt(0) {}
+};
+
+ProcMemory getProcMemory() {
+  ProcMemory m;
+  FILE* fp = fopen("/proc/self/statm", "r");
+  if (fp == NULL) {
+    LOG(ERROR) << "Fail to open /proc/self/statm.";
+    return m;
+  }
+
+  if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld",
+             &m.size, &m.resident, &m.share,
+             &m.trs, &m.lrs, &m.drs, &m.dt) != 7) {
+    fclose(fp);
+    LOG(ERROR) << "Fail to fscanf /proc/self/statm.";
+    return m;
+  }
+  fclose(fp);
+
+  return m;
+}
+
+double getSize() {
+  ProcMemory m = getProcMemory();
+  return m.size;
+}
+
+double getResident() {
+  ProcMemory m = getProcMemory();
+  return m.resident;
+}
+namespace embedding {
+EmbeddingVar<int64, float>* CreateEmbeddingVar(
+    int value_size, Tensor& default_value,
+    int64 default_value_dim) {
+  Allocator* allocator = ev_allocator();
+  auto embedding_config = EmbeddingConfig(
+			0, 0, 1, 0, "emb_var", 0,
+			0, 999999, -1.0, "light",
+			0, -1.0, DT_UINT64, default_value_dim,
+			0.0, false, false, false);
+  auto storage =
+      embedding::StorageFactory::Create<int64, float>(
+          embedding::StorageConfig(
+              embedding::StorageType::DRAM, "",
+              {1024, 1024, 1024, 1024}, "light",
+              embedding_config),
+          allocator,
+          "emb_var");
+	auto ev = new EmbeddingVar<int64, float>(
+      "emb_var",
+      storage,
+      embedding_config,
+      allocator);
+	ev->Init(default_value, default_value_dim);
+  return ev;
+}
+
+void GenerateSkewIds(int num_of_ids, float skew_factor,
+                     std::vector<int64>& hot_ids_list,
+                     std::vector<int64>& cold_ids_list) {
+  int num_of_hot_ids = num_of_ids * (1 - skew_factor);
+	int num_of_cold_ids = num_of_ids - num_of_hot_ids;
+	std::set<int64> hot_ids_set;
+  std::set<int64> cold_ids_set;
+  hot_ids_list.resize(num_of_hot_ids);
+  cold_ids_list.resize(num_of_cold_ids);
+  srand((unsigned)time(NULL));
+  //Generate hot ids
+  for (int i = 0; i < num_of_hot_ids; i++) {
+    bool flag = false;
+    int64 key;
+    do {
+      key = rand() % 100000000;
+      flag = hot_ids_set.insert(key).second;
+      hot_ids_list[i] = key;
+    } while (!flag);
+  }
+  //Generate cold ids
+  for (int i = 0; i < num_of_cold_ids; i++) {
+    bool flag = false;
+    int64 key;
+    do {
+      key = rand() % 100000000;
+      if (hot_ids_set.find(key) != hot_ids_set.end()) {
+        flag = false;
+      } else {
+        flag = cold_ids_set.insert(key).second;
+        cold_ids_list[i] = key;
+      }
+    } while (!flag);
+  }
+}
+
+void InitSkewInputBatch(std::vector<std::vector<int64>>& input_batches,
+                        float skew_factor,
+                        const std::vector<int64>& hot_ids_list,
+                        const std::vector<int64>& cold_ids_list) {
+  srand((unsigned)time(NULL));
+  int num_of_hot_ids = hot_ids_list.size();
+  int num_of_cold_ids = cold_ids_list.size();
+  int num_of_batch = input_batches.size();
+  for (int i = 0; i < input_batches.size(); i++) {
+    for (int j = 0; j < input_batches[i].size(); j++) {
+      int tmp = rand() % 10;
+      if ((float)tmp * 0.1 < skew_factor) {
+        int pos = rand() % num_of_hot_ids;
+        input_batches[i][j] = hot_ids_list[pos];
+      } else {
+        int pos = rand() % num_of_cold_ids;
+        input_batches[i][j] = cold_ids_list[pos];
+      }
+    }
+  }
+}
+
+
+void GenerateSkewInput(int num_of_ids, float skew_factor,
+                      std::vector<std::vector<int64>>& input_batches) {
+  std::vector<int64> hot_ids_list;
+  std::vector<int64> cold_ids_list;
+  //Generate hot ids
+  GenerateSkewIds(num_of_ids, skew_factor,
+                  hot_ids_list, cold_ids_list);
+  //Select id for each batch
+  InitSkewInputBatch(input_batches, skew_factor,
+                     hot_ids_list, cold_ids_list);
+}
+
+void thread_lookup_or_create(
+    EmbeddingVar<int64, float>* ev,
+    const int64* input_batch,
+    float** outputs, int value_size,
+    int start, int end) {
+  ValuePtr<float>* value_ptr = nullptr;
+	bool is_filter = false;
+  for (int i = start; i < end; i++) {
+    ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false);
+    auto val = ev->flat(value_ptr, input_batch[i]);
+    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+  }
+}
+
+double PerfLookupOrCreate(
+    const std::vector<std::vector<int64>>& input_batches,
+    int num_thread) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
+  std::vector<std::thread> worker_threads(num_thread);
+  double total_time = 0.0;
+  timespec start, end;
+  for (int k = 0; k < input_batches.size(); k++) {
+    //Allocate Outputs for each batch
+    std::vector<float*> outputs(input_batches[k].size());
+    for (int i = 0; i < outputs.size(); i++) {
+      outputs[i] =
+          (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size);
+    }
+    //Execution
+    std::vector<std::pair<int, int>> thread_task_range(num_thread);
+    for (int i = 0; i < num_thread; i++) {
+      int st = input_batches[k].size() / num_thread * i;
+      int ed = input_batches[k].size() / num_thread * (i + 1);
+      ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed;
+      thread_task_range[i].first = st;
+      thread_task_range[i].second = ed;
+    }
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i] = std::thread(thread_lookup_or_create,
+                                      ev, input_batches[k].data(),
+                                      outputs.data(), value_size,
+                                      thread_task_range[i].first,
+                                      thread_task_range[i].second);
+    }
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i].join();
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    if (k > 10)
+      total_time += ((double)(end.tv_sec - start.tv_sec) *
+                     1000000000 + end.tv_nsec - start.tv_nsec);
+    //Check
+    for (int i = 0; i < input_batches[k].size(); i++) {
+      int64 key =  input_batches[k][i];
+      float* output = outputs[i];
+      for (int j = 0; j < value_size; j++) {
+        float val = default_value_matrix(key % default_value_dim, j);
+        if (output[j] != val) {
+          LOG(INFO)<<"Value Error: outputs["<<key<<"]["<<j
+                    <<"] is "<<output[j]<<", while the anwser is "<<val;
+          return -1.0;
+        }
+      }
+    }
+    //Deallocate Output
+    for (auto ptr: outputs) {
+      cpu_allocator()->DeallocateRaw(ptr);
+    }
+  }
+  ev->Unref();
+  return total_time;
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestLookupOrCreate) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  LOG(INFO)<<"[TestLookupOrCreate] Start generating skew input";
+  GenerateSkewInput(num_of_ids, 0.8, input_batches);
+  LOG(INFO)<<"[TestLookupOrCreate] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread: num_thread_vec) {
+    LOG(INFO)<<"[TestLookupOrCreate] Test LookupOrCreate With"
+             <<num_thread<<" threads.";
+    double exec_time = PerfLookupOrCreate(input_batches, num_thread);
+    if (exec_time == -1.0) {
+      LOG(INFO)<<"[TestLookupOrCreate] Test Failed";
+    } else {
+      LOG(INFO)<<"[TestLookupOrCreate] Performance of LookupOrCreate With "
+               <<num_thread<<" threads: "<<exec_time/1000000<<" ms";
+    }
+  }
+}
+
+void thread_lookup(
+    EmbeddingVar<int64, float>* ev,
+    const int64* input_batch,
+    float** outputs, int value_size,
+    int start, int end) {
+  ValuePtr<float>* value_ptr = nullptr;
+	bool is_filter = false;
+  for (int i = start; i < end; i++) {
+    ev->LookupKey(input_batch[i], &value_ptr);
+    auto val = ev->flat(value_ptr, input_batch[i]);
+    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+  }
+}
+
+double PerfLookup(
+    EmbeddingVar<int64, float>* ev,
+    const std::vector<std::vector<int64>>& input_batches,
+    int num_thread,
+    int value_size, float* default_value,
+    int64 default_value_dim) {
+  std::vector<std::thread> worker_threads(num_thread);
+  double total_time = 0.0;
+  timespec start, end;
+  for (int k = 0; k < input_batches.size(); k++) {
+    //Allocate Outputs for each batch
+    std::vector<float*> outputs(input_batches[k].size());
+    for (int i = 0; i < outputs.size(); i++) {
+      outputs[i] =
+          (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size);
+    }
+    //Execution
+    std::vector<std::pair<int, int>> thread_task_range(num_thread);
+    for (int i = 0; i < num_thread; i++) {
+      int st = input_batches[k].size() / num_thread * i;
+      int ed = input_batches[k].size() / num_thread * (i + 1);
+      ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed;
+      thread_task_range[i].first = st;
+      thread_task_range[i].second = ed;
+    }
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i] = std::thread(thread_lookup,
+                                      ev, input_batches[k].data(),
+                                      outputs.data(), value_size,
+                                      thread_task_range[i].first,
+                                      thread_task_range[i].second);
+    }
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i].join();
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    if (k > 10)
+      total_time += ((double)(end.tv_sec - start.tv_sec) *
+                     1000000000 + end.tv_nsec - start.tv_nsec);
+    //Check
+    for (int i = 0; i < input_batches[k].size(); i++) {
+      int64 key =  input_batches[k][i];
+      float* output = outputs[i];
+      for (int j = 0; j < value_size; j++) {
+        float val = default_value[(key % default_value_dim) * value_size + j];
+        if (output[j] != val) {
+          LOG(INFO)<<"Value Error: outputs["<<key<<"]["<<j
+                    <<"] is "<<output[j]<<", while is the anwser is "<<val;
+          return -1.0;
+        }
+      }
+    }
+    //Deallocate Output
+    for (auto ptr: outputs) {
+      cpu_allocator()->DeallocateRaw(ptr);
+    }
+  }
+  return total_time;
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestLookup) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  float skew_factor = 0.8;
+
+  LOG(INFO)<<"[TestLookup] Start initializing EV storage.";
+  std::vector<int64> hot_ids_list;
+  std::vector<int64> cold_ids_list;
+  GenerateSkewIds(num_of_ids, skew_factor, hot_ids_list, cold_ids_list);
+
+  Tensor default_value(
+      DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
+  ValuePtr<float>* value_ptr = nullptr;
+  bool is_filter = false;
+  for (int i = 0; i < hot_ids_list.size(); i++) {
+    ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false);
+  }
+  for (int i = 0; i < cold_ids_list.size(); i++) {
+    ev->LookupOrCreateKey(cold_ids_list[i], &value_ptr, &is_filter, false);
+  }
+  LOG(INFO)<<"[TestLookup] End initializing EV storage.";
+
+  LOG(INFO)<<"[TestLookup] Start generating skew input";
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  InitSkewInputBatch(input_batches, skew_factor, hot_ids_list, cold_ids_list);
+  LOG(INFO)<<"[TestLookup] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread: num_thread_vec) {
+    LOG(INFO)<<"[TestLookup] Test Lookup With "<<num_thread<<" threads.";
+    double exec_time = PerfLookup(ev, input_batches, num_thread,
+                                  value_size, (float*)default_value.data(),
+                                  default_value_dim);
+    if (exec_time == -1.0) {
+      LOG(INFO)<<"[TestLookup] Test Failed";
+    } else {
+      LOG(INFO)<<"[TestLookup] Performance of Lookup With "
+               <<num_thread<<" threads: "<<exec_time/1000000<<" ms";
+    }
+  }
+  ev->Unref();
+}
+} //namespace embedding
+} //namespace tensorflow

From 56cc51efeabe3e713bd39d038cf51eca23eb7087 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 11 Jul 2023 04:37:38 -0700
Subject: [PATCH 38/91] [Embedding] Refactor the restore interface of
 EmbeddingVariable. (#903)

support restore parameters from single or partitioned EmbeddingVariable

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 .../framework/embedding/bloom_filter_policy.h |   45 +-
 .../embedding/counter_filter_policy.h         |   88 +-
 .../embedding/dram_leveldb_storage.h          |    9 +-
 .../framework/embedding/dram_pmem_storage.h   |    9 +-
 .../framework/embedding/dram_ssd_storage.h    |   36 +-
 .../core/framework/embedding/embedding_var.h  |  139 +--
 .../embedding/embedding_var_restore.h         |  740 ++++++++++++
 .../core/framework/embedding/filter_policy.h  |   69 +-
 .../embedding/hbm_dram_ssd_storage.h          |  216 ++--
 .../framework/embedding/hbm_dram_storage.h    |  227 ++--
 .../embedding/hbm_storage_iterator.h          |    4 +
 .../framework/embedding/multi_tier_storage.h  |   57 +-
 .../embedding/nullable_filter_policy.h        |   76 +-
 .../framework/embedding/single_tier_storage.h |   86 +-
 tensorflow/core/framework/embedding/storage.h |  103 +-
 tensorflow/core/kernels/BUILD                 |    4 +
 .../kernels/embedding_variable_ops_test.cc    |   57 +-
 tensorflow/core/kernels/kv_variable_ops.h     | 1024 -----------------
 .../kernels/kv_variable_save_restore_ops.cc   |   44 +-
 .../ops/embedding_variable_ops_gpu_test.py    |  206 +++-
 .../python/ops/embedding_variable_ops_test.py |  227 +++-
 21 files changed, 1754 insertions(+), 1712 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_restore.h

diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index c7a1f901ab3..29b85e5bb4e 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -31,9 +31,13 @@ const static std::vector<int64> default_seeds = {
 
 template<typename K, typename V, typename EV>
 class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
+ using FilterPolicy<K, V, EV>::ev_;
+ using FilterPolicy<K, V, EV>::config_;
+
  public:
-  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev)
-      : config_(config), ev_(ev) {
+  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) :
+      FilterPolicy<K, V, EV>(config, ev) {
+    
     switch (config_.counter_type){
       case DT_UINT64:
         VLOG(2) << "The type of bloom counter is uint64";
@@ -303,16 +307,18 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     }
   }
 
-  Status Import(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter) override {
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr, RestoreBuffer& restore_buff) override {
     K* key_buff = (K*)restore_buff.key_buffer;
     V* value_buff = (V*)restore_buff.value_buffer;
     int64* version_buff = (int64*)restore_buff.version_buffer;
     int64* freq_buff = (int64*)restore_buff.freq_buffer;
+    if (to_dram) {
+      LOG(FATAL)<<"BloomFilter dosen't support ImportToDRAM";
+      return Status::OK();
+    }
+
     for (auto i = 0; i < key_num; ++i) {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
@@ -333,33 +339,19 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         SetBloomFreq(key_buff[i], freq_buff[i]);
       }
       if (new_freq >= config_.filter_freq){
-        ev_->CreateKey(key_buff[i], &value_ptr);
+        ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
         if (config_.steps_to_live != 0 || config_.record_version) {
           value_ptr->SetStep(version_buff[i]);
         }
         if (!is_filter){
           ev_->LookupOrCreateEmb(value_ptr,
-              value_buff + i * ev_->ValueLen());
+                                 value_buff + i * ev_->ValueLen());
         } else {
           ev_->LookupOrCreateEmb(value_ptr,
-              ev_->GetDefaultValue(key_buff[i]));
+                                 ev_->GetDefaultValue(key_buff[i]));
         }
       }
     }
-    if (ev_->IsMultiLevel() && !ev_->IsUseHbm() && config_.is_primary()) {
-      ev_->UpdateCache(key_buff, key_num, version_buff, freq_buff);
-    }
-    return Status::OK();
-  }
-
-  Status ImportToDram(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter,
-                V* default_values) override {
-    LOG(FATAL)<<"BloomFilter dosen't support ImportToDRAM";
     return Status::OK();
   }
 
@@ -455,11 +447,8 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
       }
     }
   }
-
  private:
   void* bloom_counter_;
-  EmbeddingConfig config_;
-  EV* ev_;
   std::vector<int64> seeds_;
 };
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index ec83ee16d6d..c9f19f34cd2 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -23,10 +23,13 @@ namespace tensorflow {
 
 template<typename K, typename V, typename EV>
 class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
+ using FilterPolicy<K, V, EV>::ev_;
+ using FilterPolicy<K, V, EV>::config_;
+ using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
+
  public:
-  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev)
-      : config_(config), ev_(ev){
-  }
+  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) :
+      FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
@@ -115,60 +118,13 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
     return value_ptr->GetFreq();
   }
 
-  Status Import(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter) override {
-    K* key_buff = (K*)restore_buff.key_buffer;
-    V* value_buff = (V*)restore_buff.value_buffer;
-    int64* version_buff = (int64*)restore_buff.version_buffer;
-    int64* freq_buff = (int64*)restore_buff.freq_buffer;
-    for (auto i = 0; i < key_num; ++i) {
-      // this can describe by graph(Mod + DynamicPartition),
-      // but memory waste and slow
-      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
-        continue;
-      }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr);
-      if (!is_filter) {
-        if (freq_buff[i] >= config_.filter_freq) {
-          value_ptr->SetFreq(freq_buff[i]);
-        } else {
-          value_ptr->SetFreq(config_.filter_freq);
-        }
-      } else {
-        value_ptr->SetFreq(freq_buff[i]);
-      }
-      if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
-      }
-      if (value_ptr->GetFreq() >= config_.filter_freq) {
-        if (!is_filter) {
-           ev_->LookupOrCreateEmb(value_ptr,
-               value_buff + i * ev_->ValueLen());
-        } else {
-           ev_->LookupOrCreateEmb(value_ptr,
-               ev_->GetDefaultValue(key_buff[i]));
-        }
-      }
-    }
-    if (ev_->IsMultiLevel() && !ev_->IsUseHbm() && config_.is_primary()) {
-      ev_->UpdateCache(key_buff, key_num, version_buff, freq_buff);
-    }
-    return Status::OK();
+  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+    return (GetFreq(key, value_ptr) >= config_.filter_freq);
   }
 
-  Status ImportToDram(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter,
-                V* default_values) override {
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr, RestoreBuffer& restore_buff) override {
     K* key_buff = (K*)restore_buff.key_buffer;
     V* value_buff = (V*)restore_buff.value_buffer;
     int64* version_buff = (int64*)restore_buff.version_buffer;
@@ -181,7 +137,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         continue;
       }
       ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKeyOnDram(key_buff[i], &value_ptr);
+      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
           value_ptr->SetFreq(freq_buff[i]);
@@ -195,28 +151,12 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         value_ptr->SetStep(version_buff[i]);
       }
       if (value_ptr->GetFreq() >= config_.filter_freq) {
-        if (!is_filter) {
-          ev_->LookupOrCreateEmb(value_ptr,
-               value_buff + i * ev_->ValueLen(), ev_allocator());
-        } else {
-          ev_->LookupOrCreateEmb(value_ptr,
-              default_values +
-                (key_buff[i] % config_.default_value_dim)
-                 * ev_->ValueLen(),
-              ev_allocator());
-        }
+        LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
+                                  value_ptr, value_buff, key_buff);
       }
     }
     return Status::OK();
   }
-
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
-    return (GetFreq(key, value_ptr) >= config_.filter_freq);
-  }
-
- private:
-  EmbeddingConfig config_;
-  EV* ev_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index 7a7f4986f62..c6c64e14865 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -68,10 +68,9 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
   }
 
   void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
+              size_t alloc_len, bool to_dram = false) override {
     dram_->Insert(key, value_ptr, alloc_len);
   }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size, CopyBackFlag &need_copyback) override {
     LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
@@ -112,12 +111,6 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return false;
   }
 
-  bool IsUsePersistentStorage() override {
-    /*The return value is set to false temporarily,
-      because the corresponding interface is not implemented.*/
-    return false;
-  }
-
   void iterator_mutex_lock() override {
     leveldb_->get_mutex()->lock();
   }
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index c6ff328f6fc..47b6115e801 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -76,10 +76,9 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
   }
 
   void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
+              size_t alloc_len, bool to_dram = false) override {
     dram_->Insert(key, value_ptr, alloc_len);
   }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size, CopyBackFlag &need_copyback) override {
      LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
@@ -95,12 +94,6 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  bool IsUsePersistentStorage() override {
-    /*The return value is set to false temporarily,
-      because the corresponding interface is not implemented.*/
-    return false;
-  }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size) override {
     Status s = dram_->Get(key, value_ptr);
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 9ba794edf7e..675395c667d 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -69,7 +69,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   }
 
   void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
+              size_t alloc_len, bool to_dram = false) override {
     dram_->Insert(key, value_ptr, alloc_len);
   }
 
@@ -210,27 +210,27 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return key_list->size() + ssd_rec_desc->key_list.size();
   }
 
-  void RestoreSsdHashmap(
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys,
-      int64* file_list, int64* invalid_record_count_list,
-      int64* record_count_list, int64 num_of_files,
-      const std::string& ssd_emb_file_name) override {
+  Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
+                    const std::string& ssd_emb_file_name, EmbeddingVar<K, V>* ev,
+                    RestoreSSDBuffer<K>& restore_buff) override {
+    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
     std::map<int64, int64> file_id_map;
-    for (int64 i = 0; i < num_of_files; i++) {
-      file_id_map[file_list[i]] = i;
+    for (int64 i = 0; i < restore_buff.num_of_files; i++) {
+      file_id_map[restore_buff.file_list_buf[i]] = i;
     }
 
-    ssd_hash_->CopyEmbFilesFromCkpt(
-        file_list, invalid_record_count_list,
-        record_count_list, num_of_files,
-        ssd_emb_file_name);
-
-    ssd_hash_->Import(key_list, key_file_id_list,
-                    key_offset_list, num_of_keys,
-                    file_id_map);
+    ssd_hash_->CopyEmbFilesFromCkpt(restore_buff.file_list_buf,
+                                    restore_buff.invalid_record_count_list_buf,
+                                    restore_buff.record_count_list_buf,
+                                    restore_buff.num_of_files,
+                                    ssd_emb_file_name);
+
+    ssd_hash_->Import(restore_buff.key_list_buf,
+                      restore_buff.key_file_id_list_buf,
+                      restore_buff.key_offset_list_buf,
+                      restore_buff.num_of_keys,
+                      file_id_map);
   }
-
   Status Eviction(K* evict_ids, int64 evict_size) override {
     ValuePtr<V>* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index ae5760bfbc0..204c758e3ba 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_var_context.h"
+#include "tensorflow/core/framework/embedding/embedding_var_restore.h"
 #include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/framework/embedding/filter_factory.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/storage.h"
 #include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -191,14 +193,9 @@ class EmbeddingVar : public ResourceBase {
     return s;
   }
 
-  void CreateKey(K key, ValuePtr<V>** value_ptr) {
+  void CreateKey(K key, ValuePtr<V>** value_ptr, bool to_dram) {
     storage_->Insert(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()));
-  }
-
-  void CreateKeyOnDram(K key, ValuePtr<V>** value_ptr) {
-    storage_->InsertToDram(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()));
+        emb_config_.total_num(storage_->GetAllocLen()), to_dram);
   }
 
   void UpdateVersion(ValuePtr<V>* value_ptr, int64 gs) {
@@ -575,122 +572,14 @@ class EmbeddingVar : public ResourceBase {
     return emb_config_.DebugString();
   }
 
-  Status Import(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter,
-                const Eigen::GpuDevice* device) {
-    if (IsMultiLevel() && IsUseHbm()) {
-      Status s = Status::OK();
-#if GOOGLE_CUDA
-      V* default_value_host = nullptr;
-      if (is_filter) {
-        default_value_host = new V[emb_config_.default_value_dim * value_len_];
-        cudaMemcpy(default_value_host, default_value_,
-                   sizeof(V) * emb_config_.default_value_dim * value_len_,
-                   cudaMemcpyDeviceToHost);
-      }
-      s = filter_->ImportToDram(restore_buff, key_num, bucket_num,
-          partition_id, partition_num, is_filter, default_value_host);
-      delete[] default_value_host;
-#endif //GOOGLE_CUDA
-      return s;
-    } else if (IsSingleHbm()) {
-#if GOOGLE_CUDA
-      K* key_buff = (K*)restore_buff.key_buffer;
-      V* value_buff = (V*)restore_buff.value_buffer;
-      std::vector<K> key_import;
-      std::vector<V> value_import;
-      for (auto i = 0; i < key_num; ++ i) {
-        if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-          LOG(INFO) << "skip EV key:" << *(key_buff + i);
-          continue;
-        }
-        key_import.emplace_back(*(key_buff + i));
-        register auto row_offset = value_buff + i * value_len_;
-        for (int j = 0; j < value_len_; j++) {
-          value_import.emplace_back(*(row_offset + j));
-        }
-      }
-      storage_->ImportToHbm(key_import, value_import, device, emb_config_);
-#endif //GOOGLE_CUDA
-      return Status::OK();
-    } else {
-      return filter_->Import(restore_buff, key_num, bucket_num,
-          partition_id, partition_num, is_filter);
-    }
-  }
-
-  void ImportToHbm(K* ids, int64 size) {
-    storage_->ImportToHbm(ids, size,
-        value_len_, emb_config_.emb_index);
-  }
-
-  void RestoreSsdHashmap(
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys,
-      int64* file_list, int64* invalid_record_count_list,
-      int64* record_count_list, int64 num_of_files,
-      const std::string& ssd_emb_file_name) {
-    storage_->
-        RestoreSsdHashmap(
-            key_list, key_file_id_list,
-            key_offset_list, num_of_keys,
-            file_list, invalid_record_count_list,
-            record_count_list, num_of_files,
-            ssd_emb_file_name);
-  }
-
-  void LoadSsdData(
-      const string& old_file_prefix,
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys) {
-    int64 alloc_len = storage_->ComputeAllocLen(value_len_);
-    for (int64 i = 0; i < num_of_keys; i++) {
-      ValuePtr<V>* value_ptr = nullptr;
-      LookupOrCreateKey(key_list[i], &value_ptr);
-
-      int64 file_id = key_file_id_list[i];
-      int64 key_offset = key_offset_list[i];
-      // Read data from embedding files on SSD. Data are stored in
-      // NormalContiguousValuePtr temporarily.
-      std::stringstream ss;
-      ss <<old_file_prefix << "/" << file_id << ".emb";
-      int fd = open(ss.str().data(), O_RDONLY);
-      char* file_addr =
-          (char*)mmap(nullptr,
-                      sizeof(FixedLengthHeader)
-                          + alloc_len * sizeof(V)
-                          * (emb_config_.slot_num + 1)
-                          + key_offset,
-                      PROT_READ,
-                      MAP_PRIVATE, fd, 0);
-
-      NormalContiguousValuePtr<V> tmp_value_ptr(alloc_,
-          alloc_len * (emb_config_.slot_num + 1));
-      void* ptr = tmp_value_ptr.GetPtr();
-      memcpy(ptr, file_addr + key_offset,
-             sizeof(FixedLengthHeader)
-                 + alloc_len * sizeof(V) * (emb_config_.slot_num + 1));
-      munmap(file_addr,
-             sizeof(FixedLengthHeader)
-                 + alloc_len * sizeof(V)
-                 * (emb_config_.slot_num + 1)
-                 + key_offset);
-      close(fd);
-      //Copy Data to ValuePtr, data of slots are set by primary here.
-      for (int j = 0; j < emb_config_.slot_num + 1; j++) {
-        V* value = tmp_value_ptr.GetValue(j, alloc_len * j);
-        if (value != nullptr) {
-          value_ptr->GetOrAllocate(alloc_, value_len_, value,
-              j, alloc_len * j);
-        }
-      }
-      value_ptr->SetFreq(tmp_value_ptr.GetFreq());
-      value_ptr->SetStep(tmp_value_ptr.GetStep());
-    }
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string, int64 partition_id,
+               int64 partition_num, bool is_incr, BundleReader* reader,
+               bool reset_version = false,
+               const Eigen::GpuDevice* device = nullptr) {
+    return storage_->Restore(name_string, file_name_string, partition_id,
+                             partition_num, value_len_, is_incr, reset_version,
+                             emb_config_, device, reader, this, filter_);
   }
 
   int64 GetSnapshot(std::vector<K>* key_list,
@@ -756,6 +645,10 @@ class EmbeddingVar : public ResourceBase {
     return emb_config_.emb_index;
   }
 
+  int64 GetEmbeddingSlotNum() {
+    return emb_config_.slot_num;
+  }
+  
   Allocator* GetAllocator() {
     return alloc_;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.h b/tensorflow/core/framework/embedding/embedding_var_restore.h
new file mode 100644
index 00000000000..821ef7485e8
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.h
@@ -0,0 +1,740 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/filter_policy.h"
+#include "tensorflow/core/framework/embedding/storage.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename K, typename V>
+class EmbeddingVar;
+
+
+namespace {
+  const int kSavedPartitionNum = 1000;
+  const size_t kBufferSize = 8 << 20;
+  constexpr char kPartStr[] = "part_";
+  
+  constexpr char kPartOffsetTensorSuffsix[] = "-partition_offset";
+  constexpr char kPartFilterOffsetTensorSuffsix[] =
+      "-partition_filter_offset";
+  constexpr char kKeySuffix[] = "-keys";
+  constexpr char kValueSuffix[] = "-values";
+  constexpr char kVersionSuffix[] = "-versions";
+  constexpr char kFreqSuffix[] = "-freqs";
+
+  constexpr char kIncrPartOffsetTensorSuffsix[] = "-incr_partition_offset";
+  constexpr char kIncrKeySuffix[] = "-sparse_incr_keys";
+  constexpr char kIncrValueSuffix[] = "-sparse_incr_values";
+  constexpr char kIncrVersionSuffix[] = "-sparse_incr_versions";
+  constexpr char kIncrFreqSuffix[] = "-sparse_incr_freqs";
+}  // namespace
+
+template <typename K>
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
+  TensorShape shape;
+  Status st;
+  st = reader->LookupTensorShape(record_key, &shape);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  size_t bytes_read = 0;
+  *buffer = new K[shape.dim_size(0)];
+  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
+                             (char*)*buffer, bytes_read);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  return shape.dim_size(0);
+}
+
+template <typename K>
+struct RestoreSSDBuffer {
+  int64* file_list_buf = nullptr;
+  int64* invalid_record_count_list_buf = nullptr;
+  int64* record_count_list_buf = nullptr;
+  K* key_list_buf = nullptr;
+  int64* key_file_id_list_buf = nullptr;
+  int64* key_offset_list_buf = nullptr;
+  int64 num_of_keys = 0;
+  int64 num_of_files = 0;
+
+  explicit RestoreSSDBuffer(BundleReader* ssd_record_reader) {
+    num_of_files = ReadRecord(ssd_record_reader, "files", &file_list_buf);
+
+    ReadRecord(ssd_record_reader, "invalid_record_count",
+               &invalid_record_count_list_buf);
+    ReadRecord(ssd_record_reader, "record_count", &record_count_list_buf);
+    num_of_keys = ReadRecord(ssd_record_reader, "keys", &key_list_buf);
+
+    ReadRecord(ssd_record_reader, "keys_file_id", &key_file_id_list_buf);
+    ReadRecord(ssd_record_reader, "keys_offset", &key_offset_list_buf);
+  }
+
+  ~RestoreSSDBuffer() {
+    delete[] file_list_buf;
+    delete[] invalid_record_count_list_buf;
+    delete[] record_count_list_buf;
+    delete[] key_list_buf;
+    delete[] key_file_id_list_buf;
+    delete[] key_offset_list_buf;
+  }
+};
+
+struct RestoreArgs {
+  std::string m_file_name_string;
+  std::string m_name_string;
+  std::string m_tensor_key;
+  std::string m_tensor_value;
+  std::string m_tensor_version;
+  std::string m_tensor_freq;
+  std::vector<int> m_loaded_parts;
+  int64 m_partition_id;
+  int64 m_partition_num;
+  int64 m_idx;
+  int m_old_dim;
+  bool m_is_incr;
+  bool m_reset_version;
+  bool m_has_freq;
+  bool m_has_filter;
+  bool m_is_oldform;
+  RestoreArgs(const std::string name_string,
+              const std::string file_name_string,
+              int64 partition_id,
+              int64 partition_num,
+              bool is_incr,
+              bool reset_version):
+      m_name_string(name_string), m_file_name_string(file_name_string),
+      m_partition_id(partition_id), m_partition_num(partition_num),
+      m_idx(0), m_old_dim(0), m_is_incr(is_incr),
+      m_reset_version(reset_version), m_has_freq(true),
+      m_has_filter(true), m_is_oldform(false) {}
+  RestoreArgs() = default;
+};
+
+template <typename K, typename V>
+class CheckpointLoader {
+ public:
+  CheckpointLoader(embedding::Storage<K, V>* storage, EmbeddingVar<K, V>* ev,
+                   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                   const std::string& name_string,
+                   const std::string& file_name_string, int64 partition_id,
+                   int64 partition_num, bool is_incr, bool reset_version,
+                   BundleReader* reader)
+      : storage_(storage), ev_(ev), filter_(filter), reader_(reader) {
+    restore_args_ = RestoreArgs(name_string, file_name_string, partition_id,
+                                partition_num, is_incr, reset_version);
+  }
+
+  void RestoreCkpt(const EmbeddingConfig& emb_config,
+                   const Eigen::GpuDevice* device) {
+    /* Step 1: Restore SSD ckpt Data (Optional)
+       Step 2; Restore model ckpt */
+    RestoreSSD();
+
+    std::vector<std::string> tensor_name_vec;
+    InitPartNumAndLoadedParts(tensor_name_vec);
+
+    RestoreBuffer restore_buff(kBufferSize);
+    for (auto& tensor_name : tensor_name_vec) {
+      RestoreInternal(tensor_name, emb_config, device, restore_buff);
+    }
+
+  }
+
+  void RestoreInternal(const std::string& name_string,
+                       const EmbeddingConfig& emb_config,
+                       const Eigen::GpuDevice* device,
+                       RestoreBuffer& restore_buff) {
+    Status s = EVInitTensorNameAndShape(name_string);
+    if (!s.ok()) {
+      LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
+      return;
+    }
+
+    Tensor part_offset_tensor;
+    Tensor part_filter_offset_tensor;
+    if (!restore_args_.m_is_oldform) {
+      /****** InitPartOffsetTensor ******/
+      TensorShape part_offset_shape, part_filter_offset_shape;
+      DataType part_offset_type, part_filter_offset_type;
+      string offset_tensor_name;
+      if (!restore_args_.m_is_incr) {
+        offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
+      } else {
+        offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
+      }
+      
+      string offset_filter_tensor_name =
+          name_string + kPartFilterOffsetTensorSuffsix;
+      Status s = reader_->LookupDtypeAndShape(
+          offset_tensor_name, &part_offset_type, &part_offset_shape);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail:" << s.error_message();
+      }
+      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                       &part_filter_offset_type,
+                                       &part_filter_offset_shape);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
+      part_offset_tensor =
+          Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
+      part_filter_offset_tensor = Tensor(
+          cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
+      s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail:" << s.error_message();
+      }
+
+      s = reader_->Lookup(offset_filter_tensor_name,
+                          &part_filter_offset_tensor);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
+    }
+    auto part_offset_flat = part_offset_tensor.flat<int32>();
+    auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
+    
+    if (restore_args_.m_is_oldform) {
+      VLOG(1) << "old form, EV name:" << name_string
+              << ", partition_id:" << restore_args_.m_partition_id
+              << ", new partition num:" << restore_args_.m_partition_num;
+      int64 new_dim = ev_->ValueLen();
+      TensorShape key_shape;
+      Status st =
+          reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+      if (!st.ok()) {
+      }
+      int tot_key_num = key_shape.dim_size(0);
+      Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff,
+                                   new_dim, emb_config, device);
+      if (!s.ok()) {
+        LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message();
+      }
+    } else {
+      int64 new_dim = ev_->ValueLen();
+      VLOG(1) << "new form checkpoint... :" << name_string
+              << " , partition_id:" << restore_args_.m_partition_id
+              << " , partition_num:" << restore_args_.m_partition_num;
+      for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
+        int subpart_id = restore_args_.m_loaded_parts[i];
+        size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+        size_t value_unit_bytes_new = sizeof(V) * new_dim;
+        int subpart_offset = part_offset_flat(subpart_id);
+        int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
+        int64 key_part_offset = subpart_offset * sizeof(K);
+        int64 value_part_offset =
+            subpart_offset * sizeof(V) * restore_args_.m_old_dim;
+        int64 version_part_offset = subpart_offset * sizeof(int64);
+        int64 freq_part_offset = subpart_offset * sizeof(int64);
+        VLOG(1) << "dynamically load ev : " << name_string
+                << ", subpartid:" << subpart_id;
+
+        EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
+                          version_part_offset, freq_part_offset, restore_buff,
+                          new_dim, emb_config, device);
+
+        if (restore_args_.m_has_filter) {
+          Status s = EVRestoreFilteredFeatures(
+              subpart_id, new_dim, restore_buff, part_filter_offset_flat,
+              emb_config, device);
+          if (!s.ok()) {
+            LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message();
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void RestoreSSD() {
+    std::string name_string_temp(restore_args_.m_name_string);
+    std::string new_str = "_";
+    int64 pos = name_string_temp.find("/");
+    while (pos != std::string::npos) {
+      name_string_temp.replace(pos, 1, new_str.data(), 1);
+      pos = name_string_temp.find("/");
+    }
+    std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" +
+                                       name_string_temp + "-ssd_record";
+    if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
+      std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
+                                      name_string_temp + "-emb_files";
+      BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
+      RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
+      VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
+      storage_->RestoreSSD(ev_->GetEmbeddingIndex(),
+                           ev_->GetEmbeddingSlotNum(), ev_->ValueLen(),
+                           ssd_emb_file_name, ev_, ssd_buffer);
+    }
+  }
+
+  bool IsOldCheckpoint(const std::string& curr_partid_str,
+                       const std::string& kPartOffsetTensorSuffsix) {
+    if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
+      string tensor_name = restore_args_.m_name_string;
+      TensorShape part_offset_shape;
+      DataType part_offset_type;
+      Status st =
+          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                       &part_offset_type, &part_offset_shape);
+      if (st.ok()) return false;
+
+      string part_id = std::to_string(0);
+      tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
+
+      Status form_st =
+          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                       &part_offset_type, &part_offset_shape);
+      if (form_st.ok()) return false;
+    } else {
+      string part_id = std::to_string(0);
+      size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+      size_t part_size = strlen(kPartStr);
+      size_t cur_part_size = curr_partid_str.size();
+
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + part_size + cur_part_size);
+      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+
+      TensorShape part_offset_shape;
+      DataType part_offset_type;
+      Status form_st =
+          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                       &part_offset_type, &part_offset_shape);
+      if (form_st.ok()) return false;
+      pre_subname =
+          restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
+      post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
+                                                        cur_part_size);
+      tensor_name = pre_subname + post_subname;
+
+      Status st =
+          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                       &part_offset_type, &part_offset_shape);
+      if (st.ok()) return false;
+    }
+
+    return true;
+  }
+
+  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec) {
+    std::string tmp_key_suffix;
+    std::string tmp_kPartOffsetTensorSuffsix;
+    if (!restore_args_.m_is_incr) {
+      tmp_key_suffix = kKeySuffix;
+      tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
+    } else {
+      tmp_key_suffix = kIncrKeySuffix;
+      tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
+    }
+
+    restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
+    int orig_partnum = 0;
+    const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
+    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+
+    if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
+      restore_args_.m_is_oldform = true;
+    }
+
+    if (part_pos == std::string::npos) {
+      for (;; orig_partnum++) {
+        string part_id = std::to_string(orig_partnum);
+        string tensor_name =
+            restore_args_.m_name_string + "/" + kPartStr + part_id;
+        string tensor_key = tensor_name + tmp_key_suffix;
+        TensorShape key_shape;
+        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+        if (!st.ok()) {
+          break;
+        }
+        tensor_name_vec.emplace_back(tensor_name);
+      }
+      if (orig_partnum == 0) {
+        tensor_name_vec.emplace_back(restore_args_.m_name_string);
+      }
+      for (int i = 0; i < kSavedPartitionNum; ++i) {
+        restore_args_.m_loaded_parts.push_back(i);
+      }
+    } else {
+      for (;; orig_partnum++) {
+        string part_id = std::to_string(orig_partnum);
+        string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+        string post_subname = restore_args_.m_name_string.substr(
+            part_pos + strlen(kPartStr) + curr_partid_str.size());
+        string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+        string tensor_key = tensor_name + tmp_key_suffix;
+        TensorShape key_shape;
+        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+        if (!st.ok()) {
+          break;
+        }
+        tensor_name_vec.emplace_back(tensor_name);
+      }
+      if (orig_partnum == 0) {
+        string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
+        string post_subname = restore_args_.m_name_string.substr(
+            part_pos + strlen(kPartStr) + curr_partid_str.size());
+        string tmp_name = pre_subname + post_subname;
+        tensor_name_vec.emplace_back(tmp_name);
+      }
+      for (int i = 0; i < kSavedPartitionNum; i++) {
+        if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
+          restore_args_.m_loaded_parts.push_back(i);
+        }
+      }
+    }
+    for (auto& tensor_name : tensor_name_vec) {
+      VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
+              << " ****";
+    }
+  }
+
+  Status EVInitTensorNameAndShape(const std::string& tensor_name) {
+    if (!restore_args_.m_is_incr) {
+      restore_args_.m_tensor_key = tensor_name + kKeySuffix;
+      restore_args_.m_tensor_value = tensor_name + kValueSuffix;
+      restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
+      restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
+    } else {
+      restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
+      restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
+      restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
+      restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
+    }
+
+    TensorShape key_shape, value_shape, version_shape, freq_shape;
+
+    Status st =
+        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
+                                    &version_shape);
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_key,
+                               sizeof(K) * key_shape.dim_size(0));
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_value,
+                               sizeof(V) * value_shape.dim_size(0) *
+                               value_shape.dim_size(1));
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_version,
+                               sizeof(int64) * version_shape.dim_size(0));
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        freq_shape = version_shape;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_freq,
+                               sizeof(int64) * freq_shape.dim_size(0));
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        restore_args_.m_has_freq = false;
+      } else {
+        return st;
+      }
+    }
+    restore_args_.m_old_dim = value_shape.dim_size(1);
+
+    if (!restore_args_.m_is_oldform) {
+      TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
+      st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
+                                      &key_filter_shape);
+      if (!st.ok()) {
+        if (st.code() == error::NOT_FOUND) {
+          key_filter_shape = key_shape;
+          restore_args_.m_has_filter = false;
+        } else {
+          return st;
+        }
+      }
+      st = reader_->LookupTensorShape(
+          restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
+      if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
+        return st;
+      }
+      st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
+                                 sizeof(K) * key_filter_shape.dim_size(0));
+      if (!st.ok()) {
+        if (st.code() == error::NOT_FOUND) {
+          restore_args_.m_has_filter = false;
+        } else {
+          return st;
+        }
+      }
+      st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
+                                 sizeof(K) * version_filter_shape.dim_size(0));
+      if (!st.ok()) {
+        return st;
+      }
+      st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
+                                      &freq_filter_shape);
+      if (!st.ok()) {
+        if (st.code() == error::NOT_FOUND) {
+          freq_filter_shape = freq_shape;
+        } else {
+          return st;
+        }
+      }
+
+      st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
+                                 sizeof(K) * freq_filter_shape.dim_size(0));
+      if (!st.ok() && st.code() != error::NOT_FOUND) {
+        return st;
+      }
+    }
+    return st;
+  }
+
+  Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset,
+                           int64 value_part_offset, int64 version_part_offset,
+                           int64 freq_part_offset, RestoreBuffer& restore_buff,
+                           int64 new_dim, const EmbeddingConfig& emb_config,
+                           const Eigen::GpuDevice* device) {
+    size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+    size_t value_unit_bytes_new = sizeof(V) * new_dim;
+    int64 tot_key_bytes_read(0);
+    int64 tot_value_bytes_read(0);
+    int64 tot_version_bytes_read(0);
+    int64 tot_freq_bytes_read(0);
+    size_t key_bytes_read = 0;
+    size_t value_bytes_read = 0;
+    size_t version_bytes_read = 0;
+    size_t freq_bytes_read = 0;
+
+    while (tot_key_num > 0) {
+      size_t read_key_num = std::min(
+          std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
+                   kBufferSize / sizeof(int64));
+      read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
+      read_key_num = std::min((int)read_key_num, tot_key_num);
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
+          read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read,
+          read_key_num * value_unit_bytes, restore_buff.value_buffer,
+          value_bytes_read);
+      if (!restore_args_.m_reset_version) {
+        reader_->LookupSegmentOffset(
+            restore_args_.m_tensor_version,
+            version_part_offset + tot_version_bytes_read,
+            read_key_num * sizeof(int64), restore_buff.version_buffer,
+            version_bytes_read);
+        if (version_bytes_read == 0) {
+          memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
+        }
+      } else {
+        int64* version_tmp = (int64*)restore_buff.version_buffer;
+        memset(version_tmp, 0, read_key_num * sizeof(int64));
+      }
+
+      if (restore_args_.m_has_freq) {
+        reader_->LookupSegmentOffset(
+            restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read,
+            read_key_num * sizeof(int64), restore_buff.freq_buffer,
+            freq_bytes_read);
+        if (freq_bytes_read == 0) {
+          int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+          for (int64 i = 0; i < read_key_num; i++) {
+            freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+          }
+        }
+      } else {
+        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+        for (int64 i = 0; i < read_key_num; i++) {
+          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+        }
+      }
+      if (key_bytes_read > 0) {
+        read_key_num = key_bytes_read / sizeof(K);
+        Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
+                                     value_bytes_read, value_unit_bytes_new,
+                                     restore_buff);
+        if (!st.ok()) {
+          LOG(FATAL) << "EV Restore fail:" << st.ToString();
+        }
+
+        st = storage_->RestoreFeatures(
+            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+            restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr,
+            emb_config, device,
+            filter_, restore_buff);
+        if (!st.ok()) {
+          LOG(FATAL) << "EV Restore fail:" << st.ToString();
+        }
+      }
+
+      tot_key_num -= read_key_num;
+      tot_key_bytes_read += key_bytes_read;
+      tot_value_bytes_read += value_bytes_read;
+      tot_version_bytes_read += version_bytes_read;
+      tot_freq_bytes_read += freq_bytes_read;
+    }
+
+    return Status::OK();
+  }
+
+  Status EVRestoreFilteredFeatures(
+      int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
+      typename TTypes<int32>::Flat part_filter_offset_flat,
+      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
+    int subpart_filter_offset = part_filter_offset_flat(subpart_id);
+    int tot_key_filter_num =
+        part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
+    int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
+    int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
+    int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
+
+    VLOG(1) << "key_filter_num: " << tot_key_filter_num
+            << ", subpart_filter_offset: " << subpart_filter_offset;
+
+    size_t key_filter_bytes_read = 0;
+    size_t version_filter_bytes_read = 0;
+    size_t freq_filter_bytes_read = 0;
+
+    while (tot_key_filter_num > 0) {
+      size_t read_key_num =
+          std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
+      read_key_num = std::min((int)read_key_num, tot_key_filter_num);
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_key + "_filtered",
+          key_filter_part_offset + key_filter_bytes_read,
+          read_key_num * sizeof(K), restore_buff.key_buffer,
+          key_filter_bytes_read);
+      if (!restore_args_.m_reset_version) {
+        reader_->LookupSegmentOffset(
+            restore_args_.m_tensor_version + "_filtered",
+            version_filter_part_offset + version_filter_bytes_read,
+            read_key_num * sizeof(int64), restore_buff.version_buffer,
+            version_filter_bytes_read);
+      } else {
+        int64* version_tmp = (int64*)restore_buff.version_buffer;
+        memset(version_tmp, 0, read_key_num * sizeof(int64));
+      }
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_freq + "_filtered",
+          freq_filter_part_offset + freq_filter_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.freq_buffer,
+          freq_filter_bytes_read);
+      if (key_filter_bytes_read > 0) {
+        read_key_num = key_filter_bytes_read / sizeof(K);
+        VLOG(2) << "restore, read_key_num:" << read_key_num;
+        Status st = storage_->RestoreFeatures(
+            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+            restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr,
+            emb_config, device,
+            filter_, restore_buff);
+        if (!st.ok()) return st;
+        tot_key_filter_num -= read_key_num;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status RestoreCustomDim(int new_dim, int read_key_num,
+                          size_t value_unit_bytes, size_t value_bytes_read,
+                          size_t value_unit_bytes_new,
+                          RestoreBuffer& restore_buff) {
+    bool restore_customDim;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_RESTORE_CUSTOM_DIM", false,
+                                   &restore_customDim));
+    if (restore_customDim && ev_->IsUseHbm()) {
+      return errors::FailedPrecondition(
+          "HBM EV not and custom dim,"
+          "are not supported used together");
+    }
+    if (restore_customDim && restore_args_.m_old_dim != new_dim) {
+      VLOG(2) << "restore, read_value_reshape dim: from "
+              << restore_args_.m_old_dim << " to " << new_dim;
+      if (read_key_num * value_unit_bytes != value_bytes_read) {
+        return tensorflow::errors::FailedPrecondition(
+            "Expected read_key_num * value_unit_bytes == "
+            "value_bytes_read, but got read_key_num * value_unit_bytes "
+            "!= value_bytes_read!");
+      }
+
+      std::unique_ptr<char[]> tmp_ptr(new char[kBufferSize]);
+      size_t read_once = std::min(value_unit_bytes, value_unit_bytes_new);
+      for (int i = 0; i < read_key_num; ++i) {
+        memcpy(tmp_ptr.get() + i * value_unit_bytes_new,
+               restore_buff.value_buffer + i * value_unit_bytes, read_once);
+        if (restore_args_.m_old_dim >= new_dim) continue;
+        auto p = ev_->GetDefaultValue(restore_args_.m_idx++);
+        memcpy(tmp_ptr.get() + i * value_unit_bytes_new + value_unit_bytes,
+               p + value_unit_bytes, value_unit_bytes_new - value_unit_bytes);
+      }
+      auto tmp = tmp_ptr.release();
+      tmp_ptr.reset(restore_buff.value_buffer);
+      restore_buff.value_buffer = tmp;
+    }
+    return Status::OK();
+  }
+
+ private:
+  embedding::Storage<K, V>* storage_;
+  EmbeddingVar<K, V>* ev_;
+  FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
+  BundleReader* reader_;
+  RestoreArgs restore_args_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 51dddba3e9a..559a6796246 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_
 
 #include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/emb_file.h"
 
 namespace tensorflow {
 
@@ -26,6 +27,13 @@ struct RestoreBuffer {
   char* version_buffer = nullptr;
   char* freq_buffer = nullptr;
 
+  explicit RestoreBuffer(size_t buffer_size) {
+    key_buffer = new char[buffer_size];
+    value_buffer = new char[buffer_size];
+    version_buffer = new char[buffer_size];
+    freq_buffer = new char[buffer_size];
+  }
+
   ~RestoreBuffer() {
     delete []key_buffer;
     delete []value_buffer;
@@ -34,9 +42,18 @@ struct RestoreBuffer {
   }
 };
 
+template<typename K>
+class RestoreSSDBuffer;
+
+template <typename V>
+class ValuePtr;
+
 template<typename K, typename V, typename EV>
 class FilterPolicy {
  public:
+  FilterPolicy(const EmbeddingConfig& config, EV* ev) :
+      config_(config), ev_(ev) {}
+
   virtual void LookupOrCreate(K key, V* val,
       const V* default_value_ptr, ValuePtr<V>** value_ptr,
       int count, const V* default_value_no_permission) = 0;
@@ -61,22 +78,48 @@ class FilterPolicy {
       bool* is_filter, int64 count) = 0;
 
   virtual int64 GetFreq(K key, ValuePtr<V>* value_ptr) = 0;
+
   virtual int64 GetFreq(K key) = 0;
-  virtual Status Import(RestoreBuffer& restore_buff,
-    int64 key_num,
-    int bucket_num,
-    int64 partition_id,
-    int64 partition_num,
-    bool is_filter) = 0;
-  virtual Status ImportToDram(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter,
-                V* default_values) = 0;
 
   virtual bool is_admit(K key, ValuePtr<V>* value_ptr) = 0;
+
+  virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0;
+
+ protected:
+  void LookupOrCreateEmbInternal(bool is_filter, bool to_dram,
+                                 int i, int value_len,
+                                 ValuePtr<V>* value_ptr,
+                                 V* value_src, K* key_src) {
+    
+    if (!is_filter) {
+      ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen());
+      return;
+    } else {
+      if (to_dram) {
+#if GOOGLE_CUDA
+        std::vector<V> default_value_host;
+        default_value_host.resize(config_.default_value_dim * value_len);
+        cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(),
+                    sizeof(V) * config_.default_value_dim * value_len,
+                    cudaMemcpyDeviceToHost);
+        ev_->LookupOrCreateEmb(value_ptr,
+                               default_value_host.data() +
+                                  (key_src[i] % config_.default_value_dim)
+                                  * ev_->ValueLen());
+#endif
+        return;
+      } else {
+        ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i]));
+      return;
+      }
+    }
+  }
+
+ protected:
+  EmbeddingConfig config_;
+  EV* ev_;
 };
 } // tensorflow
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 6210e27ab16..72a3ef4483c 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -8,6 +8,7 @@
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;
@@ -16,6 +17,9 @@ using se::Stream;
 template <class V>
 class ValuePtr;
 
+template <typename K, typename V>
+class CheckpointLoader;
+
 void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
 
 namespace embedding {
@@ -131,15 +135,13 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   }
 
   void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
-    hbm_->Insert(key, value_ptr, alloc_len);
-  }
-
-  void InsertToDram(K key, ValuePtr<V>** value_ptr,
-              int64 alloc_len) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+              size_t alloc_len, bool to_dram = false) override {
+    if (to_dram) {
+      dram_->Insert(key, value_ptr, alloc_len);
+    } else {
+      hbm_->Insert(key, value_ptr, alloc_len);
+    }
   }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size) override {
     Status s = hbm_->Get(key, value_ptr);
@@ -193,79 +195,6 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     dram_cache_ = new LRUCache<K>();
   }
 
-  void ImportToHbm(
-      K* ids, int64 size, int64 value_len, int64 emb_index) override {
-    V* memcpy_buffer_cpu = new V[size * value_len];
-    V** value_address = new V*[size];
-    V* memcpy_buffer_gpu =
-        (V*)gpu_alloc_->AllocateRaw(
-            Allocator::kAllocatorAlignment,
-            size * value_len * sizeof(V));
-    V* dev_value_address =
-        (V*)gpu_alloc_->AllocateRaw(
-            Allocator::kAllocatorAlignment,
-            size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
-      }
-    }
-    //Split from above for loop for minize the cost of mutex lock
-    //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
-    for (int64 i = 0; i < size; i++) {
-      memcpy(memcpy_buffer_cpu + i * value_len,
-          cpu_value_ptrs[i]->GetValue(emb_index,
-              Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
-    }
-    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
-        size * value_len * sizeof(V), cudaMemcpyHostToDevice);
-    cudaMemcpy(dev_value_address, value_address,
-        size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
-    int block_dim = 128;
-      void* args[] = {
-          (void*)&dev_value_address,
-          (void*)&memcpy_buffer_gpu,
-          (void*)&value_len,
-          (void*)&size};
-
-    cudaLaunchKernel(
-          (void *)BatchUnpack<V>,
-          (size + block_dim - 1) / block_dim * value_len,
-          block_dim,
-          args, 0, NULL);
-    cudaDeviceSynchronize();
-
-    delete[] memcpy_buffer_cpu;
-    delete[] cpu_value_ptrs;
-    delete[] gpu_value_ptrs;
-    delete[] value_address;
-    gpu_alloc_->DeallocateRaw(dev_value_address);
-    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
-  }
-
   void CopyEmbeddingsFromCPUToGPU(
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
@@ -373,12 +302,6 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  bool IsUsePersistentStorage() override {
-    /*The return value is set to false temporarily,
-      because the corresponding interface is not implemented.*/
-    return false;
-  }
-
   void iterator_mutex_lock() override {
     ssd_->get_mutex()->lock();
   }
@@ -543,7 +466,126 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
            cpu_ptr->GetPtr(),
            sizeof(FixedLengthHeader));
   }
+
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string,
+               int64 partition_id, int64 partition_num,
+               int64 value_len, bool is_incr, bool reset_version,
+               const EmbeddingConfig& emb_config,
+               const Eigen::GpuDevice* device,
+               BundleReader* reader, EmbeddingVar<K, V>* ev,
+               FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) override {
+
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this), ev,
+                                    filter, name_string, file_name_string,
+                                    partition_id, partition_num,
+                                    is_incr, reset_version, reader);
+    restorer.RestoreCkpt(emb_config, device);  
+
+    int64 num_of_hbm_ids =
+          std::min(MultiTierStorage<K, V>::cache_capacity_,
+                   (int64)MultiTierStorage<K, V>::cache_->size());
+    if (num_of_hbm_ids > 0) {
+      K* hbm_ids = new K[num_of_hbm_ids];
+      int64* hbm_freqs = new int64[num_of_hbm_ids];
+      int64* hbm_versions = nullptr;
+      MultiTierStorage<K, V>::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids,
+                                                     hbm_versions, hbm_freqs);
+      ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index);
+      MultiTierStorage<K, V>::cache_thread_pool_->Schedule(
+          [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() {
+            MultiTierStorage<K, V>::cache_->update(hbm_ids, num_of_hbm_ids,
+                                                   hbm_versions, hbm_freqs);
+            delete[] hbm_ids;
+            delete[] hbm_freqs;
+          });
+    }
+  }
+
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id,
+                               partition_num, value_len, is_filter,
+                               true/*to_dram*/, is_incr, restore_buff);
+
+    MultiTierStorage<K, V>::cache_->update((K*)restore_buff.key_buffer, key_num,
+                                           (int64*)restore_buff.version_buffer,
+                                           (int64*)restore_buff.freq_buffer);
+    return s;
+  }
  private:
+  void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
+    V* memcpy_buffer_cpu = new V[size * value_len];
+    V** value_address = new V*[size];
+    V* memcpy_buffer_gpu =
+        (V*)gpu_alloc_->AllocateRaw(
+            Allocator::kAllocatorAlignment,
+            size * value_len * sizeof(V));
+    V* dev_value_address =
+        (V*)gpu_alloc_->AllocateRaw(
+            Allocator::kAllocatorAlignment,
+            size * sizeof(V*));
+    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
+    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
+    {
+      //Mutex with other Import Ops
+      mutex_lock l(memory_pool_mu_);
+      for (int64 i = 0; i < size; i++) {
+        dram_->Get(ids[i], &cpu_value_ptrs[i]);
+        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
+        V* val_ptr = embedding_mem_pool_->Allocate();
+        gpu_value_ptrs[i]->SetPtr(val_ptr);
+        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
+               (char *)cpu_value_ptrs[i]->GetPtr(),
+               sizeof(FixedLengthHeader));
+      }
+    }
+    //Split from above for loop for minize the cost of mutex lock
+    //TODO: Speed up with intra parallelism
+    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    for (int64 i = 0; i < size; i++) {
+      memcpy(memcpy_buffer_cpu + i * value_len,
+             cpu_value_ptrs[i]->GetValue(emb_index,
+                                         Storage<K, V>::GetOffset(emb_index)),
+                                         value_len * sizeof(V));
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
+      }
+      gpu_value_ptrs[i]->SetInitialized(emb_index);
+      value_address[i] = gpu_value_ptrs[i]->GetValue(
+          emb_index, Storage<K, V>::GetOffset(emb_index));
+    }
+    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
+               size * value_len * sizeof(V), cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_value_address, value_address,
+               size * sizeof(V*), cudaMemcpyHostToDevice);
+    {
+      mutex_lock l(memory_pool_mu_);
+      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
+    }
+    int block_dim = 128;
+    void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
+                    (void*)&value_len, (void*)&size};
+
+    cudaLaunchKernel((void *)BatchUnpack<V>,
+                     (size + block_dim - 1) / block_dim * value_len,
+                     block_dim, args, 0, NULL);
+    cudaDeviceSynchronize();
+
+    delete[] memcpy_buffer_cpu;
+    delete[] cpu_value_ptrs;
+    delete[] gpu_value_ptrs;
+    delete[] value_address;
+    gpu_alloc_->DeallocateRaw(dev_value_address);
+    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
+  }
+
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index ed7d197555f..ce8e9a91643 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
 #include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;
@@ -31,6 +32,9 @@ using se::Stream;
 template <class V>
 class ValuePtr;
 
+template <typename K, typename V>
+class CheckpointLoader;
+
 void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
 
 namespace embedding {
@@ -38,12 +42,15 @@ template<typename K, typename V>
 class HbmDramStorage : public MultiTierStorage<K, V> {
  public:
   HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-      Allocator* cpu_alloc, LayoutCreator<V>* lc, const std::string& name)
-      : gpu_alloc_(gpu_alloc),
-        MultiTierStorage<K, V>(sc, name) {
+                 Allocator* cpu_alloc, LayoutCreator<V>* lc,
+                 const std::string& name)
+      : gpu_alloc_(gpu_alloc), MultiTierStorage<K, V>(sc, name) {
     hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc, lc,
-        new LocklessHashMapCPU<K, V>(gpu_alloc));
+    StorageConfig storage_config = StorageConfig();
+    storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS;
+    dram_ = new DramStorage<K, V>(sc, cpu_alloc,
+                                  LayoutCreatorFactory::Create<V>(storage_config),
+                                  new LocklessHashMapCPU<K, V>(gpu_alloc));
   }
 
   ~HbmDramStorage() override {
@@ -111,15 +118,13 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   }
 
   void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
-    hbm_->Insert(key, value_ptr, alloc_len);
-  }
-
-  void InsertToDram(K key, ValuePtr<V>** value_ptr,
-              int64 alloc_len) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+              size_t alloc_len, bool to_dram = false) override {
+    if (to_dram) {
+      dram_->Insert(key, value_ptr, alloc_len);
+    } else {
+      hbm_->Insert(key, value_ptr, alloc_len);
+    }
   }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size) override {
     Status s = hbm_->Get(key, value_ptr);
@@ -163,79 +168,6 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
-  void ImportToHbm(
-      K* ids, int64 size, int64 value_len, int64 emb_index) override {
-    V* memcpy_buffer_cpu = new V[size * value_len];
-    V** value_address = new V*[size];
-    V* memcpy_buffer_gpu =
-        (V*)gpu_alloc_->AllocateRaw(
-            Allocator::kAllocatorAlignment,
-            size * value_len * sizeof(V));
-    V* dev_value_address =
-        (V*)gpu_alloc_->AllocateRaw(
-            Allocator::kAllocatorAlignment,
-            size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
-      }
-    }
-    //Split from above for loop for minize the cost of mutex lock
-    //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
-    for (int64 i = 0; i < size; i++) {
-      memcpy(memcpy_buffer_cpu + i * value_len,
-          cpu_value_ptrs[i]->GetValue(emb_index,
-              Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
-    }
-    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
-        size * value_len * sizeof(V), cudaMemcpyHostToDevice);
-    cudaMemcpy(dev_value_address, value_address,
-        size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
-    int block_dim = 128;
-      void* args[] = {
-          (void*)&dev_value_address,
-          (void*)&memcpy_buffer_gpu,
-          (void*)&value_len,
-          (void*)&size};
-
-    cudaLaunchKernel(
-          (void *)BatchUnpack<V>,
-          (size + block_dim - 1) / block_dim * value_len,
-          block_dim,
-          args, 0, NULL);
-    cudaDeviceSynchronize();
-
-    delete[] memcpy_buffer_cpu;
-    delete[] cpu_value_ptrs;
-    delete[] gpu_value_ptrs;
-    delete[] value_address;
-    gpu_alloc_->DeallocateRaw(dev_value_address);
-    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
-  }
-
   void CopyEmbeddingsFromCPUToGPU(
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
@@ -329,10 +261,6 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  bool IsUsePersistentStorage() override {
-    return false;
-  }
-
   void iterator_mutex_lock() override {
     return;
   }
@@ -466,7 +394,59 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     }
   }
 
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string,
+               int64 partition_id, int64 partition_num,
+               int64 value_len, bool is_incr, bool reset_version,
+               const EmbeddingConfig& emb_config,
+               const Eigen::GpuDevice* device,
+               BundleReader* reader, EmbeddingVar<K, V>* ev,
+               FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) override {
+                         
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this),
+                                    ev, filter, name_string, file_name_string,
+                                    partition_id, partition_num,
+                                    is_incr, reset_version, reader);
+
+    restorer.RestoreCkpt(emb_config, device);
+
+    int64 num_of_hbm_ids =
+        std::min(MultiTierStorage<K, V>::cache_capacity_,
+        (int64)MultiTierStorage<K, V>::cache_->size());
+    if (num_of_hbm_ids > 0) {
+      K* hbm_ids = new K[num_of_hbm_ids];
+      int64* hbm_freqs = new int64[num_of_hbm_ids];
+      int64* hbm_versions = nullptr;
+      MultiTierStorage<K, V>::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids,
+                                                     hbm_versions, hbm_freqs);
+      ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index);
+      MultiTierStorage<K, V>::cache_thread_pool_->Schedule(
+          [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() {
+            MultiTierStorage<K, V>::cache_->update(hbm_ids, num_of_hbm_ids,
+                                                   hbm_versions, hbm_freqs);
+            delete[] hbm_ids;
+            delete[] hbm_freqs;
+          });
+    }
+  }
+
  protected:
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id,
+                               partition_num, value_len, is_filter,
+                               true/*to_dram*/, is_incr, restore_buff);
+
+    MultiTierStorage<K, V>::cache_->update((K*)restore_buff.key_buffer, key_num,
+                                           (int64*)restore_buff.version_buffer,
+                                           (int64*)restore_buff.freq_buffer);
+    return s;
+  }
+
   void SetTotalDims(int64 total_dims) override {
     dram_->SetTotalDims(total_dims);
   }
@@ -664,6 +644,73 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
   }
 
+  void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
+    V* memcpy_buffer_cpu = new V[size * value_len];
+    V** value_address = new V*[size];
+    V* memcpy_buffer_gpu =
+        (V*)gpu_alloc_->AllocateRaw(
+            Allocator::kAllocatorAlignment,
+            size * value_len * sizeof(V));
+    V* dev_value_address =
+        (V*)gpu_alloc_->AllocateRaw(
+            Allocator::kAllocatorAlignment,
+            size * sizeof(V*));
+    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
+    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
+    {
+      //Mutex with other Import Ops
+      mutex_lock l(memory_pool_mu_);
+      for (int64 i = 0; i < size; i++) {
+        dram_->Get(ids[i], &cpu_value_ptrs[i]);
+        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
+        V* val_ptr = embedding_mem_pool_->Allocate();
+        gpu_value_ptrs[i]->SetPtr(val_ptr);
+        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
+               (char *)cpu_value_ptrs[i]->GetPtr(),
+               sizeof(FixedLengthHeader));
+      }
+    }
+    //Split from above for loop for minize the cost of mutex lock
+    //TODO: Speed up with intra parallelism
+    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    for (int64 i = 0; i < size; i++) {
+      memcpy(memcpy_buffer_cpu + i * value_len,
+             cpu_value_ptrs[i]->GetValue(emb_index,
+             Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
+      }
+      gpu_value_ptrs[i]->SetInitialized(emb_index);
+      value_address[i] = gpu_value_ptrs[i]->GetValue(
+          emb_index, Storage<K, V>::GetOffset(emb_index));
+    }
+    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
+               size * value_len * sizeof(V), cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_value_address, value_address,
+               size * sizeof(V*), cudaMemcpyHostToDevice);
+    {
+      mutex_lock l(memory_pool_mu_);
+      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
+    }
+    int block_dim = 128;
+    void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
+                    (void*)&value_len, (void*)&size};
+
+    cudaLaunchKernel((void *)BatchUnpack<V>,
+                     (size + block_dim - 1) / block_dim * value_len,
+                     block_dim, args, 0, NULL);
+    cudaDeviceSynchronize();
+
+    delete[] memcpy_buffer_cpu;
+    delete[] cpu_value_ptrs;
+    delete[] gpu_value_ptrs;
+    delete[] value_address;
+    gpu_alloc_->DeallocateRaw(dev_value_address);
+    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
+  }
+
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
index 3fa74d67237..4831b940bb8 100644
--- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h
+++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
@@ -25,6 +25,10 @@ class ValuePtr;
 namespace embedding {
 class Iterator;
 
+namespace {
+  const int kSavedPartitionNum = 1000;
+}
+
 template<class K, class V>
 class PartitionedCheckpointData {
  public:
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 277c20157cd..ac82f3911fb 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_var_context.h"
+#include "tensorflow/core/framework/embedding/embedding_var_restore.h"
 #include "tensorflow/core/framework/embedding/eviction_manager.h"
 #include "tensorflow/core/framework/embedding/globalstep_shrink_policy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
@@ -79,11 +80,6 @@ class MultiTierStorage : public Storage<K, V> {
     return cache_;
   }
 
-  void InsertToDram(K key, ValuePtr<V>** value_ptr,
-              int64 alloc_len) override {
-    LOG(FATAL)<<"InsertToDram in MultiTierStorage shouldn't be called";
-  }
-
   void InitCache(embedding::CacheStrategy cache_strategy) override {
     cache_ = CacheFactory::Create<K>(cache_strategy, name_);
     eviction_manager_ = EvictionManagerCreator::Create<K, V>();
@@ -182,22 +178,6 @@ class MultiTierStorage : public Storage<K, V> {
     return;
   }
 
-  void RestoreSsdHashmap(
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys,
-      int64* file_list, int64* invalid_record_count_list,
-      int64* record_count_list, int64 num_of_files,
-      const std::string& ssd_emb_file_name) override {
-    LOG(FATAL)<<"The Storage dosen't have ssd storage"
-              <<" or this storage hasn't suppported"
-              <<" RestoreSsdHashmap yet";
-  }
-
-  void ImportToHbm(
-      K* ids, int64 size, int64 value_len, int64 emb_index) override {
-    LOG(FATAL)<<"This Storage dosen't have a HBM storage.";
-  }
-
   bool IsMultiLevel() override {
     return true;
   }
@@ -257,6 +237,10 @@ class MultiTierStorage : public Storage<K, V> {
     });
   }
 
+  virtual bool IsUseHbm() override {
+    return false;
+  }
+
   void AddToCachePrefetchList(const Tensor& indices) override {
     Schedule([this, indices]() {
       cache_->add_to_prefetch_list(indices);
@@ -270,6 +254,37 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
  protected:
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id,
+                               partition_num, value_len, is_filter,
+                               false/*to_dram*/, is_incr, restore_buff);
+ 
+    if (emb_config.is_primary()) {
+      K* key_buff = (K*)restore_buff.key_buffer;
+      V* value_buff = (V*)restore_buff.value_buffer;
+      int64* version_buff = (int64*)restore_buff.version_buffer;
+      int64* freq_buff = (int64*)restore_buff.freq_buffer;
+      if (cache_) {
+        cache_->update(key_buff, key_num, version_buff, freq_buff);
+        auto cache_size = CacheSize();
+        if (cache_->size() > cache_size) {
+          int64 evict_size = cache_->size() - cache_size;
+          std::vector<K> evict_ids(evict_size);
+          size_t true_size =
+              cache_->get_evic_ids(evict_ids.data(), evict_size);
+          Eviction(evict_ids.data(), true_size);
+        }
+      }
+      return s;
+    }
+    return s;
+  }
+ 
   virtual void SetTotalDims(int64 total_dims) = 0;
 
   void DeleteFromEvictionManager() {
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 34a9976a4f1..0c5ce80886a 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -28,11 +28,14 @@ class Storage;
 
 template<typename K, typename V, typename EV>
 class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
+ using FilterPolicy<K, V, EV>::ev_;
+ using FilterPolicy<K, V, EV>::config_;
+ using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
+
  public:
   NullableFilterPolicy(const EmbeddingConfig& config,
-      EV* ev, embedding::Storage<K, V>* storage)
-       : config_(config), ev_(ev), storage_(storage) {
-  }
+                       EV* ev, embedding::Storage<K, V>* storage) : 
+      FilterPolicy<K, V, EV>(config, ev), storage_(storage) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
@@ -143,53 +146,10 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
     }
   }
 
-  Status Import(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter) override {
-    K* key_buff = (K*)restore_buff.key_buffer;
-    V* value_buff = (V*)restore_buff.value_buffer;
-    int64* version_buff = (int64*)restore_buff.version_buffer;
-    int64* freq_buff = (int64*)restore_buff.freq_buffer;
-    for (auto i = 0; i < key_num; ++i) {
-      // this can describe by graph(Mod + DynamicPartition),
-      // but memory waste and slow
-      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
-        continue;
-      }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr);
-      if (config_.filter_freq !=0 || ev_->IsMultiLevel()
-          || config_.record_freq) {
-        value_ptr->SetFreq(freq_buff[i]);
-      }
-      if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
-      }
-      if (!is_filter) {
-        ev_->LookupOrCreateEmb(value_ptr,
-            value_buff + i * ev_->ValueLen());
-      }else {
-        ev_->LookupOrCreateEmb(value_ptr,
-            ev_->GetDefaultValue(key_buff[i]));
-      }
-    }
-    if (ev_->IsMultiLevel() && !ev_->IsUseHbm() && config_.is_primary()) {
-      ev_->UpdateCache(key_buff, key_num, version_buff, freq_buff);
-    }
-    return Status::OK();
-  }
-
-  Status ImportToDram(RestoreBuffer& restore_buff,
-                int64 key_num,
-                int bucket_num,
-                int64 partition_id,
-                int64 partition_num,
-                bool is_filter,
-                V* default_values) override {
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr,
+                 RestoreBuffer& restore_buff) override {
     K* key_buff = (K*)restore_buff.key_buffer;
     V* value_buff = (V*)restore_buff.value_buffer;
     int64* version_buff = (int64*)restore_buff.version_buffer;
@@ -202,7 +162,7 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
         continue;
       }
       ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKeyOnDram(key_buff[i], &value_ptr);
+      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
       if (config_.filter_freq !=0 || ev_->IsMultiLevel()
           || config_.record_freq) {
         value_ptr->SetFreq(freq_buff[i]);
@@ -210,16 +170,8 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
       if (config_.steps_to_live != 0 || config_.record_version) {
         value_ptr->SetStep(version_buff[i]);
       }
-      if (!is_filter) {
-        ev_->LookupOrCreateEmb(value_ptr,
-            value_buff + i * ev_->ValueLen(), ev_allocator());
-      } else {
-        ev_->LookupOrCreateEmb(value_ptr,
-            default_values +
-                (key_buff[i] % config_.default_value_dim)
-                * ev_->ValueLen(),
-            ev_allocator());
-      }
+      LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
+                                value_ptr, value_buff, key_buff);
     }
     return Status::OK();
   }
@@ -229,9 +181,7 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
  private:
-  EmbeddingConfig config_;
   embedding::Storage<K, V>* storage_;
-  EV* ev_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index 2ebc4e3dc40..54bf1f76c14 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -110,7 +110,7 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len) override {
+                      size_t alloc_len, bool to_dram = false) override {
     do {
       *value_ptr = layout_creator_->Create(alloc_, alloc_len);
       Status s = kv_->Insert(key, *value_ptr);
@@ -127,11 +127,6 @@ class SingleTierStorage : public Storage<K, V> {
     LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in SingleTireStorage.";
   }
 
-  void InsertToDram(K key, ValuePtr<V>** value_ptr,
-              int64 alloc_len) override {
-    LOG(FATAL)<<"InsertToDram in SingleTierStorage shouldn't be called";
-  }
-
   Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
       size_t size) override {
     Status s = kv_->Lookup(key, value_ptr);
@@ -304,20 +299,6 @@ class SingleTierStorage : public Storage<K, V> {
     return nullptr;
   }
 
-  void RestoreSsdHashmap(
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys,
-      int64* file_list, int64* invalid_record_count_list,
-      int64* record_count_list, int64 num_of_files,
-      const std::string& ssd_emb_file_name) override {
-    LOG(FATAL)<<"The Storage dosen't have ssd storage.";
-  }
-
-  virtual void ImportToHbm (
-      K* ids, int64 size, int64 value_len, int64 emb_index) override {
-    LOG(FATAL)<<"This Storage dosen't have a HBM storage.";
-  }
-
   Status Shrink(const ShrinkArgs& shrink_args) override {
     mutex_lock l(Storage<K, V>::mu_);
     shrink_policy_->Shrink(shrink_args);
@@ -350,10 +331,6 @@ class SingleTierStorage : public Storage<K, V> {
     return false;
   }
 
-  bool IsUsePersistentStorage() override {
-    return false;
-  }
-
   void iterator_mutex_lock() override {
     return;
   }
@@ -377,7 +354,18 @@ class SingleTierStorage : public Storage<K, V> {
     value_ptr->Destroy(alloc_);
     delete value_ptr;
   }
-
+ protected:
+  virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                                 int64 partition_num, int64 value_len, bool is_filter,
+                                 bool is_incr, const EmbeddingConfig& emb_config, 
+                                 const Eigen::GpuDevice* device,
+                                 FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                                 RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id,
+                               partition_num, value_len, is_filter,
+                               false/*to_dram*/, is_incr, restore_buff);
+    return s;
+  }
  protected:
   KVInterface<K, V>* kv_;
   ShrinkPolicy<K, V>* shrink_policy_;
@@ -446,8 +434,10 @@ class HbmStorage : public SingleTierStorage<K, V> {
   void BatchLookupOrCreate(const K* key, V* val, V* default_v,
       int32 default_v_num,
       size_t n, const Eigen::GpuDevice& device) override {
-    SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v, default_v_num,
-        n, device);
+    SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val,
+                                                      default_v,
+                                                      default_v_num,
+                                                      n, device);
   }
 
   void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
@@ -473,20 +463,37 @@ class HbmStorage : public SingleTierStorage<K, V> {
     return key_list->size();
   }
 
-  void ImportToHbm(
-      const std::vector<K>& keys, const std::vector<V>& values,
-      const Eigen::GpuDevice* device,
-      const EmbeddingConfig& emb_config) override {
-    GPUHashMapKV<K, V>* gpu_kv =
-        dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
-    gpu_kv->Import(keys, values, device, emb_config);
-  }
-
   GPUHashTable<K, V>* HashTable() override {
     return SingleTierStorage<K, V>::kv_->HashTable();
   }
-
  protected:
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    K* key_buff = (K*)restore_buff.key_buffer;
+    V* value_buff = (V*)restore_buff.value_buffer;
+    std::vector<K> key_import;
+    std::vector<V> value_import;
+    for (auto i = 0; i < key_num; ++i) {
+      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
+        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        continue;
+      }
+      key_import.emplace_back(*(key_buff + i));
+      auto row_offset = value_buff + i * value_len;
+      for (int j = 0; j < value_len; j++) {
+        value_import.emplace_back(*(row_offset + j));
+      }
+    }
+    GPUHashMapKV<K, V>* gpu_kv =
+        dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+    gpu_kv->Import(key_import, value_import, device, emb_config);
+    return Status::OK();
+  }
+
   void SetTotalDims(int64 total_dims) override {}
 };
 
@@ -512,8 +519,9 @@ class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
     } while (!(SingleTierStorage<K, V>::kv_->Lookup(key, &value_ptr)).ok());
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr, size_t alloc_len) override {
-    SingleTierStorage<K, V>::Insert(key, value_ptr, alloc_len);
+  void Insert(K key, ValuePtr<V>** value_ptr,
+              size_t alloc_len, bool to_dram = false) override {
+    SingleTierStorage<K, V>::Insert(key, value_ptr, alloc_len, to_dram);
   }
 
   Status TryInsert(K key, ValuePtr<V>* value_ptr) {
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 7a7deaae483..604041f49c8 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -17,11 +17,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+#include "tensorflow/core/framework/embedding/embedding_var_restore.h"
+#include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/embedding/shrink_policy.h"
 #include "tensorflow/core/framework/embedding/storage_config.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tensorflow/core/framework/device_base.h"
 #if GOOGLE_CUDA
@@ -33,7 +37,8 @@ namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-const int kSavedPartitionNum = 1000;
+template <class K, class V>
+class CheckpointLoader;
 
 template <class V>
 class ValuePtr;
@@ -44,9 +49,6 @@ class EmbeddingVar;
 template <class K>
 struct SsdRecordDescriptor;
 
-template<typename K, typename V, typename EV>
-class FilterPolicy;
-
 template <class K, class V>
 class GPUHashTable;
 
@@ -56,6 +58,7 @@ namespace embedding {
 
 template<typename K, typename V>
 class Storage {
+ friend class CheckpointLoader<K, V>;
  public:
   explicit Storage(const StorageConfig& storage_config)
       : storage_config_(storage_config) {}
@@ -79,9 +82,8 @@ class Storage {
       std::vector<std::list<int64>>& not_found_cursor_list) {}
 #endif //GOOGLE_CUDA
   virtual Status Contains(K key) = 0;
-  virtual void Insert(K key, ValuePtr<V>** value_ptr, size_t alloc_len) = 0;
-  virtual void InsertToDram(K key, ValuePtr<V>** value_ptr,
-                            int64 alloc_len) = 0;
+  virtual void Insert(K key, ValuePtr<V>** value_ptr,
+                      size_t alloc_len, bool to_dram = false) = 0;
   virtual void Insert(K key, ValuePtr<V>* value_ptr) = 0;
   virtual void SetAllocLen(int64 value_len, int slot_num) = 0;
   virtual void SetValueLen(int64 value_len) {}
@@ -110,12 +112,6 @@ class Storage {
       const EmbeddingConfig& emb_config,
       SsdRecordDescriptor<K>* ssd_rec_desc) = 0;
   virtual embedding::Iterator* GetIterator() = 0;
-  virtual void RestoreSsdHashmap(
-      K* key_list, int64* key_file_id_list,
-      int64* key_offset_list, int64 num_of_keys,
-      int64* file_list, int64* invalid_record_count_list,
-      int64* record_count_list, int64 num_of_files,
-      const std::string& ssd_emb_file_name) = 0;
   virtual Status Shrink(const ShrinkArgs& shrink_args) = 0;
 
   virtual Status BatchCommit(const std::vector<K>& keys,
@@ -139,9 +135,6 @@ class Storage {
       const Eigen::GpuDevice& device) {}
   virtual void BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val,
                            size_t n, const V* default_v) {}
-  virtual void ImportToHbm(const std::vector<K>& keys,
-      const std::vector<V>& values, const Eigen::GpuDevice* device,
-      const EmbeddingConfig& emb_config) {};
   virtual GPUHashTable<K, V>* HashTable() {
     return nullptr;
   }
@@ -152,7 +145,7 @@ class Storage {
   virtual bool IsMultiLevel() = 0;
   virtual bool IsUseHbm() = 0;
   virtual bool IsSingleHbm() = 0;
-  virtual bool IsUsePersistentStorage() = 0;
+  virtual bool IsUsePersistentStorage() { return false; };
   virtual void iterator_mutex_lock() = 0;
   virtual void iterator_mutex_unlock() = 0;
   virtual void Schedule(std::function<void()> fn) = 0;
@@ -164,8 +157,6 @@ class Storage {
       const std::vector<ValuePtr<V>*>& value_ptr_list) = 0;
   virtual void AllocateMemoryForNewFeatures(
       ValuePtr<V>** value_ptr_list, int64 num_of_value_ptrs) = 0;
-  virtual void ImportToHbm(K* ids, int64 size, int64 value_len,
-                           int64 emb_index) = 0;
  
   inline mutex* get_mutex() { return &mu_; }
   inline int64 GetAllocLen() { return alloc_len_; }
@@ -210,6 +201,78 @@ class Storage {
   virtual void AddToCachePrefetchList(const Tensor& indices) {}
 
   virtual void AddToCache(const Tensor& indices) {}
+  
+  virtual void Restore(const std::string& name_string,
+                       const std::string& file_name_string, int64 partition_id,
+                       int64 partition_num, int64 value_len, bool is_incr,
+                       bool reset_version, const EmbeddingConfig& emb_config,
+                       const Eigen::GpuDevice* device, BundleReader* reader,
+                       EmbeddingVar<K, V>* ev,
+                       FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) {
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this), ev,
+                                    filter, name_string, file_name_string,
+                                    partition_id, partition_num, is_incr,
+                                    reset_version, reader);
+    restorer.RestoreCkpt(emb_config, device);
+  };
+
+ protected:
+  virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                                 int64 partition_num, int64 value_len, bool is_filter,
+                                 bool is_incr, const EmbeddingConfig& emb_config,
+                                 const Eigen::GpuDevice* device,
+                                 FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                                 RestoreBuffer& restore_buff) {
+    return Status::OK();
+  }
+  
+  virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num,
+                            int64 value_len,
+                            const std::string& ssd_emb_file_name,
+                            EmbeddingVar<K, V>* ev,
+                            RestoreSSDBuffer<K>& restore_buff) {
+    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
+    auto* alloc = ev->GetAllocator();
+    for (int64 i = 0; i < restore_buff.num_of_keys; i++) {
+      ValuePtr<V>* value_ptr = nullptr;
+      ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr);
+      value_ptr->SetInitialized(emb_index);
+      int64 file_id = restore_buff.key_file_id_list_buf[i];
+      int64 key_offset = restore_buff.key_offset_list_buf[i];
+      // Read data from embedding files on SSD. Data are stored in
+      // NormalContiguousValuePtr temporarily.
+      std::stringstream ss;
+      ss << ssd_emb_file_name << "/" << file_id << ".emb";
+      int fd = open(ss.str().data(), O_RDONLY);
+      char* file_addr = (char*)mmap(nullptr,
+                                    sizeof(FixedLengthHeader) +
+                                    alloc_len * sizeof(V) * (emb_slot_num + 1) +
+                                    key_offset,
+                                    PROT_READ, MAP_PRIVATE, fd, 0);
+
+      NormalContiguousValuePtr<V> tmp_value_ptr(alloc,
+                                                alloc_len * (emb_slot_num + 1));
+      void* ptr = tmp_value_ptr.GetPtr();
+      memcpy(ptr, file_addr + key_offset,
+             sizeof(FixedLengthHeader) +
+              alloc_len * sizeof(V) * (emb_slot_num + 1));
+      munmap(file_addr,
+             sizeof(FixedLengthHeader) +
+             alloc_len * sizeof(V) * (emb_slot_num + 1) +
+             key_offset);
+      close(fd);
+      // Copy Data to ValuePtr, data of slots are set by primary here.
+      for (int j = 0; j < emb_slot_num + 1; j++) {
+        V* value = tmp_value_ptr.GetValue(j, alloc_len * j);
+        if (value != nullptr) {
+          value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j);
+        }
+      }
+      value_ptr->SetFreq(tmp_value_ptr.GetFreq());
+      value_ptr->SetStep(tmp_value_ptr.GetStep());
+    }
+    return Status::OK();
+  }
 
  protected:
   int64 alloc_len_ = 0;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7d494726886..647d730bbe0 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2929,7 +2929,9 @@ cuda_library(
         }),
     visibility = ["//visibility:public"],
     deps = [
+        ":save_restore_tensor",
         "//tensorflow/core:stream_executor",
+        "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core/kernels:gpu_device_array"] + ["//third_party/eigen3"]
 )
 
@@ -5432,6 +5434,8 @@ tf_kernel_library(
             "@sparsehash_c11//:dense_hash_map",
             "@libcuckoo//:libcuckoo",
             ":unique_ali_op",
+            ":save_restore_tensor",
+            "//tensorflow/core/util/tensor_bundle",
             "@com_github_google_leveldb//:leveldb",] + DYNAMIC_DEPS + mkl_deps() +
            if_cuda(["@cub_archive//:cub",
                     ":fused_embedding_common_cuh",
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 0159ccbc25d..408a2bfd16c 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -1357,21 +1357,48 @@ TEST(EmbeddingVariableTest, TestCacheRestore) {
       storage, emb_config, cpu_allocator());
   variable->Init(value, 1);
   variable->InitCache(CacheStrategy::LFU);
-  RestoreBuffer buf;
-  buf.key_buffer = new char[6 * sizeof(int64)];
-  buf.version_buffer = new char[6 * sizeof(int64)];
-  buf.freq_buffer = new char[6 * sizeof(int64)];
-  buf.value_buffer = new char[24 * sizeof(float)];
-  for (int i = 1; i < 7; i++) {
-    ((int64*)buf.key_buffer)[i-1] = i;
-    ((int64*)buf.version_buffer)[i-1] = 1;
-    ((int64*)buf.freq_buffer)[i-1] = i * 10;
-  }
-  variable->Import(buf, 6, 1, 0, 1, false, nullptr);
-
-  ASSERT_EQ(variable->storage()->Size(0), 4);
-  ASSERT_EQ(variable->storage()->Size(1), 2);
-  delete storage;
+
+  Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
+
+  int64 ev_size = 7;
+  int64 cache_size = 3;
+  for (int64 i = 1; i < cache_size; i++) {
+    ValuePtr<float>* value_ptr = nullptr;
+    variable->LookupOrCreateKey(i, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    value_ptr->AddFreq(2);
+  }
+  for (int64 i = cache_size; i < ev_size; i++) {
+    ValuePtr<float>* value_ptr = nullptr;
+    variable->LookupOrCreateKey(i, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    value_ptr->AddFreq(1);
+  }
+
+  LOG(INFO) << "size:" << variable->Size();
+
+  BundleWriter writer(Env::Default(), Prefix("foo"));
+  DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
+  TF_ASSERT_OK(writer.Finish());  
+
+  auto imported_storage= embedding::StorageFactory::Create<int64, float>(
+      embedding::StorageConfig(embedding::DRAM_SSDHASH,
+      testing::TmpDir(),
+      size, "normal_contiguous",
+      emb_config),
+      cpu_allocator(),
+      "EmbeddingVar1");
+  auto imported_variable = new EmbeddingVar<int64, float>("EmbeddingVar1",
+      imported_storage, emb_config, cpu_allocator());
+  imported_variable->Init(value, 1);
+  imported_variable->InitCache(CacheStrategy::LFU);
+
+  BundleReader reader(Env::Default(), Prefix("foo"));
+  std::string name_string("var");
+  imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false);
+
+  ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
+  ASSERT_EQ(imported_storage->Size(1), 2);
 }
 
 void t1_gpu(KVInterface<int64, float>* hashmap) {
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index a17a75d8124..b6b29acbedc 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -434,7 +434,6 @@ Status DumpEmbeddingValues(EmbeddingVar<K, V>* ev,
       }
     }
   }
-  // LOG(INFO) << "EV:" << tensor_key << ", key_list_parts:" << key_list_parts.size();
 
   auto part_offset_flat = part_offset_tensor->flat<int32>();
   part_offset_flat(0) = 0;
@@ -588,1029 +587,6 @@ Status DumpEmbeddingValues(EmbeddingVar<K, V>* ev,
   return Status::OK();
 }
 
-namespace {
-const static string part_str = "part_";
-}
-
-template<typename K, typename V>
-Status DynamicRestoreValue(EmbeddingVar<K, V>* ev, BundleReader* reader,
-    std::string name_string, int orig_partnum, const GPUDevice* device,
-    int64 partition_id = 0, int64 partition_num = 1, bool reset_version = false) {
-  string curr_partid_str = std::to_string(partition_id);
-  bool filter_flag = true;
-  embedding::BatchCache<K>* cache_for_restore_hbm = nullptr;
-  if (ev->IsMultiLevel() && ev->IsUseHbm()) {
-    auto cache_strategy = ev->storage()->CacheStrategy();
-    cache_for_restore_hbm = embedding::CacheFactory::Create<K>(
-        cache_strategy, "hbm_restore_cache for " + name_string);
-  }
-  for (int i = 0; i < orig_partnum; i++) {
-    string part_id = std::to_string(i);
-    string pre_subname =
-      name_string.substr(0, name_string.find("part_"));
-    string post_subname =
-      name_string.substr(name_string.find("part_")
-          + part_str.size() + curr_partid_str.size());
-    string tensor_name =
-      pre_subname + part_str + part_id + post_subname;
-
-    string tensor_key = tensor_name + "-keys";
-    string tensor_value = tensor_name + "-values";
-    string tensor_version = tensor_name + "-versions";
-    string tensor_freq = tensor_name + "-freqs";
-    
-    TensorShape key_shape, value_shape, version_shape, freq_shape;
-    Status st = reader->LookupTensorShape(tensor_key, &key_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader->LookupTensorShape(tensor_value, &value_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    if (!reset_version) {
-      st = reader->LookupTensorShape(tensor_version, &version_shape);
-      if (!st.ok()) {
-        return st;
-      }
-    }
-
-    st = reader->LookupTensorShape(tensor_freq, &freq_shape);
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        freq_shape = version_shape;
-      }else {
-        return st;
-      }
-    }
-
-    st = reader->LookupHeader(tensor_key, sizeof(K) * key_shape.dim_size(0));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader->LookupHeader(tensor_value,
-        sizeof(V) * value_shape.dim_size(0) * value_shape.dim_size(1));
-    if (!st.ok()) {
-      return st;
-    }
-    if (!reset_version) {
-      st = reader->LookupHeader(tensor_version,
-          sizeof(int64) * version_shape.dim_size(0));
-      if (!st.ok()) {
-        return st;
-      }
-    }
-    st = reader->LookupHeader(tensor_freq,
-        sizeof(int64) * freq_shape.dim_size(0));
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        filter_flag = false;
-      }else {
-        return st;
-      }
-    }
-
-    size_t buffer_size = 8 << 20;
-    RestoreBuffer restore_buff;
-    restore_buff.key_buffer = new char[buffer_size];
-    restore_buff.value_buffer = new char[buffer_size];
-    restore_buff.version_buffer = new char[buffer_size];
-    restore_buff.freq_buffer = new char[buffer_size];
-    int64 newDim = ev->ValueLen();
-    size_t value_unit_bytes_new = sizeof(V) * newDim;
-    int64 idx = 0;
-    bool restore_customDim;
-    TF_CHECK_OK(ReadBoolFromEnvVar(
-                                  "TF_EV_RESTORE_CUSTOM_DIM", false, &restore_customDim));
-    size_t key_bytes_read = 0;
-    size_t value_bytes_read = 0;
-    size_t version_bytes_read = 0;
-    size_t freq_bytes_read = 0;
-    int64 tot_key_num = key_shape.dim_size(0);
-    size_t value_unit_bytes = sizeof(V) *  value_shape.dim_size(1);
-
-    while(tot_key_num > 0) {
-      size_t read_key_num = std::min(std::min(buffer_size / sizeof(K),
-            buffer_size / value_unit_bytes), buffer_size / sizeof(int64));
-      read_key_num = std::min(read_key_num, buffer_size / value_unit_bytes_new);
-      read_key_num = std::min((int64)read_key_num, tot_key_num);
-      reader->LookupSegment(tensor_key, read_key_num * sizeof(K),
-          restore_buff.key_buffer, key_bytes_read);
-      reader->LookupSegment(tensor_value, read_key_num * value_unit_bytes,
-          restore_buff.value_buffer, value_bytes_read);
-      if (!reset_version) {
-        reader->LookupSegment(tensor_version, read_key_num * sizeof(int64),
-            restore_buff.version_buffer, version_bytes_read);
-        if (version_bytes_read == 0) {
-          memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
-        }
-      } else {
-        int64 *version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-      if (filter_flag) {
-        reader->LookupSegment(tensor_freq, (read_key_num + 1)* sizeof(int64),
-            restore_buff.freq_buffer, freq_bytes_read);
-        if (freq_bytes_read == 0) {
-          int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-          for (int64 i = 0; i < read_key_num; i++) {
-            freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-          }
-        }
-      }else {
-        int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-        for (int64 i = 0; i < read_key_num; i++) {
-          freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-        }
-      }
-
-      if (key_bytes_read > 0) {
-        read_key_num = key_bytes_read / sizeof(K);
-        VLOG(2) << "repartition, read_key_num:" << read_key_num;
-        if (restore_customDim && value_shape.dim_size(1) != newDim) {
-          VLOG(2) << "restore, read_value_reshape dim: from "
-                  << value_shape.dim_size(1) << " to " << newDim;
-          if (read_key_num * value_unit_bytes != value_bytes_read) {
-            return tensorflow::errors::FailedPrecondition(
-                "Expected read_key_num * value_unit_bytes == value_bytes_read, "
-                "but got read_key_num * value_unit_bytes != value_bytes_read!");
-          }
-
-	  std::unique_ptr<char[]> tmp_ptr(new char[buffer_size]);
-          size_t read_once = std::min(value_unit_bytes, value_unit_bytes_new);
-          for (int i = 0; i < read_key_num; ++i) {
-            memcpy(tmp_ptr.get() + i * value_unit_bytes_new,
-                   restore_buff.value_buffer + i * value_unit_bytes, read_once);
-            if (value_shape.dim_size(1) >= newDim) continue;
-            auto p = ev->GetDefaultValue(idx);
-            ++idx;
-            memcpy(tmp_ptr.get() + i * value_unit_bytes_new +
-                       value_unit_bytes,
-                   p + value_unit_bytes,
-                   value_unit_bytes_new - value_unit_bytes);
-          }
-          auto tmp = tmp_ptr.release();
-          tmp_ptr.reset(restore_buff.value_buffer);
-          restore_buff.value_buffer = tmp;
-        }
-        st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
-            partition_id, partition_num, false, device);
-        if (cache_for_restore_hbm) {
-          cache_for_restore_hbm->update(
-              (K*)restore_buff.key_buffer, read_key_num,
-              (int64*)restore_buff.version_buffer,
-              (int64*)restore_buff.freq_buffer);
-        }
-        if (!st.ok()) {
-          return st;
-        }
-        tot_key_num -= read_key_num;
-      }
-    }
-  }
-  if (cache_for_restore_hbm) {
-    int64 cache_capacity = ev->CacheSize();
-    int64 num_of_hbm_ids =
-        std::min(cache_capacity, (int64)cache_for_restore_hbm->size());
-    K* hbm_ids = new K[num_of_hbm_ids];
-    int64* hbm_freqs = new int64[num_of_hbm_ids];
-    int64* hbm_versions = nullptr;
-    cache_for_restore_hbm->get_cached_ids(
-        hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-    ev->ImportToHbm(hbm_ids, num_of_hbm_ids);
-    ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
-                                     hbm_versions, hbm_freqs]() {
-      embedding::BatchCache<K>* cache = ev->Cache();
-      cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-      delete[] hbm_ids;
-      delete[] hbm_freqs;
-    });
-    delete cache_for_restore_hbm;
-  }
-  return Status::OK();
-}
-
-template<typename K, typename V>
-Status EVRestoreNoPartition(EmbeddingVar<K, V>* ev, BundleReader* reader,
-    std::string tensor_key, std::string tensor_value,
-    std::string tensor_version, std::string tensor_freq,
-    const GPUDevice* device,
-    bool reset_version=false) {
-  TensorShape key_shape;
-  TensorShape value_shape;
-  TensorShape version_shape;
-  TensorShape freq_shape;
-  TensorShape key_filter_shape;
-  TensorShape version_filter_shape;
-  TensorShape freq_filter_shape;
-
-  Status st;
-  reader->LookupTensorShape(tensor_key, &key_shape);
-  reader->LookupTensorShape(tensor_value, &value_shape);
-  reader->LookupTensorShape(tensor_version, &version_shape);
-  st = reader->LookupTensorShape(tensor_freq, &freq_shape);
-  if (!st.ok()) {
-    if (st.code() == error::NOT_FOUND) {
-      freq_shape = version_shape;
-    }else {
-      return st;
-    }
-  }
-  st = reader->LookupTensorShape(tensor_key + "_filtered", &key_filter_shape);
-  if (!st.ok()) {
-    if (st.code() == error::NOT_FOUND) {
-      key_filter_shape = key_shape;
-    }else {
-      return st;
-    }
-  }
-  st = reader->LookupTensorShape(tensor_version + "_filtered",
-      &version_filter_shape);
-  if (!st.ok()) {
-    if (st.code() == error::NOT_FOUND) {
-      version_filter_shape = version_shape;
-    }else {
-      return st;
-    }
-  }
-  st = reader->LookupTensorShape(tensor_freq + "_filtered",
-      &freq_filter_shape);
-  if (!st.ok()) {
-    if (st.code() == error::NOT_FOUND) {
-      freq_filter_shape = freq_shape;
-    }else {
-      return st;
-    }
-  }
-
-  bool filter_flag = true;
-  bool restore_filter_flag = true;
-  st = reader->LookupHeader(tensor_key,
-      sizeof(K) * key_shape.dim_size(0));
-  if (!st.ok())
-    return st;
-  st = reader->LookupHeader(tensor_value,
-      sizeof(V) * value_shape.dim_size(0) * value_shape.dim_size(1));
-  if (!st.ok())
-    return st;
-  st = reader->LookupHeader(tensor_version,
-      sizeof(int64) * version_shape.dim_size(0));
-  if (!st.ok())
-    return st;
-  st = reader->LookupHeader(tensor_freq,
-      sizeof(int64) * freq_shape.dim_size(0));
-  if (!st.ok()) {
-    if (st.code() == error::NOT_FOUND) {
-      filter_flag = false;
-    }else {
-      return st;
-    }
-  }
-  st = reader->LookupHeader(tensor_key + "_filtered",
-      sizeof(K) * key_filter_shape.dim_size(0));
-  if (!st.ok()){
-    if (st.code() == error::NOT_FOUND){
-      restore_filter_flag=false;
-    } else {
-      return st;
-    }
-  }
-  st = reader->LookupHeader(tensor_version + "_filtered",
-      sizeof(K) * version_filter_shape.dim_size(0));
-  if (!st.ok() && st.code() != error::NOT_FOUND){
-    return st;
-  }
-  st = reader->LookupHeader(tensor_freq + "_filtered",
-      sizeof(K) * freq_filter_shape.dim_size(0));
-  if (!st.ok() && st.code() != error::NOT_FOUND){
-    return st;
-  }
-  embedding::BatchCache<K>* cache_for_restore_hbm = nullptr;
-  if (ev->IsMultiLevel() && ev->IsUseHbm()) {
-    auto cache_strategy = ev->storage()->CacheStrategy();
-    cache_for_restore_hbm = embedding::CacheFactory::Create<K>(
-        cache_strategy, "hbm_restore_cache for " + tensor_key);
-  }
-
-  size_t buffer_size = 8 << 20;
-  RestoreBuffer restore_buff;
-  restore_buff.key_buffer = new char[buffer_size];
-  restore_buff.value_buffer = new char[buffer_size];
-  restore_buff.version_buffer = new char[buffer_size];
-  restore_buff.freq_buffer = new char[buffer_size];
-  int64 newDim = ev->ValueLen();
-  size_t value_unit_bytes_new = sizeof(V) * newDim;
-  int64 idx = 0;
-  bool restore_customDim;
-  TF_CHECK_OK(ReadBoolFromEnvVar(
-                                "TF_EV_RESTORE_CUSTOM_DIM", false, &restore_customDim));
-  size_t key_bytes_read = 0;
-  size_t value_bytes_read = 0;
-  size_t version_bytes_read = 0;
-  size_t freq_bytes_read = 0;
-  size_t key_filter_bytes_read = 0;
-  size_t version_filter_bytes_read = 0;
-  size_t freq_filter_bytes_read = 0;
-
-  int64 tot_key_num = key_shape.dim_size(0);
-  size_t value_unit_bytes = sizeof(V) *  value_shape.dim_size(1);
-  std::string key_str = "|";
-  while(tot_key_num > 0) {
-    size_t read_key_num = std::min(
-        std::min(buffer_size / sizeof(K),
-          buffer_size / value_unit_bytes), buffer_size / sizeof(int64));
-    read_key_num = std::min(read_key_num, buffer_size / value_unit_bytes_new);
-    read_key_num = std::min((int64)read_key_num, tot_key_num);
-    reader->LookupSegment(tensor_key, read_key_num * sizeof(K),
-        restore_buff.key_buffer, key_bytes_read);
-    reader->LookupSegment(tensor_value, read_key_num * value_unit_bytes,
-        restore_buff.value_buffer, value_bytes_read);
-    if (!reset_version) {
-      reader->LookupSegment(tensor_version, read_key_num * sizeof(int64),
-          restore_buff.version_buffer, version_bytes_read);
-      if (version_bytes_read == 0) {
-          memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
-      }
-    } else {
-      int64 *version_tmp = (int64*)restore_buff.version_buffer;
-      memset(version_tmp, 0, read_key_num * sizeof(int64));
-    }
-    if (filter_flag) {
-      reader->LookupSegment(tensor_freq, read_key_num * sizeof(int64),
-          restore_buff.freq_buffer, freq_bytes_read);
-      if (freq_bytes_read == 0) {
-        int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-        for (int64 i = 0; i < read_key_num; i++) {
-          freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-        }
-      }
-    } else {
-      int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-      for (int64 i = 0; i < read_key_num; i++) {
-        freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-      }
-    }
-    if (key_bytes_read > 0) {
-      read_key_num = key_bytes_read / sizeof(K);
-      VLOG(2) << "restore, read_key_num:" << read_key_num;
-
-      if (restore_customDim && value_shape.dim_size(1) != newDim) {
-        VLOG(2) << "restore, read_value_reshape dim: from "
-                << value_shape.dim_size(1) << " to " << newDim;
-        if (read_key_num * value_unit_bytes != value_bytes_read) {
-          return tensorflow::errors::FailedPrecondition(
-              "Expected read_key_num * value_unit_bytes == value_bytes_read, "
-              "but got read_key_num * value_unit_bytes != value_bytes_read!");
-        }
-
-        std::unique_ptr<char[]> tmp_ptr(new char[buffer_size]);
-        size_t read_once = std::min(value_unit_bytes, value_unit_bytes_new);
-        for (int i = 0; i < read_key_num; ++i) {
-          memcpy(tmp_ptr.get() + i * value_unit_bytes_new,
-                 restore_buff.value_buffer + i * value_unit_bytes, read_once);
-          if (value_shape.dim_size(1) >= newDim) continue;
-          auto p = ev->GetDefaultValue(idx);
-          ++idx;
-          memcpy(tmp_ptr.get() + i * value_unit_bytes_new +
-                     value_unit_bytes,
-                 p + value_unit_bytes, value_unit_bytes_new - value_unit_bytes);
-        }
-        auto tmp = tmp_ptr.release();
-        tmp_ptr.reset(restore_buff.value_buffer);
-        restore_buff.value_buffer = tmp;
-      }
-      st = ev->Import(restore_buff, read_key_num, 1, 0, 1, false, device);
-      if (cache_for_restore_hbm) {
-        cache_for_restore_hbm->update(
-            (K*)restore_buff.key_buffer, read_key_num,
-            (int64*)restore_buff.version_buffer,
-            (int64*)restore_buff.freq_buffer);
-      }
-      if (!st.ok())
-        return st;
-      tot_key_num -= read_key_num;
-    }
-  }
-  
-  if (restore_filter_flag) {
-    int64 tot_key_filter_num = key_filter_shape.dim_size(0);
-    while (tot_key_filter_num > 0) {
-      size_t read_key_num = std::min(buffer_size / sizeof(K),
-          buffer_size / sizeof(int64));
-      read_key_num = std::min((int64)read_key_num, tot_key_filter_num);
-      reader->LookupSegment(tensor_key + "_filtered",
-          read_key_num * sizeof(K), restore_buff.key_buffer,
-          key_filter_bytes_read);
-      if (!reset_version) {
-        reader->LookupSegment(tensor_version + "_filtered",
-            read_key_num * sizeof(int64), restore_buff.version_buffer,
-            version_filter_bytes_read);
-      } else {
-        int64 *version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-      reader->LookupSegment(tensor_freq + "_filtered",
-          read_key_num * sizeof(int64), restore_buff.freq_buffer,
-          freq_filter_bytes_read);
-      if (key_filter_bytes_read > 0) {
-        read_key_num = key_filter_bytes_read / sizeof(K);
-        VLOG(2) << "restore, read_key_num:" << read_key_num;
-
-        st = ev->Import(restore_buff, read_key_num, 1, 0, 1, true, device);
-        if (cache_for_restore_hbm) {
-          cache_for_restore_hbm->update(
-              (K*)restore_buff.key_buffer, read_key_num,
-              (int64*)restore_buff.version_buffer,
-              (int64*)restore_buff.freq_buffer);
-        }
-        if (!st.ok())
-          return st;
-        tot_key_filter_num -= read_key_num;
-      }
-    }
-  }
-
-  if (cache_for_restore_hbm) {
-    int64 cache_capacity = ev->CacheSize();
-    int64 num_of_hbm_ids =
-        std::min(cache_capacity, (int64)cache_for_restore_hbm->size());
-    K* hbm_ids = new K[num_of_hbm_ids];
-    int64* hbm_freqs = new int64[num_of_hbm_ids];
-    int64* hbm_versions = nullptr;
-    cache_for_restore_hbm->get_cached_ids(
-        hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-    ev->ImportToHbm(hbm_ids, num_of_hbm_ids);
-    ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
-                                     hbm_versions, hbm_freqs]() {
-      embedding::BatchCache<K>* cache = ev->Cache();
-      cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-      delete[] hbm_ids;
-      delete[] hbm_freqs;
-    });
-    delete cache_for_restore_hbm;
-  }
-
-  return Status::OK();
-}
-
-inline bool IsOldCheckpoint(const std::string& name_string,
-    const std::string& curr_partid_str, BundleReader* reader,
-    const std::string& part_offset_tensor_suffix) {
-  // then check whether checkpoint is in old form
-  bool is_oldform = false;
-
-  string part_id = std::to_string(0);
-  string pre_subname =
-    name_string.substr(0, name_string.find(part_str));
-  string post_subname = name_string.substr(
-      name_string.find(part_str) + part_str.size() + curr_partid_str.size());
-  string tensor_name = pre_subname + part_str + part_id + post_subname;
-
-  TensorShape part_offset_shape;
-  DataType part_offset_type;
-  Status form_st = reader->LookupDtypeAndShape(
-      tensor_name + part_offset_tensor_suffix,
-      &part_offset_type, &part_offset_shape);
-  if (!form_st.ok()) {
-    is_oldform = true;
-  }
-  return is_oldform;
-}
-
-template<typename K, typename V>
-Status EVRestoreOldFromCheckpoint(EmbeddingVar<K, V>* ev,
-    const std::string& name_string, const std::string& curr_partid_str,
-    const std::string& key_suffix, int partition_id,
-    BundleReader* reader, int partition_num,
-    const GPUDevice* device, bool reset_version=false) {
-  // first get original partition number
-  int orig_partnum = 0;
-  for (;  ; orig_partnum++) {
-    string part_id = std::to_string(orig_partnum);
-    string pre_subname = name_string.substr(0, name_string.find(part_str));
-    string post_subname = name_string.substr(name_string.find(part_str)
-        + part_str.size() + curr_partid_str.size());
-    string tensor_name = pre_subname + part_str + part_id + post_subname;
-
-    string tensor_key = tensor_name + key_suffix;
-    TensorShape key_shape;
-    Status st = reader->LookupTensorShape(tensor_key, &key_shape);
-    if (!st.ok()) {
-      break;
-    }
-  }
-
-  VLOG(1) << "old form, EV name:" << name_string
-          << ", partition_id:" << curr_partid_str
-          << ", old partition_num:" << orig_partnum
-          << ", new partition num:" << partition_num;
-  Status s = DynamicRestoreValue(ev, reader, name_string,
-      orig_partnum, device, partition_id, partition_num, reset_version);
-  if (!s.ok()) {
-    LOG(FATAL) <<  "EV restoring fail:" << s.ToString();
-  }
-  return s;
-}
-
-template<typename K, typename V>
-Status EVRestoreDynamically(EmbeddingVar<K, V>* ev,
-    const std::string& name_string, int partition_id,
-    int partition_num, OpKernelContext* context,
-    BundleReader* reader, const std::string& part_offset_tensor_suffix,
-    const std::string& key_suffix, const std::string& value_suffix,
-    const std::string& version_suffix, const std::string& freq_suffix,
-    bool reset_version = false, const Eigen::GpuDevice* device = nullptr) {
-
-  // first check whether there is partition
-  if (name_string.find(part_str) == std::string::npos) {
-    Status s = EVRestoreNoPartition(
-        ev, reader, name_string + key_suffix,
-        name_string + value_suffix, name_string + version_suffix,
-        name_string + freq_suffix, device, reset_version);
-    if (!s.ok()) {
-      LOG(FATAL) <<  "EV restoring fail:" << s.ToString();
-    }
-    return s;
-  }
-
-  const string& curr_partid_str = std::to_string(partition_id);
-  auto is_oldform = IsOldCheckpoint(name_string, curr_partid_str,
-      reader, part_offset_tensor_suffix);
-
-  if (is_oldform) {
-    EVRestoreOldFromCheckpoint(ev, name_string, curr_partid_str, key_suffix,
-        partition_id, reader, partition_num, device, reset_version);
-  } else {
-    // first find out which sub parts we should load
-    bool filter_flag = true;
-    bool restore_filter_flag = true;
-    std::vector<int> loaded_parts;
-    for (int i = 0; i < kSavedPartitionNum; i++) {
-      if (i % partition_num == partition_id) {
-        loaded_parts.push_back(i);
-      }
-    }
-
-    // then we use primary partition number to compose with
-    // sub partition number
-    VLOG(1) << "new form:" << name_string
-            << ", partition_id:" << partition_id
-            << ", partition_num:" << partition_num;
-
-    embedding::BatchCache<K>* cache_for_restore_hbm = nullptr;
-    if (ev->IsMultiLevel() && ev->IsUseHbm()) {
-      auto cache_strategy = ev->storage()->CacheStrategy();
-      cache_for_restore_hbm = embedding::CacheFactory::Create<K>(
-          cache_strategy, "hbm_restore_cache for " + name_string);
-    }
-
-    int orig_partnum = 0;
-    size_t buffer_size = 8 << 20;
-    RestoreBuffer restore_buff;
-    restore_buff.key_buffer = new char[buffer_size];
-    restore_buff.value_buffer = new char[buffer_size];
-    restore_buff.version_buffer = new char[buffer_size];
-    restore_buff.freq_buffer = new char[buffer_size];
-    int64 newDim = ev->ValueLen();
-    size_t value_unit_bytes_new = sizeof(V) * newDim;
-    int64 idx = 0;
-    bool restore_customDim;
-    TF_CHECK_OK(ReadBoolFromEnvVar(
-			          "TF_EV_RESTORE_CUSTOM_DIM", false, &restore_customDim));
-    for (;  ; orig_partnum++) {
-      string part_id = std::to_string(orig_partnum);
-      string pre_subname = name_string.substr(0, name_string.find(part_str));
-      string post_subname = name_string.substr(name_string.find(part_str)
-          + part_str.size() + curr_partid_str.size());
-      string tensor_name = pre_subname + part_str + part_id + post_subname;
-
-      // first check whether is  old ckpt form
-      string tensor_key = tensor_name + key_suffix;
-      string tensor_value = tensor_name + value_suffix;
-      string tensor_version = tensor_name + version_suffix;
-      string tensor_freq = tensor_name + freq_suffix;
-      TensorShape key_shape, value_shape, version_shape, freq_shape;
-      TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
-      Status st = reader->LookupTensorShape(tensor_key, &key_shape);
-      if (!st.ok()) {
-        VLOG(1) << "ev part " << tensor_key
-                << " not exist, reach the end of restoring";
-        break;
-      }
-      st = reader->LookupTensorShape(tensor_value, &value_shape);
-      if (!st.ok()) {
-        break;
-      }
-      st = reader->LookupTensorShape(tensor_version, &version_shape);
-      if (!st.ok()) {
-        break;
-      }
-      st = reader->LookupTensorShape(tensor_freq, &freq_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          freq_shape = version_shape;
-        } else {
-          return st;
-        }
-      }
-      st = reader->LookupTensorShape(tensor_key + "_filtered",
-          &key_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          key_filter_shape = key_shape;
-        } else {
-          return st;
-        }
-      }
-      st = reader->LookupTensorShape(tensor_version + "_filtered",
-          &version_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          version_filter_shape = version_shape;
-        } else {
-          return st;
-        }
-      }
-      st = reader->LookupTensorShape(tensor_freq + "_filtered",
-          &freq_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          freq_filter_shape = freq_shape;
-        }else {
-          return st;
-        }
-      }
-
-      reader->LookupHeader(tensor_key, sizeof(K) * key_shape.dim_size(0));
-      if (!st.ok()) {
-        break;
-      }
-      st = reader->LookupHeader(tensor_value,
-          sizeof(V) * value_shape.dim_size(0) * value_shape.dim_size(1));
-      if (!st.ok()) {
-        break;
-      }
-      st = reader->LookupHeader(tensor_version,
-          sizeof(int64) * version_shape.dim_size(0));
-      if (!st.ok()) {
-        break;
-      }
-      st = reader->LookupHeader(tensor_freq,
-          sizeof(int64) * freq_shape.dim_size(0));
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          filter_flag = false;
-        }else {
-          return st;
-        }
-      }
-      st = reader->LookupHeader(tensor_key + "_filtered",
-          sizeof(K) * key_filter_shape.dim_size(0));
-      if (!st.ok()){
-        if (st.code() == error::NOT_FOUND){
-          restore_filter_flag=false;
-        }else {
-          return st;
-        }
-      }
-      st = reader->LookupHeader(tensor_version + "_filtered",
-          sizeof(K) * version_filter_shape.dim_size(0));
-      if (!st.ok() && st.code() != error::NOT_FOUND){
-        return st;
-      }
-      st = reader->LookupHeader(tensor_freq + "_filtered",
-          sizeof(K) * freq_filter_shape.dim_size(0));
-      if (!st.ok() && st.code() != error::NOT_FOUND){
-        return st;
-      }
-
-      TensorShape part_offset_shape, part_filter_offset_shape;
-      DataType part_offset_type, part_filter_offset_type;
-      string offset_tensor_name = tensor_name + part_offset_tensor_suffix;
-      string offset_filter_tensor_name =
-          tensor_name + "-partition_filter_offset";
-      st = reader->LookupDtypeAndShape(offset_tensor_name,
-          &part_offset_type, &part_offset_shape);
-      if (!st.ok()) {
-          LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      st = reader->LookupDtypeAndShape(offset_filter_tensor_name,
-          &part_filter_offset_type, &part_filter_offset_shape);
-      if (!st.ok()) {
-        LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      Tensor part_offset_tensor(cpu_allocator(),
-          part_offset_type, part_offset_shape);
-      if (!st.ok()) {
-        LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      Tensor part_filter_offset_tensor(cpu_allocator(),
-          part_offset_type, part_offset_shape);
-      if (!st.ok()) {
-        LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      st = reader->Lookup(offset_tensor_name, &part_offset_tensor);
-      if (!st.ok()) {
-        LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      auto part_offset_flat = part_offset_tensor.flat<int32>();
-      st = reader->Lookup(offset_filter_tensor_name, &part_filter_offset_tensor);
-      if (!st.ok()) {
-        LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-      }
-      auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
-
-      for (size_t i = 0; i < loaded_parts.size(); i++) {
-        int subpart_id = loaded_parts[i];
-        int subpart_offset = part_offset_flat(subpart_id);
-
-        size_t value_unit_bytes = sizeof(V) *  value_shape.dim_size(1);
-        int64 tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
-        int64 key_part_offset = subpart_offset * sizeof(K);
-        int64 value_part_offset = subpart_offset *  value_unit_bytes;
-        int64 version_part_offset = subpart_offset * sizeof(int64);
-        int64 freq_part_offset = subpart_offset * sizeof(int64);
-
-        VLOG(1) << "dynamically load ev : " << name_string
-                << ", subpartid:" << loaded_parts[i]
-                << ", subpart_offset:" << subpart_offset
-                << ", partition_id:" << partition_id
-                << ", partition_num:" << partition_num
-                << ", keynum:" << tot_key_num;
-
-        int64 tot_key_bytes_read(0);
-        int64 tot_value_bytes_read(0);
-        int64 tot_version_bytes_read(0);
-        int64 tot_freq_bytes_read(0);
-        size_t key_bytes_read = 0;
-        size_t value_bytes_read = 0;
-        size_t version_bytes_read = 0;
-        size_t freq_bytes_read = 0;
-        while(tot_key_num > 0) {
-          size_t read_key_num = std::min(std::min(buffer_size / sizeof(K),
-                buffer_size / value_unit_bytes), buffer_size / sizeof(int64));
-          read_key_num = std::min(read_key_num, buffer_size / value_unit_bytes_new);
-          read_key_num = std::min((int64)read_key_num, tot_key_num);
-          reader->LookupSegmentOffset(tensor_key,
-              key_part_offset + tot_key_bytes_read, read_key_num * sizeof(K),
-              restore_buff.key_buffer, key_bytes_read);
-
-          reader->LookupSegmentOffset(tensor_value,
-              value_part_offset + tot_value_bytes_read,
-              read_key_num * value_unit_bytes, restore_buff.value_buffer,
-              value_bytes_read);
-          if (!reset_version) {
-            reader->LookupSegmentOffset(tensor_version,
-                version_part_offset + tot_version_bytes_read,
-                read_key_num * sizeof(int64), restore_buff.version_buffer,
-                version_bytes_read);
-            if (version_bytes_read == 0) {
-              memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
-            }
-          } else {
-            int64 *version_tmp = (int64*)restore_buff.version_buffer;
-            memset(version_tmp, 0, read_key_num * sizeof(int64));
-          }
-          if (filter_flag) {
-            reader->LookupSegmentOffset(tensor_freq,
-                freq_part_offset + tot_freq_bytes_read,
-                read_key_num * sizeof(int64), restore_buff.freq_buffer,
-                freq_bytes_read);
-            if (freq_bytes_read == 0) {
-              int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-              for (int64 i = 0; i < read_key_num; i++) {
-                freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-              }
-            }
-          } else {
-            int64 *freq_tmp = (int64 *)restore_buff.freq_buffer;
-            for (int64 i = 0; i < read_key_num; i++) {
-              freq_tmp[i] = (ev->MinFreq() == 0) ? 1 : ev->MinFreq();
-            }
-          }
-          if (key_bytes_read > 0) {
-            read_key_num = key_bytes_read / sizeof(K);
-            VLOG(2) << "restore, read_key_num:" << read_key_num;
-            if (restore_customDim && value_shape.dim_size(1) != newDim) {
-              VLOG(2) << "restore, read_value_reshape dim: from "
-                      << value_shape.dim_size(1) << " to " << newDim;
-              if (read_key_num * value_unit_bytes != value_bytes_read) {
-                return tensorflow::errors::FailedPrecondition(
-                    "Expected read_key_num * value_unit_bytes == "
-                    "value_bytes_read, but got read_key_num * value_unit_bytes "
-                    "!= value_bytes_read!");
-              }
-
-	      std::unique_ptr<char[]> tmp_ptr(new char[buffer_size]);
-              size_t read_once =
-                  std::min(value_unit_bytes, value_unit_bytes_new);
-              for (int i = 0; i < read_key_num; ++i) {
-                memcpy(tmp_ptr.get() + i * value_unit_bytes_new,
-                       restore_buff.value_buffer + i * value_unit_bytes,
-                       read_once);
-                if (value_shape.dim_size(1) >= newDim) continue;
-                auto p = ev->GetDefaultValue(idx);
-                ++idx;
-                memcpy(tmp_ptr.get() + i * value_unit_bytes_new +
-                           value_unit_bytes,
-                       p + value_unit_bytes,
-                       value_unit_bytes_new - value_unit_bytes);
-              }
-              auto tmp = tmp_ptr.release();
-              tmp_ptr.reset(restore_buff.value_buffer);
-              restore_buff.value_buffer = tmp;
-            }
-
-            st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
-                partition_id, partition_num, false, device);
-            if (cache_for_restore_hbm) {
-              cache_for_restore_hbm->update(
-                  (K*)restore_buff.key_buffer, read_key_num,
-                  (int64*)restore_buff.version_buffer,
-                  (int64*)restore_buff.freq_buffer);
-            }
-            if (!st.ok()) {
-              LOG(FATAL) <<  "EV restoring fail:" << st.ToString();
-            }
-          }
-          tot_key_num -= read_key_num;
-          tot_key_bytes_read += key_bytes_read;
-          tot_value_bytes_read += value_bytes_read;
-          tot_version_bytes_read += version_bytes_read;
-          tot_freq_bytes_read += freq_bytes_read;
-        }
-
-        if (restore_filter_flag) {
-          int subpart_filter_offset = part_filter_offset_flat(subpart_id);
-          int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
-          int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
-          int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
-          int64 tot_key_filter_num =
-            part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
-          size_t key_filter_bytes_read = 0;
-          size_t version_filter_bytes_read = 0;
-          size_t freq_filter_bytes_read = 0;
-          while (tot_key_filter_num > 0) {
-            size_t read_key_num =
-              std::min(buffer_size / sizeof(K), buffer_size / sizeof(int64));
-            read_key_num = std::min((int64)read_key_num, tot_key_filter_num);
-            reader->LookupSegmentOffset(tensor_key + "_filtered",
-                key_filter_part_offset + key_filter_bytes_read,
-                read_key_num * sizeof(K), restore_buff.key_buffer,
-                key_filter_bytes_read);
-            if (!reset_version) {
-              reader->LookupSegmentOffset(tensor_version + "_filtered",
-                  version_filter_part_offset + version_filter_bytes_read,
-                  read_key_num * sizeof(int64), restore_buff.version_buffer,
-                  version_filter_bytes_read);
-            } else {
-              int64 *version_tmp = (int64*)restore_buff.version_buffer;
-              memset(version_tmp, 0, read_key_num * sizeof(int64));
-            }
-            reader->LookupSegmentOffset(tensor_freq + "_filtered",
-                freq_filter_part_offset + freq_filter_bytes_read,
-                read_key_num * sizeof(int64), restore_buff.freq_buffer,
-                freq_filter_bytes_read);
-            if (key_filter_bytes_read > 0) {
-              read_key_num = key_filter_bytes_read / sizeof(K);
-              VLOG(2) << "restore, read_key_num:" << read_key_num;
-              st = ev->Import(restore_buff, read_key_num, kSavedPartitionNum,
-                  partition_id, partition_num, true, device);
-              if (cache_for_restore_hbm) {
-                cache_for_restore_hbm->update(
-                    (K*)restore_buff.key_buffer, read_key_num,
-                    (int64*)restore_buff.version_buffer,
-                    (int64*)restore_buff.freq_buffer);
-              }
-              if (!st.ok())
-               return st;
-              tot_key_filter_num -= read_key_num;
-            }
-          }
-        }
-      }
-    }
-
-    if (cache_for_restore_hbm) {
-      int64 cache_capacity = ev->CacheSize();
-      int64 num_of_hbm_ids =
-          std::min(cache_capacity, (int64)cache_for_restore_hbm->size());
-      K* hbm_ids = new K[num_of_hbm_ids];
-      int64* hbm_freqs = new int64[num_of_hbm_ids];
-      int64* hbm_versions = nullptr;
-      cache_for_restore_hbm->get_cached_ids(
-          hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-      ev->ImportToHbm(hbm_ids, num_of_hbm_ids);
-      ev->storage()->Schedule([ev, hbm_ids, num_of_hbm_ids,
-                                       hbm_versions, hbm_freqs]() {
-        embedding::BatchCache<K>* cache = ev->Cache();
-        cache->update(hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs);
-        delete[] hbm_ids;
-        delete[] hbm_freqs;
-      });
-      delete cache_for_restore_hbm;
-    }
-  }
-  return Status::OK();
-}
-
-
-template<class K>
-int64 ReadRecord(
-    BundleReader* reader,
-    const string& record_key,
-    K** buffer) {
-  TensorShape shape;
-  Status st;
-  reader->LookupTensorShape(record_key, &shape);
-  st = reader->LookupHeader(record_key,
-      sizeof(K) * shape.dim_size(0));
-  if (!st.ok()) {
-    LOG(FATAL)<<"Restore record "<<record_key<<" failed";
-  }
-  size_t bytes_read = 0;
-  *buffer = new K[shape.dim_size(0)];
-  reader->LookupSegment(
-      record_key, sizeof(K) * shape.dim_size(0),
-      (char*)*buffer, bytes_read);
-  return shape.dim_size(0);
-}
-
-template<class K, class V>
-void RestoreSsdRecord(
-    EmbeddingVar<K, V>* ev,
-    const std::string& ssd_record_file_name,
-    const std::string& ssd_emb_file_name) {
-  BundleReader ssd_record_reader(Env::Default(),
-                                 ssd_record_file_name);
-  //Read the data of embedding files
-  int64* file_list = nullptr;
-  int64 num_of_files =
-      ReadRecord(&ssd_record_reader, "files", &file_list);
-
-  int64* invalid_record_count_list = nullptr;
-  ReadRecord(&ssd_record_reader,
-             "invalid_record_count",
-             &invalid_record_count_list);
-
-  int64* record_count_list = nullptr;
-  ReadRecord(&ssd_record_reader,
-             "record_count",
-             &record_count_list);
-
-  //Read the data of keys
-  K* key_list = nullptr;
-  int64 num_of_keys =
-      ReadRecord(&ssd_record_reader, "keys", &key_list);
-
-  int64* key_file_id_list = nullptr;
-  ReadRecord(&ssd_record_reader, "keys_file_id", &key_file_id_list);
-
-  int64* key_offset_list = nullptr;
-  ReadRecord(&ssd_record_reader, "keys_offset", &key_offset_list);
-
-  //Import the meta of keys to SSDHashKV
-  ev->RestoreSsdHashmap(key_list, key_file_id_list,
-                        key_offset_list, num_of_keys,
-                        file_list, invalid_record_count_list,
-                        record_count_list, num_of_files,
-                        ssd_emb_file_name);
-  delete[] key_list;
-  delete[] key_file_id_list;
-  delete[] key_offset_list;
-  delete[] file_list;
-  delete[] invalid_record_count_list;
-  delete[] record_count_list;
-}
-
-template<class K, class V>
-void LoadSsdData(
-    EmbeddingVar<K, V>* ev,
-    const std::string& ssd_record_file_name,
-    const std::string& ssd_emb_file_name) {
-  BundleReader ssd_record_reader(Env::Default(),
-                                 ssd_record_file_name);
-  std::string record_key;
-
-  K* key_list = nullptr;
-  int64 num_of_keys =
-      ReadRecord(&ssd_record_reader, "keys", &key_list);
-
-  int64* key_file_id_list = nullptr;
-  ReadRecord(&ssd_record_reader, "keys_file_id", &key_file_id_list);
-
-  int64* key_offset_list = nullptr;
-  ReadRecord(&ssd_record_reader, "keys_offset", &key_offset_list);
-
-  //Load keys and embedding data on ssd
-  ev->LoadSsdData(ssd_emb_file_name, key_list,
-                  key_file_id_list, key_offset_list,
-                  num_of_keys);
-  delete[] key_list;
-  delete[] key_file_id_list;
-  delete[] key_offset_list;
-}
-
 Status MoveMatchingFiles(
     Env* env,
     const tstring& pattern,
diff --git a/tensorflow/core/kernels/kv_variable_save_restore_ops.cc b/tensorflow/core/kernels/kv_variable_save_restore_ops.cc
index e0b9a1c95c1..fa7e043ffd3 100644
--- a/tensorflow/core/kernels/kv_variable_save_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_save_restore_ops.cc
@@ -258,10 +258,8 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                    << s.ToString();
       }
 
-      EVRestoreDynamically(
-          ev, name_string, partition_id_, partition_num_, context, &reader,
-          "-partition_offset", "-keys", "-values", "-versions", "-freqs",
-          reset_version_);
+      ev->Restore(name_string, file_name_string, partition_id_, partition_num_, 
+                  false, &reader, reset_version_);
       ev->SetInitialized();
       done();
     };
@@ -374,42 +372,20 @@ class KvResourceImportV3Op: public AsyncOpKernel {
       if (!s.ok()) {
         LOG(FATAL) << "Restore EV failure, create BundleReader error:"
                    << s.ToString();
+        done();
       }
 
-      std::string name_string_temp(name_string);
-      std::string new_str = "_";
-      int64 pos = name_string_temp.find("/");
-      while (pos != std::string::npos) {
-        name_string_temp.replace(pos, 1, new_str.data(), 1);
-        pos = name_string_temp.find("/");
-      }
-      std::string ssd_record_file_name =
-          file_name_string + "-" + name_string_temp + "-ssd_record";
-      //TODO: support change the partition number
-      if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
-        std::string ssd_emb_file_name =
-            file_name_string + "-" + name_string_temp + "-emb_files";
-        if (ev->IsUsePersistentStorage()) {
-          RestoreSsdRecord(ev, ssd_record_file_name, ssd_emb_file_name);
-        } else {
-          LoadSsdData(ev, ssd_record_file_name, ssd_emb_file_name);
-        }
-      }
       if (ev->IsSingleHbm()) {
 #if GOOGLE_CUDA
         se::cuda::ScopedActivateExecutorContext scoped_activation{
             context->op_device_context()->stream()->parent()};
         const Eigen::GpuDevice& device = context->eigen_gpu_device();
-        EVRestoreDynamically(
-            ev, name_string, partition_id_, partition_num_, context, &reader,
-            "-partition_offset", "-keys", "-values", "-versions", "-freqs",
-            reset_version_, &device);
+        ev->Restore(name_string, file_name_string, partition_id_, partition_num_, 
+                    false, &reader, reset_version_, &device);
 #endif
       } else {
-        EVRestoreDynamically(
-            ev, name_string, partition_id_, partition_num_, context, &reader,
-            "-partition_offset", "-keys", "-values", "-versions", "-freqs",
-            reset_version_, nullptr);
+        ev->Restore(name_string, file_name_string, partition_id_, partition_num_, 
+                    false, &reader, reset_version_, nullptr);
       }
       ev->SetInitialized();
       done();
@@ -495,10 +471,8 @@ class KvResourceIncrImportOp: public AsyncOpKernel {
               << "partition_num:"
               << partition_num_;
 
-    EVRestoreDynamically(
-        ev, name_string, partition_id_, partition_num_, context, &reader,
-        "-incr_partition_offset", "-sparse_incr_keys", "-sparse_incr_values",
-        "-sparse_incr_versions", "-sparse_incr_freqs");
+    ev->Restore(name_string, file_name_string, partition_id_, partition_num_, 
+                true, &reader);
     ev->SetInitialized();
     done();
   }
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 0a62f93af54..240938e8675 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -860,8 +860,210 @@ def testSaveV3(self):
       for name, shape in checkpoint_utils.list_variables(model_path):
         ckpt_value = checkpoint_utils.load_variable(model_path, name)
         print(name, shape, ckpt_value)
-    with self.test_session() as sess:
-      saver.restore(sess, model_path)
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+      emb_var_restore = variable_scope.get_embedding_variable("emb_var", 10)
+      emb1 = embedding_ops.embedding_lookup(emb_var_restore, math_ops.cast([1,2,3], dtypes.int64))
+      saver = saver_module.Saver([emb_var_restore],sharded=True)
+      graph = ops.get_default_graph()
+      with self.test_session(graph = graph) as sess:
+        saver.restore(sess, model_path)
+        result = sess.run([emb1])
+        print(result)
+
+  def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
+    print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM_DRAM,
+                        storage_path='/tmp/leveldb/')))
+        
+        emb = embedding_ops.embedding_lookup(var, 
+                                            math_ops.cast([0,1,2,5,6,7],
+                                            dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2,
+                                              math_ops.cast([0,1,2,5,6,7],
+                                              dtypes.int64))
+        fun = math_ops.multiply(emb, 0.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver(sharded=True)
+        init = variables.global_variables_initializer()
+        graph = ops.get_default_graph()
+        with self.test_session() as sess:
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+          sess.run([init])
+          sess.run(train_op)
+          emb_ori = sess.run(emb)
+          emb_ori_2 = sess.run(emb2)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+          print(save_path)
+          for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+            print('loading... ', name, shape)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM_DRAM)))
+        emb = embedding_ops.embedding_lookup(var, 
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, 
+                                              math_ops.cast([0,1,2,5,6,7],
+                                              dtypes.int64))
+        saver = saver_module.Saver([var, var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM_DRAM)))
+        emb = embedding_ops.embedding_lookup(var,
+                                              math_ops.cast([0,1,2,5,6,7],
+                                              dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2,
+                                              math_ops.cast([0,1,2,5,6,7],
+                                              dtypes.int64))
+        saver = saver_module.Saver([var, var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+          
+  def testEmbeddingVariableForSaveAndRestoreForSingleTierHbm(self):
+    print("testEmbeddingVariableForSaveAndRestoreForSingleTierHbm")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=3),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM)))
+        
+        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        fun = math_ops.multiply(emb, 0.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver(sharded=True)
+        init = variables.global_variables_initializer()
+        graph = ops.get_default_graph()
+        with self.test_session() as sess:
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+          sess.run([init])
+          sess.run(train_op)
+          emb_ori = sess.run(emb)
+          emb_ori_2 = sess.run(emb2)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+          print(save_path)
+          for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+            print('loading... ', name, shape)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                partitioner=partitioned_variables.fixed_size_partitioner(num_shards=3),
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM)))
+
+        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        saver = saver_module.Saver([var, var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+          
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM)))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.HBM)))
+
+        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        saver = saver_module.Saver([var,var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 1175342f410..fae4ef91380 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -614,36 +614,6 @@ def testCategoricalColumnWithEmbeddingVariableFunction(self):
         for i in range(ids[col_name].shape.as_list()[0]):
           self.assertAllEqual(val_list[0][i], val_list[1][i])
 
-  def testEmbeddingVariableForSaveAndRestore(self):
-    print("testEmbeddingVariableForSaveAndRestore")
-    checkpoint_directory = self.get_temp_dir()
-    with ops.device("/cpu:0"):
-      var = variable_scope.get_embedding_variable("var_1",
-              embedding_dim = 3,
-              initializer=init_ops.ones_initializer(dtypes.float32),
-              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
-              ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                   storage_path='/tmp/leveldb/')))
-    emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-    saver = saver_module.Saver(sharded=True)
-    init = variables.global_variables_initializer()
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb_ori = sess.run(emb)
-      emb_ori = sess.run(emb)
-      emb_ori = sess.run(emb)
-      emb_ori = sess.run(emb)
-      save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
-      print(save_path)
-      for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
-        print('loading... ', name, shape)
-
-    with self.test_session() as sess:
-      saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
-      self.assertAllEqual(emb_ori, sess.run(emb))
-
   def testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn(self):
     print("testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn")
     checkpoint_directory = self.get_temp_dir()
@@ -2413,24 +2383,133 @@ def testEmbeddingVariableForMultiTierInference(self):
         sess.run(emb, feed_dict={ids:[3, 3]})
         result = sess.run(tires)
         self.assertAllEqual(result, [0, 1, 0, 1])
+        print(result)
     del os.environ["TF_SSDHASH_ASYNC_COMPACTION"]
     del os.environ["TF_RECORD_FREQ"]
 
-  def testEmbeddingVariableCustomDimForSaveAndRestore(self):
-    print("testEmbeddingVariableCustomForSaveAndRestore")
+  def testEmbeddingVariableForSaveAndRestoreForSingleTier(self):
+    print("testEmbeddingVariableForSaveAndRestoreForSingleTier")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+      
+      var_2 = variable_scope.get_embedding_variable("var_2",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+
+      emb = embedding_ops.embedding_lookup(var,
+                                           math_ops.cast([0,1,2,5,6,7],
+                                           dtypes.int64))
+      emb_1 = embedding_ops.embedding_lookup(var_2,
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+      fun = math_ops.multiply(emb, 0.0, name='multiply')
+      fun1 = math_ops.multiply(emb_1, 0.0, name='multiply_1')
+      loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v, gs)
+      saver = saver_module.Saver(sharded=True)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        sess.run(train_op)
+        emb_ori = sess.run(emb)
+        emb_ori_2 = sess.run(emb_1)
+        save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+        print(save_path)
+        for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+          print('loading... ', name, shape)
+
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+      
+      var_2 = variable_scope.get_embedding_variable("var_2",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+
+      emb = embedding_ops.embedding_lookup(var,
+                                           math_ops.cast([0,1,2,5,6,7],
+                                           dtypes.int64))
+      emb_1 = embedding_ops.embedding_lookup(var_2,
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+      saver = saver_module.Saver([var,var_2], sharded=True)
+      graph = ops.get_default_graph()
+      with self.test_session(graph=graph) as sess:
+        saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+        ret = sess.run(emb)
+        ret_1 = sess.run(emb_1)
+        self.assertAllEqual(emb_ori, ret)
+        self.assertAllEqual(emb_ori_2, ret_1)
+    
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+      
+      var_2 = variable_scope.get_embedding_variable("var_2",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=2),
+              ev_option = variables.EmbeddingVariableOption(
+                  storage_option=variables.StorageOption(
+                      storage_type=config_pb2.StorageType.DRAM)))
+
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+      emb_1 = embedding_ops.embedding_lookup(var_2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+      saver = saver_module.Saver([var,var_2], sharded=True)
+      graph = ops.get_default_graph()
+      with self.test_session(graph=graph) as sess:
+        saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+        ret = sess.run(emb)
+        ret_1 = sess.run(emb_1)
+        self.assertAllEqual(emb_ori, ret)
+        self.assertAllEqual(emb_ori_2, ret_1)
+
+  def testEmbeddingVariableSaveAndRestoreForMultiTierWithoutHbm(self):
+    print("testEmbeddingVariableSaveAndRestoreForMultiTierWithoutHbm")
     checkpoint_directory = self.get_temp_dir()
     os.environ["TF_EV_RESTORE_CUSTOM_DIM"] = "True"
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
         var = variable_scope.get_embedding_variable("var_1",
                 embedding_dim = 3,
-                ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                     storage_path='/tmp/leveldb/')))
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                            storage_path='/tmp/leveldb/')))
 
         var2 = variable_scope.get_embedding_variable("var_2",
             embedding_dim = 3,
             partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                 storage_path='/tmp/leveldb/')))
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path='/tmp/leveldb/')))
         
         emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
@@ -2459,14 +2538,18 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
         var = variable_scope.get_embedding_variable("var_1",
                 embedding_dim = 6,
-                ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                     storage_path='/tmp/leveldb/')))
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                            storage_path='/tmp/leveldb/')))
 
         var2 = variable_scope.get_embedding_variable("var_2",
             embedding_dim = 5,
             partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                 storage_path='/tmp/leveldb/')))
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path='/tmp/leveldb/')))
         emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         saver = saver_module.Saver([var,var2],sharded=True)
@@ -2481,14 +2564,18 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
         var = variable_scope.get_embedding_variable("var_1",
                 embedding_dim = 2,
-                ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                     storage_path='/tmp/leveldb/')))
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                            storage_path='/tmp/leveldb/')))                                                         
 
         var2 = variable_scope.get_embedding_variable("var_2",
             embedding_dim = 2,
             partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_LEVELDB,
-                                                                                                 storage_path='/tmp/leveldb/')))
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path='/tmp/leveldb/')))
         emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
         saver = saver_module.Saver([var,var2],sharded=True)
@@ -2501,6 +2588,58 @@ def testEmbeddingVariableCustomDimForSaveAndRestore(self):
           self.assertAllEqual(emb_ori_2[:,0:2], emb_val_2)
     del os.environ["TF_EV_RESTORE_CUSTOM_DIM"]
 
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                            storage_path='/tmp/leveldb/')))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path='/tmp/leveldb/')))
+        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        saver = saver_module.Saver([var,var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                            storage_path='/tmp/leveldb/')))
+
+        var2 = variable_scope.get_embedding_variable("var_2",
+            embedding_dim = 3,
+            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4),
+            ev_option = variables.EmbeddingVariableOption(
+                storage_option=variables.StorageOption(
+                    storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path='/tmp/leveldb/')))
+        emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
+        saver = saver_module.Saver([var,var2],sharded=True)
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          emb_val_2 = sess.run(emb2)
+          self.assertAllEqual(emb_ori, emb_val)
+          self.assertAllEqual(emb_ori_2, emb_val_2)
+
   def testCPUFbjOpt(self):
     print("testCPUFbjOpt")
     os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"

From 616e9e45510a557b2c74caba8e9c0e3cae163dfa Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Fri, 14 Jul 2023 13:49:23 +0800
Subject: [PATCH 39/91] [ModelZoo] Update modelzoo README. (#916)

Models: bst/dbmtl/dcn/deepfm/dien/din/dlrm/dssm/esmm/mmoe/ple/simple_multitask/wide_and_deep

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 modelzoo/bst/README.md              | 42 +++++++++++++--------------
 modelzoo/dbmtl/README.md            | 43 +++++++++++++--------------
 modelzoo/dcn/README.md              | 39 ++++++++++++-------------
 modelzoo/deepfm/README.md           | 39 ++++++++++++-------------
 modelzoo/dien/README.md             | 39 ++++++++++++-------------
 modelzoo/din/README.md              | 39 ++++++++++++-------------
 modelzoo/dlrm/README.md             | 39 ++++++++++++-------------
 modelzoo/dssm/README.md             | 40 ++++++++++++-------------
 modelzoo/esmm/README.md             | 45 +++++++++++++++--------------
 modelzoo/mmoe/README.md             | 43 +++++++++++++--------------
 modelzoo/ple/README.md              | 43 +++++++++++++--------------
 modelzoo/simple_multitask/README.md | 45 +++++++++++++++--------------
 modelzoo/wide_and_deep/README.md    | 39 ++++++++++++-------------
 13 files changed, 267 insertions(+), 268 deletions(-)

diff --git a/modelzoo/bst/README.md b/modelzoo/bst/README.md
index 4a87f8a2055..05b05e4e006 100644
--- a/modelzoo/bst/README.md
+++ b/modelzoo/bst/README.md
@@ -127,7 +127,7 @@ input:
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 100 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -157,21 +157,20 @@ input:
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -182,33 +181,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsetp/Sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">BST</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.912500</td>
+        <td>0.499316</td>
+        <td>16924.47(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.894900</td>
+        <td>0.499316</td>
+        <td>22143.04(1.30x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.909099</td>
+        <td>0.499316</td>
+        <td>28686.70(1.69x)</td>
     </tr>
 </table>
 
 - Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ### Distributed Training
 #### Test Environment
diff --git a/modelzoo/dbmtl/README.md b/modelzoo/dbmtl/README.md
index 1903d47e079..471fa01372d 100644
--- a/modelzoo/dbmtl/README.md
+++ b/modelzoo/dbmtl/README.md
@@ -121,7 +121,7 @@ Context     │      │────►│                  │
       - `--data_location`: Full path of train & eval data. Default is `./data`.
       - `--steps`: Set the number of steps on train dataset. When default(`0`) is used, the number of steps is computed based on dataset size and number of epochs equals 1000.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default is `512`.
+      - `--batch_size`: Batch size to train. Default is `2048`.
       - `--output_dir`: Full path to output directory for logs and saved model. Default is `./result`.
       - `--checkpoint`: Full path to checkpoints output directory. Default is `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMP)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to `None`.
@@ -151,20 +151,20 @@ Context     │      │────►│                  │
 ### Stand-alone Training
 
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
-- Hardware
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-305.12.1.el8_4.x86_64
-  - OS:                     CentOS Linux release 8.4.2105
-  - Docker:                 20.10.12
-  - Python:                 3.6.12
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -175,33 +175,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsetp/Sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">DBMTL</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.753008</td>
+        <td>63220.87(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.753070</td>
+        <td>77383.57(1.22x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.753070</td>
+        <td>137581.54(2.17x)</td>
     </tr>
 </table>
 
 - Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ## Dataset
 Train & eval dataset using ***Taobao dataset***.
diff --git a/modelzoo/dcn/README.md b/modelzoo/dcn/README.md
index e46ca2ef734..f7cc1ffbc58 100644
--- a/modelzoo/dcn/README.md
+++ b/modelzoo/dcn/README.md
@@ -98,7 +98,7 @@ The following is a brief directory structure and description for this example:
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -128,21 +128,20 @@ The following is a brief directory structure and description for this example:
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -159,23 +158,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DCN</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.775859</td>
-        <td>0.768275</td>
-        <td></td>
+        <td>0.776260</td>
+        <td>0.769636</td>
+        <td>24524.91(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.775738</td>
+        <td>0.769095</td>
+        <td>31917.35(1.30x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.775738</td>
+        <td>0.768651</td>
+        <td>55753.15(2.27x)</td>
     </tr>
 </table>
 
diff --git a/modelzoo/deepfm/README.md b/modelzoo/deepfm/README.md
index 49eb0c0dab8..986c4ecdd89 100644
--- a/modelzoo/deepfm/README.md
+++ b/modelzoo/deepfm/README.md
@@ -123,7 +123,7 @@ input:                                  |               |
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -153,21 +153,20 @@ input:                                  |               |
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -184,23 +183,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DeepFM</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.784695</td>
-        <td>0.781548</td>
-        <td>18848.64(baseline)</td>
+        <td>0.782777</td>
+        <td>0.776113</td>
+        <td>61230.80(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.782755</td>
-        <td>0.777158</td>
-        <td>31260.00(1.65x)</td>
+        <td>0.780460</td>
+        <td>0.773281</td>
+        <td>74380.35(1.22x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.782659</td>
-        <td>0.776537</td>
-        <td>34627.46(1.84x)</td>
+        <td>0.780460</td>
+        <td>0.775249</td>
+        <td>95107.32(1.55x)</td>
     </tr>
 </table>
 
diff --git a/modelzoo/dien/README.md b/modelzoo/dien/README.md
index 58df07cd66b..aaaf66f5bee 100644
--- a/modelzoo/dien/README.md
+++ b/modelzoo/dien/README.md
@@ -108,7 +108,7 @@ The following is a brief directory structure and description for this example:
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -138,21 +138,20 @@ The following is a brief directory structure and description for this example:
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -169,23 +168,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DIEN</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.681824</td>
-        <td>0.757496</td>
-        <td>2822.78(baseline)</td>
+        <td>0.575529</td>
+        <td>0.597272</td>
+        <td>6327.50(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.692499</td>
-        <td>0.767193</td>
-        <td>3834.05(1.36x)</td>
+        <td>0.543935</td>
+        <td>0.5972728</td>
+        <td>10094.21(1.60x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.693011</td>
-        <td>0.768412</td>
-        <td>3862.06(1.37x)</td>
+        <td>0.551233</td>
+        <td>0.597272</td>
+        <td>11565.63(1.83x)</td>
     </tr>
 </table>
 
diff --git a/modelzoo/din/README.md b/modelzoo/din/README.md
index 6c2ee620eb4..5a7767d22e9 100644
--- a/modelzoo/din/README.md
+++ b/modelzoo/din/README.md
@@ -105,7 +105,7 @@ The following is a brief directory structure and description for this example:
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -135,21 +135,20 @@ The following is a brief directory structure and description for this example:
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -166,23 +165,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DIN</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.68156</td>
-        <td>0.757246</td>
-        <td>10474.23(baseline)</td>
+        <td>0.580060</td>
+        <td>0.611078</td>
+        <td>18522.65(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.692483</td>
-        <td>0.767444</td>
-        <td>22299.68(2.13x)</td>
+        <td>0.546231</td>
+        <td>0.582456</td>
+        <td>59160.78(3.19x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.694471</td>
-        <td>0.770365</td>
-        <td></td>
+        <td>0.552499</td>
+        <td>0.583775</td>
+        <td>59651.75(3.22x)</td>
     </tr>
 </table>
 
diff --git a/modelzoo/dlrm/README.md b/modelzoo/dlrm/README.md
index 5c1ef2f3b69..61baa14912c 100755
--- a/modelzoo/dlrm/README.md
+++ b/modelzoo/dlrm/README.md
@@ -115,7 +115,7 @@ The triangles represent mlp network. The inputs consists of dense features and s
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -146,21 +146,20 @@ The triangles represent mlp network. The inputs consists of dense features and s
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -177,23 +176,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DLRM</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.782930</td>
-        <td>0.780513</td>
-        <td>34617.94(baseline)</td>
+        <td>0.785739</td>
+        <td>0.785776</td>
+        <td>110607.49(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.783274</td>
-        <td>0.780988</td>
-        <td>59067.85(1.71x)</td>
+        <td>0.784370</td>
+        <td>0.785100</td>
+        <td>129766.90(1.17x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.783428</td>
-        <td>0.780825</td>
-        <td>60907.11(1.76x)</td>
+        <td>0.783970</td>
+        <td>0.785587</td>
+        <td>141266.06(1.28x)</td>
     </tr>
 </table>
 
diff --git a/modelzoo/dssm/README.md b/modelzoo/dssm/README.md
index 14b920948aa..4eb5ee84863 100644
--- a/modelzoo/dssm/README.md
+++ b/modelzoo/dssm/README.md
@@ -120,7 +120,7 @@ input:
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1000 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 4096.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -149,21 +149,20 @@ input:
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -180,27 +179,28 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">DSSM</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.913399</td>
-        <td>0.503494</td>
-        <td>75906.95(baseline)</td>
+        <td>0.894500</td>
+        <td>0.499083</td>
+        <td>98357.70(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.9079</td>
-        <td>0.492405</td>
-        <td>115607.72(1.52x)</td>
+        <td>0.922299</td>
+        <td>0.503223</td>
+        <td>182120.67(1.85x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.9265</td>
-        <td>0.484328</td>
-        <td>129099.08(1.70x)</td>
+        <td>0.918200</td>
+        <td>0.498067</td>
+        <td>191525.84(1.95x)</td>
     </tr>
 </table>
 
 - Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ### Distributed Training
 #### Test Environment
diff --git a/modelzoo/esmm/README.md b/modelzoo/esmm/README.md
index cab8689ed4c..e0ff1df421d 100644
--- a/modelzoo/esmm/README.md
+++ b/modelzoo/esmm/README.md
@@ -130,7 +130,7 @@ input:
       - `--data_location`: Full path of train & eval data. Default is `./data`.
       - `--steps`: Set the number of steps on train dataset. When default(`0`) is used, the number of steps is computed based on dataset size and number of epochs equals 1000.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default is `512`.
+      - `--batch_size`: Batch size to train. Default is `2048`.
       - `--output_dir`: Full path to output directory for logs and saved model. Default is `./result`.
       - `--checkpoint`: Full path to checkpoints output directory. Default is `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMP)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to `None`.
@@ -163,20 +163,20 @@ input:
 ### Stand-alone Training
 
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
-- Hardware
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-305.12.1.el8_4.x86_64
-  - OS:                     CentOS Linux release 8.4.2105
-  - Docker:                 20.10.12
-  - Python:                 3.6.12
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -187,33 +187,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsteps/sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">ESMM</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.99959</td>
+        <td>0.5</td>
+        <td>90890.15(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.99959</td>
+        <td>0.5</td>
+        <td>170830.50(1.88x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.99959</td>
+        <td>0.5</td>
+        <td>202675.93(2.23x)</td>
     </tr>
 </table>
 
-Community TensorFlow version is v1.15.5.
+- Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ### Distributed Training
 #### Test Environment
diff --git a/modelzoo/mmoe/README.md b/modelzoo/mmoe/README.md
index 608cb718534..f1f3b0d61dd 100644
--- a/modelzoo/mmoe/README.md
+++ b/modelzoo/mmoe/README.md
@@ -137,7 +137,7 @@ model:                     ┌────┴────┐
       - `--data_location`: Full path of train & eval data. Default is `./data`.
       - `--steps`: Set the number of steps on train dataset. When default(`0`) is used, the number of steps is computed based on dataset size and number of epochs equals 1000.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default is `512`.
+      - `--batch_size`: Batch size to train. Default is `2048`.
       - `--output_dir`: Full path to output directory for logs and saved model. Default is `./result`.
       - `--checkpoint`: Full path to checkpoints output directory. Default is `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMP)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to `None`.
@@ -167,20 +167,20 @@ model:                     ┌────┴────┐
 ### Stand-alone Training
 
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
-- Hardware
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-305.12.1.el8_4.x86_64
-  - OS:                     CentOS Linux release 8.4.2105
-  - Docker:                 20.10.12
-  - Python:                 3.6.12
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 <table>
@@ -190,33 +190,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsetp/Sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">MMOE</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.747064</td>
+        <td>67189.94(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.753070</td>
+        <td>105387.94(1.57x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.5</td>
+        <td>142645.17(2.12x)</td>
     </tr>
 </table>
 
 - Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ## Dataset
 Train & eval dataset using ***Taobao dataset***.
diff --git a/modelzoo/ple/README.md b/modelzoo/ple/README.md
index 5f011bcc726..dcfdb00782b 100644
--- a/modelzoo/ple/README.md
+++ b/modelzoo/ple/README.md
@@ -92,7 +92,7 @@ The following is a brief directory structure and description for this example:
       - `--data_location`: Full path of train & eval data. Default is `./data`.
       - `--steps`: Set the number of steps on train dataset. When default(`0`) is used, the number of steps is computed based on dataset size and number of epochs equals 1000.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default is `512`.
+      - `--batch_size`: Batch size to train. Default is `2048`.
       - `--output_dir`: Full path to output directory for logs and saved model. Default is `./result`.
       - `--checkpoint`: Full path to checkpoints output directory. Default is `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMP)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to `None`.
@@ -122,20 +122,20 @@ The following is a brief directory structure and description for this example:
 ### Stand-alone Training
 
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
-- Hardware
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-305.12.1.el8_4.x86_64
-  - OS:                     CentOS Linux release 8.4.2105
-  - Docker:                 20.10.12
-  - Python:                 3.6.12
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 <table>
@@ -145,33 +145,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsetp/Sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">PLE</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>1.000000</td>
+        <td>0.498449</td>
+        <td>21182.44(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>1.000000</td>
+        <td>0.497499</td>
+        <td>28608.60(1.35x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.998046</td>
+        <td>0.499049</td>
+        <td>33542.94(1.58x)</td>
     </tr>
 </table>
 
 - Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ## Dataset
 Train & eval dataset using ***Taobao dataset***.
diff --git a/modelzoo/simple_multitask/README.md b/modelzoo/simple_multitask/README.md
index c7c570f835b..f09be3d779f 100644
--- a/modelzoo/simple_multitask/README.md
+++ b/modelzoo/simple_multitask/README.md
@@ -115,7 +115,7 @@ input:
       - `--data_location`: Full path of train & eval data. Default is `./data`.
       - `--steps`: Set the number of steps on train dataset. When default(`0`) is used, the number of steps is computed based on dataset size and number of epochs equals 1000.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default is `512`.
+      - `--batch_size`: Batch size to train. Default is `2048`.
       - `--output_dir`: Full path to output directory for logs and saved model. Default is `./result`.
       - `--checkpoint`: Full path to checkpoints output directory. Default is `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMP)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to `None`.
@@ -147,20 +147,20 @@ input:
 ### Stand-alone Training
 
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
-- Hardware
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-305.12.1.el8_4.x86_64
-  - OS:                     CentOS Linux release 8.4.2105
-  - Docker:                 20.10.12
-  - Python:                 3.6.12
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -171,33 +171,34 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td>DType</td>
         <td>Accuracy</td>
         <td>AUC</td>
-        <td>Globalsteps/sec</td>
+        <td>Throughput</td>
     </tr>
     <tr>
         <td rowspan="3">SimpleMultiTask</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.5</td>
+        <td>109859.81(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.5</td>
+        <td>216999.25(1.98x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td></td>
-        <td></td>
-        <td></td>
+        <td>0.973150</td>
+        <td>0.5</td>
+        <td>282453.56(2.57x)</td>
     </tr>
 </table>
 
-Community TensorFlow version is v1.15.5.
+- Community TensorFlow version is v1.15.5.
+- Due to the small size of the dataset, the results did not converge, leading to limited reference value for ACC and AUC.
 
 ### Distributed Training
 #### Test Environment
diff --git a/modelzoo/wide_and_deep/README.md b/modelzoo/wide_and_deep/README.md
index c4bf2f740b1..122a79aaa25 100755
--- a/modelzoo/wide_and_deep/README.md
+++ b/modelzoo/wide_and_deep/README.md
@@ -135,7 +135,7 @@ input:                                          |
       - `--data_location`: Full path of train & eval data, default to `./data`.
       - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
       - `--no_eval`: Do not evaluate trained model by eval dataset.
-      - `--batch_size`: Batch size to train. Default to 512.
+      - `--batch_size`: Batch size to train. Default to 2048.
       - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
       - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
       - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
@@ -166,21 +166,20 @@ input:                                          |
 ## Benchmark
 ### Stand-alone Training
 #### Test Environment
-The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.g8i.4xlarge**](https://help.aliyun.com/document_detail/25378.html#g8i).
 - Hardware 
-  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
-  - CPU(s):              8
+  - Model name:          Intel(R) Xeon(R) Platinum 8475B
+  - CPU(s):              16
   - Socket(s):           1
-  - Core(s) per socket:  4
+  - Core(s) per socket:  8
   - Thread(s) per core:  2
-  - Memory:              32G
+  - Memory:              64G
 
 - Software
-  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
-  - OS:                     CentOS Linux release 8.5.2111
-  - GCC:                    8.5.0
-  - Docker:                 20.10.12
-  - Python:                 3.6.8
+  - kernel:                 Linux version 5.15.0-58-generic (buildd@lcy02-amd64-101)(AMX patched)
+  - OS:                     Ubuntu 22.04.2 LTS
+  - GCC:                    11.3.0
+  - Docker:                 20.10.21
 
 #### Performance Result
 
@@ -197,23 +196,23 @@ The benchmark is performed on the [Alibaba Cloud ECS general purpose instance fa
         <td rowspan="3">WDL</td>
         <td>Community TensorFlow</td>
         <td>FP32</td>
-        <td>0.770078</td>
-        <td>0.749524</td>
-        <td>8786.04(baseline)</td>
+        <td>0.777847</td>
+        <td>0.773350</td>
+        <td>32605.25(baseline)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32</td>
-        <td>0.771792</td>
-        <td>0.759774</td>
-        <td>16393.65(1.86x)</td>
+        <td>0.778171</td>
+        <td>0.774059</td>
+        <td>38533.30(1.18x)</td>
     </tr>
     <tr>
         <td>DeepRec w/ oneDNN</td>
         <td>FP32+BF16</td>
-        <td>0.774918</td>
-        <td>0.767013</td>
-        <td>22788.93(2.59x)</td>
+        <td>0.778171</td>
+        <td>0.774755</td>
+        <td>82485.07(2.53x)</td>
     </tr>
 </table>
 

From 14c25d8596c6934081d7062cabb6df66a8c8a842 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Mon, 17 Jul 2023 16:23:47 +0800
Subject: [PATCH 40/91] [Embedding] Add memory and performance tests of
 EmbeddingVariable. (#913)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/embedding_var.h  |   7 +-
 tensorflow/core/kernels/BUILD                 |  29 +-
 .../kernels/embedding_variable_memory_test.cc |  76 +++++
 ...=> embedding_variable_performance_test.cc} | 265 ++++++++++--------
 .../core/kernels/embedding_variable_test.h    | 144 ++++++++++
 5 files changed, 402 insertions(+), 119 deletions(-)
 create mode 100644 tensorflow/core/kernels/embedding_variable_memory_test.cc
 rename tensorflow/core/kernels/{embedding_variable_memory_performance_test.cc => embedding_variable_performance_test.cc} (68%)
 create mode 100644 tensorflow/core/kernels/embedding_variable_test.h

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 204c758e3ba..534ebf68950 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -141,8 +141,8 @@ class EmbeddingVar : public ResourceBase {
       memcpy(default_value_, &default_tensor_flat(0),
           default_tensor.TotalBytes());
 
-      default_value_no_permission_ = TypedAllocator::Allocate<V>(alloc_,
-          value_len_, AllocationAttributes());
+      default_value_no_permission_ = TypedAllocator::Allocate<V>(
+          default_value_alloc_, value_len_, AllocationAttributes());
       for (int i = 0; i < value_len_; ++i) {
         default_value_no_permission_[i] = static_cast<V>(
             emb_config_.default_value_no_permission);
@@ -755,7 +755,8 @@ class EmbeddingVar : public ResourceBase {
     TypedAllocator::Deallocate(default_value_alloc_, default_value_,
         value_len_ * emb_config_.default_value_dim);
     if (default_value_no_permission_) {
-      TypedAllocator::Deallocate(alloc_, default_value_no_permission_,
+      TypedAllocator::Deallocate(default_value_alloc_,
+          default_value_no_permission_,
           value_len_);
     }
     if (filter_) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 647d730bbe0..23d12c295ca 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -461,8 +461,33 @@ tf_cuda_cc_test(
 )
 
 tf_cuda_cc_test(
-    name = "embedding_variable_memory_performance_test",
-    srcs = ["embedding_variable_memory_performance_test.cc"],
+    name = "embedding_variable_performance_test",
+    srcs = ["embedding_variable_performance_test.cc",
+            "embedding_variable_test.h"],
+    extra_copts = ["-fexceptions", "-g"],
+    deps = [
+        ":io",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "embedding_variable_memory_test",
+    srcs = ["embedding_variable_memory_test.cc",
+            "embedding_variable_test.h"],
     extra_copts = ["-fexceptions", "-g"],
     deps = [
         ":io",
diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc
new file mode 100644
index 00000000000..7ec6b1cf109
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "tensorflow/core/kernels/embedding_variable_test.h"
+
+namespace tensorflow {
+namespace embedding {
+float PerfMemory(Tensor& default_value,
+                const std::vector<int64>& id_list,
+                int value_size, int64 default_value_dim,
+                int64 filter_freq = 0) {
+  auto ev = CreateEmbeddingVar(value_size, default_value,
+                               default_value_dim, filter_freq);
+  ValuePtr<float>* value_ptr = nullptr;
+  bool is_filter = false;
+  double start_mem, end_mem;
+  start_mem = getResident() * getpagesize();
+  for (int i = 0; i < id_list.size(); i++) {
+    ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
+    if (is_filter)
+      ev->flat(value_ptr, id_list[i]);
+  }
+  end_mem = getResident() * getpagesize();
+  double used_mb = (end_mem - start_mem)/1000000;
+  LOG(INFO)<<"[TestMemory]Use Memory: "<<used_mb;
+  return used_mb;
+}
+
+TEST(EmbeddingVariabelMemoryTest, TestMemory) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  int filter_freq = 2;
+  Tensor default_value(
+      DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = i;
+  }
+  float used_mb = PerfMemory(default_value, id_list,
+                             value_size, default_value_dim);
+  float theoritical_mb =
+      50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000;
+  EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
+              (used_mb < theoritical_mb * 1.01));
+
+  for (int i = 0; i < num_of_ids / 2; i++) {
+    id_list.emplace_back(i);
+  }
+  used_mb = PerfMemory(default_value, id_list, value_size,
+                       default_value_dim, filter_freq);
+  theoritical_mb =
+      50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000;
+  EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
+              (used_mb < theoritical_mb * 1.01));
+}
+} //namespace embedding
+} //namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_variable_memory_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc
similarity index 68%
rename from tensorflow/core/kernels/embedding_variable_memory_performance_test.cc
rename to tensorflow/core/kernels/embedding_variable_performance_test.cc
index 6aa30b3ab40..ee04b4468f6 100644
--- a/tensorflow/core/kernels/embedding_variable_memory_performance_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc
@@ -12,120 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ======================================================================*/
-#include <thread>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/public/session.h"
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/tensor_slice_reader_cache.h"
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#endif //GOOGLE_CUDA
-
-#include <time.h>
-#include <sys/resource.h>
-#include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/cache.h"
-#include "tensorflow/core/kernels/kv_variable_ops.h"
-#ifdef TENSORFLOW_USE_JEMALLOC
-#include "jemalloc/jemalloc.h"
-#endif
+#include "tensorflow/core/kernels/embedding_variable_test.h"
 
 namespace tensorflow {
-struct ProcMemory {
-  long size;      // total program size
-  long resident;  // resident set size
-  long share;     // shared pages
-  long trs;       // text (code)
-  long lrs;       // library
-  long drs;       // data/stack
-  long dt;        // dirty pages
-
-  ProcMemory() : size(0), resident(0), share(0),
-                 trs(0), lrs(0), drs(0), dt(0) {}
-};
-
-ProcMemory getProcMemory() {
-  ProcMemory m;
-  FILE* fp = fopen("/proc/self/statm", "r");
-  if (fp == NULL) {
-    LOG(ERROR) << "Fail to open /proc/self/statm.";
-    return m;
-  }
-
-  if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld",
-             &m.size, &m.resident, &m.share,
-             &m.trs, &m.lrs, &m.drs, &m.dt) != 7) {
-    fclose(fp);
-    LOG(ERROR) << "Fail to fscanf /proc/self/statm.";
-    return m;
-  }
-  fclose(fp);
-
-  return m;
-}
-
-double getSize() {
-  ProcMemory m = getProcMemory();
-  return m.size;
-}
-
-double getResident() {
-  ProcMemory m = getProcMemory();
-  return m.resident;
-}
 namespace embedding {
-EmbeddingVar<int64, float>* CreateEmbeddingVar(
-    int value_size, Tensor& default_value,
-    int64 default_value_dim) {
-  Allocator* allocator = ev_allocator();
-  auto embedding_config = EmbeddingConfig(
-			0, 0, 1, 0, "emb_var", 0,
-			0, 999999, -1.0, "light",
-			0, -1.0, DT_UINT64, default_value_dim,
-			0.0, false, false, false);
-  auto storage =
-      embedding::StorageFactory::Create<int64, float>(
-          embedding::StorageConfig(
-              embedding::StorageType::DRAM, "",
-              {1024, 1024, 1024, 1024}, "light",
-              embedding_config),
-          allocator,
-          "emb_var");
-	auto ev = new EmbeddingVar<int64, float>(
-      "emb_var",
-      storage,
-      embedding_config,
-      allocator);
-	ev->Init(default_value, default_value_dim);
-  return ev;
-}
-
 void GenerateSkewIds(int num_of_ids, float skew_factor,
                      std::vector<int64>& hot_ids_list,
                      std::vector<int64>& cold_ids_list) {
@@ -213,7 +103,7 @@ void thread_lookup_or_create(
 
 double PerfLookupOrCreate(
     const std::vector<std::vector<int64>>& input_batches,
-    int num_thread) {
+    int num_thread, int filter_freq = 0) {
   int value_size = 32;
   int64 default_value_dim = 4096;
   Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
@@ -223,7 +113,8 @@ double PerfLookupOrCreate(
 			default_value_matrix(i, j) = i * value_size + j;
 		}
 	}
-  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
+  auto ev = CreateEmbeddingVar(value_size, default_value,
+                               default_value_dim, filter_freq);
   std::vector<std::thread> worker_threads(num_thread);
   double total_time = 0.0;
   timespec start, end;
@@ -293,7 +184,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookupOrCreate) {
   LOG(INFO)<<"[TestLookupOrCreate] Finish generating skew input";
   std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
   for (auto num_thread: num_thread_vec) {
-    LOG(INFO)<<"[TestLookupOrCreate] Test LookupOrCreate With"
+    LOG(INFO)<<"[TestLookupOrCreate] Test LookupOrCreate With "
              <<num_thread<<" threads.";
     double exec_time = PerfLookupOrCreate(input_batches, num_thread);
     if (exec_time == -1.0) {
@@ -434,5 +325,151 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) {
   }
   ev->Unref();
 }
+
+string Prefix(const string& prefix) {
+  return strings::StrCat(testing::TmpDir(), "/", prefix);
+}
+
+void PerfSave(Tensor& default_value,
+              const std::vector<int64>& id_list,
+              int value_size, int64 default_value_dim,
+              int64 steps_to_live = 0,
+              float l2_weight_threshold = -1.0) {
+  auto ev = CreateEmbeddingVar(
+      value_size, default_value,
+      default_value_dim, 0, steps_to_live,
+      l2_weight_threshold);
+  ValuePtr<float>* value_ptr = nullptr;
+  bool is_filter = false;
+  srand((unsigned)time(NULL));
+
+  for (int i = 0; i < id_list.size(); i++) {
+    ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
+    ev->flat(value_ptr, id_list[i]);
+    int64 global_step = rand() % 100;
+    ev->UpdateVersion(value_ptr, global_step);
+  }
+  Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
+
+  BundleWriter writer(Env::Default(), Prefix("foo"));
+  timespec start, end;
+  double total_time = 0.0;
+  if (steps_to_live != 0 || l2_weight_threshold != -1.0) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    embedding::ShrinkArgs shrink_args;
+    shrink_args.global_step = 100;
+    ev->Shrink(shrink_args);
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    total_time += (double)(end.tv_sec - start.tv_sec) *
+                  1000000000 + end.tv_nsec - start.tv_nsec;
+  }
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  DumpEmbeddingValues(ev, "var", &writer, &part_offset_tensor);
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  total_time += (double)(end.tv_sec - start.tv_sec) *
+                 1000000000 + end.tv_nsec - start.tv_nsec;
+  TF_ASSERT_OK(writer.Finish());
+  LOG(INFO)<<"[TestSave]execution time: "
+           << total_time/1000000 <<"ms";
+  ev->Unref();
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestSave) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(
+      DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+
+  int num_of_ids = 1000000;
+  srand((unsigned)time(NULL));
+  std::vector<int64> id_list(num_of_ids);
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size, default_value_dim);
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestGlobalStepEviction) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(
+      DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  srand((unsigned)time(NULL));
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size, default_value_dim, 80);
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestL2WeightEviction) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(
+      DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+	for (int i = 0; i < default_value_dim; i++) {
+		for (int j = 0 ; j < value_size; j++) {
+			default_value_matrix(i, j) = i * value_size + j;
+		}
+	}
+
+  int l2_weight_threshold_index = default_value_dim * 0.2;
+  float l2_weight_threshold = 0.0;
+  for (int64 j = 0; j < value_size; j++) {
+    l2_weight_threshold +=
+        pow(default_value_matrix(l2_weight_threshold_index, j), 2);
+  }
+  l2_weight_threshold *= 0.5;
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  srand((unsigned)time(NULL));
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size,
+           default_value_dim, 0, l2_weight_threshold);
+}
+
+TEST(EmbeddingVariablePerformaceTest, TestCounterFilterLookupOrCreate) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  int64 filter_freq = 5;
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  LOG(INFO)<<"[TestCounterFilterLookupOrCreate] Start generating skew input";
+  GenerateSkewInput(num_of_ids, 0.8, input_batches);
+  LOG(INFO)<<"[TestCounterFilterLookupOrCreate] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread: num_thread_vec) {
+    LOG(INFO)<<"[TestCounterFilterLookupOrCreate] Test LookupOrCreate With "
+             <<num_thread<<" threads.";
+    double exec_time = PerfLookupOrCreate(input_batches, num_thread, filter_freq);
+    if (exec_time == -1.0) {
+      LOG(INFO)<<"[TestCounterFilterLookupOrCreate] Test Failed";
+    } else {
+      LOG(INFO)<<"[TestCounterFilterLookupOrCreate] Performance of LookupOrCreate With "
+               <<num_thread<<" threads: "<<exec_time/1000000<<" ms";
+    }
+  }
+}
 } //namespace embedding
 } //namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h
new file mode 100644
index 00000000000..d06304fb78a
--- /dev/null
+++ b/tensorflow/core/kernels/embedding_variable_test.h
@@ -0,0 +1,144 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H
+#define TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H
+#include <thread>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#endif //GOOGLE_CUDA
+
+#include <time.h>
+#include <sys/resource.h>
+#include "tensorflow/core/framework/embedding/kv_interface.h"
+#include "tensorflow/core/framework/embedding/cache.h"
+#include "tensorflow/core/kernels/kv_variable_ops.h"
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
+namespace tensorflow {
+namespace embedding {
+struct ProcMemory {
+  long size;      // total program size
+  long resident;  // resident set size
+  long share;     // shared pages
+  long trs;       // text (code)
+  long lrs;       // library
+  long drs;       // data/stack
+  long dt;        // dirty pages
+
+  ProcMemory() : size(0), resident(0), share(0),
+                 trs(0), lrs(0), drs(0), dt(0) {}
+};
+
+ProcMemory getProcMemory() {
+  ProcMemory m;
+  FILE* fp = fopen("/proc/self/statm", "r");
+  if (fp == NULL) {
+    LOG(ERROR) << "Fail to open /proc/self/statm.";
+    return m;
+  }
+
+  if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld",
+             &m.size, &m.resident, &m.share,
+             &m.trs, &m.lrs, &m.drs, &m.dt) != 7) {
+    fclose(fp);
+    LOG(ERROR) << "Fail to fscanf /proc/self/statm.";
+    return m;
+  }
+  fclose(fp);
+
+  return m;
+}
+
+double getSize() {
+  ProcMemory m = getProcMemory();
+  return m.size;
+}
+
+double getResident() {
+  ProcMemory m = getProcMemory();
+  return m.resident;
+}
+
+EmbeddingVar<int64, float>* CreateEmbeddingVar(
+    int value_size, Tensor& default_value,
+    int64 default_value_dim, int64 filter_freq = 0,
+    int64 steps_to_live = 0,
+    float l2_weight_threshold=-1.0) {
+  std::string layout_type = "light";
+  if (filter_freq != 0) {
+    layout_type = "normal";
+  }
+
+  if (steps_to_live != 0) {
+    if (layout_type == "light") {
+      layout_type = "normal_contiguous";
+    }
+  }
+  auto embedding_config = EmbeddingConfig(
+			0, 0, 1, 0, "emb_var", steps_to_live,
+			filter_freq, 999999, l2_weight_threshold, layout_type,
+			0, -1.0, DT_UINT64, default_value_dim,
+			0.0, false, false, false);
+  auto storage =
+      embedding::StorageFactory::Create<int64, float>(
+          embedding::StorageConfig(
+              embedding::StorageType::DRAM, "",
+              {1024, 1024, 1024, 1024}, layout_type,
+              embedding_config),
+          cpu_allocator(),
+          "emb_var");
+	auto ev = new EmbeddingVar<int64, float>(
+      "emb_var",
+      storage,
+      embedding_config,
+      cpu_allocator());
+	ev->Init(default_value, default_value_dim);
+  return ev;
+}
+} //namespace embedding
+} //namespace tensorflow
+#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_

From f7bc9014bca2b976939c3482fdd19411e94de479 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Thu, 20 Jul 2023 02:50:22 -0700
Subject: [PATCH 41/91] [Distributed] Integrate HybridBackend in collective
 training mode. (#912)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 tensorflow/python/BUILD                       |    1 +
 tensorflow/python/distribute/BUILD            |   16 +
 tensorflow/python/distribute/__init__.py      |    1 +
 .../group_embedding_collective_strategy.py    |  138 ++
 tensorflow/python/distribute/hvd_strategy.py  | 1452 +++++++++++++++++
 tensorflow/python/distribute/launch.py        |  319 ++++
 .../python/framework/group_embedding_types.py |    9 +-
 tensorflow/python/ops/embedding_ops.py        |   28 +-
 8 files changed, 1959 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/python/distribute/group_embedding_collective_strategy.py
 create mode 100644 tensorflow/python/distribute/hvd_strategy.py
 create mode 100644 tensorflow/python/distribute/launch.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index edaf20749d9..68649078f5c 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -170,6 +170,7 @@ py_library(
         "//tensorflow/python/compiler",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:deeprec_collective",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:estimator_training",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 51f0e4cc174..a5357ce21b7 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -10,6 +10,22 @@ package(
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "deeprec_collective",
+    srcs = [
+        "group_embedding_collective_strategy.py",
+        "launch.py",
+        "hvd_strategy.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
 py_library(
     name = "distribute_test_lib_pip",
     deps = [
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
index f9d0a95ea58..7ff6c680ca1 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -25,6 +25,7 @@
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.distribute import launch
 from tensorflow.python.distribute.experimental import collective_all_reduce_strategy
 from tensorflow.python.distribute.experimental import parameter_server_strategy
 # pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/group_embedding_collective_strategy.py b/tensorflow/python/distribute/group_embedding_collective_strategy.py
new file mode 100644
index 00000000000..8fa94ea9f7e
--- /dev/null
+++ b/tensorflow/python/distribute/group_embedding_collective_strategy.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+# Copyright 2023 Alibaba Group Holding Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+
+from tensorflow.python.framework.group_embedding_types import (
+    DistStrategy,
+    set_group_lookup_strategy,
+)
+
+import os
+import contextlib
+from tensorflow_estimator.python.estimator import estimator as _estimator_lib
+
+
+class CollectiveStrategy:
+    r"""
+    A thin interface to all kinds of Synchonized training strategy.
+    """
+
+    def __init__(self):
+        self._hvd = None
+        self._hb = None
+        strategy = os.getenv("COLLECTIVE_STRATEGY", "sok")
+        if strategy == DistStrategy.SOK.value:
+            try:
+                import horovod.tensorflow as hvd
+                hvd.init()
+                from sparse_operation_kit import experiment as sok
+                sok.init()
+            except:
+                raise ImportError(
+                    "While param `strategy` in enable_distributed_strategyis given `sok`,"
+                    " sok module initialize error,please double check"
+                )
+
+            self._sok = sok
+            self._hvd = hvd
+            set_group_lookup_strategy(strategy)
+        elif strategy == DistStrategy.HB.value:
+            try:
+                import hybridbackend.tensorflow as hb
+            except:
+                raise ImportError(
+                    "While param `strategy` in enable_distributed_strategyis given `hb`, hb module initialize error,please double check"
+                )
+            self._hb = hb
+            set_group_lookup_strategy(strategy)
+        else:
+            raise ValueError(
+                "accepted `COLLECTIVE_STRATEGY` is sok or hb, while given %s", strategy
+            )
+
+    @contextlib.contextmanager
+    def scope(self, *args, **kwargs):
+        if self._hvd:
+            from tensorflow.python.distribute import hvd_strategy
+            with hvd_strategy.scope() as context:
+                yield context
+        elif self._hb:
+            with self._hb.scope() as context:
+                yield context
+
+    @contextlib.contextmanager
+    def embedding_scope(self, **kwargs):
+        if self._hvd:
+            from tensorflow.python.distribute import hvd_strategy
+            with hvd_strategy.embedding_scope() as context:
+                yield context
+        elif self._hb:
+            with self._hb.embedding_scope() as context:
+                yield context
+
+    def world_size(self):
+        if self._hvd:
+            return self._hvd.size()
+        elif self._hb:
+            return self._hb.context.world_size
+
+    def rank(self):
+        if self._hvd:
+            return self._hvd.rank()
+        elif self._hb:
+            return self._hb.context.rank
+
+    def estimator(self, model_fn, **kwargs):
+        if self._hvd:
+            from tensorflow.python.distribute.hvd_strategy import wraps_estimator
+            _estimator = wraps_estimator(_estimator_lib.Estimator)
+        elif self._hb:
+            _estimator = hb.estimator.Estimator
+        
+        return _estimator(model_fn, **kwargs)
+
+    def export_saved_model(
+        self,
+        savedmodel_dir,
+        checkpoint_dir=None,
+        signature_def_fn=None,
+        assets_extra=None,
+        as_text=False,
+        clear_devices=True,
+        strip_default_attrs=True,
+    ):
+        if self._hvd:
+            from tensorflow.python.distribute import hvd_strategy
+            hvd_strategy.export(
+                savedmodel_dir,
+                checkpoint_dir,
+                signature_def_fn,
+                assets_extra,
+                as_text,
+                clear_devices,
+                strip_default_attrs,
+            )
+        elif self._hb:
+            self._hb.train.export(
+                savedmodel_dir,
+                checkpoint_dir,
+                signature_def_fn,
+                assets_extra,
+                as_text,
+                clear_devices,
+                strip_default_attrs,
+            )
diff --git a/tensorflow/python/distribute/hvd_strategy.py b/tensorflow/python/distribute/hvd_strategy.py
new file mode 100644
index 00000000000..8a3ae9c3f43
--- /dev/null
+++ b/tensorflow/python/distribute/hvd_strategy.py
@@ -0,0 +1,1452 @@
+#!/usr/bin/env python
+
+# Copyright 2023 Alibaba Group Holding Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import uuid
+import re
+import abc
+import os
+import contextlib
+import threading
+import random as rn
+import json
+import collections
+import time
+import six
+import numpy as np
+import horovod.tensorflow as hvd
+
+from tensorflow._api.v1 import train as train_v1
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.distribute import device_util, estimator_training, \
+                                         multi_worker_util, device_util, \
+                                         estimator_training
+from tensorflow.python.eager import context as _context
+from tensorflow.python.estimator.model_fn import ModeKeys
+from tensorflow.python.framework import ops, constant_op, dtypes, ops, random_seed, \
+                                        tensor_util
+from tensorflow.python.util import nest, compat
+from tensorflow.python.training import saver, server_lib, training, checkpoint_utils, \
+                                        checkpoint_management, optimizer, \
+                                        session_run_hook, basic_session_run_hooks, \
+                                        training_util
+from tensorflow.python.training import monitored_session as _monitored_session
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import array_ops, data_flow_ops, embedding_ops, \
+                                  gen_io_ops, math_ops, \
+                                  string_ops, control_flow_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.summary import summary as core_summary
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.saved_model import builder, constants
+from tensorflow.python.saved_model.model_utils.export_utils import \
+    (EXPORT_TAG_MAP, get_timestamped_export_dir, SIGNATURE_KEY_MAP)
+                                           
+try:
+    from tensorflow.python.training.saving.saveable_object_util import \
+        op_list_to_dict
+except ImportError:
+    op_list_to_dict = saver.BaseSaverBuilder.OpListToDict
+
+from tensorflow_estimator.python.estimator import run_config as run_config_lib
+from tensorflow_estimator.python.estimator import estimator as _estimator_lib
+from tensorflow_estimator.python.estimator.training import \
+    _is_google_env, _MAX_DELAY_SECS, _TrainingExecutor
+from tensorflow_estimator.python.estimator.export import export_lib 
+from tensorflow.python.keras.backend import reset_uids as reset_keras_uids
+
+
+##################### HVDSTRATEGY COMMON CODE ##########################
+
+class HvdContext(object):
+
+    _instance = None
+
+    @classmethod
+    def get(cls):
+        r'''Get singleton.
+        '''
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    @contextlib.contextmanager
+    def scope(cls, **kwargs):
+        r'''Update params in context.
+        '''
+        prev_kwargs = {}
+        try:
+            c = cls.get()
+            yield c
+        finally:
+            del prev_kwargs
+
+    @classmethod
+    def get_tf_config(cls):
+        r'''Get configuration from TF_CONFIG environment variable.
+        '''
+        tf_config = json.loads(os.getenv('TF_CONFIG', '{}'))
+        if not tf_config:
+            return None
+        task = tf_config['task']
+        cluster = tf_config['cluster']
+        task_type = task['type']
+        task_id = int(task['index'])
+        tf_config_type = collections.namedtuple(
+            'TfConfig', ['task_type', 'task_id', 'cluster'])
+        return tf_config_type(task_type, task_id, cluster)
+
+    @property
+    def cluster_spec(self):
+        r'''cluster spec.
+        '''
+        return self._cluster_spec
+
+    @property
+    def task_type(self):
+        r'''job name of current server. `localhost` by default.
+        '''
+        return self._task_type
+
+    @property
+    def task_id(self):
+        r'''task index of current server. 0 by default.
+        '''
+        return self._task_id
+
+    @property
+    def is_chief(self):
+        r'''True if current server is chief worker.
+        '''
+        return self._is_chief
+
+    @property
+    def has_gpu(self):
+        r'''True if current server has GPU.
+        '''
+        return self._num_gpus > 0
+
+    @property
+    def world_size(self):
+        r'''Number of devices.
+        '''
+        return self._world_size
+
+    @property
+    def rank(self):
+        r'''Global index of default local device.
+        '''
+        return self._rank
+
+    @property
+    def num_gpus(self):
+        r'''Number of GPUs.
+        '''
+        return self._num_gpus
+
+    def _update(self, task_type=None, task_id=None, cluster_spec=None,
+                num_gpus=None):
+        r'''Update parameters from cluster_spec.
+
+        If task_type, task_id or cluster_spec is None, these arguments will not be
+        changed.
+
+        Args:
+          task_type: (Optional.) name of current job. `localhost` by default.
+          task_id: (Optional.) index of current task. 0 by default.
+          cluster_spec: (Optional.) ClusterSpec object.
+        '''
+        tf_config = None
+        try:
+            tf_config = self.get_tf_config()
+        except:  # pylint: disable=bare-except
+            pass
+        if tf_config:
+            self._task_type = tf_config.task_type
+            self._task_id = tf_config.task_id
+            self._cluster_spec = server_lib.ClusterSpec(tf_config.cluster)
+        else:
+            self._task_type = 'localhost'
+            self._task_id = 0
+            self._cluster_spec = None
+        if task_type:
+            self._task_type = task_type
+        if self._task_type not in ('localhost', 'chief', 'worker'):
+            logging.info('No valid configuration for non-worker roles')
+            return
+
+        if task_id:
+            self._task_id = task_id
+        if cluster_spec:
+            self._cluster_spec = cluster_spec
+        if self._cluster_spec:
+            self._cluster_spec = multi_worker_util.normalize_cluster_spec(
+                self._cluster_spec)
+            self._is_chief = False
+            try:
+                self._is_chief = multi_worker_util.is_chief(
+                    self._cluster_spec, self._task_type, self._task_id)
+            except:  # pylint: disable=bare-except
+                pass
+        if num_gpus:
+            self._num_gpus = num_gpus
+        elif not self._num_gpus:
+            num_gpus = 0
+            num_gpus_config = config_pb2.ConfigProto()
+            num_gpus_config.inter_op_parallelism_threads = 1
+            num_gpus_config.intra_op_parallelism_threads = 1
+            num_gpus_config.gpu_options.allow_growth = True
+            for device in device_lib.list_local_devices(num_gpus_config):
+                if device.device_type == 'GPU':
+                    num_gpus += 1
+            self._num_gpus = num_gpus
+        self._default_device = (
+            f'/job:{self._task_type}/replica:0/task:{self._task_id}')
+        self._local_cpu_device = device_util.canonicalize(
+            '/device:CPU:0', default=self._default_device)
+        if self._num_gpus == 0:
+            self._local_devices = [self._local_cpu_device]
+        else:
+            self._local_devices = [
+                device_util.canonicalize(
+                    f'/device:GPU:{d}', default=self._default_device)
+                for d in range(self._num_gpus)]
+
+        local_world_size_str = os.getenv('LOCAL_WORLD_SIZE', '')
+        if not local_world_size_str:
+            self._local_world_size = len(
+                self._local_devices)  # pylint: disable=protected-access
+        else:
+            self._local_world_size = int(local_world_size_str)
+
+        if not self._cluster_spec:
+            self._devices = list(self._local_devices)
+            return
+        task_indices = []
+        try:
+            task_defs = dict(
+                enumerate(self._cluster_spec.job_tasks(self._task_type)))
+            task_indices = sorted(task_defs)
+        except:  # pylint: disable=bare-except
+            pass
+        worker_indices = []
+        try:
+            worker_defs = dict(
+                enumerate(self._cluster_spec.job_tasks('worker')))
+            worker_indices = sorted(worker_defs)
+        except:  # pylint: disable=bare-except
+            pass
+        chief_indices = []
+        try:
+            chief_defs = dict(enumerate(self._cluster_spec.job_tasks('chief')))
+            chief_indices = sorted(chief_defs)
+        except:  # pylint: disable=bare-except
+            pass
+        self._cpu_devices = [
+            device_util.resolve(
+                f'/job:{self._task_type}/task:{t}/device:CPU:0')
+            for t in task_indices]
+        if self._num_gpus == 0:
+            self._devices = self._cpu_devices
+            if self._task_type == 'worker':
+                chief_devices = [
+                    device_util.resolve(f'/job:chief/task:{t}/device:CPU:0')
+                    for t in chief_indices]
+                self._devices = chief_devices + self._devices
+            elif self._task_type == 'chief':
+                self._devices += [
+                    device_util.resolve(f'/job:worker/task:{t}/device:CPU:0')
+                    for t in worker_indices]
+            return
+        self._devices = [
+            device_util.resolve(
+                f'/job:{self._task_type}/task:{t}/device:GPU:{g}')
+            for t in task_indices for g in range(self._num_gpus)]
+        if self._task_type == 'worker':
+            chief_devices = [
+                device_util.resolve(f'/job:chief/task:{t}/device:GPU:{g}')
+                for t in chief_indices for g in range(self._num_gpus)]
+            self._devices = chief_devices + self._devices
+        elif self._task_type == 'chief':
+            self._devices += [
+                device_util.resolve(f'/job:worker/task:{t}/device:GPU:{g}')
+                for t in worker_indices for g in range(self._num_gpus)]
+
+    def __init__(self):
+        r'''Construct a server specification.
+        '''
+        self._task_type = 'localhost'
+        self._rank = hvd.local_rank()
+        self._world_size = hvd.size()
+        self._task_id = 0
+        self._cluster_spec = None
+        self._is_chief = True
+        visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '')
+        if visible_devices:
+            self._num_gpus = len(visible_devices.split(','))
+        else:
+            self._num_gpus = 1
+        self._update()
+        self._saving_listener_registry = {}
+
+
+class GraphRewriting(object):  # pylint: disable=useless-object-inheritance
+    r'''Python API rewriting.
+    '''
+    _lock = threading.Lock()
+    _stack_depth = 0
+    _registry = {}
+    _registry_keys = []
+
+    @classmethod
+    def register(cls, rewriting):
+        r'''Register implementation.
+
+        Args:
+          rewriting: Implementation class to register.
+        '''
+        if not issubclass(rewriting, cls):
+            raise ValueError(
+                f'{rewriting} must be a subclass of GraphRewriting')
+        if rewriting.__name__ not in cls._registry:
+            cls._registry_keys.append(rewriting.__name__)
+        cls._registry[rewriting.__name__] = rewriting()
+
+    @classmethod
+    @contextlib.contextmanager
+    def scope(cls, **kwargs):
+        r'''Context manager that patches Python APIs.
+        '''
+        seed = kwargs.pop('seed', None)
+        if seed is not None:
+            rn.seed(seed)
+            np.random.seed(seed)
+            random_seed.set_random_seed(seed)
+            os.environ['PYTHONHASHSEED'] = str(seed)
+            os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
+
+        with HvdContext.scope() as ctx:
+            try:
+                with cls._lock:
+                    cls._stack_depth += 1
+                    if cls._stack_depth <= 1:
+                        for name in cls._registry_keys:
+                            cls._registry[name].begin()
+                yield ctx
+
+            finally:
+                with cls._lock:
+                    if cls._stack_depth <= 1:
+                        for name in reversed(cls._registry_keys):
+                            cls._registry[name].end()
+                    cls._stack_depth -= 1
+
+    @abc.abstractmethod
+    def begin(self):
+        r'''Rewrites API.
+        '''
+
+    @abc.abstractmethod
+    def end(self):
+        r'''Revert API rewriting.
+        '''
+
+
+##################### Optimizer ##########################
+
+def wraps_optimizer(cls):
+    r'''Decorator to create horovod optimizer class.
+
+    Args:
+      optimizer_type: The actual optimizer type that will be used to compute and
+        apply the gradients. Must be one of the Optimizer classes.
+      aggregation: Aggregate gradients inside `compute_gradients` or
+        `apply_gradients`.
+
+    Returns:
+      HvdOptimizer
+    '''
+    class HvdOptimizer(cls, optimizer.Optimizer):
+        def __init__(self, *args, **kwargs):
+            kwargs["learning_rate"] = kwargs.get("learning_rate", 0.001) *\
+                                                 HvdContext.get().world_size
+            super(HvdOptimizer, self).__init__(*args, **kwargs)
+        
+        def compute_gradients(self, loss, **kwargs):
+            loss = hvd.allreduce(loss, op=hvd.Sum)
+            return super().compute_gradients(loss, **kwargs)
+
+    if isinstance(cls, HvdOptimizer):
+        return cls
+    else:
+        def horovod_optimizer(*args, **kwargs):
+            return HvdOptimizer(*args, **kwargs)
+        return horovod_optimizer
+
+
+class OptimizerRewriting(GraphRewriting):
+    r'''Rewriting optimizers.
+    '''
+
+    def __init__(self):
+        super().__init__()
+        self._prev_optimizers = {}
+
+    def begin(self):
+        r'''Rewrites API.
+        '''
+        for k, c in training.__dict__.items():
+            if (isinstance(c, type)
+                and issubclass(c, training.Optimizer)
+                and c not in (
+                training.Optimizer,
+                    training.SyncReplicasOptimizer)):
+                self._prev_optimizers[k] = c
+                wrapped = wraps_optimizer(c)
+                setattr(training, k, wrapped)
+                setattr(train_v1, k, wrapped)
+
+    def end(self):
+        r'''Revert API rewriting.
+        '''
+        for c, opt in self._prev_optimizers.items():
+            setattr(training, c, opt)
+            setattr(train_v1, c, opt)
+
+
+GraphRewriting.register(OptimizerRewriting)
+
+##################### MonitoredTrainingSession ##########################
+
+
+def wraps_session_config(session_config, *args, **kwargs):
+    r'''Wraps ConfigProto for distributed training.
+    '''
+    if not session_config:
+        kwargs.setdefault('allow_soft_placement', True)
+        session_config = config_pb2.ConfigProto(*args, **kwargs)
+    session_config.gpu_options.allow_growth = True
+    session_config.gpu_options.force_gpu_compatible = True
+    # Horovod: pin GPU to be used to process local rank (one GPU per process)
+    session_config.gpu_options.visible_device_list = str(HvdContext.get().rank)
+    return session_config
+
+def wraps_monitored_training_session(fn):
+    r'''Decorator to create wrapped monitored training session.
+    '''
+    if hasattr(fn, 'wrapped_fn'):
+        return fn
+
+    def HorovodMonitoredTrainingSession(*args, **kwargs):  # pylint: disable=invalid-name
+        r'''Creates a `MonitoredSession` for training.
+        '''
+        checkpoint_dir = kwargs.get('checkpoint_dir', None)
+        if HvdContext.get().rank != 0:
+            checkpoint_dir = None
+        summary_dir = kwargs.get('summary_dir', None)
+        summary_dir = summary_dir or checkpoint_dir
+        scaffold = kwargs.pop('scaffold', _monitored_session.Scaffold())
+        kwargs['scaffold'] = scaffold
+        hooks = kwargs.pop('hooks', [])
+        hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+
+        chief_only_hooks = kwargs.pop('chief_only_hooks', [])
+        chief_only_hooks = list(chief_only_hooks)
+        kwargs['hooks'] = hooks
+        kwargs['chief_only_hooks'] = chief_only_hooks
+        kwargs['config'] = wraps_session_config(kwargs.pop('config', None))
+        kwargs['is_chief'] = True
+        args = list(args)
+        if args:
+            master = args[0]
+            if not master:
+                master = ''
+            args[0] = master
+        else:
+            master = kwargs.pop('master', None)
+            if not master:
+                master = ''
+            kwargs['master'] = master
+
+        prev_monitored_session = _monitored_session.MonitoredSession
+        sess = fn(*args, **kwargs)
+        _monitored_session.MonitoredSession = prev_monitored_session
+        return sess
+
+    HorovodMonitoredTrainingSession.wrapped_fn = fn
+    return HorovodMonitoredTrainingSession
+
+
+class SessionRewriting(GraphRewriting):
+    r'''Rewriting monitored training session.
+    '''
+
+    def __init__(self):
+        super().__init__()
+        self._prev_sess_fn = None
+
+    def begin(self):
+        r'''Rewrites API.
+        '''
+        self._prev_sess_fn = _monitored_session.MonitoredTrainingSession
+        _monitored_session.MonitoredTrainingSession = (
+            wraps_monitored_training_session(
+                _monitored_session.MonitoredTrainingSession))
+        training.MonitoredTrainingSession = (
+            _monitored_session.MonitoredTrainingSession)
+        train_v1.MonitoredTrainingSession = (
+            _monitored_session.MonitoredTrainingSession)
+
+    def end(self):
+        r'''Revert API rewriting.
+        '''
+        train_v1.MonitoredTrainingSession = self._prev_sess_fn
+        training.MonitoredTrainingSession = self._prev_sess_fn
+        _monitored_session.MonitoredTrainingSession = self._prev_sess_fn
+
+
+GraphRewriting.register(SessionRewriting)
+
+##################### Saver ##############################
+
+class CollectiveSaverBase(object):  # pylint: disable=useless-object-inheritance
+    r'''Base class of sharded savers.
+    '''
+
+def wraps_saver(cls):
+    r'''Wraps a saver to support hybrid parallelism.
+    '''
+    if issubclass(cls, CollectiveSaverBase):
+        return cls
+
+    class CollectiveSaver(cls, CollectiveSaverBase):
+        r'''SaverBuilder with support for hybrid parallelism.
+        '''
+
+        def __init__(self, *args, **kwargs):
+            self._rank = HvdContext.get().rank
+            self._world_size = HvdContext.get().world_size
+            kwargs['sharded'] = True
+            kwargs['allow_empty'] = True
+            with ops.device('/cpu:0'):
+                super().__init__(*args, **kwargs)
+
+        @property
+        def rank(self):
+            return self._rank
+
+        @property
+        def world_size(self):
+            return self._world_size
+
+        def _build(self, *args, **kwargs):
+            r'''Builds saver_def.
+            '''
+            if self._world_size <= 1:
+                super()._build(*args, **kwargs)
+                return
+
+            if self._builder is None:
+                super()._build(*args, **kwargs)
+
+        def save(self, *args, **kwargs):
+            r'''Saves sharded variables.
+            '''
+            if self._world_size <= 1:
+                super().save(*args, **kwargs)
+                return
+
+            write_meta_graph = (
+                kwargs.pop('write_meta_graph', True)
+                and self._rank == 0)
+            kwargs['write_meta_graph'] = write_meta_graph
+            write_state = kwargs.pop('write_state', True) and self._rank == 0
+            kwargs['write_state'] = write_state
+            super().save(*args, **kwargs)
+
+        def export_meta_graph(self, filename=None, **kwargs):
+            if self._rank == 0:
+                return super().export_meta_graph(filename=filename, **kwargs)
+            return None
+
+    return CollectiveSaver
+
+
+Saver = wraps_saver(saver.Saver)
+
+
+def replace_default_saver():
+    rank = HvdContext.get().rank
+    savers = ops.get_collection_ref(ops.GraphKeys.SAVERS)
+
+    if not savers:
+        default_saver = Saver()
+        ops.add_to_collection(ops.GraphKeys.SAVERS, default_saver)
+        return
+    if len(savers) > 1:
+        raise ValueError(
+            f'Multiple items found in collection SAVERS: {savers}')
+
+    default_saver = savers[0]
+    if isinstance(default_saver, CollectiveSaverBase):
+        return
+
+    if not default_saver._sharded:  # pylint: disable=protected-access
+        raise ValueError('Default saver must be sharded')
+    if default_saver._builder is None:
+        def _wraps_build(build_fn):
+            r'''Decorator to wrap saver build.
+            '''
+
+            def wrapped_build(self, *args, **kwargs):
+                r'''Builds saver_def.
+                '''
+                build_fn(self, *args, **kwargs)
+            return wrapped_build
+        default_saver._build = _wraps_build(
+            default_saver._build)  # pylint: disable=protected-access
+
+    def _wraps_save(save_fn):
+        def wrapped_save(self, *args, **kwargs):
+            r'''Saves sharded variables.
+            '''
+            write_meta_graph = kwargs.pop(
+                'write_meta_graph', True) and rank == 0
+            kwargs['write_meta_graph'] = write_meta_graph
+            write_state = kwargs.pop('write_state', True) and rank == 0
+            kwargs['write_state'] = write_state
+            save_fn(self, *args, **kwargs)
+        return wrapped_save
+    default_saver.save = _wraps_save(default_saver.save)
+
+
+class DefaultSaverRewriting(GraphRewriting):
+    r'''A SessionRunHook replaces default saver.
+    '''
+
+    def begin(self):
+        r''' initialize replica variables and enable synchronous dataset wrapper
+        '''
+        replace_default_saver()
+
+
+GraphRewriting.register(DefaultSaverRewriting)
+
+
+##################### Estimator ##########################
+
+def export_all(
+            export_dir_base,
+            checkpoint_path,
+            signature_defs_and_main_op_fn,
+            assets_extra=None,
+            as_text=False,
+            clear_devices=True,
+            strip_default_attrs=True,
+            modes=None,
+            **kwargs):
+        r'''Build a SavedModel from variables in checkpoint.
+
+        Args:
+          export_dir_base: A string containing a directory to write the exported
+              graph and checkpoints.
+          checkpoint_path: A path to a checkpoint.
+          signature_defs_and_main_op_fn: Function returns signature defs and main_op.
+          assets_extra: A dict specifying how to populate the assets.extra directory
+              within the exported SavedModel.  Each key should give the destination
+              path (including the filename) relative to the assets.extra directory.
+              The corresponding value gives the full path of the source file to be
+              copied.  For example, the simple case of copying a single file without
+              renaming it is specified as
+              `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+          as_text: Whether or not to write the SavedModel proto in text format.
+          clear_devices: Whether or not to clear the device field.
+          strip_default_attrs: Whether or not to remove default-valued attributes
+              from the NodeDefs.
+          modes: List contains PREDICT, TRAIN or TEST.
+
+        Returns:
+          Export directory if it's chief.
+        '''
+        if HvdContext.get().rank != 0:
+            return None
+
+        export_dir = get_timestamped_export_dir(export_dir_base)
+        with ops.Graph().as_default():
+            with HvdContext.scope():
+               # Build graph.
+                signature_def_map = signature_defs_and_main_op_fn()
+                main_op = None
+                if isinstance(signature_def_map, (tuple, list)):
+                    if len(signature_def_map) > 1:
+                        main_op = signature_def_map[1]
+                    signature_def_map = signature_def_map[0]
+                if not main_op:
+                    main_op = _monitored_session.Scaffold.default_local_init_op()
+                if modes is None:
+                    modes = [ModeKeys.PREDICT, ModeKeys.TRAIN, ModeKeys.EVAL]
+                modes = [
+                    m for m in modes
+                    if SIGNATURE_KEY_MAP[m] in signature_def_map]
+                signature_def_map = {
+                    k: signature_def_map[k] for k in signature_def_map
+                    if k in [SIGNATURE_KEY_MAP[m] for m in modes]}
+                signature_tags = [EXPORT_TAG_MAP[m][0] for m in modes]
+
+                b = builder.SavedModelBuilder(export_dir, **kwargs)
+                b._has_saved_variables = True  # pylint: disable=protected-access
+
+                # Copy variables.
+                saved_model_utils.get_or_create_variables_dir(export_dir)
+                export_checkpoint_path = saved_model_utils.get_variables_path(
+                    export_dir)
+                checkpoint_files = [
+                    *gfile.Glob(f'{checkpoint_path}.index'),
+                    *gfile.Glob(f'{checkpoint_path}.data-?????-of-?????')]
+                for f in checkpoint_files:
+                    export_ckpt = re.sub(
+                        compat.as_text(checkpoint_path),
+                        compat.as_text(export_checkpoint_path),
+                        f)
+                    gfile.Copy(f, export_ckpt)
+
+                # Add MetaGraph.
+                b.add_meta_graph(
+                    tags=signature_tags,
+                    signature_def_map=signature_def_map,
+                    assets_collection=ops.get_collection(
+                        ops.GraphKeys.ASSET_FILEPATHS),
+                    clear_devices=clear_devices,
+                    main_op=main_op,
+                    strip_default_attrs=strip_default_attrs)
+
+                # Save model.
+                b.save(as_text=as_text)
+
+                # Save extras.
+                if assets_extra:
+                    assets_extra_path = os.path.join(
+                        export_dir, constants.EXTRA_ASSETS_DIRECTORY)
+                    for dst, src in assets_extra.items():
+                        target = os.path.join(
+                            assets_extra_path, compat.as_bytes(dst))
+                        gfile.MakeDirs(os.path.dirname(target))
+                        gfile.Copy(src, target)
+
+def wraps_model_fn(model_fn, model_dir, config):
+    r'''Decorator to set params in a model function.
+    '''
+    def wrapped_model_fn(features, labels, mode, params):
+        r'''Wrapped model function.
+        '''
+        with scope():
+            estimator_spec = model_fn(features, labels, mode, params)
+            if estimator_spec.scaffold.saver is None:
+                estimator_spec.scaffold._saver = Saver(  # pylint: disable=protected-access
+                    max_to_keep=config.keep_checkpoint_max,
+                    keep_checkpoint_every_n_hours=config.keep_checkpoint_every_n_hours,
+                    defer_build=True,
+                    save_relative_paths=True)
+            training_hooks = list(estimator_spec.training_hooks) or []
+            training_chief_hooks = list(
+                estimator_spec.training_chief_hooks) or []
+            estimator_spec = estimator_spec._replace(  # pylint: disable=protected-access
+                training_hooks=training_hooks,
+                training_chief_hooks=training_chief_hooks)
+            return estimator_spec
+    return wrapped_model_fn
+
+
+def start_std_server(config):
+    r'''Creates, starts, and returns a server_lib.Server.
+    '''
+    logging.info('Start Tensorflow server.')
+    return server_lib.Server(config.cluster_spec,
+                              job_name=config.task_type,
+                              task_index=config.task_id,
+                              config=wraps_session_config(config.session_config),
+                              start=True,
+                              protocol=config.protocol)
+
+
+class ReuseVariables(object):  # pylint: disable=useless-object-inheritance
+    r'''Variable reusing context.
+    '''
+
+    def __call__(self, reuse):
+        reset_keras_uids()
+        varscope = ops.get_default_graph().get_collection_ref(('__varscope',))
+        if varscope:
+            varscope[0].variable_scopes_count.clear()
+        vs.get_variable_scope()._reuse = reuse  # pylint: disable=protected-access
+
+
+@contextlib.contextmanager
+def reuse_variables(reuse=None):
+    r'''Context manager that reuses variables.
+    '''
+    try:
+        fn = ReuseVariables()
+        prev_reuse = vs.get_variable_scope()._reuse  # pylint: disable=protected-access
+        if reuse is not None:
+            fn(reuse)
+        yield fn
+    finally:
+        vs.get_variable_scope()._reuse = prev_reuse  # pylint: disable=protected-access
+
+
+EvaluationSpec = collections.namedtuple(
+    'EvaluationSpec', ['name', 'hooks', 'update_op', 'eval_dict'])
+
+
+class EvaluationHook(session_run_hook.SessionRunHook):
+    r'''Hook to make evaluation along with training.
+    '''
+
+    def __init__(
+            self, fn,
+            steps=100,
+            every_n_iter=1000,
+            summary_dir=None,
+            history=None):
+        r'''Evaluates specific function.
+
+        Args:
+          fn: Function returns update_op, metric ops and hooks.
+          steps: Number of steps for which to evaluate model.
+          every_n_iter: `int`, runs the evaluator once every N training iteration.
+          summary_dir: Directory for summaries.
+          history: History of eval metrics. history should support `append` method.
+        Raises:
+          ValueError: if `every_n_iter` is non-positive or it's not a single machine
+            training
+        '''
+        if every_n_iter is None or every_n_iter <= 0:
+            raise ValueError(f'invalid every_n_iter={every_n_iter}.')
+        self._fn = fn
+        self._steps = steps
+        self._every_n_iter = every_n_iter
+        self._summary_dir = summary_dir
+        self._history = history
+
+    def begin(self):
+        r'''Preprocess global step and evaluation's hooks.
+        '''
+        self._evaluation_specs = ops.get_collection_ref(
+            EvaluationSpec.__name__)
+        if len(self._evaluation_specs) > 0:
+            raise ValueError('Only one evaluation spec allowed in a graph')
+
+        self._timer = None
+        self._iter_count = 0
+        self._hooks = []
+
+        self._timer = basic_session_run_hooks.SecondOrStepTimer(
+            every_steps=self._every_n_iter)
+        self._timer.reset()
+
+        with reuse_variables(vs.AUTO_REUSE):
+            with scope():
+                with ops.name_scope(ModeKeys.EVAL):
+                    eval_spec = self._fn()
+                    if isinstance(eval_spec, dict):
+                        eval_dict = {}
+                        update_ops = []
+                        for metric_name, metric_val_and_update in eval_spec.items():
+                            if not isinstance(metric_name, six.string_types):
+                                raise ValueError(
+                                    f'Metric name {metric_name} should be a str')
+                            if (not isinstance(metric_val_and_update, (tuple, list))
+                                    or len(metric_val_and_update) != 2):
+                                raise ValueError(
+                                    f'{metric_val_and_update} should be a tuple '
+                                    'of (metric, update_op)')
+                            eval_dict[metric_name] = metric_val_and_update[0]
+                            update_ops.append(metric_val_and_update[1])
+                        update_op = control_flow_ops.group(update_ops)
+                        eval_spec = EvaluationSpec(
+                            name=EvaluationSpec.__name__,
+                            hooks=None,
+                            update_op=update_op,
+                            eval_dict=eval_dict)
+                    if not isinstance(eval_spec, EvaluationSpec):
+                        raise ValueError(
+                            'eval_fn should return a dict or a EvaluationSpec')
+                    self._evaluation_specs.append(eval_spec)
+                    if eval_spec.hooks:
+                        self._hooks.extend(eval_spec.hooks)
+                    eval_dict = dict(eval_spec.eval_dict)
+                    if ops.GraphKeys.GLOBAL_STEP not in eval_dict:
+                        global_step_tensor = training_util.get_global_step(
+                            ops.get_default_graph())
+                        eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
+                    for h in self._hooks:
+                        h.begin()
+                    self._update_op = eval_spec.update_op
+                    self._metrics = eval_dict
+
+    def after_create_session(self, session, coord):  # pylint: disable=unused-argument
+        r'''Call evaluation's hooks.
+        '''
+        if ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS):
+            raise ValueError(
+                'EvaluationHook does not support saveables other than global '
+                'variables.')
+        for h in self._hooks:
+            h.after_create_session(session, coord)
+
+    def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
+        r'''Runs evaluator after session run.
+        '''
+        self._iter_count += 1
+        if self._timer.should_trigger_for_step(self._iter_count):
+            ctx_stop_requested = run_context.stop_requested
+            run_context._stop_requested = False  # pylint: disable=protected-access
+            self._evaluate(run_context)
+            run_context._stop_requested = ctx_stop_requested  # pylint: disable=protected-access
+
+    def _call_before_run_hooks(
+            self, run_context, fetch_dict, user_feed_dict=None):
+        r'''Call hooks.before_run and handle requests from hooks.
+        '''
+        hook_feeds = {}
+        for hook in self._hooks:
+            request = hook.before_run(run_context)
+            if request is not None:
+                if request.fetches is not None:
+                    fetch_dict[hook] = request.fetches
+                if request.feed_dict:
+                    hook_feeds.update(request.feed_dict)
+
+        if not hook_feeds:
+            return user_feed_dict
+
+        if not user_feed_dict:
+            return hook_feeds
+
+        hook_feeds.update(user_feed_dict)
+        return hook_feeds
+
+    def _run(self, run_context, fetches):
+        r'''Run the evaluation.
+        '''
+        if isinstance(fetches, dict):
+            actual_fetches = fetches
+        else:
+            actual_fetches = {fetches: fetches}
+        eval_metrics = self._call_before_run_hooks(
+            run_context, actual_fetches)
+        eval_results = run_context.session.run(
+            actual_fetches, feed_dict=eval_metrics)
+        for hook in self._hooks:
+            hook.after_run(
+                run_context,
+                session_run_hook.SessionRunValues(
+                    results=eval_results.pop(hook, None),
+                    options=config_pb2.RunOptions(),
+                    run_metadata=config_pb2.RunMetadata()))
+        return eval_results
+
+    def _write_dict_to_summary(self, dictionary):
+        r'''Write evaluation results to summary directory.
+        '''
+        current_global_step = dictionary[ops.GraphKeys.GLOBAL_STEP]
+        prev_np_printoptions = np.get_printoptions()
+        np.set_printoptions(suppress=True)
+        stats = ', '.join(
+            f'{k} = {v}'
+            for k, v in sorted(six.iteritems(dictionary))
+            if not (
+                isinstance(v, six.binary_type)
+                or k == ops.GraphKeys.GLOBAL_STEP))
+        np.set_printoptions(**prev_np_printoptions)
+        logging.info('Saving metrics for step %d: %s',
+                     current_global_step, stats)
+
+        summary_dir = self._summary_dir
+        if HvdContext.get().world_size > 1:
+            summary_dir = os.path.join(summary_dir, f'{HvdContext.get().rank}')
+        summary_writer = core_summary.FileWriterCache.get(summary_dir)
+        summary_proto = summary_pb2.Summary()
+
+        for key in dictionary:
+            if dictionary[key] is None:
+                continue
+            if key == 'global_step':
+                continue
+            if isinstance(dictionary[key], (np.float32, float)):
+                summary_proto.value.add(
+                    tag=key, simple_value=float(dictionary[key]))
+            elif isinstance(dictionary[key], (np.int64, np.int32, int)):
+                summary_proto.value.add(
+                    tag=key, simple_value=int(dictionary[key]))
+            elif isinstance(dictionary[key], six.binary_type):
+                try:
+                    summ = summary_pb2.Summary.FromString(dictionary[key])
+                    for i, _ in enumerate(summ.value):
+                        summ.value[i].tag = f'{key}/{i}'
+                    summary_proto.value.extend(summ.value)
+                except message.DecodeError:
+                    logging.warning(
+                        'Skipping summary for %s, cannot parse string to Summary.', key)
+                    continue
+            elif isinstance(dictionary[key], np.ndarray):
+                value = summary_proto.value.add()
+                value.tag = key
+                value.node_name = key
+                tensor_proto = tensor_util.make_tensor_proto(dictionary[key])
+                value.tensor.CopyFrom(tensor_proto)
+                # pylint: disable=line-too-long
+                logging.info(
+                    'Summary for np.ndarray is not visible in Tensorboard by default. '
+                    'Consider using a Tensorboard plugin for visualization (see '
+                    'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
+                    ' for more information).')
+                # pylint: enable=line-too-long
+            else:
+                logging.warning(
+                    'Skipping summary for %s, must be a float, np.float32, np.int64, '
+                    'np.int32 or int or np.ndarray or a serialized string of Summary.',
+                    key)
+        summary_writer.add_summary(summary_proto, current_global_step)
+        summary_writer.flush()
+
+    def _evaluate(self, run_context):
+        r'''Evaluate on run context.
+        '''
+        for _ in range(self._steps):
+            if not run_context.stop_requested:
+                self._run(run_context, self._update_op)
+        metric_values = self._run(run_context, self._metrics)
+        if metric_values is not None:
+            if self._history is not None:
+                self._history.append(metric_values)
+            self._write_dict_to_summary(metric_values)
+        self._timer.update_last_triggered_step(self._iter_count)
+
+class HorovodEstimatorBase(object):  # pylint: disable=useless-object-inheritance
+    r'''Base class of estimator wrapper.
+    '''
+
+def wraps_estimator(cls):
+    r'''Estimator decorator to train and evaluate in parallel.
+    '''
+    if issubclass(cls, HorovodEstimatorBase):
+        return cls
+
+    class HorovodEstimator(cls, HorovodEstimatorBase):
+        r'''Class to train and evaluate TensorFlow models.
+        '''
+
+        def __init__(self, model_fn, **kwargs):
+            r'''Constructs a wrapped `Estimator` instance.
+
+            Args:
+              model_fn: Model function. See
+                `tensorflow_estimator/python/estimator/estimator.py#L145`
+                for more information.
+              kwargs: Estimator arguments.
+            '''
+            model_dir = kwargs.get('model_dir', None)
+            self._train_drop_remainder = kwargs.pop(
+                'train_drop_remainder', True)
+            self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', True)
+            self._predict_drop_remainder = kwargs.pop(
+                'predict_drop_remainder', True)
+
+            super().__init__(
+                wraps_model_fn(model_fn, model_dir, kwargs['config']),
+                **kwargs)
+
+        def _assert_members_are_not_overridden(self):
+            r'''disable the overridden check here.
+            '''
+
+        def train(
+                self, input_fn, hooks=None, max_steps=None, saving_listeners=None):
+            r'''support sync_dataset in training.
+            '''
+            if saving_listeners is None:
+                saving_listeners = []
+            with scope():
+                return super().train(
+                    input_fn, hooks=hooks, max_steps=max_steps,
+                    saving_listeners=saving_listeners)
+
+        def evaluate(self,
+                     input_fn,
+                     steps=None,
+                     hooks=None,
+                     checkpoint_path=None,
+                     name=None):
+            r'''support standalone evaluation.
+            '''
+            _estimator_lib._estimator_api_gauge.get_cell(
+                'evaluate').set(True)  # pylint: disable=protected-access
+            if self.config.cluster_spec:
+                if estimator_training.should_run_distribute_coordinator(self.config):
+                    raise ValueError(
+                        'Running `evaluate` with Distribute Coordinator '
+                        'not supported.')
+                if not _is_google_env():
+                    start_std_server(self.config)
+
+                start_delay_secs = 0
+                if self.config.task_type == run_config_lib.TaskType.WORKER:
+                    max_delay_secs = _MAX_DELAY_SECS
+                    if self.config.experimental_max_worker_delay_secs is not None:
+                        max_delay_secs = int(
+                            self.config.experimental_max_worker_delay_secs)
+                    start_delay_secs = min(
+                        max_delay_secs,
+                        (self.config.task_id + 1) * _DELAY_SECS_PER_WORKER)
+
+                if start_delay_secs > 0:
+                    logging.info(
+                        f'Waiting {start_delay_secs} secs before starting evaluation.')
+                    time.sleep(start_delay_secs)
+
+            return self._actual_eval(
+                input_fn,
+                strategy=self._eval_distribution,
+                steps=steps,
+                hooks=hooks,
+                checkpoint_path=checkpoint_path,
+                name=name)
+
+        def _actual_eval(
+                self, input_fn, strategy=None, steps=None, hooks=None,
+                checkpoint_path=None, name=None):
+            if strategy:
+                raise ValueError('DistributionStrategy not supported')
+
+            with _context.graph_mode(), HvdContext.scope():
+                hooks = _estimator_lib._check_hooks_type(
+                    hooks)  # pylint: disable=protected-access
+                hooks.extend(self._convert_eval_steps_to_hooks(
+                    steps))  # pylint: disable=protected-access
+                if not checkpoint_path:
+                    latest_path = checkpoint_management.latest_checkpoint(
+                        self._model_dir)  # pylint: disable=protected-access
+                    if not latest_path:
+                        raise ValueError(
+                            f'Could not find trained model in model_dir: {self._model_dir}.')  # pylint: disable=protected-access
+                    checkpoint_path = latest_path
+
+                with ops.Graph().as_default() as g, g.device(self._device_fn):  # pylint: disable=protected-access
+                    with ops.name_scope(ModeKeys.EVAL), reuse_variables(vs.AUTO_REUSE):
+                        (scaffold, update_op, eval_dict, all_hooks) = (
+                            self._evaluate_build_graph(  # pylint: disable=protected-access
+                                input_fn,
+                                hooks, checkpoint_path))
+                        return self._evaluate_run(  # pylint: disable=protected-access
+                            checkpoint_path=checkpoint_path,
+                            scaffold=scaffold,
+                            update_op=update_op,
+                            eval_dict=eval_dict,
+                            all_hooks=all_hooks,
+                            output_dir=self.eval_dir(name))
+
+        def train_and_evaluate(
+                self, train_spec, eval_spec,
+                eval_every_n_iter=None,
+                eval_history=None):
+            r'''Train and evaluate the `estimator`.
+
+            Args:
+              eval_every_n_iter: `int`, runs parallel evaluation once every
+                N training iteration. If None, disable the evaluation.
+              eval_history: History of eval metrics. eval_history should support
+                `append` method.
+            '''
+            train_hooks = []
+            if eval_every_n_iter is not None:
+                def _eval_fn():
+                    with scope():
+                        (_, evaluation_hooks, input_hooks, update_op, eval_dict) = (
+                            self._call_model_fn_eval(  # pylint: disable=protected-access
+                                eval_spec.input_fn, self.config))
+                        hooks = list(evaluation_hooks) or []
+                        hooks.extend(list(input_hooks) or [])
+                        return EvaluationSpec(
+                            name=EvaluationSpec.__name__,
+                            hooks=hooks,
+                            update_op=update_op,
+                            eval_dict=eval_dict)
+                eval_hook = EvaluationHook(
+                    _eval_fn,
+                    steps=eval_spec.steps,
+                    every_n_iter=eval_every_n_iter,
+                    summary_dir=self.eval_dir(),
+                    history=eval_history)
+                train_hooks.append(eval_hook)
+
+            if self.config.cluster_spec:
+                if estimator_training.should_run_distribute_coordinator(self.config):
+                    raise ValueError(
+                        'Running `train_and_evaluate` with Distribute Coordinator '
+                        'not supported.')
+
+                executor = _TrainingExecutor(
+                    estimator=self,
+                    train_spec=train_spec,
+                    eval_spec=eval_spec,
+                    train_hooks=train_hooks)
+                return executor.run()
+
+            return self.train(
+                train_spec.input_fn,
+                hooks=tuple(train_spec.hooks) + tuple(train_hooks),
+                max_steps=train_spec.max_steps)
+
+        def export_saved_model(
+                self, export_dir_base, serving_input_receiver_fn,
+                assets_extra=None,
+                as_text=False,
+                checkpoint_path=None,
+                experimental_mode=ModeKeys.PREDICT,
+                **kwargs):
+            r'''Exports inference graph as a `SavedModel` into the given dir.
+            '''
+            if not serving_input_receiver_fn:
+                raise ValueError('An input_receiver_fn must be defined.')
+
+            input_receiver_fn_map = {
+                experimental_mode: serving_input_receiver_fn}
+
+            return self._export_all_saved_models(
+                export_dir_base,
+                input_receiver_fn_map,
+                assets_extra=assets_extra,
+                as_text=as_text,
+                checkpoint_path=checkpoint_path,
+                strip_default_attrs=True,
+                **kwargs)
+
+        def experimental_export_all_saved_models(
+                self, export_dir_base, input_receiver_fn_map,
+                assets_extra=None,
+                as_text=False,
+                checkpoint_path=None,
+                **kwargs):
+            r'''Exports a `SavedModel` with `tf.MetaGraphDefs` for each requested
+              mode.
+            '''
+            return self._export_all_saved_models(
+                export_dir_base, input_receiver_fn_map,
+                assets_extra=assets_extra,
+                as_text=as_text,
+                checkpoint_path=checkpoint_path,
+                strip_default_attrs=True,
+                **kwargs)
+
+        def _export_all_saved_models(
+                self,
+                export_dir_base,
+                input_receiver_fn_map,
+                assets_extra=None,
+                as_text=False,
+                checkpoint_path=None,
+                strip_default_attrs=True,
+                **kwargs):
+            r'''Exports multiple modes in the model function to a SavedModel.
+            '''
+            if (input_receiver_fn_map.get(ModeKeys.TRAIN)
+                or input_receiver_fn_map.get(ModeKeys.EVAL)
+                    or not input_receiver_fn_map.get(ModeKeys.PREDICT)):
+                raise ValueError('Only PREDICT mode is supported.')
+            mode = ModeKeys.PREDICT
+
+            if HvdContext.get().rank != 0:
+                return None
+
+            if not checkpoint_path:
+                checkpoint_path = checkpoint_management.latest_checkpoint(
+                    self._model_dir)
+            if not checkpoint_path:
+                if self._warm_start_settings:
+                    checkpoint_path = self._warm_start_settings.ckpt_to_initialize_from
+                    if gfile.IsDirectory(checkpoint_path):
+                        checkpoint_path = checkpoint_management.latest_checkpoint(
+                            checkpoint_path)
+                else:
+                    raise ValueError(
+                        f'Couldn\'t find trained model at {self._model_dir}.')
+
+            def _fn():
+                random_seed.set_random_seed(self._config.tf_random_seed)
+
+                input_receiver_fn = input_receiver_fn_map[mode]
+                input_receiver = input_receiver_fn()
+                estimator_spec = self._call_model_fn(
+                    features=input_receiver.features,
+                    labels=getattr(input_receiver, 'labels', None),
+                    mode=mode,
+                    config=self.config)
+                export_outputs = export_lib.export_outputs_for_mode(
+                    mode=estimator_spec.mode,
+                    serving_export_outputs=estimator_spec.export_outputs,
+                    predictions=estimator_spec.predictions,
+                    loss=estimator_spec.loss,
+                    metrics=estimator_spec.eval_metric_ops)
+                signature_def_map = export_lib.build_all_signature_defs(
+                    input_receiver.receiver_tensors,
+                    export_outputs,
+                    getattr(input_receiver,
+                            'receiver_tensors_alternatives', None),
+                    serving_only=(mode == ModeKeys.PREDICT))
+                main_op = None
+                if estimator_spec.scaffold.local_init_op is not None:
+                    main_op = estimator_spec.scaffold.local_init_op
+                return signature_def_map, main_op
+
+            return export_all(
+                export_dir_base,
+                checkpoint_path,
+                _fn,
+                assets_extra=assets_extra,
+                as_text=as_text,
+                clear_devices=True,
+                strip_default_attrs=strip_default_attrs,
+                modes=[mode],
+                **kwargs)
+
+        def _actual_predict(
+                self, input_fn,
+                predict_keys=None,
+                hooks=None,
+                checkpoint_path=None,
+                yield_single_examples=True):
+            r'''Predict method of estimator in HB.
+            '''
+            with _context.graph_mode(), HvdContext.scope():
+                hooks = _estimator_lib._check_hooks_type(
+                    hooks)  # pylint: disable=protected-access
+                # Check that model has been trained.
+                if not checkpoint_path:
+                    checkpoint_path = checkpoint_management.latest_checkpoint(
+                        self._model_dir)
+                if not checkpoint_path:
+                    logging.info(
+                        f'Could not find trained model in model_dir: {self._model_dir},'
+                        f'running initialization to predict.')
+                with ops.Graph().as_default() as g, g.device(self._device_fn):
+                    with ops.name_scope(ModeKeys.PREDICT):
+                        random_seed.set_random_seed(
+                            self._config.tf_random_seed)
+                        self._create_and_assert_global_step(g)
+                        features, input_hooks = self._get_features_from_input_fn(
+                            input_fn, ModeKeys.PREDICT)
+                        estimator_spec = self._call_model_fn(
+                            features, None, ModeKeys.PREDICT, self.config)
+
+                    # Call to warm_start has to be after model_fn is called.
+                    self._maybe_warm_start(checkpoint_path)
+
+                    predictions = self._extract_keys(estimator_spec.predictions,
+                                                     predict_keys)
+                    all_hooks = list(input_hooks)
+                    all_hooks.extend(hooks)
+                    all_hooks.extend(
+                        list(estimator_spec.prediction_hooks or []))
+                    with _monitored_session.MonitoredSession(
+                            session_creator=_monitored_session.ChiefSessionCreator(
+                                checkpoint_filename_with_path=checkpoint_path,
+                                master=self._config.master,
+                                scaffold=estimator_spec.scaffold,
+                                config=self._session_config),
+                            hooks=all_hooks) as mon_sess:
+                        while not mon_sess.should_stop():
+                            preds_evaluated = mon_sess.run(predictions)
+                            if not yield_single_examples:
+                                yield preds_evaluated
+                            elif not isinstance(predictions, dict):
+                                for pred in preds_evaluated:
+                                    yield pred
+                            else:
+                                for i in range(self._extract_batch_length(preds_evaluated)):
+                                    yield {
+                                        key: value[i]
+                                        for key, value in six.iteritems(preds_evaluated)
+                                    }
+
+        def predict(
+                self, input_fn,
+                predict_keys=None,
+                hooks=None,
+                checkpoint_path=None,
+                yield_single_examples=True):
+            r'''Predict method of estimator in HB.
+            '''
+            _estimator_lib._estimator_api_gauge.get_cell(
+                'predict').set(True)  # pylint: disable=protected-access
+            if self.config.cluster_spec:
+                if estimator_training.should_run_distribute_coordinator(self.config):
+                    raise ValueError(
+                        'Running `evaluate` with Distribute Coordinator '
+                        'not supported.')
+                if not _is_google_env():
+                    start_std_server(self.config)
+
+            return self._actual_predict(
+                input_fn,
+                predict_keys=predict_keys,
+                hooks=hooks,
+                checkpoint_path=checkpoint_path,
+                yield_single_examples=yield_single_examples)
+
+    return HorovodEstimator
+
+
+##################### public interface ##########################
+
+
+@contextlib.contextmanager
+def scope(**kwargs):
+    with GraphRewriting.scope(**kwargs) as ctx:
+        yield ctx
+
+
+@contextlib.contextmanager
+def embedding_scope(**kwargs):
+    with GraphRewriting.scope(sharded=True, **kwargs) as ctx:
+        yield ctx
+
+def export(export_dir_base,
+           checkpoint_path,
+           signature_def_fn,
+           assets_extra=None,
+           as_text=False,
+           clear_devices=True,
+           strip_default_attrs=True,
+           mode=ModeKeys.PREDICT):
+
+    return export_all(export_dir_base,
+                      checkpoint_path,
+                      lambda: {SIGNATURE_KEY_MAP[mode]: signature_def_fn()},
+                      assets_extra=assets_extra,
+                      as_text=as_text,
+                      clear_devices=clear_devices,
+                      strip_default_attrs=strip_default_attrs,
+                      modes=[mode])
\ No newline at end of file
diff --git a/tensorflow/python/distribute/launch.py b/tensorflow/python/distribute/launch.py
new file mode 100644
index 00000000000..60dc93550e5
--- /dev/null
+++ b/tensorflow/python/distribute/launch.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+
+# Copyright 2023 Alibaba Group Holding Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+import signal
+
+
+def sigintHandler(signum, frame):
+    print("exiting process")
+    exit(-1)
+
+
+def _query_visible_devices():
+    r"""Query visible devices."""
+    visible_devices_str = os.getenv("CUDA_VISIBLE_DEVICES", "")
+    if not visible_devices_str:
+        visible_devices_str = os.getenv("NVIDIA_VISIBLE_DEVICES", "")
+    if not visible_devices_str or visible_devices_str == "void":
+        return []
+    if visible_devices_str != "all":
+        try:
+            return visible_devices_str.split(",")
+        except:  # pylint: disable=bare-except
+            logging.exception("Parse NVIDIA_VISIBLE_DEVICES failed:")
+            return []
+    query_devices_command = (
+        "nvidia-smi --query-gpu=uuid --format=csv,noheader 2>/dev/null"
+    )
+    try:
+        with subprocess.Popen(
+            query_devices_command,
+            shell=True,
+            stderr=subprocess.STDOUT,
+            stdout=subprocess.PIPE,
+        ) as proc:
+            return [d for d in iter(proc.stdout.readline, b"") if d]
+    except (OSError, ValueError):
+        return []
+
+
+def launch(command):
+    r"""Run command in subprocess."""
+    visible_devices = _query_visible_devices()
+    local_world_size_str = str(len(visible_devices))
+    strategy = os.getenv("COLLECTIVE_STRATEGY", "hb")
+
+    signal.signal(signal.SIGINT, sigintHandler)
+    signal.signal(signal.SIGHUP, sigintHandler)
+    signal.signal(signal.SIGTERM, sigintHandler)
+
+    if strategy == "hb":
+        port = int(os.getenv("HB_RUN_BASE_PORT", "20001"))
+        device_to_ports = []
+        for d in visible_devices:
+            device_to_ports.append([d, port])
+            port += 1
+
+        if len(device_to_ports) < 1:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            os.environ["HB_OP_OPTIMIZATION_DISABLED"] = "1"
+            if callable(command):
+                command()
+                return
+            subprocess.check_call(command)
+            return
+
+        if len(device_to_ports) == 1:
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+            if callable(command):
+                command()
+                return
+            subprocess.check_call(command)
+            return
+
+        tf_config = json.loads(os.getenv("TF_CONFIG", "{}"))
+        if tf_config:
+            task = tf_config["task"]
+            task_type = task["type"]
+            task_id = int(task["index"])
+            cluster = tf_config["cluster"]
+        else:
+            task_type = "chief"
+            task_id = 0
+            cluster = {"chief": ["127.0.0.1:20000"]}
+
+        workers = []
+        if "chief" in cluster:
+            workers.extend(cluster["chief"])
+        if "worker" in cluster:
+            workers.extend(cluster["worker"])
+        worker_hosts = [w.split(":")[0] for w in workers]
+        new_workers = [
+            f"{h}:{p}" for h in worker_hosts for _, p in device_to_ports]
+        new_cluster = cluster.copy()
+        if "chief" in cluster:
+            new_cluster["chief"] = [new_workers[0]]
+            if len(new_workers) > 1:
+                new_cluster["worker"] = new_workers[1:]
+        else:
+            new_cluster["worker"] = new_workers
+
+        if task_type not in ("chief", "worker"):
+            new_tf_config = {}
+            new_tf_config["cluster"] = new_cluster
+            new_tf_config["task"] = {}
+            new_tf_config["task"]["type"] = task_type
+            new_tf_config["task"]["index"] = task_id
+            os.environ["TF_CONFIG"] = json.dumps(new_tf_config)
+            os.environ["TF_TASK_TYPE"] = str(task_type)
+            os.environ["TF_TASK_INDEX"] = str(task_id)
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            os.environ["HB_OP_OPTIMIZATION_DISABLED"] = "1"
+            if callable(command):
+                command()
+            subprocess.check_call(command)
+            return
+
+        cpu_count = os.cpu_count()
+        interop_threads = os.getenv("TF_NUM_INTEROP_THREADS", cpu_count)
+        interop_threads_gpu = None
+        if interop_threads:
+            interop_threads_gpu = int(
+                int(interop_threads) / len(device_to_ports))
+            interop_threads_gpu = max(interop_threads_gpu, 4)
+        intraop_threads = os.getenv("TF_NUM_INTRAOP_THREADS", cpu_count)
+        intraop_threads_gpu = None
+        if intraop_threads:
+            intraop_threads_gpu = int(
+                int(intraop_threads) / len(device_to_ports))
+            intraop_threads_gpu = max(intraop_threads_gpu, 1)
+        gpu_procs = {}
+        gpu_envs = {}
+        local_host = cluster[task_type][task_id].split(":")[0]
+        for device, port in device_to_ports:
+            gpu_addr = f"{local_host}:{port}"
+            gpu_index = new_workers.index(gpu_addr)
+            gpu_tf_config = {}
+            gpu_tf_config["cluster"] = new_cluster
+            gpu_tf_config["task"] = {}
+            if "chief" in cluster:
+                if gpu_index == 0:
+                    gpu_tf_config["task"]["type"] = "chief"
+                    gpu_tf_config["task"]["index"] = 0
+                else:
+                    gpu_tf_config["task"]["type"] = "worker"
+                    gpu_tf_config["task"]["index"] = gpu_index - 1
+            else:
+                gpu_tf_config["task"]["type"] = "worker"
+                gpu_tf_config["task"]["index"] = gpu_index
+            gpu_env = os.environ.copy()
+            gpu_env["TF_CONFIG"] = json.dumps(gpu_tf_config)
+            gpu_env["TF_TASK_TYPE"] = gpu_tf_config["task"]["type"]
+            gpu_env["TF_TASK_INDEX"] = str(gpu_tf_config["task"]["index"])
+            gpu_env["CUDA_VISIBLE_DEVICES"] = device
+            gpu_env["LOCAL_WORLD_SIZE"] = local_world_size_str
+            if interop_threads_gpu:
+                gpu_env["TF_NUM_INTEROP_THREADS"] = str(interop_threads_gpu)
+            if intraop_threads_gpu:
+                gpu_env["TF_NUM_INTRAOP_THREADS"] = str(intraop_threads_gpu)
+            gpu_envs[device] = gpu_env
+
+        if callable(command):
+            procs = {}
+            for device, _ in device_to_ports:
+
+                def _target(env):
+                    os.environ = env
+                    command()
+
+                proc = mp.Process(target=_target, args=(gpu_envs[device],))
+                proc.start()
+                procs[device] = proc
+            done_procs = []
+            for device, proc in procs.items():
+                proc.join()
+                done_procs.append(device)
+                if proc.exitcode is not None and proc.exitcode != 0:
+                    for term_gid, term_proc in procs.items():
+                        if term_gid not in done_procs:
+                            term_proc.terminate()
+                            done_procs.append(term_gid)
+                    if proc.exitcode < 0:
+                        sys.exit(
+                            f"Process {proc.pid} killed by "
+                            f"{signal.Signals(-proc.exitcode).name}"
+                        )
+                    else:
+                        sys.exit(
+                            f"Process {proc.pid} exits unexpectedly: {proc.exitcode}"
+                        )
+            return
+
+        for device, _ in device_to_ports:
+            gpu_proc = subprocess.Popen(  # pylint: disable=consider-using-with
+                command, env=gpu_envs[device], stdout=sys.stdout, stderr=sys.stderr
+            )
+            gpu_procs[gpu_proc.pid] = gpu_proc
+        while True:
+            if len(gpu_procs) < 1:
+                break
+            done_pids = []
+            for pid, proc in gpu_procs.items():
+                proc.poll()
+                if proc.returncode is not None:
+                    if proc.returncode == 0:
+                        done_pids.append(pid)
+                    else:
+                        sys.exit(proc.returncode)
+            for pid in done_pids:
+                del gpu_procs[pid]
+            time.sleep(1)
+
+    elif strategy == "sok":
+
+        def func_horovod_command(rank_size_str):
+            horovod_command = ["horovodrun", "-np", rank_size_str]
+            subprocess_command = []
+            # shape like -H python main.py
+            if isinstance(command, list):
+                for cmd in command:
+                    subprocess_command.extend(cmd.split(" "))
+            elif isinstance(command, str):
+                subprocess_command.append(command)
+            horovod_command.extend(subprocess_command)
+            return horovod_command
+
+        port = int(os.getenv("HB_RUN_BASE_PORT", "20001"))
+        device_to_ports = []
+        for d in visible_devices:
+            device_to_ports.append([d, port])
+            port += 1
+
+        if len(device_to_ports) < 1:
+            logging.error("SOK mode currently not support CPU mode ")
+            return
+
+        if len(device_to_ports) == 1:
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+            horovod_command = func_horovod_command(local_world_size_str)
+            subprocess.Popen(
+                horovod_command, stdout=sys.stdout, stderr=sys.stderr)
+            try:
+                signal.pause()
+            finally:
+                proc.terminate()
+
+                if proc.poll() is None:
+                    proc.kill()
+
+        cpu_count = os.cpu_count()
+        interop_threads = os.getenv("TF_NUM_INTEROP_THREADS", cpu_count)
+        interop_threads_gpu = None
+        if interop_threads:
+            interop_threads_gpu = int(
+                int(interop_threads) / len(device_to_ports))
+            interop_threads_gpu = max(interop_threads_gpu, 4)
+        intraop_threads = os.getenv("TF_NUM_INTRAOP_THREADS", cpu_count)
+        intraop_threads_gpu = None
+        if intraop_threads:
+            intraop_threads_gpu = int(
+                int(intraop_threads) / len(device_to_ports))
+            intraop_threads_gpu = max(intraop_threads_gpu, 1)
+
+        envs = os.environ.copy()
+        envs["TF_CONFIG"] = "{}"
+        envs["TF_NUM_INTEROP_THREADS"] = str(interop_threads_gpu)
+        envs["TF_NUM_INTRAOP_THREADS"] = str(intraop_threads_gpu)
+
+        horovod_command = func_horovod_command(local_world_size_str)
+        proc = subprocess.Popen(
+            horovod_command, stdout=sys.stdout, stderr=sys.stderr)
+
+        # 在父进程中等待子进程结束
+        pid, status = os.wait()
+
+        if os.WIFEXITED(status):
+            print("subprocess exit normally with exit code: ",
+                  os.WEXITSTATUS(status))
+        elif os.WIFSIGNALED(status):
+            print("subprocess exit abnormally with exit code:", os.WTERMSIG(status))
+
+    else:
+        logging.error("ENV `COLLECTIVE_STRATEGY` is unrecognized......")
+        return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("command", nargs="?", help="Command to launch script")
+    parser.add_argument(
+        "args", nargs=argparse.REMAINDER, help="Arguments of the command"
+    )
+    args = parser.parse_args()
+    launch([args.command] + args.args)
diff --git a/tensorflow/python/framework/group_embedding_types.py b/tensorflow/python/framework/group_embedding_types.py
index bc38eeac1ab..450c8cfa0d0 100644
--- a/tensorflow/python/framework/group_embedding_types.py
+++ b/tensorflow/python/framework/group_embedding_types.py
@@ -22,7 +22,8 @@
 
 @unique
 class DistStrategy(Enum):
-  COLLECTIVE = "collective"
+  SOK = "sok"
+  HB = "hb"
   DISTRIBUTED = "ps"
   LOCALIZED = "localized"
   UNKNOWN = "unknown"
@@ -31,8 +32,10 @@ class DistStrategy(Enum):
 
 def set_group_lookup_strategy(strategy):
   def str_to_strategy(strategy):
-    if strategy == "collective":
-      return DistStrategy.COLLECTIVE
+    if strategy == "sok":
+      return DistStrategy.SOK
+    elif strategy == "hb":
+      return DistStrategy.HB
     elif strategy == "ps":
       return DistStrategy.DISTRIBUTED
     elif strategy == "localized":
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 8c98d2b59f3..cb2b7bb8154 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -1664,8 +1664,17 @@ def group_embedding_lookup_sparse(params,
                              )
 
     strategy = get_group_lookup_strategy()
-    if strategy == DistStrategy.COLLECTIVE:
-        for (index, param) in enumerate(params):
+    if strategy == DistStrategy.SOK:
+        import horovod.tensorflow as hvd
+        should_shard = False
+        if len(params) > hvd.size():
+          should_shard = True
+          global_size = hvd.size()
+        if should_shard:
+          for (index, param) in enumerate(params):
+            param.target_gpu = index % global_size
+        else:
+          for (index, param) in enumerate(params):
             param.target_gpu = -1
 
         try:
@@ -1676,6 +1685,21 @@ def group_embedding_lookup_sparse(params,
         with ops.name_scope(name, 'group_embedding_lookup', params
                             + sp_ids) as name_scope:
             emb_vec = sok.lookup_sparse(params, sp_ids, combiners=combiners)
+    elif strategy == DistStrategy.HB:
+      emb_vec = []
+      with ops.name_scope(name, 'group_embedding_lookup', params
+                            + sp_ids) as name_scope:
+          for idx, embedding in enumerate(params):
+            if not ignore_weights:
+              sp_weight = sp_weights[idx]
+            else:
+              sp_weight = None
+            emb_vec.append(embedding_lookup_sparse(embedding,
+                                              sp_ids[idx],
+                                              sp_weight,
+                                              combiner=combiners[idx]))
+
+      
     elif strategy == DistStrategy.LOCALIZED:
 
       emb_vec = [None for _ in range(len(params))]

From e2037deae15703b43968ac0ab27519d96d78e0cf Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Thu, 20 Jul 2023 02:51:31 -0700
Subject: [PATCH 42/91] [Modelzoo & Docs] Add examples and docs to demonstrate
 Collective Training. (#914)

- Add example model to demonstrate usage of Collective Training function.
- Add user documentation about Collective Training Interface.

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 ...evel-py3.8-cu116-ubuntu20.04-hybridbackend | 136 +++
 docs/docs_en/Collective-Training.md           | 176 ++++
 docs/docs_zh/Collective-Training.md           | 173 ++++
 .../group_embedding/dcnv2/result/README.md    |   2 -
 modelzoo/features/group_embedding/script.sh   |   7 -
 .../dcnv2/README.md                           |   0
 .../dcnv2/data/README.md                      |   0
 .../dcnv2/train.py                            | 330 +++----
 .../grouped_embedding/deepfm/README.md        | 286 ++++++
 .../grouped_embedding/deepfm/data/README.md   |  10 +
 .../grouped_embedding/deepfm/train.py         | 872 ++++++++++++++++++
 modelzoo/features/grouped_embedding/script.sh |  21 +
 12 files changed, 1820 insertions(+), 193 deletions(-)
 create mode 100644 cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend
 create mode 100644 docs/docs_en/Collective-Training.md
 create mode 100644 docs/docs_zh/Collective-Training.md
 delete mode 100644 modelzoo/features/group_embedding/dcnv2/result/README.md
 delete mode 100644 modelzoo/features/group_embedding/script.sh
 rename modelzoo/features/{group_embedding => grouped_embedding}/dcnv2/README.md (100%)
 rename modelzoo/features/{group_embedding => grouped_embedding}/dcnv2/data/README.md (100%)
 rename modelzoo/features/{group_embedding => grouped_embedding}/dcnv2/train.py (68%)
 create mode 100644 modelzoo/features/grouped_embedding/deepfm/README.md
 create mode 100644 modelzoo/features/grouped_embedding/deepfm/data/README.md
 create mode 100644 modelzoo/features/grouped_embedding/deepfm/train.py
 create mode 100644 modelzoo/features/grouped_embedding/script.sh

diff --git a/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend
new file mode 100644
index 00000000000..4e7f951cec1
--- /dev/null
+++ b/cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend
@@ -0,0 +1,136 @@
+FROM alideeprec/deeprec-build:deeprec-dev-gpu-py38-cu116-ubuntu20.04
+
+RUN apt-get update && \
+    apt-get install -y \
+    --allow-unauthenticated \
+    --no-install-recommends \
+    pkg-config \
+    libssl-dev \
+    libcurl4-openssl-dev \
+    zlib1g-dev \
+    libhdf5-dev \
+    wget \
+    curl \
+    inetutils-ping \
+    net-tools \
+    unzip \
+    git \
+    vim \
+    cmake \
+    clang-format-7 \
+    openssh-server openssh-client \
+    openmpi-bin openmpi-common libopenmpi-dev libgtk2.0-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget -nv -O /opt/openmpi-4.1.1.tar.gz \
+    https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.1.tar.gz && \
+    cd /opt/ && tar -xvzf ./openmpi-4.1.1.tar.gz && \
+    cd openmpi-4.1.1 && ./configure && make && make install
+
+RUN git clone https://github.com/DeepRec-AI/HybridBackend.git /opt/HybridBackend
+
+ENV HYBRIDBACKEND_USE_CXX11_ABI=0 \
+    HYBRIDBACKEND_WITH_ARROW_HDFS=ON \
+    HYBRIDBACKEND_WITH_ARROW_S3=ON \
+    TMP=/tmp
+
+RUN cd /opt/HybridBackend/build/arrow && \
+    ARROW_USE_CXX11_ABI=${HYBRIDBACKEND_USE_CXX11_ABI} \
+    ARROW_HDFS=${HYBRIDBACKEND_WITH_ARROW_HDFS} \
+    ARROW_S3=${HYBRIDBACKEND_WITH_ARROW_S3} \
+    ./build.sh /opt/arrow
+
+RUN pip install -U --no-cache-dir \
+    Cython \
+    nvidia-pyindex \
+    pybind11 \
+    tqdm && \
+    pip install -U --no-cache-dir \
+    nvidia-nsys-cli
+
+ARG TF_REPO=https://github.com/DeepRec-AI/DeepRec.git
+ARG TF_TAG=main
+
+RUN git clone ${TF_REPO} -b ${TF_TAG} /opt/DeepRec
+
+RUN wget -nv -O /opt/DeepRec/install_bazel.sh \
+    http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/bazel-0.26.1-installer-linux-x86_64.sh && \
+    chmod 777 /opt/DeepRec/install_bazel.sh && /opt/DeepRec/install_bazel.sh
+
+
+ENV TF_NEED_CUDA=1 \
+    TF_CUDA_PATHS=/usr,/usr/local/cuda \
+    TF_CUDA_VERSION=11.6 \
+    TF_CUBLAS_VERSION=11 \
+    TF_CUDNN_VERSION=8 \
+    TF_NCCL_VERSION=2 \
+    TF_CUDA_CLANG=0 \
+    TF_DOWNLOAD_CLANG=0 \
+    TF_NEED_TENSORRT=0 \
+    TF_CUDA_COMPUTE_CAPABILITIES="7.0,8.0" \
+    TF_ENABLE_XLA=1 \
+    TF_NEED_MPI=0 \
+    CC_OPT_FLAGS="-march=skylake -Wno-sign-compare" \
+    CXX_OPT_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
+
+RUN cd /opt/DeepRec && \
+    yes "" | bash ./configure || true
+
+RUN --mount=type=cache,target=/var/cache/bazel.tensorflow \
+    cd /opt/DeepRec && \
+    bazel build \
+    --disk_cache=/var/cache/bazel.tensorflow \
+    --config=nogcp \
+    --config=cuda \
+    --config=xla \
+    --verbose_failures \
+    --cxxopt="${CXX_OPT_FLAGS}" \
+    --host_cxxopt="${CXX_OPT_FLAGS}" \
+    --define tensorflow_mkldnn_contraction_kernel=0 \
+    //tensorflow/tools/pip_package:build_pip_package
+
+RUN mkdir -p /src/dist && \
+    cd /opt/DeepRec && \
+    ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+    /src/dist --gpu --project_name tensorflow
+
+RUN pip install --no-cache-dir --user \
+    /src/dist/tensorflow-*.whl && \
+    rm -f /src/dist/tensorflow-*.whl
+
+RUN mkdir -p \
+    $(pip show tensorflow | grep Location | cut -d " " -f 2)/tensorflow_core/include/third_party/gpus/cuda/ && \
+    ln -sf /usr/local/cuda/include \
+    $(pip show tensorflow | grep Location | cut -d " " -f 2)/tensorflow_core/include/third_party/gpus/cuda/include
+
+RUN cd /opt/DeepRec/ && \
+    cp tensorflow/core/kernels/gpu_device_array* \
+    $(pip show tensorflow | grep Location | cut -d " " -f 2)/tensorflow_core/include/tensorflow/core/kernels
+
+RUN cd /opt/DeepRec && \
+   bazel build --disk_cache=/var/cache/bazel.tensorflow \
+   -j 16  -c opt --config=opt  //tensorflow/tools/pip_package:build_sok && \
+   ./bazel-bin/tensorflow/tools/pip_package/build_sok
+
+ENV ARROW_INCLUDE=/opt/arrow/include \
+    ARROW_LIB=/opt/arrow/lib \
+    ZSTD_LIB=/opt/arrow/lib
+
+# Configure HybridBackend
+ENV HYBRIDBACKEND_WITH_CUDA=ON \
+    HYBRIDBACKEND_WITH_NCCL=ON \
+    HYBRIDBACKEND_WITH_ARROW_ZEROCOPY=ON \
+    HYBRIDBACKEND_WITH_TENSORFLOW_HALF=OFF \
+    HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO=99881015 \
+    HYBRIDBACKEND_USE_CXX11_ABI=0 \
+    HYBRIDBACKEND_USE_RUFF=1 \
+    HYBRIDBACKEND_WHEEL_ALIAS=-deeprec-cu116 \
+    TF_DISABLE_EV_ALLOCATOR=true
+
+RUN cd /opt/HybridBackend && make -j32
+
+RUN pip install --no-cache-dir --user \
+    /opt/HybridBackend/build/wheel/hybridbackend_deeprec*.whl
+
+RUN rm -rf /opt/DeepRec /opt/HybridBackend && /opt/openmpi-4.1.1.tar.gz
diff --git a/docs/docs_en/Collective-Training.md b/docs/docs_en/Collective-Training.md
new file mode 100644
index 00000000000..dff578f6c8d
--- /dev/null
+++ b/docs/docs_en/Collective-Training.md
@@ -0,0 +1,176 @@
+# Collective Training
+
+## Background
+
+For sparse recommendation models like DLRM, there are a large number of parameters and heavy GEMM operations. The asynchronous training paradigm of PS makes it difficult to fully utilize the GPUs in the cluster to accelerate the entire training/inference process.We try to place all the parameters on the worker, but the large amount of memory consumed by the parameters(Embedding) cannot be stored on a single GPU, so we need to perform sharding to place on all GPUs.Native Tensorflow did not support model parallel training (MP), and the community has many excellent plug-ins based on Tensorflow, such as HybridBackend (hereinafter referred to as HB), SparseOperationKit (hereinafter referred to as SOK), and so on. DeepRec provides a unified synchronous training interface `CollectiveStrategy` for users to choose and use. Users can use different synchronous training frameworks with very little code.
+
+## Interface Introduction
+
+1. Currently the interface supports HB and SOK, users can choose through the environment variable `COLLECTIVE_STRATEGY`. `COLLECTIVE_STRATEGY` can configure hb, sok corresponding to HB and SOK respectively. The difference from normal startup of Tensorflow tasks is that when users use synchronous training, they need to pull up through additional modules, which need to be started in the following way:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 COLLECTIVE_STRATEGY=hb python3 -m tensorflow.python.distribute.launch <python script.py>
+```
+If the environment variable is not configured with `CUDA_VISIBLE_DEVICES`, the process will pull up the training sub-processes with the number of GPUs in the current environment by default.
+
+2. In the user script, a `CollectiveStrategy` needs to be initialized to complete the construction of the model.
+
+```python
+class CollectiveStrategy:
+    def scope(self, *args, **kwargs):
+        pass
+    def embedding_scope(self, **kwargs):
+        pass
+    def world_size(self):
+        pass
+    def rank(self):
+        pass
+    def estimator(self):
+        pass
+    def export_saved_model(self):
+        pass
+```
+
+Following steps below to using synchronous training:
+- Mark with strategy.scope() before the entire model definition.
+- Use the embedding_scope() flag where model parallelism is required (embedding layer)
+- Use export_saved_model when exporting
+- (Optional) In addition, the strategy also provides the estimator interface for users to use.
+
+## Example
+
+**MonitoredTrainingSession**
+
+The following example guides users how to construct Graph through tf.train.MonitoredTrainingSession.
+
+```python
+import tensorflow as tf
+from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
+
+#STEP1:  initialize a collective strategy
+strategy = CollectiveStrategy()
+#STEP2:  define the data parallel scope
+with strategy.scope(), tf.Graph().as_default():
+    #STEP3:  define the model parallel scope
+    with strategy.embedding_scope():
+        var = tf.get_variable(
+            'var_1',
+            shape=(1000, 3),
+            initializer=tf.ones_initializer(tf.float32),
+            partitioner=tf.fixed_size_partitioner(num_shards=strategy.world_size())
+        )
+    emb = tf.nn.embedding_lookup(
+        var, tf.cast([0, 1, 2, 5, 6, 7], tf.int64))
+    fun = tf.multiply(emb, 2.0, name='multiply')
+    loss = tf.reduce_sum(fun, name='reduce_sum')
+    opt = tf.train.FtrlOptimizer(
+        0.1,
+        l1_regularization_strength=2.0,
+        l2_regularization_strength=0.00001)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    with tf.train.MonitoredTrainingSession('') as sess:
+        emb_result, loss_result, _ = sess.run([emb, loss, train_op])
+        print (emb_result, loss_result)
+```
+
+**Estimator**
+
+The following example guides users how to construct Graph through tf.estimator.Estimator.
+```python
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
+
+#STEP1:  initialize a collective strategy
+strategy = CollectiveStrategy()
+#STEP2:  define the data parallel scope
+with strategy.scope(), tf.Graph().as_default():
+    def input_fn():
+        ratings = tfds.load("movie_lens/100k-ratings", split="train")
+        ratings = ratings.map(
+            lambda x: {
+                "movie_id": tf.strings.to_number(x["movie_id"], tf.int64),
+                "user_id": tf.strings.to_number(x["user_id"], tf.int64),
+                "user_rating": x["user_rating"]
+            })
+        shuffled = ratings.shuffle(1_000_000,
+                                    seed=2021,
+                                    reshuffle_each_iteration=False)
+        dataset = shuffled.batch(256)
+        return dataset
+
+    def input_receiver():
+        r'''Prediction input receiver.
+        '''
+        inputs = {
+            "movie_id": tf.placeholder(dtype=tf.int64, shape=[None]),
+            "user_id": tf.placeholder(dtype=tf.int64, shape=[None]),
+            "user_rating": tf.placeholder(dtype=tf.float32, shape=[None])
+        }
+        return tf.estimator.export.ServingInputReceiver(inputs, inputs)
+
+    def model_fn(features, labels, mode, params):
+        r'''Model function for estimator.
+        '''
+        del params
+        movie_id = features["movie_id"]
+        user_id = features["user_id"]
+        rating = features["user_rating"]
+
+        embedding_columns = [
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_embedding(
+                    "movie_id", dtype=tf.int64),
+                dimension=16,
+                initializer=tf.random_uniform_initializer(-1e-3, 1e-3)),
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_embedding(
+                    "user_id", dtype=tf.int64),
+                dimension=16,
+                initializer=tf.random_uniform_initializer(-1e-3, 1e-3))
+            ]
+        #STEP3:  define the model parallel scope
+        with strategy.embedding_scope():
+            with tf.variable_scope(
+                'embedding',
+                partitioner=tf.fixed_size_partitioner(
+                strategy.world_size)):
+                deep_features = [
+                    tf.feature_column.input_layer(features, [c])
+                    for c in embedding_columns]
+        emb = tf.concat(deep_features, axis=-1)
+        logits = tf.multiply(emb, 2.0, name='multiply')
+
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            labels = tf.reshape(tf.to_float(labels), shape=[-1, 1])
+            loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, logits))
+            step = tf.train.get_or_create_global_step()
+            opt = tf.train.AdagradOptimizer(learning_rate=self._args.lr)
+            train_op = opt.minimize(loss, global_step=step)
+            return tf.estimator.EstimatorSpec(
+                mode=mode,
+                loss=loss,
+                train_op=train_op,
+                training_chief_hooks=[])
+
+        return None
+    estimator = strategy.estimator(model_fn=model_fn,
+                                   model_dir="./",
+                                   config=None)
+    estimator.train_and_evaluate(
+      tf.estimator.TrainSpec(
+        input_fn=input_fn,
+        max_steps=50),
+      tf.estimator.EvalSpec(
+        input_fn=input_fn))
+    estimator.export_saved_model("./", input_receiver)
+```
+
+## Appendix
+
+- Currently DeepRec provides the corresponding GPU image for users to use (alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04-hybridbackend), users can also refer to [Dockerfile](../../cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend)
+- We also provides more detailed demos about the above two usage methods, see: [ModelZoo](../../modelzoo/features/grouped_embedding)
+
+- If further optimization is required, there are more fine-tuning parameters for HB and SOK, please refer to:
+[SOK](./SOK.md) 和 [HB](https://github.com/DeepRec-AI/HybridBackend)
diff --git a/docs/docs_zh/Collective-Training.md b/docs/docs_zh/Collective-Training.md
new file mode 100644
index 00000000000..488cb12d601
--- /dev/null
+++ b/docs/docs_zh/Collective-Training.md
@@ -0,0 +1,173 @@
+# Collective Training
+
+## 背景
+对于像DLRM类似的稀疏的推荐模型，通常有大量的参数以及复杂的矩阵运算。PS的异步训练范式，难以充分利用集群中的GPU来加速整个训练/推理过程。我们开始尝试将所有的参数放置在worker上，但是大规模参数(Embedding)占据了大量的显存导致无法储存在单个GPU上，我们需要对参数做并行切分存放。原生Tensorflow不支持模型并行(MP)，社区已有的许多优秀的基于Tensorflow实现的addons，比如HybridBackend（以下简称HB）、SparseOperationKit(以下简称SOK）等等。DeepRec没有重复开发，而是提供了一个统一的同步训练的接口`CollectiveStrategy`供用户自行选择使用。用户可以以很少的代码改动来使用不同的同步训练框架。
+
+## 接口介绍
+
+1. 目前接口支持HB和SOK，用户可以通过环境变量 `COLLECTIVE_STRATEGY`来选择。`COLLECTIVE_STRATEGY`可以配置hb，sok分别对应HB和SOK方式。与正常启动Tensorflow任务的区别在于，用户使用同步训练的时候需要通过额外的模块拉起，需要通过以下方式启动:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 COLLECTIVE_STRATEGY=hb python3 -m tensorflow.python.distribute.launch <python script.py>
+```
+如果环境变量没有配置`CUDA_VISIBLE_DEVICES`，进程会默认拉起当前环境GPU数目的训练子进程。
+
+2. 用户脚本中则需要初始化一个`CollectiveStrategy`,来完成模型的构建。
+```python
+class CollectiveStrategy:
+    def scope(self, *args, **kwargs):
+        pass
+    def embedding_scope(self, **kwargs):
+        pass
+    def world_size(self):
+        pass
+    def rank(self):
+        pass
+    def estimator(self):
+        pass
+    def export_saved_model(self):
+        pass
+```
+
+使用同步该接口有以下几个步骤
+- 在整个模型定义前用strategy.scope()标记
+- 在需要模型并行的地方(embedding层)使用embedding_scope()标记
+- 在导出的时候使用export_saved_model
+- (Optional)strategy还提供estimator接口给用户使用。
+
+## 使用示例
+
+**MonitoredTrainingSession**
+
+下面例子指导用户如何通过tf.train.MonitoredTrainingSession构图
+```python
+import tensorflow as tf
+from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
+
+#STEP1:  initialize a collective strategy
+strategy = CollectiveStrategy()
+#STEP2:  define the data parallel scope
+with strategy.scope(), tf.Graph().as_default():
+    #STEP3:  define the model parallel scope
+    with strategy.embedding_scope():
+        var = tf.get_variable(
+            'var_1',
+            shape=(1000, 3),
+            initializer=tf.ones_initializer(tf.float32),
+            partitioner=tf.fixed_size_partitioner(num_shards=strategy.world_size())
+        )
+    emb = tf.nn.embedding_lookup(
+        var, tf.cast([0, 1, 2, 5, 6, 7], tf.int64))
+    fun = tf.multiply(emb, 2.0, name='multiply')
+    loss = tf.reduce_sum(fun, name='reduce_sum')
+    opt = tf.train.FtrlOptimizer(
+        0.1,
+        l1_regularization_strength=2.0,
+        l2_regularization_strength=0.00001)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    with tf.train.MonitoredTrainingSession('') as sess:
+        emb_result, loss_result, _ = sess.run([emb, loss, train_op])
+        print (emb_result, loss_result)
+```
+
+**Estimator**
+
+下面例子指导用户如何通过tf.estimator.Estimator构图
+```python
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
+
+#STEP1:  initialize a collective strategy
+strategy = CollectiveStrategy()
+#STEP2:  define the data parallel scope
+with strategy.scope(), tf.Graph().as_default():
+    def input_fn():
+        ratings = tfds.load("movie_lens/100k-ratings", split="train")
+        ratings = ratings.map(
+            lambda x: {
+                "movie_id": tf.strings.to_number(x["movie_id"], tf.int64),
+                "user_id": tf.strings.to_number(x["user_id"], tf.int64),
+                "user_rating": x["user_rating"]
+            })
+        shuffled = ratings.shuffle(1_000_000,
+                                    seed=2021,
+                                    reshuffle_each_iteration=False)
+        dataset = shuffled.batch(256)
+        return dataset
+
+    def input_receiver():
+        r'''Prediction input receiver.
+        '''
+        inputs = {
+            "movie_id": tf.placeholder(dtype=tf.int64, shape=[None]),
+            "user_id": tf.placeholder(dtype=tf.int64, shape=[None]),
+            "user_rating": tf.placeholder(dtype=tf.float32, shape=[None])
+        }
+        return tf.estimator.export.ServingInputReceiver(inputs, inputs)
+
+    def model_fn(features, labels, mode, params):
+        r'''Model function for estimator.
+        '''
+        del params
+        movie_id = features["movie_id"]
+        user_id = features["user_id"]
+        rating = features["user_rating"]
+
+        embedding_columns = [
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_embedding(
+                    "movie_id", dtype=tf.int64),
+                dimension=16,
+                initializer=tf.random_uniform_initializer(-1e-3, 1e-3)),
+            tf.feature_column.embedding_column(
+                tf.feature_column.categorical_column_with_embedding(
+                    "user_id", dtype=tf.int64),
+                dimension=16,
+                initializer=tf.random_uniform_initializer(-1e-3, 1e-3))
+            ]
+        #STEP3:  define the model parallel scope
+        with strategy.embedding_scope():
+            with tf.variable_scope(
+                'embedding',
+                partitioner=tf.fixed_size_partitioner(
+                strategy.world_size)):
+                deep_features = [
+                    tf.feature_column.input_layer(features, [c])
+                    for c in embedding_columns]
+        emb = tf.concat(deep_features, axis=-1)
+        logits = tf.multiply(emb, 2.0, name='multiply')
+
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            labels = tf.reshape(tf.to_float(labels), shape=[-1, 1])
+            loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, logits))
+            step = tf.train.get_or_create_global_step()
+            opt = tf.train.AdagradOptimizer(learning_rate=self._args.lr)
+            train_op = opt.minimize(loss, global_step=step)
+            return tf.estimator.EstimatorSpec(
+                mode=mode,
+                loss=loss,
+                train_op=train_op,
+                training_chief_hooks=[])
+
+        return None
+    estimator = strategy.estimator(model_fn=model_fn,
+                                   model_dir="./",
+                                   config=None)
+    estimator.train_and_evaluate(
+      tf.estimator.TrainSpec(
+        input_fn=input_fn,
+        max_steps=50),
+      tf.estimator.EvalSpec(
+        input_fn=input_fn))
+    estimator.export_saved_model("./", input_receiver)
+```
+
+## 附录
+
+- 目前DeepRec提供了相应的GPU镜像给用户使用(alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04-hybridbackend),用户也可以参考Dockerfile(../../cibuild/dockerfiles/Dockerfile.devel-py3.8-cu116-ubuntu20.04-hybridbackend)
+- Modelzoo中关于上述两个使用方法还提供了更详细的demo，参见[ModelZoo](../../modelzoo/features/grouped_embedding)
+
+- 如果需要做进一步的优化，关于HB和SOK自身还有更多的微调参数，可以参考:
+[SOK](./SOK.md) 和 [HB](https://github.com/DeepRec-AI/HybridBackend)
\ No newline at end of file
diff --git a/modelzoo/features/group_embedding/dcnv2/result/README.md b/modelzoo/features/group_embedding/dcnv2/result/README.md
deleted file mode 100644
index ccec44eb9a5..00000000000
--- a/modelzoo/features/group_embedding/dcnv2/result/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Result
-Checkpoint & timeline file are default saved in this folder.
diff --git a/modelzoo/features/group_embedding/script.sh b/modelzoo/features/group_embedding/script.sh
deleted file mode 100644
index dcf5270da88..00000000000
--- a/modelzoo/features/group_embedding/script.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-#use_feature_column
-horovodrun -np 4 -H localhost:4 python train.py  --steps 10000  --use_feature_columns
-
-#embedding_variable
-horovodrun -np 4 -H localhost:4 python train.py  --steps 10000 
\ No newline at end of file
diff --git a/modelzoo/features/group_embedding/dcnv2/README.md b/modelzoo/features/grouped_embedding/dcnv2/README.md
similarity index 100%
rename from modelzoo/features/group_embedding/dcnv2/README.md
rename to modelzoo/features/grouped_embedding/dcnv2/README.md
diff --git a/modelzoo/features/group_embedding/dcnv2/data/README.md b/modelzoo/features/grouped_embedding/dcnv2/data/README.md
similarity index 100%
rename from modelzoo/features/group_embedding/dcnv2/data/README.md
rename to modelzoo/features/grouped_embedding/dcnv2/data/README.md
diff --git a/modelzoo/features/group_embedding/dcnv2/train.py b/modelzoo/features/grouped_embedding/dcnv2/train.py
similarity index 68%
rename from modelzoo/features/group_embedding/dcnv2/train.py
rename to modelzoo/features/grouped_embedding/dcnv2/train.py
index 478930442f8..eebe7fec4ea 100644
--- a/modelzoo/features/group_embedding/dcnv2/train.py
+++ b/modelzoo/features/grouped_embedding/dcnv2/train.py
@@ -28,6 +28,8 @@
 
 import numpy as np
 
+import pyarrow.parquet as pq
+
 from ast import arg
 
 import time
@@ -58,13 +60,15 @@
 
 from tensorflow.python.framework import ops
 
+import horovod.tensorflow as hvd
+
 os.environ["TF_GPU_THREAD_MODE"] = "global"
 os.environ["TF_GPU_THREAD_COUNT"] = "16"
 
-import horovod.tensorflow as hvd
+group_embedding_type = os.getenv("COLLECTIVE_STRATEGY", "sok")
+assert group_embedding_type in ["sok", "hb"]
 
-#Enable group_embedding_lookup
-tf.config.experimental.enable_distributed_strategy(strategy="collective")
+from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
 
 # Set to INFO for tracking training, default is WARN. ERROR for least messages
 tf.logging.set_verbosity(tf.logging.INFO)
@@ -141,6 +145,17 @@
 }
 
 
+def build_placeholders():
+    r'''Build input placeholders.
+    '''
+    inputs = {}
+    for f in CONTINUOUS_COLUMNS:
+        inputs[f] = tf.placeholder(dtype=tf.float32, shape=[None])
+    for f in CATEGORICAL_COLUMNS:
+        inputs[f] = tf.placeholder(dtype=tf.string, shape=[None])
+    return inputs
+
+
 def transform_numeric(feature):
     r'''Transform numeric features.
 
@@ -181,6 +196,7 @@ def minmaxscaler(col):
 
     return numeric_list
 
+
 def transform_feature_column():
 
     feature_columns = []
@@ -202,7 +218,6 @@ def minmaxscaler(col):
 
         return minmaxscaler
 
-
     for column_name in CONTINUOUS_COLUMNS:
 
         normalizer_fn = None
@@ -223,11 +238,11 @@ def minmaxscaler(col):
                                                 filter_option=None)
 
             column = tf.feature_column.categorical_column_with_embedding(
-                key = column_name,
+                key=column_name,
                 dtype=tf.int64,
-                ev_option = ev_opt
+                ev_option=ev_opt
             )
-            
+
             with tf.device("/gpu:0"):
                 weight = tf.feature_column.embedding_column(
                     categorical_column=column,
@@ -239,6 +254,7 @@ def minmaxscaler(col):
 
     return feature_columns
 
+
 def transform_features(sparse_features, dense_features):
     features = {}
 
@@ -249,23 +265,23 @@ def transform_features(sparse_features, dense_features):
         numeric = dense_features[i]
 
         features[column_name] = numeric
-    
+
     max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
 
     for i, column_name in enumerate(CATEGORICAL_COLUMNS):
-        category = tf.strings.to_hash_bucket_fast(sparse_features[i], max_value)
-        # ragged_tensor = tf.RaggedTensor.from_row_lengths(
-        #        values=category, row_lengths=tf.ones_like(category))
+        category = tf.strings.to_hash_bucket_fast(
+            sparse_features[i], max_value)
 
         sparse_tensor = fc._to_sparse_input_and_drop_ignore_values(
-                category)
+            category)
 
         sparse_tensor = tf.sparse.reshape(sparse_tensor, (-1, 1))
 
         features[column_name] = sparse_tensor
-    
+
     return features
 
+
 def transform_categorical(feature):
 
     max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
@@ -287,33 +303,31 @@ def transform_categorical(feature):
                 ),
                 embedding_dim=EMBEDDING_DIMENSIONS[column_name],
                 ev_option=ev_opt)
-            
 
         category = tf.strings.to_hash_bucket_fast(feature[i], max_value)
 
-
         i = CATEGORICAL_COLUMNS.index(column_name)
 
-        target_gpu = i % hvd.size()
-
-        target_gpu = -1
-
-        embedding_weights.target_gpu = target_gpu
-
+        variables.append(embedding_weights)
 
-        ragged_tensor = tf.RaggedTensor.from_row_lengths(
-            values=category, row_lengths=tf.ones_like(category))
+        # Different type of sparse input for sok and hb.
+        if group_embedding_type == "sok":
+            ragged_tensor = tf.RaggedTensor.from_row_lengths(
+                values=category, row_lengths=tf.ones_like(category))
 
+            indices.append(ragged_tensor)
+        else:
+            sparse_tensor = fc._to_sparse_input_and_drop_ignore_values(
+                category)
 
-        indices.append(ragged_tensor)
-
-        variables.append(embedding_weights)
+            sparse_tensor = tf.sparse.reshape(sparse_tensor, (-1, 1))
 
+            indices.append(sparse_tensor)
 
     combiners = ['sum' for _ in range(len(CATEGORICAL_COLUMNS))]
 
-    deep_features = tf.nn.group_embedding_lookup_sparse(variables, indices, combiners)
-
+    deep_features = tf.nn.group_embedding_lookup_sparse(
+        variables, indices, combiners)
 
     return deep_features
 
@@ -339,7 +353,7 @@ def stacked_dcn_v2(features, mlp_dims):
             cross_input_shape = [-1, sum([f.shape[-1] for f in features])]
 
             cross_input = tf.reshape(cross_input, cross_input_shape)
-        
+
         cross_input_sq = tf.layers.dense(
             cross_input,
             cross_input.shape[-1],
@@ -351,9 +365,9 @@ def stacked_dcn_v2(features, mlp_dims):
 
         cross_output = tf.reshape(cross_output, [-1, cross_input.shape[1]])
 
-        
         if args.use_feature_columns:
-            cross_output_dim = (len(CATEGORICAL_COLUMNS+CONTINUOUS_COLUMNS) * (len(CATEGORICAL_COLUMNS+CONTINUOUS_COLUMNS) + 1)) / 2
+            cross_output_dim = (len(CATEGORICAL_COLUMNS+CONTINUOUS_COLUMNS)
+                                * (len(CATEGORICAL_COLUMNS+CONTINUOUS_COLUMNS) + 1)) / 2
         else:
             cross_output_dim = (len(features) * (len(features) + 1)) / 2
 
@@ -396,219 +410,152 @@ def stacked_dcn_v2(features, mlp_dims):
 
 def build_model_input(filename, batch_size, num_epochs):
 
-    def parse_csv(value):
-
+    def parse_parquet(value):
         tf.logging.info('Parsing {}'.format(filename))
+        labels = value.pop(LABEL_COLUMN[0])
+        dense_feature = [value[name] for name in CONTINUOUS_COLUMNS]
 
-        cont_defaults = [[0.0] for i in range(1, 14)]
-
-        cate_defaults = [[' '] for i in range(1, 27)]
-
-        label_defaults = [[0]]
-
-        column_headers = TRAIN_DATA_COLUMNS
-
-        record_defaults = label_defaults + cont_defaults + cate_defaults
-
-        columns = tf.io.decode_csv(value, record_defaults=record_defaults)
-
-        all_columns = collections.OrderedDict(zip(column_headers, columns))
-
-        labels = all_columns.pop(LABEL_COLUMN[0])
-
-        dense_feature = [all_columns[name] for name in CONTINUOUS_COLUMNS]
-
-        sparse_feature = [all_columns[name] for name in CATEGORICAL_COLUMNS]
-
+        sparse_feature = [value[name] for name in CATEGORICAL_COLUMNS]
         return dense_feature, sparse_feature, labels
 
     '''Work Queue Feature'''
-
     if args.workqueue and not args.tf:
-
         from tensorflow.python.ops.work_queue import WorkQueue
-
-        work_queue = WorkQueue([filename])
-
+        work_queue = WorkQueue([filename], num_epochs=num_epochs)
         # For multiple files：
-
         # work_queue = WorkQueue([filename, filename1,filename2,filename3])
-
         files = work_queue.input_dataset()
-
     else:
-
         files = filename
 
-    # Extract lines from input files using the Dataset API.
-
-    dataset = tf.data.TextLineDataset(files)
-
-    dataset = dataset.shuffle(buffer_size=20000,
-                              seed=args.seed)  # fix seed for reproducing
-
-    dataset = dataset.repeat(num_epochs)
-
-    dataset = dataset.batch(batch_size)
-
-    dataset = dataset.map(parse_csv, num_parallel_calls=28)
+    from tensorflow.python.data.experimental.ops import parquet_dataset_ops
 
+    dataset = parquet_dataset_ops.ParquetDataset(files, batch_size=batch_size)
+    dataset = dataset.map(parse_parquet, num_parallel_calls=28)
     dataset = dataset.prefetch(2)
-
     return dataset
 
 
-def main():
-
-    # check dataset and count data set size
-
-    print("Checking dataset...")
-
-    train_file = args.data_location + '/train.csv'
-
-    if (not os.path.exists(train_file)):
-
-        print("Dataset does not exist in the given data_location.")
-
-        sys.exit()
-
-    no_of_training_examples = sum(1 for line in open(train_file))
-
-    print("Numbers of training dataset is {}".format(no_of_training_examples))
-
-    # set batch size, eporch & steps
+def model_fn(strategy, sparse_feature, dense_feature):
+    with strategy.embedding_scope():
+        if args.use_feature_columns:
 
-    assert args.batch_size % hvd.size() == 0
+            feature_columns = transform_feature_column()
 
-    batch_size = int(args.batch_size / hvd.size())
+            features = transform_features(sparse_feature, dense_feature)
 
-    if args.steps == 0:
+            input_features = tf.feature_column.input_layer(
+                features, feature_columns)
+        else:
 
-        no_of_epochs = 1
+            deep_features = transform_categorical(sparse_feature)
 
-        train_steps = math.ceil(
-            (float(no_of_epochs) * no_of_training_examples) / batch_size)
+            wide_features = transform_numeric(dense_feature)
 
-    else:
+            input_features = deep_features + wide_features
 
-        no_of_epochs = math.ceil(
-            (float(batch_size) * args.steps) / no_of_training_examples)
+    logits = stacked_dcn_v2(features=input_features,
+                            mlp_dims=[1024, 512, 256, 1])
+    return logits
 
-        train_steps = args.steps
 
-    print("The training steps is {}".format(train_steps))
+def main(strategy):
 
     # set fixed random seed
-
     tf.set_random_seed(args.seed)
 
-    # create data pipline of train & test dataset
-
-    with tf.device('/cpu:0'):
-
-        train_dataset = build_model_input(train_file, batch_size, no_of_epochs)
-
-        iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
-                                                   train_dataset.output_shapes)
-
-        next_element = iterator.get_next()
-
-    train_init_op = iterator.make_initializer(train_dataset)
-
-    dense_feature, sparse_feature, labels = next_element[0], next_element[
-        1], next_element[2]
-    
-    input_features = None
-
-    if args.use_feature_columns:
-
-        feature_columns = transform_feature_column()
-
-        features = transform_features(sparse_feature, dense_feature)
-
-        input_features = tf.feature_column.input_layer(features, feature_columns)
-    else:
-        
-        deep_features = transform_categorical(sparse_feature)
-
-        wide_features = transform_numeric(dense_feature)
+    # assert args.steps % no_of_training_examples == 0
 
-        input_features = deep_features + wide_features
+    if args.mode == "train":
+        print("Checking dataset...")
 
-    logits = stacked_dcn_v2(features=input_features,
-                            mlp_dims=[1024, 512, 256, 1])
+        train_file = args.data_location + '/train.parquet'
 
-    labels = tf.reshape(labels, (-1, 1))
+        if (not os.path.exists(train_file)):
 
-    loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, logits))
+            print("Dataset does not exist in the given data_location.")
 
-    loss = hvd.allreduce(loss, op=hvd.Sum)
+            sys.exit()
 
-    step = tf.train.get_or_create_global_step()
+        batch_size = int(args.batch_size / 4)
 
-    opt = tf.train.AdagradOptimizer(learning_rate=0.01)
+        no_of_training_examples = pq.read_table(train_file).num_rows
 
-    train_op = opt.minimize(loss, global_step=step)
+        no_of_epochs = math.ceil(
+            (float(batch_size) * args.steps) / no_of_training_examples)
 
-    # Session config
+        train_steps = args.steps
 
-    sess_config = tf.ConfigProto()
+        print("Numbers of training dataset is {}".format(no_of_training_examples))
 
-    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
+        print("The training steps is {}".format(train_steps))
 
-    sess_config.gpu_options.allow_growth = True
+        # create data pipline of train & test dataset
 
-    # # Session hooks
+        with tf.device('/cpu:0'):
 
-    hooks = []
+            train_dataset = build_model_input(
+                train_file, batch_size, no_of_epochs)
 
-    # if args.smartstaged and not args.tf:
+            iterator = tf.data.make_one_shot_iterator(train_dataset)
+            next_element = iterator.get_next()
+            dense_feature, sparse_feature, labels = next_element[0], \
+                next_element[1], \
+                next_element[2]
 
-    #     '''Smart staged Feature'''
+        logits = model_fn(strategy, sparse_feature, dense_feature)
+        labels = tf.reshape(labels, (-1, 1))
 
-    #     next_element = tf.staged(next_element, num_threads=4, capacity=40)
+        loss = tf.reduce_mean(
+            tf.keras.losses.binary_crossentropy(labels, logits))
 
-    #     sess_config.graph_options.optimizer_options.do_smart_stage = True
+        step = tf.train.get_or_create_global_step()
 
-    #     hooks.append(tf.make_prefetch_hook())
+        opt = tf.train.AdagradOptimizer(learning_rate=0.01)
 
-    # if args.op_fusion and not args.tf:
+        train_op = opt.minimize(loss, global_step=step)
 
-    #     '''Auto Graph Fusion'''
+        hooks = []
 
-    #     sess_config.graph_options.optimizer_options.do_op_fusion = True
+        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
 
-    # if args.micro_batch and not args.tf:
+        run_metadata = tf.RunMetadata()
 
-    #     '''Auto Mirco Batch'''
+        stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
 
-    #     sess_config.graph_options.optimizer_options.micro_batch_num = args.micro_batch
-    scaffold = tf.train.Scaffold(local_init_op=tf.group(
-        tf.local_variables_initializer(), train_init_op))
+        hooks.append(stop_hook)
 
-    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-    run_metadata = tf.RunMetadata()
+        log_hook = tf.train.LoggingTensorHook({
+            'steps': step,
+            'loss': loss,
+        }, every_n_iter=500)
+        hooks.append(log_hook)
 
-    stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
+        with tf.train.MonitoredTrainingSession(hooks=hooks,
+                                               checkpoint_dir=args.checkpoint,
+                                               config=None) as sess:
 
-    hooks.append(stop_hook)
+            while not sess.should_stop():
+                sess.run([loss, train_op])
 
-    log_hook = tf.train.LoggingTensorHook({
-        'steps': step,
-        'loss': loss,
-    }, every_n_iter=500)
-    hooks.append(log_hook)
+            print("Training completed.")
 
-    with tf.train.MonitoredTrainingSession(master = '',
-                                           hooks=hooks,
-                                           checkpoint_dir=checkpoint_dir,
-                                           scaffold=scaffold,
-                                           config=sess_config) as sess:
+    elif args.mode == "export":
+        def on_export():
+            sparse_feature, dense_feature = [], []
+            inputs = build_placeholders()
+            for fname, feat in inputs.items():
+                if fname in CONTINUOUS_COLUMNS:
+                    dense_feature.append(feat)
+                elif fname in CATEGORICAL_COLUMNS:
+                    sparse_feature.append(feat)
+            logits = model_fn(strategy, sparse_feature, dense_feature)
+            return tf.saved_model.predict_signature_def(inputs, {'score': logits})
 
-        while not sess.should_stop():
-            sess.run([loss, train_op])
-            
-    print("Training completed.")
+        strategy.export_saved_model("./saved_model",
+                                    tf.train.latest_checkpoint(
+                                        args.checkpoint),
+                                    on_export)
 
 
 def boolean_string(string):
@@ -629,6 +576,12 @@ def get_arg_parser():
 
     parser = argparse.ArgumentParser()
 
+    parser.add_argument('--checkpoint',
+                        help='Full path to checkpoints input/output. \
+                            Default to ./result/$MODEL_TIMESTAMP',
+                        required=False,
+                        default='./')
+
     parser.add_argument('--data_location',
                         help='Full path of train data',
                         required=False,
@@ -654,7 +607,13 @@ def get_arg_parser():
                         type=boolean_string,
                         default=False)
 
-    parser.add_argument('--use_feature_columns', action='store_true')    
+    parser.add_argument('--mode',
+                        help='Mode.',
+                        type=str,
+                        choices=["train", "export"],
+                        default="train")
+
+    parser.add_argument('--use_feature_columns', action='store_true')
 
     return parser
 
@@ -703,4 +662,7 @@ def set_env_for_DeepRec():
 
     set_env_for_DeepRec()
 
-    main()
+    strategy = CollectiveStrategy()
+
+    with strategy.scope():
+        main(strategy)
diff --git a/modelzoo/features/grouped_embedding/deepfm/README.md b/modelzoo/features/grouped_embedding/deepfm/README.md
new file mode 100644
index 00000000000..49eb0c0dab8
--- /dev/null
+++ b/modelzoo/features/grouped_embedding/deepfm/README.md
@@ -0,0 +1,286 @@
+# DeepFM
+
+The following is a brief directory structure and description for this example:
+```
+├── data                          # Data set directory
+│   └── README.md                   # Documentation describing how to prepare dataset
+├── distribute_k8s                # Distributed training related files
+│   ├── distribute_k8s_BF16.yaml    # k8s yaml to crate a training job with BF16 feature
+│   ├── distribute_k8s_FP32.yaml    # k8s yaml to crate a training job
+│   └── launch.py                   # Script to set env for distributed training
+├── README.md                     # Documentation
+├── result                        # Output directory
+│   └── README.md                   # Documentation describing output directory
+└── train.py                      # Training script
+```
+
+## Content
+- [DeepFM](#deepfm)
+  - [Content](#content)
+  - [Model Structure](#model-structure)
+  - [Usage](#usage)
+    - [Stand-alone Training](#stand-alone-training)
+    - [Distribute Training](#distribute-training)
+  - [Benchmark](#benchmark)
+    - [Stand-alone Training](#stand-alone-training-1)
+      - [Test Environment](#test-environment)
+      - [Performance Result](#performance-result)
+    - [Distributed Training](#distributed-training)
+      - [Test Environment](#test-environment-1)
+      - [Performance Result](#performance-result-1)
+  - [Dataset](#dataset)
+    - [Prepare](#prepare)
+    - [Fields](#fields)
+    - [Processing](#processing)
+
+## Model Structure
+[DeepFM](https://arxiv.org/abs/1703.04247) is a CRT recommender model proposed in 2017 which combines the power of factorization machines for recommendation and deep learning for feature learning in a new neural network architecture. Compared to WDL model, wide and deep part of DeepFM share input so that feature engineering besides raw features is not needed.
+The model's output is the probability of a click calculated by the output of FM and DNN model.
+```
+output:
+                                   probability of a click
+model:
+                                              /|\
+                                               |
+                      _____________________>  ADD  <______________________
+                    /                                                      \ 
+             ________|________                                     ________|________ 
+            |                 |                                   |                 |
+            |                 |                                   |                 |
+            |                 |                                   |                 |
+            |       FM        |                                   |       DNN       |
+            |                 |                                   |                 |
+            |                 |                                   |                 |
+            |_________________|                                   |_________________|
+                    |                                                       |
+                    |_______________________________________________________|
+                                            ____|_____
+                                          /            \
+                                         /       |_Emb_|____|__|
+                                        |               |
+input:                                  |               |
+                                 [dense features, sparse features]
+```
+
+## Usage
+
+### Stand-alone Training
+1.  Please prepare the data set and DeepRec env.
+    1.  Manually
+        - Follow [dataset preparation](#prepare) to prepare data set.
+        - Download code by `git clone https://github.com/alibaba/DeepRec`
+        - Follow [How to Build](https://github.com/alibaba/DeepRec#how-to-build) to build DeepRec whl package and install by `pip install $DEEPREC_WHL`.
+    2.  *Docker(Recommended)*
+        ```
+        docker pull alideeprec/deeprec-release-modelzoo:latest
+        docker run -it alideeprec/deeprec-release-modelzoo:latest /bin/bash
+
+        # In docker container
+        cd /root/modelzoo/deepfm
+        ```
+
+2.  Training.  
+    ```
+    python train.py
+    
+    # Memory acceleration with jemalloc.
+    # The required ENV `MALLOC_CONF` is already set in the code.
+    LD_PRELOAD=./libjemalloc.so.2.5.1 python train.py
+    ```
+    Use argument `--bf16` to enable DeepRec BF16 feature.
+    ```
+    python train.py --bf16
+
+    # Memory acceleration with jemalloc.
+    # The required ENV `MALLOC_CONF` is already set in the code.
+    LD_PRELOAD=./libjemalloc.so.2.5.1 python train.py --bf16
+    ```
+    In the community tensorflow environment, use argument `--tf` to disable all of DeepRec's feature.
+    ```
+    python train.py --tf
+    ```
+    Use arguments to set up a custom configuation:
+    - DeepRec Features:
+      - `export START_STATISTIC_STEP` and `export STOP_STATISTIC_STEP`: Set ENV to configure CPU memory optimization. This is already set to 100 & 110 in the code by default.
+      - `--bf16`: Enable DeepRec BF16 feature in DeepRec. Use FP32 by default.
+      - `--emb_fusion`: Whether to enable embedding fusion, Default to True.
+      - `--op_fusion`: Whether to enable Auto graph fusion feature. Default to True.
+      - `--optimizer`: Choose the optimizer for deep model from ['adam', 'adamasync', 'adagraddecay', 'adagrad']. Use adamasync by default.
+      - `--smartstaged`: Whether to enable smart staged feature of DeepRec, Default to True.
+      - `--micro_batch`: Set num for Auto Mirco Batch. Default 0 to close.(Not really enabled)
+      - `--ev`: Whether to enable DeepRec EmbeddingVariable. Default to False.
+      - `--group_embedding`: Use GroupEmbedding features.
+      - `--adaptive_emb`: Whether to enable Adaptive Embedding. Default to False.
+      - `--ev_elimination`: Set Feature Elimination of EmbeddingVariable Feature. Options [None, 'l2', 'gstep'], default to None.
+      - `--ev_filter`: Set Feature Filter of EmbeddingVariable Feature. Options [None, 'counter', 'cbf'], default to None.
+      - `--dynamic_ev`: Whether to enable Dynamic-dimension Embedding Variable. Default to False.(Not really enabled)
+      - `--incremental_ckpt`: Set time of save Incremental Checkpoint. Default 0 to close.
+      - `--workqueue`: Whether to enable Work Queue. Default to False.
+      - `--protocol`: Set the protocol ['grpc', 'grpc++', 'star_server'] used when starting server in distributed training. Default to grpc. 
+      - `--parquet_dataset`: Whether to enable ParquetDataset. Default is `True`.
+      - `--parquet_dataset_shuffle`: Whether to enable shuffle operation for Parquet Dataset. Default to `False`.
+    - Basic Settings:
+      - `--data_location`: Full path of train & eval data, default to `./data`.
+      - `--steps`: Set the number of steps on train dataset. Default will be set to 1 epoch.
+      - `--no_eval`: Do not evaluate trained model by eval dataset.
+      - `--batch_size`: Batch size to train. Default to 512.
+      - `--output_dir`: Full path to output directory for logs and saved model, default to `./result`.
+      - `--checkpoint`: Full path to checkpoints input/output directory, default to `$(OUTPUT_DIR)/model_$(MODEL_NAME)_$(TIMESTAMPS)`
+      - `--save_steps`: Set the number of steps on saving checkpoints, zero to close. Default will be set to 0.
+      - `--seed`: Set the random seed for tensorflow.
+      - `--timeline`: Save steps of profile hooks to record timeline, zero to close, defualt to 0.
+      - `--keep_checkpoint_max`: Maximum number of recent checkpoint to keep. Default to 1.
+      - `--learning_rate`: Learning rate for deep network. Default to 0.001.
+      - `--inter`: Set inter op parallelism threads. Default to 0.
+      - `--intra`: Set intra op parallelism threads. Default to 0.
+      - `--input_layer_partitioner`: Slice size of input layer partitioner(units MB).
+      - `--dense_layer_partitioner`: Slice size of dense layer partitioner(units kB).
+      - `--tf`: Use TF 1.15.5 API and disable DeepRec features.
+
+
+### Distribute Training
+1. Prepare a K8S cluster. [Alibaba Cloud ACK Service(Alibaba Cloud Container Service for Kubernetes)](https://cn.aliyun.com/product/kubernetes) can quickly create a Kubernetes cluster. 
+2. Perpare a shared storage volume. For Alibaba Cloud ACK, [OSS(Object Storage Service)](https://cn.aliyun.com/product/oss) can be used as a shared storage volume.
+3. Create a PVC(PersistentVolumeClaim) named `deeprec` for storage volumn in cluster.
+4. Prepare docker image. `alideeprec/deeprec-release-modelzoo:latest` is recommended.
+5. Create a k8s job from `.yaml` to run distributed training.
+   ```
+   kubectl create -f $YAML_FILE
+   ```
+6. Show training log by `kubectl logs -f trainer-worker-0`
+
+
+## Benchmark
+### Stand-alone Training
+#### Test Environment
+The benchmark is performed on the [Alibaba Cloud ECS general purpose instance family with high clock speeds - **ecs.hfg7.2xlarge**](https://help.aliyun.com/document_detail/25378.html?spm=5176.2020520101.vmBInfo.instanceType.4a944df5PvCcED#hfg7).
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
+  - CPU(s):              8
+  - Socket(s):           1
+  - Core(s) per socket:  4
+  - Thread(s) per core:  2
+  - Memory:              32G
+
+- Software
+  - kernel:                 4.18.0-348.2.1.el8_5.x86_64
+  - OS:                     CentOS Linux release 8.5.2111
+  - GCC:                    8.5.0
+  - Docker:                 20.10.12
+  - Python:                 3.6.8
+
+#### Performance Result
+
+<table>
+    <tr>
+        <td colspan="1"></td>
+        <td>Framework</td>
+        <td>DType</td>
+        <td>Accuracy</td>
+        <td>AUC</td>
+        <td>Throughput</td>
+    </tr>
+    <tr>
+        <td rowspan="3">DeepFM</td>
+        <td>Community TensorFlow</td>
+        <td>FP32</td>
+        <td>0.784695</td>
+        <td>0.781548</td>
+        <td>18848.64(baseline)</td>
+    </tr>
+    <tr>
+        <td>DeepRec w/ oneDNN</td>
+        <td>FP32</td>
+        <td>0.782755</td>
+        <td>0.777158</td>
+        <td>31260.00(1.65x)</td>
+    </tr>
+    <tr>
+        <td>DeepRec w/ oneDNN</td>
+        <td>FP32+BF16</td>
+        <td>0.782659</td>
+        <td>0.776537</td>
+        <td>34627.46(1.84x)</td>
+    </tr>
+</table>
+
+- Community TensorFlow version is v1.15.5.
+
+### Distributed Training
+#### Test Environment
+The benchmark is performed on the [Alibaba Cloud ACK Service(Alibaba Cloud Container Service for Kubernetes)](https://cn.aliyun.com/product/kubernetes), the K8S cluster is composed of the following ten machines.
+
+- Hardware 
+  - Model name:          Intel(R) Xeon(R) Platinum 8369HC CPU @ 3.30GHz
+  - CPU(s):              8
+  - Socket(s):           1
+  - Core(s) per socket:  4
+  - Thread(s) per core:  2
+  - Memory:              32G
+
+
+#### Performance Result  
+
+<table>
+    <tr>
+        <td colspan="1"></td>
+        <td>Framework</td>
+        <td>Protocol</td>
+        <td>DType</td>
+        <td>Throughput</td>
+    </tr>
+    <tr>
+        <td rowspan="3">DeepFM</td>
+        <td>Community TensorFlow</td>
+        <td>GRPC</td>
+        <td>FP32</td>
+        <td></td>
+    </tr>
+    <tr>
+        <td>DeepRec w/ oneDNN</td>
+        <td>GRPC</td>
+        <td>FP32</td>
+        <td></td>
+    </tr>
+    <tr>
+        <td>DeepRec w/ oneDNN</td>
+        <td>GRPC</td>
+        <td>FP32+BF16</td>
+        <td></td>
+    </tr>
+</table>
+
+- Community TensorFlow version is v1.15.5.
+
+## Dataset
+Train & eval dataset using ***Kaggle Display Advertising Challenge Dataset (Criteo Dataset)***.
+### Prepare
+We provide the dataset in two formats:
+1. **CSV Format**
+Put data file **train.csv & eval.csv** into ./data/    
+For details of Data download, see [Data Preparation](data/README.md).
+2. **Parquet Format**
+Put data file **train.parquet & eval.parquet** into ./data/
+These files are available at [Criteo Parquet Dataset](https://deeprec-dataset.oss-cn-beijing.aliyuncs.com/parquet_dataset/criteo_categorical_string.tar.gz).
+
+### Fields
+Total 40 columns:  
+**[0]:Label** - Target variable that indicates if an ad was clicked or not(1 or 0)  
+**[1-13]:I1-I13** - A total 13 columns of integer continuous features(mostly count features)  
+**[14-39]:C1-C26** - A total 26 columns of categorical features. The values have been hashed onto 32 bits for anonymization purposes.
+
+Integer column's distribution is as follow:
+| Column | 1    | 2     | 3     | 4   | 5       | 6      | 7     | 8    | 9     | 10  | 11  | 12   | 13   |
+| ------ | ---- | ----- | ----- | --- | ------- | ------ | ----- | ---- | ----- | --- | --- | ---- | ---- |
+| Min    | 0    | -3    | 0     | 0   | 0       | 0      | 0     | 0    | 0     | 0   | 0   | 0    | 0    |
+| Max    | 1539 | 22066 | 65535 | 561 | 2655388 | 233523 | 26279 | 5106 | 24376 | 9   | 181 | 1807 | 6879 |
+
+Categorical column's numbers of types is as follow:
+| column | C1   | C2  | C3      | C4     | C5  | C6  | C7    | C8  | C9  | C10   | C11  | C12     | C13  | C14 | C15   | C16     | C17 | C18  | C19  | C20 | C21     | C22 | C23 | C24    | C25 | C26   |
+| ------ | ---- | --- | ------- | ------ | --- | --- | ----- | --- | --- | ----- | ---- | ------- | ---- | --- | ----- | ------- | --- | ---- | ---- | --- | ------- | --- | --- | ------ | --- | ----- |
+| nums   | 1396 | 553 | 2594031 | 698469 | 290 | 23  | 12048 | 608 | 3   | 65156 | 5309 | 2186509 | 3128 | 26  | 12750 | 1537323 | 10  | 5002 | 2118 | 4   | 1902327 | 17  | 15  | 135790 | 94  | 84305 |
+
+### Processing
+- Interger columns **I[1-13]** is processed with `tf.feature_column.numeric_column()` function, and the data is normalized.  
+    In order to save time, the data required for normalization has been calculated in advance.
+- Categorical columns **C[1-26]** is processed with `tf.feature_column.embedding_column()` function after using `tf.feature_column.categorical_column_with_hash_bucket()` function.
diff --git a/modelzoo/features/grouped_embedding/deepfm/data/README.md b/modelzoo/features/grouped_embedding/deepfm/data/README.md
new file mode 100644
index 00000000000..058f8afce71
--- /dev/null
+++ b/modelzoo/features/grouped_embedding/deepfm/data/README.md
@@ -0,0 +1,10 @@
+# Dataset
+## Prepare dataset
+Put data file **train.csv & eval.csv** into ./data/
+
+Download Kaggle Display Advertising Challenge Dataset from http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/
+
+The evaluation dataset for accuracy measurement is not available in the above link can be downloaded from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/eval.csv
+
+Download the train dataset(in csv format) from https://storage.googleapis.com/dataset-uploader/criteo-kaggle/large_version/train.csv
+
diff --git a/modelzoo/features/grouped_embedding/deepfm/train.py b/modelzoo/features/grouped_embedding/deepfm/train.py
new file mode 100644
index 00000000000..670e6c0bd06
--- /dev/null
+++ b/modelzoo/features/grouped_embedding/deepfm/train.py
@@ -0,0 +1,872 @@
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import time
+import contextlib
+import argparse
+import tensorflow as tf
+import os
+import sys
+import math
+import collections
+from tensorflow.python.client import timeline
+import json
+
+from tensorflow.python.ops import partitioned_variables
+
+group_embedding_type = os.getenv("COLLECTIVE_STRATEGY", "sok")
+
+assert group_embedding_type in ["sok", "hb", "none"]
+
+# Set to INFO for tracking training, default is WARN. ERROR for least messages
+tf.logging.set_verbosity(tf.logging.INFO)
+print("Using TensorFlow version %s" % (tf.__version__))
+
+# Definition of some constants
+CONTINUOUS_COLUMNS = ['I' + str(i) for i in range(1, 14)]  # 1-13 inclusive
+CATEGORICAL_COLUMNS = ['C' + str(i) for i in range(1, 27)]  # 1-26 inclusive
+LABEL_COLUMN = ['clicked']
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
+FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
+HASH_BUCKET_SIZES = {
+    'C1': 2500,
+    'C2': 2000,
+    'C3': 5000000,
+    'C4': 1500000,
+    'C5': 1000,
+    'C6': 100,
+    'C7': 20000,
+    'C8': 4000,
+    'C9': 20,
+    'C10': 100000,
+    'C11': 10000,
+    'C12': 5000000,
+    'C13': 40000,
+    'C14': 100,
+    'C15': 100,
+    'C16': 3000000,
+    'C17': 50,
+    'C18': 10000,
+    'C19': 4000,
+    'C20': 20,
+    'C21': 4000000,
+    'C22': 100,
+    'C23': 100,
+    'C24': 250000,
+    'C25': 400,
+    'C26': 100000
+}
+
+
+def build_receive_fn():
+    r'''Build input placeholders.
+    '''
+    inputs = {}
+    for f in CONTINUOUS_COLUMNS:
+        inputs[f] = tf.placeholder(
+            dtype=tf.float32, shape=[None])
+    for f in CATEGORICAL_COLUMNS:
+        inputs[f] = tf.placeholder(
+            dtype=tf.string, shape=[None])
+    return tf.estimator.export.ServingInputReceiver(inputs, inputs)
+
+
+class DeepFM():
+    def __init__(self,
+                 wide_column=None,
+                 fm_column=None,
+                 deep_column=None,
+                 dnn_hidden_units=[1024, 256, 32],
+                 final_hidden_units=[128, 64],
+                 optimizer_type='adam',
+                 learning_rate=0.001,
+                 use_bn=True,
+                 bf16=False,
+                 stock_tf=None,
+                 adaptive_emb=False,
+                 input_layer_partitioner=None,
+                 dense_layer_partitioner=None,
+                 strategy=None):
+
+        self._wide_column = wide_column
+        self._deep_column = deep_column
+        self._fm_column = fm_column
+        if not wide_column or not fm_column or not deep_column:
+            raise ValueError(
+                'Wide column, FM column or Deep column is not defined.')
+
+        self.tf = stock_tf
+        self.bf16 = False if self.tf else bf16
+        self.is_training = True
+        self.use_bn = use_bn
+        self._adaptive_emb = adaptive_emb
+
+        self._dnn_hidden_units = dnn_hidden_units
+        self._final_hidden_units = final_hidden_units
+        self._optimizer_type = optimizer_type
+        self._learning_rate = learning_rate
+        self._input_layer_partitioner = input_layer_partitioner
+        self._dense_layer_partitioner = dense_layer_partitioner
+        self._strategy = strategy
+
+    # used to add summary in tensorboard
+    def _add_layer_summary(self, value, tag):
+        tf.summary.scalar('%s/fraction_of_zero_values' % tag,
+                          tf.nn.zero_fraction(value))
+        tf.summary.histogram('%s/activation' % tag, value)
+
+    def _dnn(self, dnn_input, dnn_hidden_units=None, layer_name=''):
+        for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+            with tf.variable_scope(layer_name + '_%d' % layer_id,
+                                   partitioner=self._dense_layer_partitioner,
+                                   reuse=tf.AUTO_REUSE) as dnn_layer_scope:
+                dnn_input = tf.layers.dense(
+                    dnn_input,
+                    num_hidden_units,
+                    activation=tf.nn.relu,
+                    name=dnn_layer_scope)
+                if self.use_bn:
+                    dnn_input = tf.layers.batch_normalization(
+                        dnn_input, training=self.is_training, trainable=True)
+                # self._add_layer_summary(dnn_input, dnn_layer_scope.name)
+
+        return dnn_input
+
+    def _create_model(self):
+        # input features
+        if self._strategy is not None:
+            with self._strategy.embedding_scope(), tf.variable_scope('input_layer',
+                                                                     partitioner=self._input_layer_partitioner,
+                                                                     reuse=tf.AUTO_REUSE):
+                fm_cols = {}
+                if self._adaptive_emb and not self.tf:
+                    '''Adaptive Embedding Feature Part 1 of 2'''
+                    adaptive_mask_tensors = {}
+                    for col in CATEGORICAL_COLUMNS:
+                        adaptive_mask_tensors[col] = tf.ones([args.batch_size],
+                                                             tf.int32)
+                    dnn_input = tf.feature_column.input_layer(
+                        features=self._feature,
+                        feature_columns=self._deep_column,
+                        adaptive_mask_tensors=adaptive_mask_tensors)
+                    wide_input = tf.feature_column.input_layer(
+                        self._feature, self._wide_column, cols_to_output_tensors=fm_cols, adaptive_mask_tensors=adaptive_mask_tensors)
+                else:
+                    dnn_input = tf.feature_column.input_layer(self._feature,
+                                                              self._deep_column)
+                    wide_input = tf.feature_column.input_layer(
+                        self._feature, self._wide_column, cols_to_output_tensors=fm_cols)
+
+                fm_input = tf.stack([fm_cols[cols]
+                                    for cols in self._fm_column], 1)
+        else:
+            with tf.variable_scope('input_layer',
+                                   partitioner=self._input_layer_partitioner,
+                                   reuse=tf.AUTO_REUSE):
+                fm_cols = {}
+                if self._adaptive_emb and not self.tf:
+                    '''Adaptive Embedding Feature Part 1 of 2'''
+                    adaptive_mask_tensors = {}
+                    for col in CATEGORICAL_COLUMNS:
+                        adaptive_mask_tensors[col] = tf.ones([args.batch_size],
+                                                             tf.int32)
+                    dnn_input = tf.feature_column.input_layer(
+                        features=self._feature,
+                        feature_columns=self._deep_column,
+                        adaptive_mask_tensors=adaptive_mask_tensors)
+                    wide_input = tf.feature_column.input_layer(
+                        self._feature, self._wide_column, cols_to_output_tensors=fm_cols,
+                        adaptive_mask_tensors=adaptive_mask_tensors)
+                else:
+                    dnn_input = tf.feature_column.input_layer(self._feature,
+                                                              self._deep_column)
+                    wide_input = tf.feature_column.input_layer(
+                        self._feature, self._wide_column, cols_to_output_tensors=fm_cols)
+
+                fm_input = tf.stack([fm_cols[cols]
+                                    for cols in self._fm_column], 1)
+
+        if self.bf16:
+            wide_input = tf.cast(wide_input, dtype=tf.bfloat16)
+            fm_input = tf.cast(fm_input, dtype=tf.bfloat16)
+            dnn_input = tf.cast(dnn_input, dtype=tf.bfloat16)
+
+        # DNN part
+        dnn_scope = tf.variable_scope('dnn')
+        with dnn_scope.keep_weights(dtype=tf.float32) if self.bf16 \
+                else dnn_scope:
+            dnn_output = self._dnn(dnn_input, self._dnn_hidden_units,
+                                   'dnn_layer')
+
+        # linear / fisrt order part
+        with tf.variable_scope('linear', reuse=tf.AUTO_REUSE) as linear:
+            linear_output = tf.reduce_sum(wide_input, axis=1, keepdims=True)
+
+        # FM second order part
+        with tf.variable_scope('fm', reuse=tf.AUTO_REUSE) as fm:
+            sum_square = tf.square(tf.reduce_sum(fm_input, axis=1))
+            square_sum = tf.reduce_sum(tf.square(fm_input), axis=1)
+            fm_output = 0.5 * tf.subtract(sum_square, square_sum)
+
+        # Final dnn layer
+        all_input = tf.concat([dnn_output, linear_output, fm_output], 1)
+        final_dnn_scope = tf.variable_scope('final_dnn')
+        with final_dnn_scope.keep_weights(dtype=tf.float32) if self.bf16 \
+                else final_dnn_scope:
+            dnn_logits = self._dnn(
+                all_input, self._final_hidden_units, 'final_dnn')
+
+        if self.bf16:
+            dnn_logits = tf.cast(dnn_logits, dtype=tf.float32)
+
+        self._logits = tf.layers.dense(dnn_logits, 1)
+        self.probability = tf.math.sigmoid(self._logits)
+        self.output = tf.round(self.probability)
+
+    # compute loss
+    def _create_loss(self):
+        loss_func = tf.losses.mean_squared_error
+        predict = tf.squeeze(self.probability)
+        self.loss = tf.math.reduce_mean(loss_func(self._label, predict))
+        tf.summary.scalar('loss', self.loss)
+
+    # define optimizer and generate train_op
+    def _create_optimizer(self):
+        self.global_step = tf.train.get_or_create_global_step()
+        if self.tf or self._optimizer_type == 'adam':
+            optimizer = tf.train.AdamOptimizer(
+                learning_rate=self._learning_rate,
+                beta1=0.9,
+                beta2=0.999,
+                epsilon=1e-8)
+        elif self._optimizer_type == 'adagrad':
+            optimizer = tf.train.AdagradOptimizer(
+                learning_rate=self._learning_rate,
+                initial_accumulator_value=1e-8)
+        elif self._optimizer_type == 'adamasync':
+            optimizer = tf.train.AdamAsyncOptimizer(
+                learning_rate=self._learning_rate,
+                beta1=0.9,
+                beta2=0.999,
+                epsilon=1e-8)
+        elif self._optimizer_type == 'adagraddecay':
+            optimizer = tf.train.AdagradDecayOptimizer(
+                learning_rate=self._learning_rate,
+                global_step=self.global_step)
+        else:
+            raise ValueError('Optimizer type error.')
+
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            print(optimizer, " =====")
+            self.train_op = optimizer.minimize(
+                self.loss, global_step=self.global_step)
+
+    # compute acc & auc
+    def _create_metrics(self):
+        self.acc, self.acc_op = tf.metrics.accuracy(labels=self._label,
+                                                    predictions=self.output)
+        self.auc, self.auc_op = tf.metrics.auc(labels=self._label,
+                                               predictions=self.probability,
+                                               num_thresholds=1000)
+        tf.summary.scalar('eval_acc', self.acc)
+        tf.summary.scalar('eval_auc', self.auc)
+
+    def _call(self, features, labels, mode, config):
+        self._feature = features
+        self._label = labels
+        self._create_model()
+
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            self.is_training = True
+            with tf.name_scope('head'):
+                self._create_loss()
+                self._create_optimizer()
+                self._create_metrics()
+
+            chief_only_hooks = []
+            if args.profile_every_n_iter is not None:
+                chief_only_hooks.append(
+                    tf.train.ProfilerHook(
+                        save_steps=args.profile_every_n_iter,
+                        output_dir=args.output_dir))
+            return tf.estimator.EstimatorSpec(
+                mode=tf.estimator.ModeKeys.TRAIN,
+                loss=self.loss,
+                train_op=self.train_op,
+                training_chief_hooks=chief_only_hooks)
+
+        if mode == tf.estimator.ModeKeys.EVAL:
+            self.is_training = False
+            return tf.estimator.EstimatorSpec(
+                mode=tf.estimator.ModeKeys.EVAL,
+                loss=self.loss)
+
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            self.is_training = False
+            return tf.estimator.EstimatorSpec(
+                mode=tf.estimator.ModeKeys.PREDICT,
+                predictions={'score': self.probability})
+
+        return None
+
+
+# generate dataset pipline
+def build_model_input(filename, batch_size, num_epochs):
+    def parse_csv(value):
+        tf.logging.info('Parsing {}'.format(filename))
+        cont_defaults = [[0.0] for i in range(1, 14)]
+        cate_defaults = [[' '] for i in range(1, 27)]
+        label_defaults = [[0]]
+        column_headers = TRAIN_DATA_COLUMNS
+        record_defaults = label_defaults + cont_defaults + cate_defaults
+        columns = tf.io.decode_csv(value, record_defaults=record_defaults)
+        all_columns = collections.OrderedDict(zip(column_headers, columns))
+        labels = all_columns.pop(LABEL_COLUMN[0])
+        features = all_columns
+        return features, labels
+
+    def parse_parquet(value):
+        tf.logging.info('Parsing {}'.format(filename))
+        labels = value.pop(LABEL_COLUMN[0])
+        features = value
+        return features, labels
+
+    '''Work Queue Feature'''
+    if args.workqueue and not args.tf:
+        from tensorflow.python.ops.work_queue import WorkQueue
+        work_queue = WorkQueue([filename], num_epochs=num_epochs)
+        # For multiple files：
+        # work_queue = WorkQueue([filename, filename1,filename2,filename3])
+        files = work_queue.input_dataset()
+    else:
+        files = filename
+    # Extract lines from input files using the Dataset API.
+    if args.parquet_dataset and not args.tf:
+        from tensorflow.python.data.experimental.ops import parquet_dataset_ops
+        dataset = parquet_dataset_ops.ParquetDataset(
+            files, batch_size=batch_size)
+        if args.parquet_dataset_shuffle:
+            dataset = dataset.shuffle(buffer_size=20000,
+                                      seed=args.seed)  # fix seed for reproducing
+        if not args.workqueue:
+            dataset = dataset.repeat(num_epochs)
+        dataset = dataset.map(parse_parquet, num_parallel_calls=28)
+    else:
+        dataset = tf.data.TextLineDataset(files)
+        dataset = dataset.shuffle(buffer_size=20000,
+                                  seed=args.seed)  # fix seed for reproducing
+        if not args.workqueue:
+            dataset = dataset.repeat(num_epochs)
+        dataset = dataset.batch(batch_size)
+        dataset = dataset.map(parse_csv, num_parallel_calls=28)
+    return dataset
+
+
+def build_feature_columns():
+    wide_column = []
+    deep_column = []
+    fm_column = []
+    if group_embedding_type and not args.tf:
+        with tf.feature_column.group_embedding_column_scope(name="categorical"):
+            for column_name in FEATURE_COLUMNS:
+                if column_name in CATEGORICAL_COLUMNS:
+                    categorical_column = tf.feature_column.categorical_column_with_hash_bucket(
+                        column_name,
+                        hash_bucket_size=10000,
+                        dtype=tf.string)
+
+                    if not args.tf:
+                        '''Feature Elimination of EmbeddingVariable Feature'''
+                        if args.ev_elimination == 'gstep':
+                            # Feature elimination based on global steps
+                            evict_opt = tf.GlobalStepEvict(steps_to_live=4000)
+                        elif args.ev_elimination == 'l2':
+                            # Feature elimination based on l2 weight
+                            evict_opt = tf.L2WeightEvict(
+                                l2_weight_threshold=1.0)
+                        else:
+                            evict_opt = None
+                        '''Feature Filter of EmbeddingVariable Feature'''
+                        if args.ev_filter == 'cbf':
+                            # CBF-based feature filter
+                            filter_option = tf.CBFFilter(
+                                filter_freq=3,
+                                max_element_size=2**30,
+                                false_positive_probability=0.01,
+                                counter_type=tf.int64)
+                        elif args.ev_filter == 'counter':
+                            # Counter-based feature filter
+                            filter_option = tf.CounterFilter(filter_freq=3)
+                        else:
+                            filter_option = None
+                        ev_opt = tf.EmbeddingVariableOption(
+                            evict_option=evict_opt, filter_option=filter_option)
+
+                        if args.ev:
+                            '''Embedding Variable Feature'''
+                            categorical_column = tf.feature_column.categorical_column_with_embedding(
+                                column_name, dtype=tf.string, ev_option=ev_opt)
+                        elif args.adaptive_emb:
+                            '''                 Adaptive Embedding Feature Part 2 of 2
+                            Expcet the follow code, a dict, 'adaptive_mask_tensors', is need as the input of
+                            'tf.feature_column.input_layer(adaptive_mask_tensors=adaptive_mask_tensors)'.
+                            For column 'COL_NAME',the value of adaptive_mask_tensors['$COL_NAME'] is a int32
+                            tensor with shape [batch_size].
+                            '''
+                            categorical_column = tf.feature_column.categorical_column_with_adaptive_embedding(
+                                column_name,
+                                hash_bucket_size=HASH_BUCKET_SIZES[column_name],
+                                dtype=tf.string,
+                                ev_option=ev_opt)
+                        elif args.dynamic_ev:
+                            '''Dynamic-dimension Embedding Variable'''
+                            print(
+                                "Dynamic-dimension Embedding Variable is not really enabled in model."
+                            )
+                            sys.exit()
+
+                    if args.tf or not args.emb_fusion:
+                        embedding_column = tf.feature_column.embedding_column(
+                            categorical_column,
+                            dimension=16,
+                            combiner='mean')
+                    else:
+                        '''Embedding Fusion Feature'''
+                        embedding_column = tf.feature_column.embedding_column(
+                            categorical_column,
+                            dimension=16,
+                            combiner='mean',
+                            do_fusion=args.emb_fusion)
+
+                    wide_column.append(embedding_column)
+                    deep_column.append(embedding_column)
+                    fm_column.append(embedding_column)
+                else:
+                    column = tf.feature_column.numeric_column(
+                        column_name, shape=(1, ))
+                    wide_column.append(column)
+                    deep_column.append(column)
+    return wide_column, fm_column, deep_column
+
+
+def main(tf_config=None, server=None):
+
+    if args.incremental_ckpt and not args.tf:
+        print("Incremental_Checkpoint is not really enabled.")
+        print("Please see the comments in the code.")
+        sys.exit()
+
+    # check dataset
+    print('Checking dataset')
+    train_file = args.data_location
+    test_file = args.data_location
+    if args.parquet_dataset and not args.tf:
+        train_file += '/train.parquet'
+        test_file += '/eval.parquet'
+    else:
+        train_file += '/train.csv'
+        test_file += '/eval.csv'
+    if (not os.path.exists(train_file)) or (not os.path.exists(test_file)):
+        print("Dataset does not exist in the given data_location.")
+        sys.exit()
+    no_of_training_examples = 0
+    no_of_test_examples = 0
+    if args.parquet_dataset and not args.tf:
+        import pyarrow.parquet as pq
+        no_of_training_examples = pq.read_table(train_file).num_rows
+        no_of_test_examples = pq.read_table(test_file).num_rows
+    else:
+        no_of_training_examples = sum(1 for line in open(train_file))
+        no_of_test_examples = sum(1 for line in open(test_file))
+    print("Numbers of training dataset is {}".format(no_of_training_examples))
+    print("Numbers of test dataset is {}".format(no_of_test_examples))
+
+    # set batch size, eporch & steps
+    batch_size = math.ceil(
+        args.batch_size / args.micro_batch
+    ) if args.micro_batch and not args.tf else args.batch_size
+
+    if args.steps == 0:
+        no_of_epochs = 1
+        train_steps = math.ceil(
+            (float(no_of_epochs) * no_of_training_examples) / batch_size)
+    else:
+        no_of_epochs = math.ceil(
+            (float(batch_size) * args.steps) / no_of_training_examples)
+        train_steps = args.steps
+    test_steps = math.ceil(float(no_of_test_examples) / batch_size)
+    print("The training steps is {}".format(train_steps))
+    print("The testing steps is {}".format(test_steps))
+
+    # set fixed random seed
+    tf.set_random_seed(args.seed)
+
+    # set directory path
+    model_dir = os.path.join(args.output_dir,
+                             'model_DeepFM_' + str(int(time.time())))
+    checkpoint_dir = args.checkpoint if args.checkpoint else model_dir
+    print("Saving model checkpoints to " + checkpoint_dir)
+
+    # create data pipline of train & test dataset
+    with tf.device("/cpu:0"):
+        train_dataset = build_model_input(train_file, batch_size, no_of_epochs)
+        test_dataset = build_model_input(test_file, batch_size, 1)
+    wide_column, fm_column, deep_column = build_feature_columns()
+
+    # create variable partitioner for distributed training
+    num_ps_replicas = len(tf_config['ps_hosts']) if tf_config else 0
+    input_layer_partitioner = partitioned_variables.min_max_variable_partitioner(
+        max_partitions=num_ps_replicas,
+        min_slice_size=args.input_layer_partitioner <<
+        20) if args.input_layer_partitioner else None
+    dense_layer_partitioner = partitioned_variables.min_max_variable_partitioner(
+        max_partitions=num_ps_replicas,
+        min_slice_size=args.dense_layer_partitioner <<
+        10) if args.dense_layer_partitioner else None
+
+    # Session config
+    sess_config = tf.ConfigProto()
+    if tf_config:
+        sess_config.device_filters.append("/job:ps")
+    sess_config.inter_op_parallelism_threads = args.inter
+    sess_config.intra_op_parallelism_threads = args.intra
+    sess_config.allow_soft_placement = True
+
+    # Session hooks
+    hooks = []
+
+    if args.smartstaged and not args.tf:
+        '''Smart staged Feature'''
+        next_element = tf.staged(next_element, num_threads=4, capacity=40)
+        sess_config.graph_options.optimizer_options.do_smart_stage = True
+        hooks.append(tf.make_prefetch_hook())
+    if args.op_fusion and not args.tf:
+        '''Auto Graph Fusion'''
+        sess_config.graph_options.optimizer_options.do_op_fusion = True
+    if args.micro_batch and not args.tf:
+        '''Auto Mirco Batch'''
+        sess_config.graph_options.optimizer_options.micro_batch_num = args.micro_batch
+
+    # save_steps = args.save_steps if args.save_steps or args.no_eval else steps
+
+    def model_main(strategy):
+        # create model
+        model = DeepFM(wide_column=wide_column,
+                       fm_column=fm_column,
+                       deep_column=deep_column,
+                       optimizer_type=args.optimizer,
+                       learning_rate=args.learning_rate,
+                       bf16=args.bf16,
+                       stock_tf=args.tf,
+                       adaptive_emb=args.adaptive_emb,
+                       input_layer_partitioner=input_layer_partitioner,
+                       dense_layer_partitioner=dense_layer_partitioner,
+                       strategy=strategy)
+
+
+        run_config = tf.estimator.RunConfig(session_config=sess_config)
+        estimator = strategy.estimator(
+            model_fn=model._call, model_dir=checkpoint_dir, config=run_config)
+
+        if args.mode == "evaluate":
+            estimator.evaluate(input_fn=lambda: build_model_input(test_file, batch_size, 1),
+                               steps=test_steps,
+                               hooks=hooks)
+
+        elif args.mode == "predict":
+            pred_result = estimator.predict(input_fn=lambda: build_model_input(test_file, batch_size, 1),
+                                            predict_keys=['score'],
+                                            hooks=hooks,
+                                            yield_single_examples=False)
+            print(next(pred_result))
+
+        elif args.mode == "train":
+            stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
+            hooks.append(stop_hook)
+            if args.timeline > 0:
+                hooks.append(
+                    tf.train.ProfilerHook(save_steps=args.timeline,
+                                      output_dir=checkpoint_dir))
+            estimator.train_and_evaluate(
+                    tf.estimator.TrainSpec(
+                    input_fn=lambda: build_model_input(
+                        train_file, batch_size, no_of_epochs),
+                    max_steps=train_steps,
+                    hooks=hooks),
+            tf.estimator.EvalSpec(
+                    input_fn=lambda: build_model_input(
+                        test_file, batch_size, 1),
+                    hooks=hooks))
+        else:
+            estimator.export_saved_model(
+                checkpoint_dir,
+                build_receive_fn)
+
+    if group_embedding_type in ["sok", "hb"]:
+        from tensorflow.python.distribute.group_embedding_collective_strategy import CollectiveStrategy
+        strategy = CollectiveStrategy()
+        if args.smartstaged and not args.tf:
+            os.environ["TF_GPU_THREAD_COUNT"] = "16"
+        with strategy.scope():
+            model_main(strategy)
+    else:
+        strategy = contextlib.nullcontext()
+        model_main(strategy)
+
+
+def boolean_string(string):
+    low_string = string.lower()
+    if low_string not in {'false', 'true'}:
+        raise ValueError('Not a valid boolean string')
+    return low_string == 'true'
+
+
+# Get parse
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_location',
+                        help='Full path of train data',
+                        required=False,
+                        default='./data')
+    parser.add_argument('--steps',
+                        help='set the number of steps on train dataset',
+                        type=int,
+                        default=0)
+    parser.add_argument('--batch_size',
+                        help='Batch size to train. Default is 512',
+                        type=int,
+                        default=2048)
+    parser.add_argument('--output_dir',
+                        help='Full path to model output directory. \
+                            Default to ./result. Covered by --checkpoint. ',
+                        required=False,
+                        default='./result')
+    parser.add_argument('--checkpoint',
+                        help='Full path to checkpoints input/output. \
+                            Default to ./result/$MODEL_TIMESTAMP',
+                        required=False)
+    parser.add_argument('--save_steps',
+                        help='set the number of steps on saving checkpoints',
+                        type=int,
+                        default=0)
+    parser.add_argument('--seed',
+                        help='set the random seed for tensorflow',
+                        type=int,
+                        default=2021)
+    parser.add_argument('--optimizer',
+                        type=str,
+                        choices=['adam', 'adamasync',
+                                 'adagraddecay', 'adagrad'],
+                        default='adamasync')
+    parser.add_argument('--learning_rate',
+                        help='Learning rate for deep model',
+                        type=float,
+                        default=0.001)
+    parser.add_argument('--keep_checkpoint_max',
+                        help='Maximum number of recent checkpoint to keep',
+                        type=int,
+                        default=1)
+    parser.add_argument('--timeline',
+                        help='number of steps on saving timeline. Default 0',
+                        type=int,
+                        default=0)
+    parser.add_argument('--protocol',
+                        type=str,
+                        choices=['grpc', 'grpc++', 'star_server'],
+                        default='grpc')
+    parser.add_argument('--inter',
+                        help='set inter op parallelism threads.',
+                        type=int,
+                        default=0)
+    parser.add_argument('--intra',
+                        help='set inter op parallelism threads.',
+                        type=int,
+                        default=0)
+    parser.add_argument('--input_layer_partitioner',
+                        help='slice size of input layer partitioner, units MB. Default 8MB',
+                        type=int,
+                        default=8)
+    parser.add_argument('--dense_layer_partitioner',
+                        help='slice size of dense layer partitioner, units KB. Default 16KB',
+                        type=int,
+                        default=16)
+    parser.add_argument('--bf16',
+                        help='enable DeepRec BF16 in deep model. Default FP32',
+                        action='store_true')
+    parser.add_argument('--mode',
+                        help='Which mode to use',
+                        type=str,
+                        choices=["train", "predict", "evaluate", "export"],
+                        default="train")
+    parser.add_argument('--tf',
+                        help='Use TF 1.15.5 API and disable DeepRec feature to run a baseline.',
+                        action='store_true')
+    parser.add_argument('--smartstaged',
+                        help='Whether to enable smart staged feature of DeepRec, Default to True.',
+                        type=boolean_string,
+                        default=False)
+    parser.add_argument('--emb_fusion',
+                        help='Whether to enable embedding fusion, Default to True.',
+                        type=boolean_string,
+                        default=True)
+    parser.add_argument('--ev',
+                        help='Whether to enable DeepRec EmbeddingVariable. Default False.',
+                        type=boolean_string,
+                        default=False)
+    parser.add_argument('--ev_elimination',
+                        help='Feature Elimination of EmbeddingVariable Feature. Default closed.',
+                        type=str,
+                        choices=[None, 'l2', 'gstep'],
+                        default=None)
+    parser.add_argument('--ev_filter',
+                        help='Feature Filter of EmbeddingVariable Feature. Default closed.',
+                        type=str,
+                        choices=[None, 'counter', 'cbf'],
+                        default=None)
+    parser.add_argument('--op_fusion',
+                        help='Whether to enable Auto graph fusion feature. Default to True',
+                        type=boolean_string,
+                        default=True)
+    parser.add_argument('--micro_batch',
+                        help='Set num for Auto Mirco Batch. Default close.',
+                        type=int,
+                        default=0)  # TODO enable
+    parser.add_argument('--adaptive_emb',
+                        help='Whether to enable Adaptive Embedding. Default to False.',
+                        type=boolean_string,
+                        default=False)
+    parser.add_argument('--dynamic_ev',
+                        help='Whether to enable Dynamic-dimension Embedding Variable. Default to False.',
+                        type=boolean_string,
+                        default=False)  # TODO enable
+    parser.add_argument('--incremental_ckpt',
+                        help='Set time of save Incremental Checkpoint. Default 0 to close.',
+                        type=boolean_string,
+                        default=0)
+    parser.add_argument('--eval_every_n_iter',
+                        help='Eval every n iten',
+                        type=int,
+                        default=10)
+    parser.add_argument('--workqueue',
+                        help='Whether to enable Work Queue. Default to False.',
+                        type=boolean_string,
+                        default=False)
+    parser.add_argument("--parquet_dataset",
+                        help='Whether to enable Parquet DataSet. Defualt to True.',
+                        type=boolean_string,
+                        default=False)
+    parser.add_argument("--parquet_dataset_shuffle",
+                        help='Whether to enable shuffle operation for Parquet Dataset. Default to False.',
+                        type=boolean_string,
+                        default=True)
+    parser.add_argument('--profile_every_n_iter',
+                        help='profiler every n iter during training',
+                        type=int,
+                        required=False)
+
+    return parser
+
+
+# Parse distributed training configuration and generate cluster information
+def generate_cluster_info(TF_CONFIG):
+    print(TF_CONFIG)
+    tf_config = json.loads(TF_CONFIG)
+    cluster_config = tf_config.get('cluster')
+    ps_hosts = []
+    worker_hosts = []
+    chief_hosts = []
+    for key, value in cluster_config.items():
+        if 'ps' == key:
+            ps_hosts = value
+        elif 'worker' == key:
+            worker_hosts = value
+        elif 'chief' == key:
+            chief_hosts = value
+    if chief_hosts:
+        worker_hosts = chief_hosts + worker_hosts
+
+    if not ps_hosts or not worker_hosts:
+        print('TF_CONFIG ERROR')
+        sys.exit()
+    task_config = tf_config.get('task')
+    task_type = task_config.get('type')
+    task_index = task_config.get('index') + (1 if task_type == 'worker'
+                                             and chief_hosts else 0)
+
+    if task_type == 'chief':
+        task_type = 'worker'
+
+    is_chief = True if task_index == 0 else False
+    cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
+    server = tf.distribute.Server(cluster,
+                                  job_name=task_type,
+                                  task_index=task_index,
+                                  protocol=args.protocol)
+    if task_type == 'ps':
+        server.join()
+    elif task_type == 'worker':
+        tf_config = {
+            'ps_hosts': ps_hosts,
+            'worker_hosts': worker_hosts,
+            'type': task_type,
+            'index': task_index,
+            'is_chief': is_chief
+        }
+        tf_device = tf.device(
+            tf.train.replica_device_setter(
+                worker_device='/job:worker/task:%d' % task_index,
+                cluster=cluster))
+        return tf_config, server, tf_device
+    else:
+        print("Task type or index error.")
+        sys.exit()
+
+
+# Some DeepRec's features are enabled by ENV.
+# This func is used to set ENV and enable these features.
+# A triple quotes comment is used to introduce these features and play an emphasizing role.
+def set_env_for_DeepRec():
+    '''
+    Set some ENV for these DeepRec's features enabled by ENV.
+    More Detail information is shown in https://deeprec.readthedocs.io/zh/latest/index.html.
+    START_STATISTIC_STEP & STOP_STATISTIC_STEP: On CPU platform, DeepRec supports memory optimization
+        in both stand-alone and distributed trainging. It's default to open, and the
+        default start and stop steps of collection is 1000 and 1100. Reduce the initial
+        cold start time by the following settings.
+    MALLOC_CONF: On CPU platform, DeepRec can use memory optimization with the jemalloc library.
+        Please preload libjemalloc.so by `LD_PRELOAD=./libjemalloc.so.2 python ...`
+    '''
+    os.environ['START_STATISTIC_STEP'] = '100'
+    os.environ['STOP_STATISTIC_STEP'] = '110'
+    os.environ['MALLOC_CONF'] = \
+        'background_thread:true,metadata_thp:auto,dirty_decay_ms:20000,muzzy_decay_ms:20000'
+
+
+if __name__ == '__main__':
+    parser = get_arg_parser()
+    args = parser.parse_args()
+
+    if not args.tf:
+        set_env_for_DeepRec()
+
+    TF_CONFIG = os.getenv('TF_CONFIG')
+    main()
+    # if not TF_CONFIG:
+    #    main()
+    # else:
+    #   tf_config, server, tf_device = generate_cluster_info(TF_CONFIG)
+    #   with tf_device:
+    #       main(tf_config, server)
diff --git a/modelzoo/features/grouped_embedding/script.sh b/modelzoo/features/grouped_embedding/script.sh
new file mode 100644
index 00000000000..1301ad278e7
--- /dev/null
+++ b/modelzoo/features/grouped_embedding/script.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+#This folders contains estimator and MonitoredTrainingSession examples to use collective training respectively.
+
+echo "====================================================================="
+echo "Running dcnv2 with SOK"
+echo "====================================================================="
+
+cd dcnv2
+
+COLLECTIVE_STRATEGY=sok python3 -m tensorflow.python.distribute.launch python3 train.py --data_location ./data --steps 500 --mode train
+
+echo "====================================================================="
+echo "Running deepfm with HB"
+echo "====================================================================="
+
+cd ../deepfm
+
+COLLECTIVE_STRATEGY=hb python3 -m tensorflow.python.distribute.launch python3 train.py --data_location ./data --steps 500 --mode train
+
+echo "Finish!"
\ No newline at end of file

From 4af2db0c0664e97a98a5d9eb450407f16f1d0148 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 26 Jul 2023 11:23:05 +0800
Subject: [PATCH 43/91] [CheckpointSaver] Add saving listeners support for
 increment checkpoint saver. (#915)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../python/training/basic_session_run_hooks.py   | 16 +++++++++++++---
 tensorflow/python/training/monitored_session.py  |  7 ++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index eff7c25b390..1d1df91fbfc 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -610,9 +610,7 @@ def after_run(self, run_context, run_values):
         global_step = run_context.session.run(self._global_step_tensor)
         if self._incremental_timer.should_trigger_for_step(global_step):
           self._incremental_timer.update_last_triggered_step(global_step)
-          logging.info("Start Save incremental checkpoints for %d into %s.", global_step, self._incremental_save_path)
-          self._get_incr_saver().incremental_save(run_context.session, self._incremental_save_path, global_step=global_step)
-          logging.info("Finish Save incremental checkpoints for %d into %s.", global_step, self._incremental_save_path)
+          self._incr_save(run_context.session, global_step)
 
 
   def end(self, session):
@@ -666,6 +664,18 @@ def _get_saver(self):
     self._saver = savers[0]
     return savers[0]
 
+  def _incr_save(self, session, step):
+    logging.info("Saving incremental checkpoints for %d into %s.", step,
+                 self._incremental_save_path)
+    for l in self._listeners:
+      l.before_save(session, step)
+
+    self._get_incr_saver().incremental_save(session,
+                                            self._incremental_save_path,
+                                            global_step=step)
+    for l in self._listeners:
+      l.after_save(session, step)
+
   def _get_incr_saver(self):
     if self._scaffold is not None:
       return self._scaffold._incr_saver
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index b8efd1ee2d7..09c05a02627 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -491,7 +491,8 @@ def MonitoredTrainingSession(
     save_checkpoint_steps=USE_DEFAULT,
     summary_dir=None,
     save_incremental_checkpoint_secs=None,
-    target_nodes_or_tensors=None):
+    target_nodes_or_tensors=None,
+    saving_listeners=None):
 
   """Creates a `MonitoredSession` for training.
 
@@ -548,6 +549,9 @@ def MonitoredTrainingSession(
       summaries. If None, checkpoint_dir is used instead.
     target_nodes_or_tensors: list of tf.Tensor or tf.Operation indicates
       targets, which determine graph transformation of 'smart-stage'
+    saving_listeners: List of `CheckpointSaverListener` subclass instances. Used
+      for callbacks that run immediately before or after this hook saves the
+      checkpoint.
 
   Returns:
     A `MonitoredSession` object.
@@ -648,6 +652,7 @@ def MonitoredTrainingSession(
               save_steps=save_checkpoint_steps,
               save_secs=save_checkpoint_secs,
               scaffold=scaffold,
+              listeners=saving_listeners,
               incremental_save_secs=save_incremental_checkpoint_secs))
 
   if hooks:

From 2065fc0412148f2198f8d60d158ae19fb4c6f9c4 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Mon, 31 Jul 2023 11:00:41 +0800
Subject: [PATCH 44/91] [Docs] Update the download link of the library that
 Processor depends on. (#919)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 docs/docs_en/Processor.md | 2 +-
 docs/docs_zh/Processor.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs_en/Processor.md b/docs/docs_en/Processor.md
index 595f697f7f8..b2cc9cf4423 100644
--- a/docs/docs_en/Processor.md
+++ b/docs/docs_en/Processor.md
@@ -31,7 +31,7 @@ First, directly [dlopen](https://linux.die.net/man/3/dlopen) in the user framewo
 
 Second, use the header files "**serving/processor/serving/processor.h**" and "**libserving_processor.so**".
 
-**Attention**: If you are not using DeepRec docker, then some additional .so dependencies may be required, including: libiomp5.so，libmklml_intel.so，libstdc++.so.6.
+**Attention**: If you are not using DeepRec docker, then some additional .so dependencies may be required, including: libiomp5.so，libmklml_intel.so，libstdc++.so.6. You can download them directly from [here](https://deeprec-dataset.oss-cn-beijing.aliyuncs.com/library/serving_processor_so.tar.gz).
 
 #### C API
 Processor provides the following C API interfaces, and users need to call the following interfaces in their Serving framework.
diff --git a/docs/docs_zh/Processor.md b/docs/docs_zh/Processor.md
index d5c006605b1..4d21e858771 100644
--- a/docs/docs_zh/Processor.md
+++ b/docs/docs_zh/Processor.md
@@ -30,7 +30,7 @@ Processor的产出是一个独立的so，用户可以很方便的对接到自己
 
 第二，可以结合头文件“**serving/processor/serving/processor.h**”使用，头文件中将Processor相关的API暴露了，通过头文件和“**libserving_processor.so**”来调用serving API也比较方便。
 
-**需要注意**：如果不是使用DeepRec docker，那么可能需要一些额外的so依赖，包括：libiomp5.so，libmklml_intel.so，libstdc++.so.6，用户可以[直接下载](http://tfsmoke1.cn-hangzhou.oss.aliyun-inc.com/deeprec/serving_processor_so.tar.gz)，然后在执行时候Preload这些so。
+**需要注意**：如果不是使用DeepRec docker，那么可能需要一些额外的so依赖，包括：libiomp5.so，libmklml_intel.so，libstdc++.so.6，用户可以[直接下载](https://deeprec-dataset.oss-cn-beijing.aliyuncs.com/library/serving_processor_so.tar.gz)，然后在执行时候Preload这些so。
 
 #### API接口
 Processor提供以下几组C API接口，用户在自己的Serving框架中需要调用下列接口。

From 4cd9ed895a3b8f64909b65c04d06ea4e95f761ad Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Mon, 7 Aug 2023 11:38:43 +0800
Subject: [PATCH 45/91] [Embedding] Refactor the code of Save Op for
 EmbeddingVariable. (#900)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/config.proto     |  10 +
 .../embedding/dram_leveldb_storage.h          |  86 ++--
 .../framework/embedding/dram_pmem_storage.h   |  82 ++--
 .../framework/embedding/dram_ssd_storage.h    |  75 +---
 .../framework/embedding/embedding_config.h    |  10 +
 .../core/framework/embedding/embedding_var.h  |  32 +-
 .../embedding/embedding_var_ckpt_data.h       | 236 ++++++++++
 .../embedding/embedding_var_dump_iterator.h   |  95 ++++
 .../embedding/embedding_var_restore.h         |   2 -
 .../embedding/globalstep_shrink_policy.h      |  12 +-
 .../framework/embedding/gpu_hash_map_kv.h     |   2 -
 .../embedding/hbm_dram_ssd_storage.h          |  84 ++--
 .../framework/embedding/hbm_dram_storage.h    |  98 ++--
 .../embedding/hbm_storage_iterator.h          | 315 +++----------
 .../core/framework/embedding/kv_interface.h   |  20 +-
 .../embedding/l2weight_shrink_policy.h        |  14 +-
 .../core/framework/embedding/leveldb_kv.h     | 114 +++--
 .../framework/embedding/multi_tier_storage.h  |  69 +--
 .../core/framework/embedding/shrink_policy.h  |   8 +-
 .../framework/embedding/single_tier_storage.h | 195 ++++----
 .../core/framework/embedding/ssd_hash_kv.h    |  37 +-
 .../embedding/ssd_record_descriptor.h         | 148 ++++++
 tensorflow/core/framework/embedding/storage.h | 117 ++++-
 tensorflow/core/kernels/BUILD                 |   6 +-
 .../kernels/embedding_variable_ops_test.cc    | 161 +------
 .../embedding_variable_performance_test.cc    |  13 +-
 tensorflow/core/kernels/kv_variable_ops.h     | 420 ------------------
 ...tore_ops.cc => kv_variable_restore_ops.cc} |  80 ----
 tensorflow/core/kernels/save_restore_tensor.h |  54 +--
 .../core/kernels/save_restore_v2_ops.cc       |  18 +-
 .../python/ops/embedding_variable_ops_test.py | 132 ++++--
 31 files changed, 1191 insertions(+), 1554 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
 create mode 100644 tensorflow/core/framework/embedding/ssd_record_descriptor.h
 rename tensorflow/core/kernels/{kv_variable_save_restore_ops.cc => kv_variable_restore_ops.cc} (86%)

diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index 1eef3edccc2..3d5fae9f6ad 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -46,3 +46,13 @@ enum EmbeddingVariableType {
   IMMUTABLE = 0;
   MUTABLE = 1;
 }
+
+enum ValuePtrStatus {
+  OK = 0;
+  IS_DELETED = 1;
+}
+
+enum ValuePosition {
+  IN_DRAM = 0;
+  NOT_IN_DRAM = 1;
+}
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index c6c64e14865..fdb6697d541 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -111,14 +111,6 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return false;
   }
 
-  void iterator_mutex_lock() override {
-    leveldb_->get_mutex()->lock();
-  }
-
-  void iterator_mutex_unlock() override {
-    leveldb_->get_mutex()->unlock();
-  }
-
   int64 Size() const override {
     int64 total_size = dram_->Size();
     total_size += leveldb_->Size();
@@ -145,46 +137,58 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return -1;
   }
 
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(key_list, value_ptr_list));
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    std::vector<K> key_list, tmp_leveldb_key_list;
+    std::vector<ValuePtr<V>*> value_ptr_list, tmp_leveldb_value_list;
+    TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
+
+    TF_CHECK_OK(leveldb_->GetSnapshot(
+        &tmp_leveldb_key_list, &tmp_leveldb_value_list));
+
+    for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) {
+      tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
+      tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index);
     }
-    {
-      mutex_lock l(*(leveldb_->get_mutex()));
-      TF_CHECK_OK(leveldb_->GetSnapshot(key_list, value_ptr_list));
+
+    std::vector<K> leveldb_key_list;
+    for (int64 i = 0; i < tmp_leveldb_key_list.size(); i++) {
+      Status s = dram_->Contains(tmp_leveldb_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_leveldb_key_list[i]);
+        leveldb_key_list.emplace_back(tmp_leveldb_key_list[i]);
+        value_ptr_list.emplace_back(tmp_leveldb_value_list[i]);
+      }
     }
-    return Status::OK();
-  }
 
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    dram_->Shrink(shrink_args);
-    leveldb_->Shrink(shrink_args);
-    return Status::OK();
-  }
+    ValueIterator<V>* value_iter =
+        leveldb_->GetValueIterator(
+            leveldb_key_list, emb_config.emb_index, value_len);
 
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      std::vector<ValuePtr<V>*> value_ptr_list;
-      std::vector<K> key_list_tmp;
-      TF_CHECK_OK(dram_->GetSnapshot(&key_list_tmp, &value_ptr_list));
-      MultiTierStorage<K, V>::SetListsForCheckpoint(
-          key_list_tmp, value_ptr_list, emb_config,
-          key_list, value_list, version_list, freq_list);
-    }
     {
       mutex_lock l(*(leveldb_->get_mutex()));
-      *it = leveldb_->GetIterator();
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer,
+          emb_config,
+          value_len, default_value,
+          key_list,
+          value_ptr_list,
+          value_iter)));
     }
-    return key_list->size();
+
+    for (auto it: tmp_leveldb_value_list) {
+      delete it;
+    }
+
+    delete value_iter;
+
+    return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index 47b6115e801..fd19f75ab4c 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -150,59 +150,41 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return -1;
   }
 
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(key_list, value_ptr_list));
-    }
-    {
-      mutex_lock l(*(pmem_->get_mutex()));
-      TF_CHECK_OK(pmem_->GetSnapshot(key_list, value_ptr_list));
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    std::vector<K> key_list, tmp_pmem_key_list;
+    std::vector<ValuePtr<V>*> value_ptr_list, tmp_pmem_value_list;
+
+    TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
+    dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    TF_CHECK_OK(pmem_->GetSnapshot(&tmp_pmem_key_list,
+                                   &tmp_pmem_value_list));
+    pmem_->Shrink(tmp_pmem_key_list, tmp_pmem_value_list,
+                  shrink_args, value_len);
+
+    for (int64 i = 0; i < tmp_pmem_key_list.size(); i++) {
+      Status s = dram_->Contains(tmp_pmem_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_pmem_key_list[i]);
+        value_ptr_list.emplace_back(tmp_pmem_value_list[i]);
+      }
     }
-    return Status::OK();
-  }
 
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    dram_->Shrink(shrink_args);
-    pmem_->Shrink(shrink_args);
-    return Status::OK();
-  }
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer,
+        emb_config,
+        value_len, default_value,
+        key_list,
+        value_ptr_list)));
 
-  void iterator_mutex_lock() override {
-    return;
-  }
-
-  void iterator_mutex_unlock() override {
-    return;
-  }
-
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      std::vector<ValuePtr<V>*> value_ptr_list;
-      std::vector<K> key_list_tmp;
-      TF_CHECK_OK(dram_->GetSnapshot(&key_list_tmp, &value_ptr_list));
-      MultiTierStorage<K, V>::SetListsForCheckpoint(
-          key_list_tmp, value_ptr_list, emb_config,
-          key_list, value_list, version_list, freq_list);
-    }
-    {
-      mutex_lock l(*(pmem_->get_mutex()));
-      std::vector<ValuePtr<V>*> value_ptr_list;
-      std::vector<K> key_list_tmp;
-      TF_CHECK_OK(pmem_->GetSnapshot(&key_list_tmp, &value_ptr_list));
-      MultiTierStorage<K, V>::SetListsForCheckpoint(
-          key_list_tmp, value_ptr_list, emb_config,
-          key_list, value_list, version_list, freq_list);
-    }
-    return key_list->size();
+    return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 675395c667d..4243cc14eb3 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -144,70 +144,21 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return true;
   }
 
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    {
-     mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(key_list, value_ptr_list));
-    }
-    {
-      mutex_lock l(*(ssd_hash_->get_mutex()));
-      TF_CHECK_OK(ssd_hash_->GetSnapshot(key_list, value_ptr_list));
-    }
-    return Status::OK();
-  }
-
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    dram_->Shrink(shrink_args);
-    ssd_hash_->Shrink(shrink_args);
-    return Status::OK();
-  }
-
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
       const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      std::vector<ValuePtr<V>*> value_ptr_list;
-      std::vector<K> key_list_tmp;
-      TF_CHECK_OK(dram_->GetSnapshot(&key_list_tmp, &value_ptr_list));
-      MultiTierStorage<K, V>::SetListsForCheckpoint(
-          key_list_tmp, value_ptr_list, emb_config,
-          key_list, value_list, version_list, freq_list);
-    }
-    {
-      mutex_lock l(*(ssd_hash_->get_mutex()));
-      *it = ssd_hash_->GetIterator();
-    }
-    return key_list->size();
-  }
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    dram_->Save(tensor_name, prefix, writer, emb_config,
+                shrink_args, value_len, default_value);
 
-  int64 GetSnapshotWithoutFetchPersistentEmb(
-      std::vector<K>* key_list,
-      std::vector<V*>* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      SsdRecordDescriptor<K>* ssd_rec_desc) override {
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      std::vector<ValuePtr<V>*> value_ptr_list;
-      std::vector<K> temp_key_list;
-      TF_CHECK_OK(dram_->GetSnapshot(&temp_key_list, &value_ptr_list));
-      MultiTierStorage<K, V>::SetListsForCheckpoint(
-          temp_key_list, value_ptr_list, emb_config,
-          key_list, value_list, version_list,
-          freq_list);
-    }
-    {
-      mutex_lock l(*(ssd_hash_->get_mutex()));
-      ssd_hash_->SetSsdRecordDescriptor(ssd_rec_desc);
-    }
-    return key_list->size() + ssd_rec_desc->key_list.size();
+    ssd_hash_->Save(tensor_name, prefix, writer, emb_config,
+                    shrink_args, value_len, default_value);
+
+    return Status::OK();
   }
 
   Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index 3aaa259c3f2..0a50b492159 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -101,6 +101,16 @@ struct EmbeddingConfig {
     return emb_index == primary_emb_index;
   }
 
+  bool is_save_freq() const {
+    return filter_freq != 0 ||
+           record_freq ||
+           normal_fix_flag == 1;
+  }
+
+  bool is_save_version() const {
+    return steps_to_live != 0 || record_version;
+  }
+
   int64 total_num(int alloc_len) {
     return block_num *
            (1 + (1 - normal_fix_flag) * slot_num) *
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 534ebf68950..9a5b5cf9a19 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -582,30 +582,14 @@ class EmbeddingVar : public ResourceBase {
                              emb_config_, device, reader, this, filter_);
   }
 
-  int64 GetSnapshot(std::vector<K>* key_list,
-                    std::vector<V* >* value_list,
-                    std::vector<int64>* version_list,
-                    std::vector<int64>* freq_list,
-                    embedding::Iterator** it = nullptr) {
-    // for Interface Compatible
-    // TODO Multi-tiered Embedding should use iterator in 'GetSnapshot' caller
-    embedding::Iterator* _it = nullptr;
-    it = (it == nullptr) ? &_it : it;
-    return storage_->GetSnapshot(
-        key_list, value_list, version_list,
-        freq_list, emb_config_, filter_, it);
-  }
-
-  int64 GetSnapshotWithoutFetchPersistentEmb(
-      std::vector<K>* key_list,
-      std::vector<V*>* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      SsdRecordDescriptor<K>* ssd_rec_desc) {
-    return storage_->
-        GetSnapshotWithoutFetchPersistentEmb(
-            key_list, value_list, version_list,
-            freq_list, emb_config_, ssd_rec_desc);
+  Status Save(const string& tensor_name,
+              const string& prefix,
+              BundleWriter* writer,
+              embedding::ShrinkArgs& shrink_args) {
+    return storage_->Save(tensor_name, prefix,
+                          writer, emb_config_,
+                          shrink_args, value_len_,
+                          default_value_);
   }
 
   mutex* mu() {
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
new file mode 100644
index 00000000000..aa1a08cbcfd
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -0,0 +1,236 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
+namespace tensorflow {
+namespace embedding {
+
+template<class K, class V>
+class  EmbeddingVarCkptData {
+ public:
+  void Emplace(K key, ValuePtr<V>* value_ptr,
+               const EmbeddingConfig& emb_config,
+               V* default_value, int64 value_offset,
+               bool is_save_freq,
+               bool is_save_version,
+               bool save_unfiltered_features) {
+    if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
+      return;
+
+    V* primary_val = value_ptr->GetValue(0, 0);
+    bool is_not_admit =
+        primary_val == nullptr
+        && emb_config.filter_freq != 0;
+
+    if (!is_not_admit) {
+       key_vec_.emplace_back(key);
+
+      if (primary_val == nullptr) {
+        value_ptr_vec_.emplace_back(default_value);
+      } else if (
+          (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
+        value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
+      } else {
+        V* val = value_ptr->GetValue(emb_config.emb_index,
+            value_offset);
+        value_ptr_vec_.emplace_back(val);
+      }
+
+
+      if(is_save_version) {
+        int64 dump_version = value_ptr->GetStep();
+        version_vec_.emplace_back(dump_version);
+      }
+
+      if(is_save_freq) {
+        int64 dump_freq = value_ptr->GetFreq();
+        freq_vec_.emplace_back(dump_freq);
+      }
+    } else {
+      if (!save_unfiltered_features)
+        return;
+
+      key_filter_vec_.emplace_back(key);
+
+      if(is_save_version) {
+        int64 dump_version = value_ptr->GetStep();
+        version_filter_vec_.emplace_back(dump_version);
+      }
+
+      int64 dump_freq = value_ptr->GetFreq();
+      freq_filter_vec_.emplace_back(dump_freq);
+    }
+  }
+
+  void Emplace(K key, V* value_ptr) {
+    key_vec_.emplace_back(key);
+    value_ptr_vec_.emplace_back(value_ptr);
+  }
+
+  void SetWithPartition(
+      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
+    part_offset_.resize(kSavedPartitionNum + 1);
+    part_filter_offset_.resize(kSavedPartitionNum + 1);
+    part_offset_[0] = 0;
+    part_filter_offset_[0] = 0;
+    for (int i = 0; i < kSavedPartitionNum; i++) {
+      part_offset_[i + 1] =
+          part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
+
+      part_filter_offset_[i + 1] =
+          part_filter_offset_[i] +
+          ev_ckpt_data_parts[i].key_filter_vec_.size();
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
+        key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
+        value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
+        version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
+        freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
+        key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) {
+        version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]);
+      }
+
+      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
+        freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
+      }
+    }
+  }
+
+  Status ExportToCkpt(const string& tensor_name,
+                      BundleWriter* writer,
+                      int64 value_len,
+                      ValueIterator<V>* value_iter = nullptr) {
+    size_t bytes_limit = 8 << 20;
+    std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
+
+    EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
+    Status s = SaveTensorWithFixedBuffer(
+        tensor_name + "-keys", writer, dump_buffer.get(),
+        bytes_limit, &key_dump_iter,
+        TensorShape({key_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EV2dVectorDataDumpIterator<V> value_dump_iter(
+        value_ptr_vec_, value_len, value_iter);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-values", writer, dump_buffer.get(),
+        bytes_limit, &value_dump_iter,
+        TensorShape({value_ptr_vec_.size(), value_len}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-versions", writer, dump_buffer.get(),
+        bytes_limit, &version_dump_iter,
+        TensorShape({version_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-freqs", writer, dump_buffer.get(),
+        bytes_limit, &freq_dump_iter,
+        TensorShape({freq_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-keys_filtered", writer, dump_buffer.get(),
+        bytes_limit, &filtered_key_dump_iter,
+        TensorShape({key_filter_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int64>
+        filtered_version_dump_iter(version_filter_vec_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-versions_filtered",
+        writer, dump_buffer.get(),
+        bytes_limit, &filtered_version_dump_iter,
+        TensorShape({version_filter_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int64>
+        filtered_freq_dump_iter(freq_filter_vec_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-freqs_filtered",
+        writer, dump_buffer.get(),
+        bytes_limit, &filtered_freq_dump_iter,
+        TensorShape({freq_filter_vec_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int32>
+        part_offset_dump_iter(part_offset_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-partition_offset",
+        writer, dump_buffer.get(),
+        bytes_limit, &part_offset_dump_iter,
+        TensorShape({part_offset_.size()}));
+    if (!s.ok())
+      return s;
+
+    EVVectorDataDumpIterator<int32>
+        part_filter_offset_dump_iter(part_filter_offset_);
+    s = SaveTensorWithFixedBuffer(
+        tensor_name + "-partition_filter_offset",
+        writer, dump_buffer.get(),
+        bytes_limit, &part_filter_offset_dump_iter,
+        TensorShape({part_filter_offset_.size()}));
+    if (!s.ok())
+      return s;
+
+    return Status::OK();
+  }
+
+ private:
+  std::vector<K> key_vec_;
+  std::vector<V*> value_ptr_vec_;
+  std::vector<int64> version_vec_;
+  std::vector<int64> freq_vec_;
+  std::vector<K> key_filter_vec_;
+  std::vector<int64> version_filter_vec_;
+  std::vector<int64> freq_filter_vec_;
+  std::vector<int32> part_offset_;
+  std::vector<int32> part_filter_offset_;
+  const int kSavedPartitionNum = 1000;
+};
+} //namespace embedding
+} //namespace tensorflow
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
new file mode 100644
index 00000000000..71ba054b873
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -0,0 +1,95 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+namespace tensorflow {
+namespace embedding {
+template<class T>
+class EVVectorDataDumpIterator: public DumpIterator<T> {
+ public:
+  EVVectorDataDumpIterator(const std::vector<T>& item_list)
+      : curr_iter_(item_list.begin()),
+        end_iter_(item_list.end()) {}
+
+  bool HasNext() const {
+    return curr_iter_ != end_iter_;
+  }
+
+  T Next() {
+    T val = *curr_iter_;
+    curr_iter_++;
+    return val;
+  }
+
+ private:
+  typename std::vector<T>::const_iterator curr_iter_;
+  typename std::vector<T>::const_iterator end_iter_;
+};
+
+template<class T>
+class EV2dVectorDataDumpIterator: public DumpIterator<T> {
+ public:
+  EV2dVectorDataDumpIterator(
+      std::vector<T*>& valueptr_list,
+      int64 value_len,
+      ValueIterator<T>* val_iter)
+      : curr_iter_(valueptr_list.begin()),
+        end_iter_(valueptr_list.end()),
+        val_iter_(val_iter),
+        value_len_(value_len),
+        col_idx_(0) {
+    if (!valueptr_list.empty()) {
+      if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+        curr_ptr_ = val_iter_->Next();
+      } else {
+        curr_ptr_ = *curr_iter_;
+      }
+    }
+  }
+
+  bool HasNext() const {
+    return curr_iter_ != end_iter_;
+  }
+
+  T Next() {
+    T val = curr_ptr_[col_idx_++];
+    if (col_idx_ >= value_len_) {
+      curr_iter_++;
+      col_idx_ = 0;
+      if (curr_iter_ != end_iter_) {
+        if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+          curr_ptr_ = val_iter_->Next();
+        } else {
+          curr_ptr_ = *curr_iter_;
+        }
+      }
+    }
+    return val;
+  }
+
+ private:
+  typename std::vector<T*>::const_iterator curr_iter_;
+  typename std::vector<T*>::const_iterator end_iter_;
+  ValueIterator<T>* val_iter_;
+  int64 value_len_;
+  int64 col_idx_;
+  T* curr_ptr_ = nullptr;
+};
+} //namespace embedding
+} //namespace tensorflow
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.h b/tensorflow/core/framework/embedding/embedding_var_restore.h
index 821ef7485e8..ec97566fbec 100644
--- a/tensorflow/core/framework/embedding/embedding_var_restore.h
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.h
@@ -40,9 +40,7 @@ using GPUDevice = Eigen::GpuDevice;
 template<typename K, typename V>
 class EmbeddingVar;
 
-
 namespace {
-  const int kSavedPartitionNum = 1000;
   const size_t kBufferSize = 8 << 20;
   constexpr char kPartStr[] = "part_";
   
diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
index 17551a6c387..a2af6a2430a 100644
--- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
@@ -35,19 +35,18 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy);
 
-  void Shrink(const ShrinkArgs& shrink_args) override {
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_list,
+              const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
-    std::vector<K> key_list;
-    std::vector<ValuePtr<V>*> value_list;
-    kv_->GetSnapshot(&key_list, &value_list);
     FilterToDelete(shrink_args.global_step,
         key_list, value_list);
   }
 
  private:
   void FilterToDelete(int64 global_step,
-                      const std::vector<K>& key_list,
-                      const std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<K>& key_list,
+                      std::vector<ValuePtr<V>*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
       int64 version = value_list[i]->GetStep();
       if (version == -1) {
@@ -56,6 +55,7 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
         if (global_step - version > steps_to_live_) {
           kv_->Remove(key_list[i]);
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
+          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 56542237a3e..1dd90d63a6e 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -256,8 +256,6 @@ class GPUHashMapKV : public KVInterface<K, V> {
 
   std::string DebugString() const override { return std::string(); }
 
-  Iterator* GetIterator() override { return nullptr; }
-
   GPUHashTable<K, V>* HashTable() override { return hash_table_; }
 
   Status BatchLookup(const Eigen::GpuDevice& device, const K* keys,
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 72a3ef4483c..581f1f1cfaf 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -302,45 +302,67 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  void iterator_mutex_lock() override {
-    ssd_->get_mutex()->lock();
-  }
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    std::vector<K> key_list, tmp_dram_key_list;
+    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
+    hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    HbmValueIterator<K, V> hbm_value_iter(
+        key_list, value_ptr_list,
+        emb_config.emb_index, Storage<K, V>::alloc_len_,
+        gpu_alloc_);
+
+    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
+    for (int64 i = 0; i < value_ptr_list.size(); i++) {
+      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
+      memcpy((char *)value_ptr->GetPtr(),
+             (char *)value_ptr_list[i]->GetPtr(),
+             sizeof(FixedLengthHeader));
+      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
+      value_ptr->SetInitialized(emb_config.primary_emb_index);
+      tmp_hbm_value_ptrs[i] = value_ptr;
+      value_ptr_list[i] = value_ptr;
+    }
 
-  void iterator_mutex_unlock() override {
-    ssd_->get_mutex()->unlock();
-  }
+    TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
+                                   &tmp_dram_value_list));
+    dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list,
+                  shrink_args, value_len);
 
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
-    {
-      mutex_lock l(*(hbm_->get_mutex()));
-      TF_CHECK_OK(hbm_->GetSnapshot(key_list, value_ptr_list));
+    for (int64 i = 0; i < tmp_dram_key_list.size(); i++) {
+      Status s = hbm_->Contains(tmp_dram_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_dram_key_list[i]);
+        value_ptr_list.emplace_back(tmp_dram_value_list[i]);
+      }
     }
+
     {
-      mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(key_list, value_ptr_list));
+      mutex_lock l(*(hbm_->get_mutex()));
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer,
+          emb_config,
+          value_len, default_value,
+          key_list,
+          value_ptr_list,
+          &hbm_value_iter)));
     }
-    {
-      mutex_lock l(*(ssd_->get_mutex()));
-      TF_CHECK_OK(ssd_->GetSnapshot(key_list, value_ptr_list));
+
+    for (auto it: tmp_hbm_value_ptrs) {
+      delete it;
     }
-    return Status::OK();
-  }
 
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
-    LOG(FATAL)<<"HbmDramSsdStorage dosen't support GetSnaoshot.";
-  }
+    ssd_->Save(tensor_name, prefix, writer, emb_config,
+               shrink_args, value_len, default_value);
 
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    hbm_->Shrink(shrink_args);
-    dram_->Shrink(shrink_args);
-    ssd_->Shrink(shrink_args);
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index ce8e9a91643..518c39287e0 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -261,63 +261,63 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  void iterator_mutex_lock() override {
-    return;
-  }
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    std::vector<K> key_list, tmp_dram_key_list;
+    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
+    hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    HbmValueIterator<K, V> hbm_value_iter(
+        key_list, value_ptr_list,
+        emb_config.emb_index, Storage<K, V>::alloc_len_,
+        gpu_alloc_);
+
+    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
+    for (int64 i = 0; i < value_ptr_list.size(); i++) {
+      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
+      memcpy((char *)value_ptr->GetPtr(),
+             (char *)value_ptr_list[i]->GetPtr(),
+             sizeof(FixedLengthHeader));
+      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
+      value_ptr->SetInitialized(emb_config.primary_emb_index);
+      tmp_hbm_value_ptrs[i] = value_ptr;
+      value_ptr_list[i] = value_ptr;
+    }
 
-  void iterator_mutex_unlock() override {
-    return;
-  }
+    TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
+                                   &tmp_dram_value_list));
+    dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list,
+                  shrink_args, value_len);
 
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
-    {
-      mutex_lock l(*(hbm_->get_mutex()));
-      TF_CHECK_OK(hbm_->GetSnapshot(key_list, value_ptr_list));
-    }
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(key_list, value_ptr_list));
+    for (int64 i = 0; i < tmp_dram_key_list.size(); i++) {
+      Status s = hbm_->Contains(tmp_dram_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_dram_key_list[i]);
+        value_ptr_list.emplace_back(tmp_dram_value_list[i]);
+      }
     }
-    return Status::OK();
-  }
 
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
-    std::vector<ValuePtr<V>*> hbm_value_ptr_list, dram_value_ptr_list;
-    std::vector<K> temp_hbm_key_list, temp_dram_key_list;
-    // Get Snapshot of HBM storage
     {
       mutex_lock l(*(hbm_->get_mutex()));
-      TF_CHECK_OK(hbm_->GetSnapshot(&temp_hbm_key_list,
-                                    &hbm_value_ptr_list));
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer,
+          emb_config,
+          value_len, default_value,
+          key_list,
+          value_ptr_list,
+          &hbm_value_iter)));
     }
-    // Get Snapshot of DRAM storage.
-    {
-      mutex_lock l(*(dram_->get_mutex()));
-      TF_CHECK_OK(dram_->GetSnapshot(&temp_dram_key_list,
-                                     &dram_value_ptr_list));
-    }
-    *it = new HbmDramIterator<K, V>(temp_hbm_key_list,
-                                    temp_dram_key_list,
-                                    hbm_value_ptr_list,
-                                    dram_value_ptr_list,
-                                    Storage<K, V>::alloc_len_,
-                                    gpu_alloc_,
-                                    emb_config.emb_index);
-    // This return value is not the exact number of IDs
-    // because the two tables intersect.
-    return temp_hbm_key_list.size() + temp_dram_key_list.size();
-  }
 
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    hbm_->Shrink(shrink_args);
-    dram_->Shrink(shrink_args);
+    for (auto it: tmp_hbm_value_ptrs) {
+      delete it;
+    }
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
index 4831b940bb8..36d331e74aa 100644
--- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h
+++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
@@ -23,267 +23,77 @@ template <class V>
 class ValuePtr;
 
 namespace embedding {
-class Iterator;
-
-namespace {
-  const int kSavedPartitionNum = 1000;
-}
-
-template<class K, class V>
-class PartitionedCheckpointData {
+template<class K, class  V>
+class HbmValueIterator: public ValueIterator<V> {
  public:
-  PartitionedCheckpointData() {
-    key_list_parts.resize(kSavedPartitionNum);
-    value_list_parts.resize(kSavedPartitionNum);
-    version_list_parts.resize(kSavedPartitionNum);
-    freq_list_parts.resize(kSavedPartitionNum);
-    key_filter_list_parts.resize(kSavedPartitionNum);
-    version_filter_list_parts.resize(kSavedPartitionNum);
-    freq_filter_list_parts.resize(kSavedPartitionNum);
-  }
-
-  ~PartitionedCheckpointData() {
-  }
-
-  void EmplaceToPartList(K key, ValuePtr<V>* value_ptr, bool is_on_hbm,
-                         int64 emb_index, int64 emb_offset) {
-    int64 part_id = key % kSavedPartitionNum;
-    V* val = value_ptr->GetValue(emb_index, emb_offset);
-    V* primary_val = value_ptr->GetValue(0, 0);
-
-    int64 freq = value_ptr->GetFreq();
-    int64 version = value_ptr->GetStep();
-    if (primary_val == nullptr) {
-      // id is filtered by feature filter.
-      key_filter_list_parts[part_id].emplace_back(key);
-      freq_filter_list_parts[part_id].emplace_back(freq);
-      version_filter_list_parts[part_id].emplace_back(version);
-    } else {
-      if (val != nullptr) {
-        key_list_parts[part_id].emplace_back(key);
-        freq_list_parts[part_id].emplace_back(freq);
-        version_list_parts[part_id].emplace_back(version);
-        value_list_parts[part_id].emplace_back(
-            std::pair<V*, bool>(val, is_on_hbm));
+  HbmValueIterator(
+      const std::vector<K>& key_list,
+      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      int64 emb_index,
+      int64 value_len,
+      Allocator* alloc)
+      : value_len_(value_len),
+        alloc_(alloc) {
+    int64 emb_offset = value_len_ * emb_index;
+    std::vector<std::list<V*>> value_parts_vec(kSavedPartitionNum);
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          value_parts_vec[part_id].emplace_back(
+              value_ptr_list[i]->GetValue(emb_index, emb_offset));
+          break;
+        }
       }
     }
-  }
-
-  void GenerateKeyList(std::vector<K>* output_key_list) {
-    MergePartList<K>(key_list_parts, output_key_list);
-  }
 
-  void GenerateFilteredKeyList(std::vector<K>* output_filter_key_list) {
-    MergePartList<K>(key_filter_list_parts, output_filter_key_list);
-  }
-
-  void GenerateValueList(
-      std::vector<std::pair<V*, bool>>* output_value_list,
-      std::vector<V*>* hbm_ptr_list) {
-    for (int i = 0; i < kSavedPartitionNum; i++) {
-      for (int j = 0; j < value_list_parts[i].size(); j++) {
-        output_value_list->emplace_back(value_list_parts[i][j]);
-        if (value_list_parts[i][j].second)
-          hbm_ptr_list->emplace_back(value_list_parts[i][j].first);
-      }
-    }
-  }
-
-  void GenerateFreqList(std::vector<int64>* output_freq_list) {
-    MergePartList<int64>(freq_list_parts, output_freq_list);
-  }
-
-  void GenerateFilteredFreqList(
-      std::vector<int64>* output_filter_freq_list) {
-    MergePartList<int64>(freq_filter_list_parts, output_filter_freq_list);
-  }
-
-  void GenerateVersionList(
-      std::vector<int64>* output_version_list) {
-    MergePartList<int64>(version_list_parts, output_version_list);
-  }
-
-  void GenerateFilteredVersionList(
-      std::vector<int64>* output_filter_version_list) {
-    MergePartList<int64>(version_filter_list_parts,
-                         output_filter_version_list);
-  }
-
-  void GeneratePartOffset(std::vector<int32>* part_offset) {
-    for (int64 i = 0; i < kSavedPartitionNum; i++) {
-      (*part_offset)[i + 1] = (*part_offset)[i] + key_list_parts[i].size();
-    }
-  }
-
-  void GeneratePartFilterOffset(std::vector<int32>* part_filter_offset) {
     for (int64 i = 0; i < kSavedPartitionNum; i++) {
-      (*part_filter_offset)[i + 1] = (*part_filter_offset)[i]
-                                     + key_filter_list_parts[i].size();
+      values_.splice(values_.end(), value_parts_vec[i]);
     }
-  }
 
- private:
-  template<class T>
-  void MergePartList(
-      const std::vector<std::vector<T>>& part_list,
-      std::vector<T> *output_list) {
-    for (int i = 0; i < kSavedPartitionNum; i++) {
-      for (int j = 0; j < part_list[i].size(); j++) {
-        output_list->emplace_back(part_list[i][j]);
-      }
-    }
-  }
-
-  std::vector<std::vector<K>> key_list_parts;
-  std::vector<std::vector<std::pair<V*, bool>>> value_list_parts;
-  std::vector<std::vector<int64>> version_list_parts;
-  std::vector<std::vector<int64>> freq_list_parts;
-  std::vector<std::vector<K>> key_filter_list_parts;
-  std::vector<std::vector<int64>> version_filter_list_parts;
-  std::vector<std::vector<int64>> freq_filter_list_parts;
-};
-
-template<class K, class V>
-class HbmDramIterator: public Iterator {
- public:
-  HbmDramIterator(
-              const std::vector<K>& hbm_key_list,
-              const std::vector<K>& dram_key_list,
-              const std::vector<ValuePtr<V>*>& hbm_value_ptr_list,
-              const std::vector<ValuePtr<V>*>& dram_value_ptr_list,
-              int64 value_len,
-              Allocator* alloc,
-              int64 emb_index):
-              value_len_(value_len),
-              alloc_(alloc),
-              cursor_(0),
-              hbm_ptr_cursor_(0),
-              fill_buffer_st_(0),
-              fill_buffer_ed_(0),
-              emb_index_(emb_index) {
-    part_offset_.resize(kSavedPartitionNum + 1);
-    part_offset_[0] = 0;
-    part_filter_offset_.resize(kSavedPartitionNum + 1);
-    part_filter_offset_[0] = 0;
-    emb_offset_ = value_len_ * emb_index_;
-    std::set<K> hbm_keys;
+    values_iter_ = values_.begin();
 
-    PartitionedCheckpointData<K, V> ckpt_data;
-    for (int64 i = 0; i < hbm_key_list.size(); i++) {
-      ckpt_data.EmplaceToPartList(
-          hbm_key_list[i], hbm_value_ptr_list[i], true,
-          emb_index_, emb_offset_);
-      hbm_keys.insert(hbm_key_list[i]);
-    }
-    for (int64 i = 0; i < dram_key_list.size(); i++) {
-      if (hbm_keys.find(dram_key_list[i]) == hbm_keys.end()) {
-        ckpt_data.EmplaceToPartList(
-            dram_key_list[i], dram_value_ptr_list[i], false,
-            emb_index_, emb_offset_);
-      }
-    }
-
-    ckpt_data.GenerateKeyList(&key_list_);
-    ckpt_data.GenerateValueList(&value_list_, &hbm_ptr_list_);
-    ckpt_data.GenerateFreqList(&freq_list_);
-    ckpt_data.GenerateVersionList(&version_list_);
-    ckpt_data.GeneratePartOffset(&part_offset_);
-
-    ckpt_data.GenerateFilteredKeyList(&filtered_key_list_);
-    ckpt_data.GenerateFilteredFreqList(&filtered_freq_list_);
-    ckpt_data.GenerateFilteredVersionList(&filtered_version_list_);
-    ckpt_data.GeneratePartFilterOffset(&part_filter_offset_);
-
-    dev_addr_list_ = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-        buffer_capacity_ / value_len_ * sizeof(V*));
-    dev_embedding_buffer_ = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+    num_of_embs_ = buffer_capacity_ / value_len_;
+    dev_addr_list_ = (V**)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment,
+        num_of_embs_ * sizeof(V*));
+    dev_embedding_buffer_ = (V*)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment,
         buffer_capacity_ * sizeof(V));
-    local_addr_list_ = new V*[buffer_capacity_ / value_len_];
+
+    FillEmbeddingBuffer();
   }
 
-  ~HbmDramIterator() {
+  ~HbmValueIterator() {
     alloc_->DeallocateRaw(dev_addr_list_);
     alloc_->DeallocateRaw(dev_embedding_buffer_);
-    delete[] local_addr_list_;
-  }
-
-  virtual bool Valid() {
-    return !(cursor_ == current_key_list_->size());
-  }
-
-  virtual void SeekToFirst() {
-    cursor_ = 0;
-    hbm_ptr_cursor_ = 0;
-    fill_buffer_st_ = 0;
-    fill_buffer_ed_ = 0;
-  }
-
-  virtual void SwitchToFilteredFeatures() {
-    current_key_list_ = &filtered_key_list_;
-    current_freq_list_ = &filtered_freq_list_;
-    current_version_list_ = &filtered_version_list_;
-  }
-
-  virtual void SwitchToAdmitFeatures() {
-    current_key_list_ = &key_list_;
-    current_freq_list_ = &freq_list_;
-    current_version_list_ = &version_list_;
-  }
-
-  virtual void Next() {
-    cursor_++;
-  }
-
-  virtual void Key(char* val, int64 dim) {
-    *((int64*)val) = (*current_key_list_)[cursor_];
-  }
-
-  virtual void Value(char* val, int64 dim, int64 value_offset) {
-    if (value_list_[cursor_].second) {
-      if (hbm_ptr_cursor_ == fill_buffer_ed_) {
-        FillEmbeddingBuffer();
-      }
-      memcpy(val,
-             embedding_buffer_ +
-                (hbm_ptr_cursor_ - fill_buffer_st_) * value_len_,
-             dim);
-      hbm_ptr_cursor_++;
-    } else {
-      memcpy(val, value_list_[cursor_].first, dim);
-    }
-  }
-
-  virtual void Freq(char* val, int64 dim) {
-    *((int64*)val) = (*current_freq_list_)[cursor_];
-  }
-
-  virtual void Version(char* val, int64 dim) {
-    *((int64*)val) = (*current_version_list_)[cursor_];
   }
 
-  virtual void SetPartOffset(int32* part_offset_ptr) {
-    for (int64 i = 0; i < kSavedPartitionNum + 1; i++) {
-      part_offset_ptr[i] = part_offset_[i];
+  V* Next() {
+    if (buffer_cursor_ == num_of_embs_) {
+      FillEmbeddingBuffer();
+      buffer_cursor_ = 0;
     }
-  }
 
-  virtual void SetPartFilterOffset(int32* part_offset_ptr) {
-    for (int64 i = 0; i < kSavedPartitionNum + 1; i++) {
-      part_offset_ptr[i] = part_filter_offset_[i];
-    }
+    V* val = embedding_buffer_ + value_len_ * buffer_cursor_;
+    counter_++;
+    values_iter_++;
+    buffer_cursor_++;
+    return val;
   }
 
  private:
   void FillEmbeddingBuffer() {
     int64 total_num = std::min(
-        buffer_capacity_ / value_len_,
-        (int64)(hbm_ptr_list_.size() - hbm_ptr_cursor_));
-    fill_buffer_st_ = hbm_ptr_cursor_;
+        num_of_embs_,
+        (int64)(values_.size() - counter_));
+    std::vector<V*> local_addr_list(total_num);
+    auto iter = values_iter_;
     for (int64 i = 0; i < total_num; i++) {
-      local_addr_list_[i] = hbm_ptr_list_[fill_buffer_st_ + i];
+      local_addr_list[i] = *iter;
+      iter++;
     }
     cudaMemcpy(dev_addr_list_,
-               local_addr_list_,
+               local_addr_list.data(),
                sizeof(V*) * total_num,
                cudaMemcpyHostToDevice);
     int block_dim = 128;
@@ -301,36 +111,19 @@ class HbmDramIterator: public Iterator {
                dev_embedding_buffer_,
                sizeof(V) * total_num * value_len_,
                cudaMemcpyDeviceToHost);
-    fill_buffer_ed_ = fill_buffer_st_ + total_num;
   }
-
-  std::vector<K> key_list_;
-  std::vector<std::pair<V*, bool>> value_list_;
-  std::vector<int64> freq_list_;
-  std::vector<int64> version_list_;
-  std::vector<int32> part_offset_;
-  std::vector<K> filtered_key_list_;
-  std::vector<int64> filtered_freq_list_;
-  std::vector<int64> filtered_version_list_;
-  std::vector<int32> part_filter_offset_;
-  std::vector<V*> hbm_ptr_list_;
-
+ private:
+  std::list<V*> values_;
+  typename std::list<V*>::iterator values_iter_;
   const static int64 buffer_capacity_ = 1024 * 1024 * 1;
   V embedding_buffer_[buffer_capacity_];
+  int64 counter_ = 0;
+  int64 buffer_cursor_ = 0;
+  int64 value_len_;
+  int64 num_of_embs_ = 0;
+  Allocator* alloc_;
   V** dev_addr_list_;
   V* dev_embedding_buffer_;
-  V** local_addr_list_;
-  Allocator* alloc_;
-  int64 value_len_;
-  int64 cursor_;
-  int64 hbm_ptr_cursor_;
-  int64 fill_buffer_st_;
-  int64 fill_buffer_ed_;
-  int64 emb_index_;
-  int64 emb_offset_;
-  std::vector<K>* current_key_list_;
-  std::vector<int64>* current_freq_list_;
-  std::vector<int64>* current_version_list_;
 };
 
 } // embedding
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 40108a140cc..71667cf0917 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -30,21 +30,11 @@ template <class K, class V>
 class GPUHashTable;
 
 namespace embedding {
-class Iterator {
+
+template<class V>
+class ValueIterator {
  public:
-  Iterator() {};
-  virtual ~Iterator() {};
-  virtual bool Valid() {return true;};
-  virtual void SeekToFirst() {};
-  virtual void SwitchToFilteredFeatures() {};
-  virtual void SwitchToAdmitFeatures() {};
-  virtual void Next() {};
-  virtual void Key(char* val, int64 dim) {};
-  virtual void Freq(char* val, int64 dim) {};
-  virtual void Version(char* val, int64 dim) {};
-  virtual void Value(char* val, int64 dim, int64 value_offset) {};
-  virtual void SetPartOffset(int32* part_offet_ptr) {};
-  virtual void SetPartFilterOffset(int32* part_offet_ptr) {};
+  virtual V* Next() = 0;
 };
 
 template <class K, class V>
@@ -98,8 +88,6 @@ class KVInterface {
 
   virtual std::string DebugString() const = 0;
 
-  virtual Iterator* GetIterator() { return nullptr; }
-
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
       int32 default_v_num,
       size_t n, const Eigen::GpuDevice& device) {
diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
index 3185cd539ab..2af6b58f94b 100644
--- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
@@ -38,20 +38,19 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
         ShrinkPolicy<K, V>(alloc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy);
-  
-  void Shrink(const ShrinkArgs& shrink_args) override {
+
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_list,
+              const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
-    std::vector<K> key_list;
-    std::vector<ValuePtr<V>*> value_list;
-    kv_->GetSnapshot(&key_list, &value_list);
     FilterToDelete(shrink_args.value_len,
                    key_list, value_list);
   }
 
  private:
   void FilterToDelete(int64 value_len,
-                      const std::vector<K>& key_list,
-                      const std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<K>& key_list,
+                      std::vector<ValuePtr<V>*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
       V* val = value_list[i]->GetValue(index_, offset_);
       if (val != nullptr) {
@@ -62,6 +61,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
         l2_weight *= (V)0.5;
         if (l2_weight < (V)l2_weight_threshold_) {
           kv_->Remove(key_list[i]);
+          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
         }
       }
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index d6dc09b49b4..8ea1fa63fc2 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -73,45 +73,6 @@ class SizeCounter {
   int num_parts_;  
 };
 
-class DBIterator : public Iterator {
- public:
-  DBIterator(leveldb::Iterator* it):it_(it) {}
-  virtual ~DBIterator() {
-    delete it_;
-  };
-  virtual bool Valid() {
-    return it_->Valid();
-  }
-  virtual void SeekToFirst() {
-    return it_->SeekToFirst();
-  }
-  virtual void Next() {
-    return it_->Next();
-  }
-  virtual void Key(char* val, int64 dim) {
-    memcpy(val, it_->key().ToString().data(), dim);
-  }
-  virtual void Value(char* val, int64 dim, int64 value_offset) {
-    memcpy(val,
-           it_->value().ToString().data() +
-               value_offset + sizeof(FixedLengthHeader), dim);
-  }
-  virtual void Freq(char* val, int64 dim) {
-    memcpy(val,
-           it_->value().ToString().data(), sizeof(FixedLengthHeader));
-    *((int64*)val) =
-        reinterpret_cast<FixedLengthHeader*>(val)->GetFreqCounter();
-  }
-  virtual void Version(char* val, int64 dim) {
-    memcpy(val,
-           it_->value().ToString().data(), sizeof(FixedLengthHeader));
-    *((int64*)val) =
-        reinterpret_cast<FixedLengthHeader*>(val)->GetGlobalStep();
-  }
- private:
-  leveldb::Iterator* it_;
-};
-
 template <class K, class V>
 class LevelDBKV : public KVInterface<K, V> {
  public:
@@ -216,14 +177,22 @@ class LevelDBKV : public KVInterface<K, V> {
 
   Status GetSnapshot(std::vector<K>* key_list,
       std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    return Status::OK();
-  }
-
-  Iterator* GetIterator() override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
     leveldb::Iterator* it = db_->NewIterator(options);
-    return new DBIterator(it);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      K key;
+      memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
+      key_list->emplace_back(key);
+      ValuePtr<V>* value_ptr =
+          new NormalGPUValuePtr<V>(ev_allocator(), 1);
+      memcpy((char *)value_ptr->GetPtr(),
+             it->value().ToString().data(),
+             sizeof(FixedLengthHeader));
+      value_ptr_list->emplace_back(value_ptr);
+    }
+    delete it;
+    return Status::OK();
   }
 
   int64 Size() const override {
@@ -247,6 +216,63 @@ class LevelDBKV : public KVInterface<K, V> {
   int total_dims_;
 };
 
+template<class K, class  V>
+class DBValueIterator: public ValueIterator<V> {
+ public:
+  DBValueIterator(
+      const std::vector<K>& key_list,
+      int64 emb_index,
+      int64 value_len,
+      LevelDBKV<K, V>* leveldb_kv)
+      : value_len_(value_len),
+        emb_index_(emb_index),
+        leveldb_kv_(leveldb_kv) {
+    int64 emb_offset = value_len_ * emb_index;
+    std::vector<std::list<K>> keys_parts_vec(kSavedPartitionNum);
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          keys_parts_vec[part_id].emplace_back(key_list[i]);
+          break;
+        }
+      }
+    }
+
+    for (int64 i = 0; i < kSavedPartitionNum; i++) {
+      keys_.splice(keys_.end(), keys_parts_vec[i]);
+    }
+
+    keys_iter_= keys_.begin();
+  }
+
+  ~DBValueIterator() {
+    delete value_ptr_;
+  }
+
+  V* Next() {
+    if (value_ptr_ != nullptr) {
+      value_ptr_->Destroy(ev_allocator());
+      delete value_ptr_;
+    }
+    K key = *(keys_iter_++);
+
+    Status s = leveldb_kv_->Lookup(key, &value_ptr_);
+    if (!s.ok()) {
+      LOG(FATAL)<<"Not found value in LevelDB when Save.";
+    }
+    return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_);
+  }
+
+ private:
+  int64 value_len_;
+  int64 emb_index_;
+  LevelDBKV<K, V>* leveldb_kv_;
+  std::list<K> keys_;
+  typename std::list<K>::const_iterator keys_iter_;
+  ValuePtr<V>* value_ptr_ = nullptr;
+  int64 key_cursor_ = 0;
+};
+
 } //namespace embedding
 } //namespace tensorflow
 
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index ac82f3911fb..ff18425ad9a 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -93,9 +93,9 @@ class MultiTierStorage : public Storage<K, V> {
     return Status::OK();
   }
 
-  embedding::Iterator* GetIterator() {
-    LOG(FATAL)<<"GetIterator isn't support by MultiTierStorage.";
-    return nullptr;
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+    LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
   }
 
   void CopyEmbeddingsFromCPUToGPU(
@@ -110,74 +110,11 @@ class MultiTierStorage : public Storage<K, V> {
     LOG(FATAL) << "Unsupport CopyEmbeddingsFromCPUToGPU in MultiTierStorage.";
   };
 
-  void SetListsForCheckpoint(
-      const std::vector<K>& input_key_list,
-      const std::vector<ValuePtr<V>*>& input_value_ptr_list,
-      const EmbeddingConfig& emb_config,
-      std::vector<K>* output_key_list,
-      std::vector<V*>* output_value_list,
-      std::vector<int64>* output_version_list,
-      std::vector<int64>* output_freq_list) {
-    for (int64 i = 0; i < input_key_list.size(); ++i) {
-      output_key_list->emplace_back(input_key_list[i]);
-
-      //NormalContiguousValuePtr is used, GetFreq() is valid.
-      int64 dump_freq = input_value_ptr_list[i]->GetFreq();
-      output_freq_list->emplace_back(dump_freq);
-
-      if (emb_config.steps_to_live != 0 || emb_config.record_version) {
-        int64 dump_version = input_value_ptr_list[i]->GetStep();
-        output_version_list->emplace_back(dump_version);
-      }
-
-      V* val = input_value_ptr_list[i]->GetValue(emb_config.emb_index,
-          Storage<K, V>::GetOffset(emb_config.emb_index));
-      V* primary_val = input_value_ptr_list[i]->GetValue(
-          emb_config.primary_emb_index,
-          Storage<K, V>::GetOffset(emb_config.primary_emb_index));
-      /* Classify features into 3 categories:
-        1. filtered
-        2. not involved in backward
-        3. normal
-      */
-      if (primary_val == nullptr) {
-        output_value_list->emplace_back(nullptr);
-      } else {
-        if (val == nullptr) {
-          output_value_list->emplace_back(reinterpret_cast<V*>(-1));
-        } else {
-          output_value_list->emplace_back(val);
-        }
-      }
-    }
-  }
-
-  virtual int64 GetSnapshotWithoutFetchPersistentEmb(
-      std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      SsdRecordDescriptor<K>* ssd_rec_desc) override {
-    LOG(FATAL)<<"The Storage dosen't use presisten memory"
-              <<" or this storage hasn't suppported"
-              <<" GetSnapshotWithoutFetchPersistentEmb yet";
-    return -1;
-  }
-
   Status Contains(K key) override {
     LOG(FATAL)<<"Contains is not support in MultiTierStorage.";
     return Status::OK();
   }
 
-  void iterator_mutex_lock() override {
-    return;
-  }
-
-  void iterator_mutex_unlock() override {
-    return;
-  }
-
   bool IsMultiLevel() override {
     return true;
   }
diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h
index 13cb51ff30d..ea063a113a3 100644
--- a/tensorflow/core/framework/embedding/shrink_policy.h
+++ b/tensorflow/core/framework/embedding/shrink_policy.h
@@ -45,7 +45,9 @@ class ShrinkPolicy {
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy);
 
-  virtual void Shrink(const ShrinkArgs& shrink_args) = 0;
+  virtual void Shrink(std::vector<K>& key_list,
+                      std::vector<ValuePtr<V>*>& value_list,
+                      const ShrinkArgs& shrink_args) = 0;
 
  protected:
   void EmplacePointer(ValuePtr<V>* value_ptr) {
@@ -71,7 +73,9 @@ class NonShrinkPolicy: public ShrinkPolicy<K, V> {
   NonShrinkPolicy(): ShrinkPolicy<K, V>(nullptr) {}
   TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy);
 
-  void Shrink(const ShrinkArgs& shrink_args) {}
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_list,
+              const ShrinkArgs& shrink_args) override {}
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index 54bf1f76c14..f9de65df588 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -235,73 +235,33 @@ class SingleTierStorage : public Storage<K, V> {
 
   Status GetSnapshot(std::vector<K>* key_list,
       std::vector<ValuePtr<V>*>* value_ptr_list) override {
+    mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
 
-  virtual int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
+  Status Save(
+      const std::string& tensor_name,
+      const std::string& prefix,
+      BundleWriter* writer,
       const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
     std::vector<ValuePtr<V>*> value_ptr_list;
     std::vector<K> key_list_tmp;
-    TF_CHECK_OK(kv_->GetSnapshot(&key_list_tmp, &value_ptr_list));
-    if (key_list_tmp.empty()) {
-      *it = kv_->GetIterator();
-      return 0;
-    }
-    for (int64 i = 0; i < key_list_tmp.size(); ++i) {
-      V* val = value_ptr_list[i]->GetValue(emb_config.emb_index,
-        Storage<K, V>::GetOffset(emb_config.emb_index));
-      V* primary_val = value_ptr_list[i]->GetValue(
-          emb_config.primary_emb_index,
-          Storage<K, V>::GetOffset(emb_config.primary_emb_index));
-      key_list->emplace_back(key_list_tmp[i]);
-      if (emb_config.filter_freq != 0 || emb_config.record_freq) {
-        int64 dump_freq = filter->GetFreq(
-            key_list_tmp[i], value_ptr_list[i]);
-        freq_list->emplace_back(dump_freq);
-      }
-      if (emb_config.steps_to_live != 0 || emb_config.record_version) {
-        int64 dump_version = value_ptr_list[i]->GetStep();
-        version_list->emplace_back(dump_version);
-      }
-      if (val != nullptr && primary_val != nullptr) {
-        value_list->emplace_back(val);
-      } else if (val == nullptr && primary_val != nullptr) {
-        // only forward, no backward
-        value_list->emplace_back(reinterpret_cast<V*>(-1));
-      } else {
-        // feature filtered
-        value_list->emplace_back(nullptr);
-      }
-    } 
-    return key_list->size();
-  }
-
-  int64 GetSnapshotWithoutFetchPersistentEmb(
-      std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      SsdRecordDescriptor<K>* ssd_rec_desc) override {
-    LOG(FATAL)<<"The Storage dosen't use presisten memory"
-              <<" or this storage hasn't suppported "
-              <<" GetSnapshotWithoutFetchPersistentEmb yet";
-    return -1;
-  }
+    TF_CHECK_OK(kv_->GetSnapshot(
+        &key_list_tmp, &value_ptr_list));
 
-  virtual embedding::Iterator* GetIterator() override {
-    LOG(FATAL)<<"GetIterator isn't support by "<<typeid(this).name();
-    return nullptr;
-  }
+    if (emb_config.is_primary()) {
+      Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len);
+    }
 
-  Status Shrink(const ShrinkArgs& shrink_args) override {
-    mutex_lock l(Storage<K, V>::mu_);
-    shrink_policy_->Shrink(shrink_args);
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer,
+        emb_config,
+        value_len, default_value,
+        key_list_tmp,
+        value_ptr_list)));
     return Status::OK();
   }
 
@@ -331,12 +291,8 @@ class SingleTierStorage : public Storage<K, V> {
     return false;
   }
 
-  void iterator_mutex_lock() override {
-    return;
-  }
-
-  void iterator_mutex_unlock() override {
-    return;
+  bool IsUsePersistentStorage() override {
+    return false;
   }
 
   void Schedule(std::function<void()> fn) override {
@@ -366,6 +322,19 @@ class SingleTierStorage : public Storage<K, V> {
                                false/*to_dram*/, is_incr, restore_buff);
     return s;
   }
+
+  virtual void Shrink(std::vector<K>& key_list,
+                      std::vector<ValuePtr<V>*>& value_ptr_list,
+                      ShrinkArgs& shrink_args,
+                      int64 value_len) {
+    mutex_lock l(Storage<K, V>::mu_);
+    shrink_args.value_len = value_len;
+    shrink_policy_->Shrink(
+        key_list,
+        value_ptr_list,
+        shrink_args);
+  }
+
  protected:
   KVInterface<K, V>* kv_;
   ShrinkPolicy<K, V>* shrink_policy_;
@@ -409,6 +378,17 @@ class DramStorage : public SingleTierStorage<K, V> {
   void SetTotalDims(int64 total_dims) override {
     SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
   }
+
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_ptr_list,
+              ShrinkArgs& shrink_args,
+              int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(
+        key_list,
+        value_ptr_list,
+        shrink_args,
+        value_len);
+  }
 };
 
 #if GOOGLE_CUDA
@@ -449,18 +429,33 @@ class HbmStorage : public SingleTierStorage<K, V> {
                    size_t n, const V* default_v) override {
     SingleTierStorage<K, V>::kv_->BatchLookup(device, keys, val, n, default_v);
   }
-  
-  int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
+
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
       const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) override {
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    std::vector<V*> value_ptr_list;
+    std::vector<K> key_list_tmp;
     GPUHashMapKV<K, V>* gpu_kv =
         dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
-    gpu_kv->GetSnapshot(key_list, value_list, emb_config);
-    return key_list->size();
+    gpu_kv->GetSnapshot(&key_list_tmp, &value_ptr_list, emb_config);
+
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer,
+        value_len,
+        key_list_tmp,
+        value_ptr_list)));
+
+    if (value_ptr_list.size() > 0) {
+      TypedAllocator::Deallocate(
+          cpu_allocator(), value_ptr_list[0],
+          value_ptr_list.size() * value_len);
+    }
+    return Status::OK();
   }
 
   GPUHashTable<K, V>* HashTable() override {
@@ -532,6 +527,17 @@ class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
   friend class HbmDramSsdStorage<K, V>;
  protected:
   void SetTotalDims(int64 total_dims) override {}
+
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_ptr_list,
+              ShrinkArgs& shrink_args,
+              int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(
+        key_list,
+        value_ptr_list,
+        shrink_args,
+        value_len);
+  }
 };
 #endif // GOOGLE_CUDA
 
@@ -568,6 +574,17 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  protected:
   friend class DramPmemStorage<K, V>;
   void SetTotalDims(int64 total_dims) override {}
+
+  void Shrink(std::vector<K>& key_list,
+              std::vector<ValuePtr<V>*>& value_ptr_list,
+              ShrinkArgs& shrink_args,
+              int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(
+        key_list,
+        value_ptr_list,
+        shrink_args,
+        value_len);
+  }
 };
 
 template<typename K, typename V>
@@ -585,10 +602,13 @@ class LevelDBStore : public SingleTierStorage<K, V> {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
-  embedding::Iterator* GetIterator() override {
+  embedding::ValueIterator<V>* GetValueIterator(
+      const std::vector<K>& key_list,
+      int64 emb_index, int64 value_len) {
     LevelDBKV<K, V>* leveldb_kv =
         reinterpret_cast<LevelDBKV<K, V>*>(SingleTierStorage<K, V>::kv_);
-    return leveldb_kv->GetIterator();
+    return new DBValueIterator<K, V>(
+        key_list, emb_index, value_len, leveldb_kv);
   }
  public:
   friend class DramLevelDBStore<K, V>;
@@ -614,10 +634,25 @@ class SsdHashStorage : public SingleTierStorage<K, V> {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
-  embedding::Iterator* GetIterator() override {
-    SSDHashKV<K, V>* ssd_kv =
-        reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
-    return ssd_kv->GetIterator();
+  Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) override {
+    if (emb_config.is_primary()) {
+      SSDHashKV<K, V>* ssd_kv =
+          reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+      SsdRecordDescriptor<K> ssd_rec_desc;
+      {
+        mutex_lock l(Storage<K, V>::mu_);
+        ssd_kv->SetSsdRecordDescriptor(&ssd_rec_desc);
+      }
+      ssd_rec_desc.GenerateCheckpoint(prefix, tensor_name);
+    }
+    return Status::OK();
   }
 
   void Import(K* key_list, int64* key_file_id_list,
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index 9e2afc360a9..8040421233e 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "sparsehash/dense_hash_map_lockless"
 #include "sparsehash/dense_hash_set_lockless"
+#include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
 #include "tensorflow/core/framework/embedding/emb_file_creator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/embedding/value_ptr.h"
@@ -35,24 +36,6 @@ namespace tensorflow {
 template <class V>
 class ValuePtr;
 
-template <class K>
-struct SsdRecordDescriptor {
-  //prefix of embedding file
-  tstring file_prefix;
-  //keys in ssd storage
-  std::vector<K> key_list;
-  //file ids of features
-  std::vector<int64> key_file_id_list;
-  //offsets in the file of features
-  std::vector<int64> key_offset_list;
-   //files in ssd storage
-  std::vector<int64> file_list;
-  //number of invalid records in the file
-  std::vector<int64> invalid_record_count_list;
-  //number of records in the file
-  std::vector<int64> record_count_list;
-};
-
 namespace embedding {
 class EmbPosition {
  public:
@@ -83,7 +66,7 @@ class EmbPosition {
 };
 
 template <class K>
-class SSDIterator : public Iterator {
+class SSDIterator {
  public:
   SSDIterator(google::dense_hash_map_lockless<K, EmbPosition*>* hash_map,
               const std::vector<EmbFile*>& emb_files, int64 value_len,
@@ -271,19 +254,13 @@ class SSDHashKV : public KVInterface<K, V> {
     done_ = true;
   }
 
-  Iterator* GetIterator() override {
-    return new SSDIterator<K>(&hash_map_, emb_files_, val_len_,
-        write_buffer_);
-  }
-
   void SetSsdRecordDescriptor(SsdRecordDescriptor<K>* ssd_rec_desc) {
     mutex_lock l(compact_save_mu_);
-    auto ssd_iter =
-        reinterpret_cast<SSDIterator<K>*>(GetIterator());
-    for (ssd_iter->SeekToFirst(); ssd_iter->Valid(); ssd_iter->Next()) {
-      ssd_rec_desc->key_list.emplace_back(ssd_iter->Key());
-      ssd_rec_desc->key_file_id_list.emplace_back(ssd_iter->FileId());
-      ssd_rec_desc->key_offset_list.emplace_back(ssd_iter->Offset());
+    SSDIterator<K> ssd_iter(&hash_map_, emb_files_, val_len_, write_buffer_);
+    for (ssd_iter.SeekToFirst(); ssd_iter.Valid(); ssd_iter.Next()) {
+      ssd_rec_desc->key_list.emplace_back(ssd_iter.Key());
+      ssd_rec_desc->key_file_id_list.emplace_back(ssd_iter.FileId());
+      ssd_rec_desc->key_offset_list.emplace_back(ssd_iter.Offset());
     }
     ssd_rec_desc->file_prefix = path_;
 
diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.h b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
new file mode 100644
index 00000000000..9d015236934
--- /dev/null
+++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
@@ -0,0 +1,148 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
+
+#include <map>
+#include <vector>
+#include <cstdlib>
+#include <iomanip>
+
+#include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
+#include "tensorflow/core/framework/embedding/kv_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace embedding {
+
+template <class K>
+class SsdRecordDescriptor {
+ public:
+  //prefix of embedding file
+  tstring file_prefix;
+  //keys in ssd storage
+  std::vector<K> key_list;
+  //file ids of features
+  std::vector<int64> key_file_id_list;
+  //offsets in the file of features
+  std::vector<int64> key_offset_list;
+   //files in ssd storage
+  std::vector<int64> file_list;
+  //number of invalid records in the file
+  std::vector<int64> invalid_record_count_list;
+  //number of records in the file
+  std::vector<int64> record_count_list;
+
+  void GenerateCheckpoint(const std::string& prefix,
+                          const std::string& var_name) {
+    DumpSsdMeta(prefix, var_name);
+    CopyEmbeddingFilesToCkptDir(prefix, var_name);
+  }
+
+ private:
+  template<typename T>
+  void DumpSection(const std::vector<T>& data_vec,
+                   const std::string& section_str,
+                   BundleWriter* writer,
+                   std::vector<char>& dump_buffer) {
+    EVVectorDataDumpIterator<T> iter(data_vec);
+    SaveTensorWithFixedBuffer(
+        section_str,
+        writer, dump_buffer.data(),
+        dump_buffer.size(), &iter,
+        TensorShape({data_vec.size()}));
+  }
+
+  void DumpSsdMeta(const std::string& prefix,
+                   const std::string& var_name) {
+    std::fstream fs;
+    std::string var_name_temp(var_name);
+    std::string new_str = "_";
+    int64 pos = var_name_temp.find("/");
+    while (pos != std::string::npos) {
+      var_name_temp.replace(pos, 1, new_str.data(), 1);
+      pos = var_name_temp.find("/");
+    }
+
+    std::string ssd_record_path =
+        prefix + "-" + var_name_temp + "-ssd_record";
+    BundleWriter ssd_record_writer(Env::Default(),
+                                   ssd_record_path);
+    size_t bytes_limit = 8 << 20;
+    std::vector<char> dump_buffer(bytes_limit);
+
+    DumpSection(key_list, "keys",
+                &ssd_record_writer, dump_buffer);
+    DumpSection(key_file_id_list, "keys_file_id",
+                &ssd_record_writer, dump_buffer);
+    DumpSection(key_offset_list, "keys_offset",
+                &ssd_record_writer, dump_buffer);
+    DumpSection(file_list, "files",
+                &ssd_record_writer, dump_buffer);
+    DumpSection(invalid_record_count_list, "invalid_record_count",
+                &ssd_record_writer, dump_buffer);
+    DumpSection(record_count_list, "record_count",
+                &ssd_record_writer, dump_buffer);
+
+    ssd_record_writer.Finish();
+  }
+
+  void CopyEmbeddingFilesToCkptDir(
+      const std::string& prefix,
+      const std::string& var_name) {
+    std::string var_name_temp(var_name);
+    std::string new_str = "_";
+    int64 pos = var_name_temp.find("/");
+    while (pos != std::string::npos) {
+      var_name_temp.replace(pos, 1, new_str.data(), 1);
+      pos = var_name_temp.find("/");
+    }
+
+    std::string embedding_folder_path =
+        prefix + "-" + var_name_temp + "-emb_files/";
+    Status s = Env::Default()->CreateDir(embedding_folder_path);
+    if (errors::IsAlreadyExists(s)) {
+      int64 undeleted_files, undeleted_dirs;
+      Env::Default()->
+          DeleteRecursively(embedding_folder_path,
+                            &undeleted_files,
+                            &undeleted_dirs);
+      Env::Default()->CreateDir(embedding_folder_path);
+    }
+
+    for (int64 i = 0; i < file_list.size(); i++) {
+      int64 file_id = file_list[i];
+      std::stringstream old_ss;
+      old_ss << std::setw(4) << std::setfill('0') << file_id << ".emb";
+      std::string file_path = file_prefix + old_ss.str();
+      std::string file_name = file_path.substr(file_path.rfind("/"));
+      std::stringstream new_ss;
+      new_ss << file_id << ".emb";
+      std::string new_file_path = embedding_folder_path + new_ss.str();
+      Status s = Env::Default()->CopyFile(file_path, new_file_path);
+      if (!s.ok()) {
+        LOG(FATAL)<<"Copy file "<<file_path<<" failed!";
+      }
+    }
+  }
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 604041f49c8..d212e5b9c77 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+#include "tensorflow/core/framework/embedding/embedding_var_ckpt_data.h"
 #include "tensorflow/core/framework/embedding/embedding_var_restore.h"
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
@@ -54,6 +55,9 @@ class GPUHashTable;
 
 template<typename Device>
 struct EmbeddingVarContext;
+namespace {
+  const int kSavedPartitionNum = 1000;
+}
 namespace embedding {
 
 template<typename K, typename V>
@@ -97,22 +101,14 @@ class Storage {
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
-  virtual int64 GetSnapshot(std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
+  virtual Status Save(
+      const string& tensor_name,
+      const string& prefix,
+      BundleWriter* writer,
       const EmbeddingConfig& emb_config,
-      FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
-      embedding::Iterator** it) = 0;
-  virtual int64 GetSnapshotWithoutFetchPersistentEmb(
-      std::vector<K>* key_list,
-      std::vector<V* >* value_list,
-      std::vector<int64>* version_list,
-      std::vector<int64>* freq_list,
-      const EmbeddingConfig& emb_config,
-      SsdRecordDescriptor<K>* ssd_rec_desc) = 0;
-  virtual embedding::Iterator* GetIterator() = 0;
-  virtual Status Shrink(const ShrinkArgs& shrink_args) = 0;
+      ShrinkArgs& shrink_args,
+      int64 value_len,
+      V* default_value) = 0;
 
   virtual Status BatchCommit(const std::vector<K>& keys,
       const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
@@ -146,8 +142,6 @@ class Storage {
   virtual bool IsUseHbm() = 0;
   virtual bool IsSingleHbm() = 0;
   virtual bool IsUsePersistentStorage() { return false; };
-  virtual void iterator_mutex_lock() = 0;
-  virtual void iterator_mutex_unlock() = 0;
   virtual void Schedule(std::function<void()> fn) = 0;
   virtual void CreateEmbeddingMemoryPool(
       Allocator* alloc,
@@ -274,6 +268,95 @@ class Storage {
     return Status::OK();
   }
 
+ private:
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list,
+      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config,
+      V* default_value) {
+    std::vector<EmbeddingVarCkptData<K, V>>
+        ev_ckpt_data_parts(kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i],
+              emb_config, default_value,
+              GetOffset(emb_config.emb_index),
+              is_save_freq,
+              is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list,
+      const std::vector<V*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data) {
+    std::vector<EmbeddingVarCkptData<K, V>>
+        ev_ckpt_data_parts(kSavedPartitionNum);
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i]);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+ protected:
+  Status SaveToCheckpoint(
+      const string& tensor_name,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      int64 value_len,
+      V* default_value,
+      const std::vector<K>& key_list,
+      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(
+            tensor_name, writer, value_len, value_iter);
+    return Status::OK();
+  }
+
+  Status SaveToCheckpoint(
+      const string& tensor_name,
+      BundleWriter* writer,
+      int64 value_len,
+      const std::vector<K>& key_list,
+      const std::vector<V*>& value_ptr_list) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(tensor_name, writer, value_len);
+    return Status::OK();
+  }
+
  protected:
   int64 alloc_len_ = 0;
   int64 total_dims_ = 0;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 23d12c295ca..fc1b2cd9c67 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2907,7 +2907,7 @@ tf_kernel_library(
     hdrs = ["kv_variable_ops.h"],
     srcs = ["kv_variable_ops.cc",
             "kv_variable_lookup_ops.cc",
-            "kv_variable_save_restore_ops.cc"],
+            "kv_variable_restore_ops.cc"],
     copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",
@@ -5453,14 +5453,14 @@ tf_kernel_library(
                 "group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc",],
     deps = ["//third_party/eigen3",
             "//tensorflow/core/kernels:gpu_device_array",
+            "//tensorflow/core/util/tensor_bundle",
             ":training_op_helpers",
             ":variable_ops",
+            ":save_restore_tensor",
             "//tensorflow/core:embedding_gpu",
             "@sparsehash_c11//:dense_hash_map",
             "@libcuckoo//:libcuckoo",
             ":unique_ali_op",
-            ":save_restore_tensor",
-            "//tensorflow/core/util/tensor_bundle",
             "@com_github_google_leveldb//:leveldb",] + DYNAMIC_DEPS + mkl_deps() +
            if_cuda(["@cub_archive//:cub",
                     ":fused_embedding_common_cuh",
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 408a2bfd16c..eff4b77c2dc 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -118,86 +118,6 @@ std::vector<string> AllTensorKeys(BundleReader* reader) {
   return ret;
 }
 
-TEST(TensorBundleTest, TestEVShrinkL2) {
-  int64 value_size = 3;
-  int64 insert_num = 5;
-  Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 1.0));
-  //float* fill_v = (float*)malloc(value_size * sizeof(float));
-  EmbeddingConfig emb_config = 
-      EmbeddingConfig(0, 0, 1, 1, "", 0, 0, 99999, 14.0);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(
-          StorageType::DRAM,
-          "", {1024, 1024, 1024, 1024},
-          "light",
-          emb_config),
-      cpu_allocator(),
-      "name");
-  auto emb_var = new EmbeddingVar<int64, float>("name",
-        storage, emb_config,
-        cpu_allocator());
-  emb_var ->Init(value, 1);
-  
-  for (int64 i=0; i < insert_num; ++i) {
-    ValuePtr<float>* value_ptr = nullptr;
-    Status s = emb_var->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr, i);
-    vflat += vflat.constant((float)i);
-  }
-
-  int size = emb_var->Size();
-  embedding::ShrinkArgs shrink_args;
-  emb_var->Shrink(shrink_args);
-  LOG(INFO) << "Before shrink size:" << size;
-  LOG(INFO) << "After shrink size:" << emb_var->Size();
-
-  ASSERT_EQ(emb_var->Size(), 2);
-}
-
-TEST(TensorBundleTest, TestEVShrinkLockless) {
-
-  int64 value_size = 64;
-  int64 insert_num = 30;
-  Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float));
-
-  int steps_to_live = 5;
-  EmbeddingConfig emb_config = EmbeddingConfig(0, 0, 1, 1, "", steps_to_live);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(
-          StorageType::DRAM,
-          "", {1024, 1024, 1024, 1024},
-          "normal",
-          emb_config),
-      cpu_allocator(),
-      "name");
-  auto emb_var = new EmbeddingVar<int64, float>("name",
-      storage, emb_config,
-      cpu_allocator());
-  emb_var ->Init(value, 1);
-  LOG(INFO) << "size:" << emb_var->Size();
-
-  for (int64 i=0; i < insert_num; ++i) {
-    ValuePtr<float>* value_ptr = nullptr;
-    Status s = emb_var->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = emb_var->flat(value_ptr, i);
-    emb_var->UpdateVersion(value_ptr, i);
-  }
-
-  int size = emb_var->Size();
-  embedding::ShrinkArgs shrink_args;
-  shrink_args.global_step = insert_num;
-  emb_var->Shrink(shrink_args);
-
-  LOG(INFO) << "Before shrink size:" << size;
-  LOG(INFO) << "After shrink size: " << emb_var->Size();
-
-  ASSERT_EQ(size, insert_num);
-  ASSERT_EQ(emb_var->Size(), steps_to_live);
-}
-
 TEST(EmbeddingVariableTest, TestEmptyEV) {
   int64 value_size = 8;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
@@ -213,7 +133,9 @@ TEST(EmbeddingVariableTest, TestEmptyEV) {
     Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
     BundleWriter writer(Env::Default(), Prefix("foo"));
-    DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
+    embedding::ShrinkArgs shrink_args;
+    shrink_args.global_step = 1;
+    variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
     TF_ASSERT_OK(writer.Finish());
 
     {
@@ -288,7 +210,9 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
   LOG(INFO) << "size:" << variable->Size();
 
   BundleWriter writer(Env::Default(), Prefix("foo"));
-  DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
   TF_ASSERT_OK(writer.Finish());
 
   {
@@ -364,7 +288,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
   LOG(INFO) << "size:" << variable->Size();
 
   BundleWriter writer(Env::Default(), Prefix("foo"));
-  DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
   TF_ASSERT_OK(writer.Finish());
 
   {
@@ -444,15 +370,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
     t.join();
   }
 
-  std::vector<int64> tot_key_list;
-  std::vector<float* > tot_valueptr_list;
-  std::vector<int64> tot_version_list;
-  std::vector<int64> tot_freq_list;
-  embedding::Iterator* it = nullptr;
-  int64 total_size = variable->GetSnapshot(&tot_key_list, &tot_valueptr_list, &tot_version_list, &tot_freq_list, &it);
-
   ASSERT_EQ(variable->Size(), 5);
-  ASSERT_EQ(variable->Size(), total_size);
 }
 
 void InsertAndLookup(EmbeddingVar<int64, float>* variable,
@@ -511,9 +429,7 @@ TEST(EmbeddingVariableTest, TestBloomFilter) {
   std::vector<int64> version_list;
   std::vector<int64> freq_list;
 
-  embedding::Iterator* it = nullptr;
-  var->GetSnapshot(&keylist, &valuelist, &version_list, &freq_list, &it);
-  ASSERT_EQ(var->Size(), keylist.size());  
+  ASSERT_EQ(var->Size(), 1);
 }
 
 TEST(EmbeddingVariableTest, TestBloomCounterInt64) {
@@ -1093,63 +1009,6 @@ TEST(EmbeddingVariableTest, TestSizeDBKV) {
   LOG(INFO) << "2 size:" << hashmap->Size();
 }
 
-TEST(EmbeddingVariableTest, TestSSDIterator) {
-  std::string temp_dir = testing::TmpDir();
-  Allocator* alloc = ev_allocator();
-  auto hashmap = new SSDHashKV<int64, float>(temp_dir, alloc);
-  hashmap->SetTotalDims(126);
-  ASSERT_EQ(hashmap->Size(), 0);
-  std::vector<ValuePtr<float>*> value_ptrs;
-  for (int64 i = 0; i < 10; ++i) {
-    auto tmp = new NormalContiguousValuePtr<float>(alloc, 126);
-    tmp->SetValue((float)i, 126);
-    value_ptrs.emplace_back(tmp);
-  }
-  for (int64 i = 0; i < 10; i++) {
-    hashmap->Commit(i, value_ptrs[i]);
-  }
-  embedding::Iterator* it = hashmap->GetIterator();
-  int64 index = 0;
-  float val_p[126] = {0.0};
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    int64 key = -1;
-    it->Key((char*)&key, sizeof(int64));
-    it->Value((char*)val_p, 126 * sizeof(float), 0);
-    ASSERT_EQ(key, index);
-    for (int i = 0; i < 126; i++)
-      ASSERT_EQ(val_p[i], key);
-    index++;
-  }
-}
-
-TEST(EmbeddingVariableTest, TestLevelDBIterator) {
-  auto hashmap = new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(126);
-  ASSERT_EQ(hashmap->Size(), 0);
-  std::vector<ValuePtr<float>*> value_ptrs;
-  for (int64 i = 0; i < 10; ++i) {
-    ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(ev_allocator(), 126);
-    tmp->SetValue((float)i, 126);
-    value_ptrs.emplace_back(tmp);
-  }
-  for (int64 i = 0; i < 10; i++) {
-    hashmap->Commit(i, value_ptrs[i]);
-  }
-  embedding::Iterator* it = hashmap->GetIterator();
-  int64 index = 0;
-  float val_p[126] = {0.0};
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    int64 key = -1;
-    it->Key((char*)&key, sizeof(int64));
-    it->Value((char*)val_p, 126 * sizeof(float), 0);
-    ASSERT_EQ(key, index);
-    for (int i = 0; i < 126; i++)
-      ASSERT_EQ(val_p[i], key);
-    index++;
-  }
-}
-
 TEST(EmbeddingVariableTest, TestLRUCachePrefetch) {
   BatchCache<int64>* cache = new LRUCache<int64>();
   int num_ids = 5;
diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc
index ee04b4468f6..9b01e35840b 100644
--- a/tensorflow/core/kernels/embedding_variable_performance_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc
@@ -354,17 +354,10 @@ void PerfSave(Tensor& default_value,
   BundleWriter writer(Env::Default(), Prefix("foo"));
   timespec start, end;
   double total_time = 0.0;
-  if (steps_to_live != 0 || l2_weight_threshold != -1.0) {
-    clock_gettime(CLOCK_MONOTONIC, &start);
-    embedding::ShrinkArgs shrink_args;
-    shrink_args.global_step = 100;
-    ev->Shrink(shrink_args);
-    clock_gettime(CLOCK_MONOTONIC, &end);
-    total_time += (double)(end.tv_sec - start.tv_sec) *
-                  1000000000 + end.tv_nsec - start.tv_nsec;
-  }
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 100;
   clock_gettime(CLOCK_MONOTONIC, &start);
-  DumpEmbeddingValues(ev, "var", &writer, &part_offset_tensor);
+  ev->Save("var", Prefix("foo"), &writer, shrink_args);
   clock_gettime(CLOCK_MONOTONIC, &end);
   total_time += (double)(end.tv_sec - start.tv_sec) *
                  1000000000 + end.tv_nsec - start.tv_nsec;
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index b6b29acbedc..8e3572443ba 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -167,426 +167,6 @@ Status GetInputEmbeddingVar(OpKernelContext* ctx, int input,
   }
 }
 
-template <class K>
-void DumpSsdIndexMeta(
-    SsdRecordDescriptor<K>& ssd_rec_desc,
-    const std::string& prefix,
-    const std::string& var_name) {
-  std::fstream fs;
-  std::string var_name_temp(var_name);
-  std::string new_str = "_";
-  int64 pos = var_name_temp.find("/");
-  while (pos != std::string::npos) {
-    var_name_temp.replace(pos, 1, new_str.data(), 1);
-    pos =var_name_temp.find("/");
-  }
-
-  std::string ssd_record_path =
-      prefix + "-" + var_name_temp + "-ssd_record";
-
-  BundleWriter ssd_record_writer(Env::Default(),
-                                 ssd_record_path);
-  typedef EVFreqDumpIterator<int64> Int64DataDumpIterator;
-  size_t bytes_limit = 8 << 20;
-  char* dump_buffer = new char[bytes_limit];
-
-  int64 num_of_keys = ssd_rec_desc.key_list.size();
-  EVKeyDumpIterator<K> keys_iter(ssd_rec_desc.key_list);
-  SaveTensorWithFixedBuffer(
-      "keys",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &keys_iter,
-      TensorShape({num_of_keys}));
-
-  Int64DataDumpIterator key_file_id_iter(ssd_rec_desc.key_file_id_list);
-  SaveTensorWithFixedBuffer(
-      "keys_file_id",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &key_file_id_iter,
-      TensorShape({num_of_keys}));
-
-  Int64DataDumpIterator key_offset_iter(ssd_rec_desc.key_offset_list);
-  SaveTensorWithFixedBuffer(
-      "keys_offset",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &key_offset_iter,
-      TensorShape({num_of_keys}));
-
-  int64 num_of_files = ssd_rec_desc.file_list.size();
-  Int64DataDumpIterator files_iter(ssd_rec_desc.file_list);
-  SaveTensorWithFixedBuffer(
-      "files",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &files_iter,
-      TensorShape({num_of_files}));
-
-  Int64DataDumpIterator
-      invalid_record_count_iter(ssd_rec_desc.invalid_record_count_list);
-  SaveTensorWithFixedBuffer(
-      "invalid_record_count",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &invalid_record_count_iter,
-      TensorShape({num_of_files}));
-
-  Int64DataDumpIterator
-      record_count_iter(ssd_rec_desc.record_count_list);
-  SaveTensorWithFixedBuffer(
-      "record_count",
-      &ssd_record_writer, dump_buffer,
-      bytes_limit, &record_count_iter,
-      TensorShape({num_of_files}));
-
-  ssd_record_writer.Finish();
-  delete[] dump_buffer;
-}
-
-template<class K>
-void CopyEmbeddingfilesToCkptDir(
-    const SsdRecordDescriptor<K>& ssd_rec_desc,
-    const std::string& prefix,
-    const std::string& var_name) {
-  std::string var_name_temp(var_name);
-  std::string new_str = "_";
-  int64 pos = var_name_temp.find("/");
-  while (pos != std::string::npos) {
-    var_name_temp.replace(pos, 1, new_str.data(), 1);
-    pos =var_name_temp.find("/");
-  }
-
-  std::string embedding_folder_path =
-      prefix + "-" + var_name_temp + "-emb_files/";
-  Status s = Env::Default()->CreateDir(embedding_folder_path);
-  if (errors::IsAlreadyExists(s)) {
-    int64 undeleted_files, undeleted_dirs;
-    Env::Default()->
-        DeleteRecursively(embedding_folder_path,
-                          &undeleted_files,
-                          &undeleted_dirs);
-    Env::Default()->CreateDir(embedding_folder_path);
-  }
-
-  for (int64 i = 0; i < ssd_rec_desc.file_list.size(); i++) {
-    int64 file_id = ssd_rec_desc.file_list[i];
-    std::stringstream old_ss;
-    old_ss << std::setw(4) << std::setfill('0') << file_id << ".emb";
-    std::string file_path = ssd_rec_desc.file_prefix + old_ss.str();
-    std::string file_name = file_path.substr(file_path.rfind("/"));
-    std::stringstream new_ss;
-    new_ss << file_id << ".emb";
-    std::string new_file_path = embedding_folder_path + new_ss.str();
-    Status s = Env::Default()->CopyFile(file_path, new_file_path);
-    if (!s.ok()) {
-      LOG(FATAL)<<"Copy file "<<file_path<<" failed!";
-    }
-  }
-}
-
-template <class K, class V>
-Status DumpEmbeddingValues(EmbeddingVar<K, V>* ev,
-    const string& tensor_key, BundleWriter* writer,
-    Tensor* part_offset_tensor,
-    const std::string& prefix = "") {
-  std::vector<K> tot_key_list;
-  std::vector<V*> tot_valueptr_list;
-  std::vector<int64> tot_version_list;
-  std::vector<int64> tot_freq_list;
-  std::vector<K> tot_key_filter_list;
-  std::vector<int64> tot_freq_filter_list;
-  std::vector<int64> tot_version_filter_list;
-  embedding::Iterator* it = nullptr;
-  int64 num_of_keys = 0;
-  //For the time being, only ev which uses SSD for storage,
-  //ev->IsUsePersistentStorage() will get true.
-  if (ev->IsUsePersistentStorage()) {
-    SsdRecordDescriptor<K> ssd_rec_desc;
-    num_of_keys =
-        ev->GetSnapshotWithoutFetchPersistentEmb(
-            &tot_key_list,
-            &tot_valueptr_list,
-            &tot_version_list,
-            &tot_freq_list,
-            &ssd_rec_desc);
-    bool is_primary = (ev->GetEmbeddingIndex() == 0);
-    if (is_primary) {
-      DumpSsdIndexMeta(ssd_rec_desc, prefix, tensor_key);
-      CopyEmbeddingfilesToCkptDir(ssd_rec_desc, prefix, tensor_key);
-    }
-  } else {
-    num_of_keys = ev->GetSnapshot(
-        &tot_key_list,
-        &tot_valueptr_list,
-        &tot_version_list,
-        &tot_freq_list, &it);
-  }
-
-  VLOG(1) << "EV:" << tensor_key << ", save size:" << num_of_keys;
-  int64 iterator_size = 0;
-  int64 filter_iterator_size = 0;
-  if (it != nullptr) {
-    it->SwitchToAdmitFeatures();
-    ev->storage()->iterator_mutex_lock();
-    for (it->SeekToFirst(); it->Valid(); it->Next()) {
-      ++iterator_size;
-    }
-    it->SwitchToFilteredFeatures();
-    for (it->SeekToFirst(); it->Valid(); it->Next()) {
-      ++filter_iterator_size;
-    }
-  }
-
-  std::vector<std::vector<K> > key_list_parts;
-  std::vector<std::vector<V* > > valueptr_list_parts;
-  std::vector<std::vector<int64 > > version_list_parts;
-  std::vector<std::vector<int64 > > freq_list_parts;
-
-  std::vector<std::vector<K> > key_filter_list_parts;
-  std::vector<std::vector<int64 > > version_filter_list_parts;
-  std::vector<std::vector<int64 > > freq_filter_list_parts;
-
-  std::vector<K> partitioned_tot_key_list;
-  std::vector<V* > partitioned_tot_valueptr_list;
-  std::vector<int64> partitioned_tot_version_list;
-  std::vector<int64> partitioned_tot_freq_list;
-  std::vector<K> partitioned_tot_key_filter_list;
-  std::vector<int64> partitioned_tot_version_filter_list;
-  std::vector<int64> partitioned_tot_freq_filter_list;
-  std::vector<int64> part_filter_offset;
-
-  key_list_parts.resize(kSavedPartitionNum);
-  valueptr_list_parts.resize(kSavedPartitionNum);
-  version_list_parts.resize(kSavedPartitionNum);
-  freq_list_parts.resize(kSavedPartitionNum);
-  key_filter_list_parts.resize(kSavedPartitionNum);
-  version_filter_list_parts.resize(kSavedPartitionNum);
-  freq_filter_list_parts.resize(kSavedPartitionNum);
-  part_filter_offset.resize(kSavedPartitionNum + 1);
-  //partitioned_tot_key_list.resize(tot_key_list.size());
-  //partitioned_tot_valueptr_list.resize(tot_valueptr_list.size());
-
-  // save the ev with kSavedPartitionNum piece of tensor
-  // so that we can dynamically load ev with changed partition number
-  bool save_unfiltered_features = true;
-  TF_CHECK_OK(ReadBoolFromEnvVar(
-      "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
-  int64 filter_freq = ev->MinFreq();
-  for (size_t i = 0; i < tot_key_list.size(); i++) {
-    for (int partid = 0; partid < kSavedPartitionNum; partid++) {
-      if (tot_key_list[i] % kSavedPartitionNum == partid) {
-        if (tot_valueptr_list[i] == reinterpret_cast<V*>(-1)) {
-            // only forward, no backward, bypass
-        } else if (tot_valueptr_list[i] == nullptr) {
-          if (filter_freq != 0) {
-            if (save_unfiltered_features) {
-              key_filter_list_parts[partid].push_back(tot_key_list[i]);
-            }
-          } else {
-            key_list_parts[partid].push_back(tot_key_list[i]);
-            valueptr_list_parts[partid].push_back(
-                ev->GetDefaultValue(tot_key_list[i]));
-          }
-        } else {
-          key_list_parts[partid].push_back(tot_key_list[i]);
-          valueptr_list_parts[partid].push_back(tot_valueptr_list[i]);
-        }
-        break;
-      }
-    }
-  }
-
-  for (size_t i = 0; i < tot_version_list.size(); i++) {
-    for (int partid = 0; partid < kSavedPartitionNum; partid++) {
-      if (tot_key_list[i] % kSavedPartitionNum == partid) {
-        if (tot_valueptr_list[i] == reinterpret_cast<V*>(-1)) {
-          // only forward, no backward, bypass
-        } else if (tot_valueptr_list[i] == nullptr) {
-          if (filter_freq != 0) {
-            if (save_unfiltered_features) {
-              version_filter_list_parts[partid].push_back(tot_version_list[i]);
-            }
-          } else {
-            version_list_parts[partid].push_back(tot_version_list[i]);
-          }
-        } else {
-          version_list_parts[partid].push_back(tot_version_list[i]);
-        }
-        break;
-      }
-    }
-  }
-  
-  for (size_t i = 0; i < tot_freq_list.size(); i++) {
-    for (int partid = 0; partid < kSavedPartitionNum; partid++) {
-      if (tot_key_list[i] % kSavedPartitionNum == partid) {
-        if (tot_valueptr_list[i] == reinterpret_cast<V*>(-1)) {
-          // only forward, no backward, bypass
-        } else if (tot_valueptr_list[i] == nullptr) {
-          if (filter_freq != 0) {
-            if (save_unfiltered_features) {
-              freq_filter_list_parts[partid].push_back(tot_freq_list[i]);
-            }
-          } else {
-            freq_list_parts[partid].push_back(tot_freq_list[i]);
-          }
-        } else {
-          freq_list_parts[partid].push_back(tot_freq_list[i]);
-        }
-        break;
-      }
-    }
-  }
-
-  auto part_offset_flat = part_offset_tensor->flat<int32>();
-  part_offset_flat(0) = 0;
-  part_filter_offset[0] = 0;
-  int ptsize = 0;
-  for (int partid = 0; partid < kSavedPartitionNum; partid++) {
-    std::vector<K>& key_list = key_list_parts[partid];
-    std::vector<V* >& valueptr_list = valueptr_list_parts[partid];
-    std::vector<int64>& version_list = version_list_parts[partid];
-    std::vector<int64>& freq_list = freq_list_parts[partid];
-    std::vector<K>& key_filter_list = key_filter_list_parts[partid];
-    std::vector<int64>& version_filter_list =
-      version_filter_list_parts[partid];
-    std::vector<int64>& freq_filter_list = freq_filter_list_parts[partid];
-
-    ptsize += key_list.size();
-    for (int inpid = 0; inpid < key_list.size(); inpid++) {
-      partitioned_tot_key_list.push_back(key_list[inpid]);
-      partitioned_tot_valueptr_list.push_back(valueptr_list[inpid]);
-    }
-    for (int inpid = 0; inpid < version_list.size(); inpid++) {
-      partitioned_tot_version_list.push_back(version_list[inpid]);
-    }
-    for (int inpid = 0; inpid < freq_list.size(); inpid++) {
-      partitioned_tot_freq_list.push_back(freq_list[inpid]);
-    }
-    for (int inpid = 0; inpid < key_filter_list.size(); inpid++) {
-      partitioned_tot_key_filter_list.push_back(key_filter_list[inpid]);
-    }
-    for (int inpid = 0; inpid < version_filter_list.size(); inpid++) {
-      partitioned_tot_version_filter_list.push_back(version_filter_list[inpid]);
-    }
-    for (int inpid = 0; inpid < freq_filter_list.size(); inpid++) {
-      partitioned_tot_freq_filter_list.push_back(freq_filter_list[inpid]);
-    }
-
-    part_offset_flat(partid + 1) = part_offset_flat(partid) + key_list.size();
-    part_filter_offset[partid + 1] = part_filter_offset[partid] + key_filter_list.size();
-  }
-  // TODO: DB iterator not support partition_offset
-  if (it != nullptr) {
-    it->SetPartOffset((int32*)part_offset_tensor->data());
-  }
-  writer->Add(tensor_key + "-partition_offset", *part_offset_tensor);
-  for(int i = 0; i <  kSavedPartitionNum + 1; i++) {
-    part_offset_flat(i) = part_filter_offset[i];
-  }
-  if (it != nullptr) {
-    it->SetPartFilterOffset((int32*)part_offset_tensor->data());
-  }
-  writer->Add(tensor_key + "-partition_filter_offset", *part_offset_tensor);
-
-  VLOG(1) << "EV before partition:" << tensor_key << ", keysize:" <<  tot_key_list.size()
-          << ", valueptr size:" << tot_valueptr_list.size();
-  VLOG(1) << "EV after partition:" << tensor_key << ", ptsize:" << ptsize
-          << ", keysize:"<<  partitioned_tot_key_list.size()
-          <<", valueptr size:" << partitioned_tot_valueptr_list.size();
-
-  size_t bytes_limit = 8 << 20;
-  char* dump_buffer = (char*)malloc(sizeof(char) * bytes_limit);
-  Status st;
-  if (it != nullptr) {
-    it->SwitchToAdmitFeatures();
-  }
-  EVKeyDumpIterator<K> ev_key_dump_iter(partitioned_tot_key_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-keys", writer, dump_buffer,
-                                 bytes_limit, &ev_key_dump_iter,
-                                 TensorShape({partitioned_tot_key_list.size() + iterator_size}),
-                                 it);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  EVValueDumpIterator<K, V> ev_value_dump_iter(ev, partitioned_tot_valueptr_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-values", writer, dump_buffer,
-      bytes_limit, &ev_value_dump_iter,
-      TensorShape({partitioned_tot_key_list.size() + iterator_size, ev->ValueLen()}),
-      it, ev->storage()->GetOffset(ev->GetEmbeddingIndex()));
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  EVVersionDumpIterator<int64> ev_version_dump_iter(partitioned_tot_version_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-versions", writer, dump_buffer,
-      bytes_limit, &ev_version_dump_iter,
-      TensorShape({partitioned_tot_version_list.size() + iterator_size}),
-      it, -3);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  EVFreqDumpIterator<int64> ev_freq_dump_iter(partitioned_tot_freq_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-freqs", writer, dump_buffer,
-      bytes_limit, &ev_freq_dump_iter,
-      TensorShape({partitioned_tot_freq_list.size() + iterator_size}),
-      it, -2);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-  if (it != nullptr) {
-    it->SwitchToFilteredFeatures();
-  }
-  EVKeyDumpIterator<K> ev_key_filter_dump_iter(partitioned_tot_key_filter_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-keys_filtered",
-      writer, dump_buffer, bytes_limit, &ev_key_filter_dump_iter,
-      TensorShape({partitioned_tot_key_filter_list.size()
-          + filter_iterator_size}), it);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  EVVersionDumpIterator<int64> ev_version_filter_dump_iter(
-      partitioned_tot_version_filter_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-versions_filtered",
-      writer, dump_buffer, bytes_limit, &ev_version_filter_dump_iter,
-      TensorShape({partitioned_tot_version_filter_list.size()
-          + filter_iterator_size}), it, -3);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  EVFreqDumpIterator<int64> ev_freq_filter_dump_iter(
-      partitioned_tot_freq_filter_list);
-  st = SaveTensorWithFixedBuffer(tensor_key + "-freqs_filtered",
-      writer, dump_buffer, bytes_limit, &ev_freq_filter_dump_iter,
-      TensorShape({partitioned_tot_freq_filter_list.size()
-          + filter_iterator_size}), it, -2);
-  if (!st.ok()) {
-    free(dump_buffer);
-    return st;
-  }
-
-  free(dump_buffer);
-
-  if (it != nullptr) {
-    ev->storage()->iterator_mutex_unlock();
-    delete it;
-  }
-
-  if (ev->IsSingleHbm() && tot_valueptr_list.size() > 0) {
-    TypedAllocator::Deallocate(
-        cpu_allocator(), tot_valueptr_list[0],
-        tot_valueptr_list.size() * ev->ValueLen());
-  }
-  return Status::OK();
-}
-
 Status MoveMatchingFiles(
     Env* env,
     const tstring& pattern,
diff --git a/tensorflow/core/kernels/kv_variable_save_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
similarity index 86%
rename from tensorflow/core/kernels/kv_variable_save_restore_ops.cc
rename to tensorflow/core/kernels/kv_variable_restore_ops.cc
index fa7e043ffd3..23a504eea5d 100644
--- a/tensorflow/core/kernels/kv_variable_save_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -508,86 +508,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
 #undef REGISTER_KERNELS_GPU
 #endif  // GOOGLE_CUDA
 
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
-// Op that outputs tensors of all keys and all values.
-template<typename TKey, typename TValue>
-class KvResourceExportOp : public OpKernel {
- public:
-  explicit KvResourceExportOp(OpKernelConstruction *ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext *ctx) override {
-    EmbeddingVar<TKey, TValue> *ev = nullptr;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
-    core::ScopedUnref unref_me(ev);
-    std::vector<TKey> tot_key_list;
-    std::vector<TValue *> tot_valueptr_list;
-    std::vector<int64> tot_version_list;
-    std::vector<int64> tot_freq_list;
-    embedding::Iterator* it = nullptr;
-    int64 total_size = ev->GetSnapshot(
-        &tot_key_list, &tot_valueptr_list, &tot_version_list,
-        &tot_freq_list, &it);
-
-    // Create an output tensor
-    Tensor *keys_output_tensor = NULL;
-    Tensor *values_output_tensor = NULL;
-    Tensor *versions_output_tensor = NULL;
-    Tensor *freq_output_tensor = NULL;
-
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-          0, TensorShape({total_size}), &keys_output_tensor));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-          1, TensorShape({total_size, ev->ValueLen()}),
-          &values_output_tensor));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-          2, TensorShape({tot_version_list.size()}),
-          &versions_output_tensor));
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-          3, TensorShape({tot_freq_list.size()}),
-          &freq_output_tensor));
-
-    auto keys_output = keys_output_tensor->template flat<TKey>();
-    auto val_matrix = values_output_tensor->matrix<TValue>();
-    auto versions_output = versions_output_tensor->template flat<int64>();
-    auto freq_output = freq_output_tensor->template flat<int64>();
-
-    for(size_t i = 0; i < total_size; i++) {
-      keys_output(i) = tot_key_list[i];
-      TValue *value = tot_valueptr_list[i];
-      for(int64 m = 0; m < ev->ValueLen(); m++) {
-        val_matrix(i, m) = *(value + m);
-      }
-      if (tot_version_list.size() != 0) {
-        versions_output(i) = tot_version_list[i];
-      }
-      if (tot_freq_list.size() != 0) {
-        freq_output(i) = tot_freq_list[i];
-      }
-    }
-  }
-};
-
-#define REGISTER_KERNELS(dev, ktype, vtype)                    \
-  REGISTER_KERNEL_BUILDER(Name("KvResourceExport")             \
-                            .Device(DEVICE_##dev)              \
-                            .TypeConstraint<ktype>("Tkeys")    \
-                            .TypeConstraint<vtype>("Tvalues"), \
-                          KvResourceExportOp<ktype, vtype>);
-#define REGISTER_KERNELS_ALL(dev, type)                        \
-  REGISTER_KERNELS(dev, int32, type)                           \
-  REGISTER_KERNELS(dev, int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
-#if GOOGLE_CUDA
-#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
-#undef REGISTER_KERNELS_GPU
-#endif  // GOOGLE_CUDA
-
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 55572eabfb2..4f69ebe3fb5 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -120,10 +120,6 @@ Status SaveTensorWithFixedBuffer(const string& tensor_name,
     size_t bytes_limit,
     DumpIterator<T>* dump_iter,
     const TensorShape& dump_tensor_shape,
-    embedding::Iterator* it = nullptr,
-    // -1: save key, x_offset: save embedding(primary or slot offset)
-    // -2: save frequency, -3: save version
-    int64 value_offset = -1,
     bool use_shape = true) {
   bool dump_happened = false;
   size_t bytes_written = 0;
@@ -149,55 +145,7 @@ Status SaveTensorWithFixedBuffer(const string& tensor_name,
     bytes_written += sizeof(T);
     total_bytes_written += sizeof(T);
   }
-  if (it != nullptr) {
-    int64 size = 0; 
-    if (value_offset < 0) {
-      size = sizeof(T);
-    } else {
-      size = sizeof(T) * dump_tensor_shape.dim_size(1);
-    }
-    char val[size] = {0};
-    for (it->SeekToFirst(); it->Valid(); it->Next()) {
-      int64 dim = 0;
-      void* start = nullptr;
-      if (value_offset < 0) {
-        if (value_offset == -1){
-          it->Key(val, sizeof(T));
-        } else if (value_offset == -2) {
-          it->Freq(val, sizeof(T));
-        } else {
-          it->Version(val, sizeof(T));
-        }
-        if (bytes_written + sizeof(T) > bytes_limit) {
-          dump_happened = true;
-          writer->AppendSegmentData(dump_buffer, bytes_written);
-          bytes_written = 0;
-          buffer_idx = 0;
-        }
-        key_dump_buffer[buffer_idx] = *((T*)val);
-        buffer_idx++;
-        bytes_written += sizeof(T);
-        total_bytes_written += sizeof(T);
-
-      } else {
-        dim = dump_tensor_shape.dim_size(1);
-        it->Value(val, dim * sizeof(T), value_offset * sizeof(T));
-
-        for (int j = 0; j < dim; ++j) {
-          if (bytes_written + sizeof(T) > bytes_limit) {
-            dump_happened = true;
-            writer->AppendSegmentData(dump_buffer, bytes_written);
-            bytes_written = 0;
-            buffer_idx = 0;
-          }
-          key_dump_buffer[buffer_idx] = *((T*)val + j);
-          buffer_idx++;
-          bytes_written += sizeof(T);
-          total_bytes_written += sizeof(T);
-        }
-      }
-    }
-  }
+
   if (!dump_happened) {
     VLOG(1) << tensor_name << " only one buffer written, size:" << bytes_written;
     writer->AddCompeleteData(dump_buffer, bytes_written);
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 313f6b81825..ace7667864c 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -126,19 +126,14 @@ class SaveV2 : public OpKernel {
                    LookupResource(context,
                      HandleFromInput(context, variable_index), &variable));
     const Tensor& global_step = context->input(3);
-    Tensor part_offset_tensor;
-    context->allocate_temp(DT_INT32,
-                           TensorShape({kSavedPartitionNum + 1}),
-                           &part_offset_tensor);
     TGlobalStep global_step_scalar = global_step.scalar<TGlobalStep>()();
     core::ScopedUnref s(variable);
     embedding::ShrinkArgs shrink_args;
     shrink_args.global_step = global_step_scalar;
-    OP_REQUIRES_OK(context, variable->Shrink(shrink_args));
     const Tensor& prefix = context->input(0);
     const string& prefix_string = prefix.scalar<tstring>()();
-    OP_REQUIRES_OK(context, DumpEmbeddingValues(variable, tensor_name,
-        &writer, &part_offset_tensor, prefix_string));
+    OP_REQUIRES_OK(context, variable->Save(tensor_name,
+        prefix_string, &writer, shrink_args));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -304,19 +299,14 @@ class SaveV3 : public OpKernel {
       EmbeddingVar<TKey, TValue>* variable,
       const string& tensor_name, BundleWriter& writer) {
     const Tensor& global_step = context->input(5);
-    Tensor part_offset_tensor;
-    context->allocate_temp(DT_INT32,
-                           TensorShape({kSavedPartitionNum + 1}),
-                           &part_offset_tensor);
     TGlobalStep global_step_scalar = global_step.scalar<TGlobalStep>()();
     core::ScopedUnref s(variable);
     embedding::ShrinkArgs shrink_args;
     shrink_args.global_step = global_step_scalar;
-    OP_REQUIRES_OK(context, variable->Shrink(shrink_args));
     const Tensor& prefix = context->input(0);
     const string& prefix_string = prefix.scalar<tstring>()();
-    OP_REQUIRES_OK(context, DumpEmbeddingValues(variable, tensor_name,
-          &writer, &part_offset_tensor, prefix_string));
+    OP_REQUIRES_OK(context, variable->Save(tensor_name,
+        prefix_string, &writer, shrink_args));
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index fae4ef91380..d3e453df9d1 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -453,39 +453,6 @@ def testEmbeddingVariableForLookupInt32(self):
       save_path = saver.save(sess, model_path, global_step=12345)
       saver.restore(sess, save_path)
 
-  def testEmbeddingVariableForExport(self):
-    print("testEmbeddingVariableForExport")
-    with ops.device('/cpu:0'):
-      ev_config = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=1))
-      var = variable_scope.get_embedding_variable("var_1", embedding_dim=3,
-              initializer=init_ops.ones_initializer(dtypes.float32), steps_to_live=10000, ev_option=ev_config)
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 0.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      opt = adam.AdamOptimizer(0.01)
-      g_v = opt.compute_gradients(loss)
-      gs = training_util.get_or_create_global_step()
-      train_op = opt.apply_gradients(g_v, gs)
-      init = variables.global_variables_initializer()
-      keys, values, versions, freqs = var.export()
-      with self.test_session() as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        sess.run(train_op)
-        sess.run(train_op)
-        fetches = sess.run([keys, values, versions, freqs])
-        print(fetches)
-        self.assertAllEqual([0, 1, 2, 5, 6, 7], fetches[0])
-        self.assertAllEqual([[1., 1., 1.],
-                            [1., 1., 1.],
-                            [1., 1., 1.],
-                            [1., 1., 1.],
-                            [1., 1., 1.],
-                            [1., 1., 1.]], fetches[1])
-        self.assertAllEqual([1, 1, 1, 1, 1, 1], fetches[2])
-        self.assertAllEqual([1, 1, 1, 1, 1, 1], fetches[3])
-
   def testEmbeddingVariableForGetShape(self):
     print("testEmbeddingVariableForGetShape")
     with ops.device("/cpu:0"):
@@ -614,6 +581,35 @@ def testCategoricalColumnWithEmbeddingVariableFunction(self):
         for i in range(ids[col_name].shape.as_list()[0]):
           self.assertAllEqual(val_list[0][i], val_list[1][i])
 
+  def testEmbeddinVariableForPartitionOffset(self):
+    print("testEmbeddinVariableForPartitionOffset")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1", embedding_dim = 3)
+    emb = embedding_ops.embedding_lookup(var, math_ops.cast([0, 1, 1000, 1001, 2, 1002], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    saver = saver_module.Saver(sharded=True)
+    init = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run(train_op)
+      model_path = os.path.join(checkpoint_directory, "model.ckpt")
+      saver.save(sess, model_path)
+
+    for name, shape in checkpoint_utils.list_variables(model_path):
+      if "partition_offset" in name:
+        self.assertEqual(shape[0], 1001)
+        part_offset = checkpoint_utils.load_variable(model_path, name)
+        self.assertEqual(part_offset[0], 0)
+        self.assertEqual(part_offset[1], 2)
+        self.assertEqual(part_offset[2], 4)
+        for i in range(3, len(part_offset)):
+          self.assertEqual(part_offset[i], 6)
+
   def testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn(self):
     print("testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn")
     checkpoint_directory = self.get_temp_dir()
@@ -1829,6 +1825,76 @@ def runTestAdagrad(self, var, g):
 
     del os.environ["TF_SSDHASH_ASYNC_COMPACTION"]
 
+  def testEmbeddingVariableForDramAndLevelDBSaveCkpt(self):
+    print("testEmbeddingVariableForDramAndLevelDBSaveCkpt")
+    checkpoint_directory = self.get_temp_dir()
+    def runTestAdagrad(self, var, g):
+      ids = array_ops.placeholder(dtypes.int64, name="ids")
+      emb = embedding_ops.embedding_lookup(var, ids)
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v, global_step=gs)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+      tires = kv_variable_ops.lookup_tier(emb_var,
+                  math_ops.cast([0,1,2,3,4,5,6,7,8,9,10,11], dtypes.int64))
+      with self.test_session(graph=g) as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        sess.run([train_op], {ids:[0,1,2,3,4,5]})
+        sess.run([train_op], {ids:[6,7,8,9,10,11]})
+        sess.run([train_op], {ids:[0,1,2,3,4,5]})
+        result = sess.run(tires)
+        for i in range(0, 12):
+          if i in range(0, 6):
+            self.assertEqual(result[i], 0)
+          else:
+            self.assertEqual(result[i], 1)
+        saver.save(sess, model_path)
+        for name, shape in checkpoint_utils.list_variables(model_path):
+          if name == "var_1-keys" or name == "var_1/Adagrad-keys":
+            self.assertEqual(shape[0], 12)
+            keys = checkpoint_utils.load_variable(model_path, name)
+            self.assertAllEqual(np.array([0,1,2,3,4,5,6,7,8,9,10,11]), keys)
+          if name == "var_1-freqs" or name == "var_1/Adagrad-freqs":
+            freqs = checkpoint_utils.load_variable(model_path, name)
+            self.assertAllEqual(np.array([2,2,2,2,2,2,1,1,1,1,1,1]), freqs)
+          if name == "var_1/Adagrad-values":
+            values = checkpoint_utils.load_variable(model_path, name)
+            for i in range(0, shape[0]):
+              for j in range(0, shape[1]):
+                if i < 6:
+                  self.assertAlmostEqual(values[i][j], 8.1, delta=1e-05)
+                else:
+                  self.assertAlmostEqual(values[i][j], 4.1, delta=1e-05)
+          if name == "var_1-values":
+            values = checkpoint_utils.load_variable(model_path, name)
+            for i in range(0, shape[0]):
+              for j in range(0, shape[1]):
+                if i < 6:
+                  self.assertAlmostEqual(values[i][j], 0.8309542, delta=1e-05)
+                else:
+                  self.assertAlmostEqual(values[i][j], 0.90122706, delta=1e-05)
+
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      storage_option = variables.StorageOption(
+                        storage_type=config_pb2.StorageType.DRAM_LEVELDB,
+                        storage_path = checkpoint_directory,
+                        storage_size=[1024 * 6])
+      ev_option = variables.EmbeddingVariableOption(
+                                storage_option=storage_option)
+      emb_var = variable_scope.get_embedding_variable("var_1",
+            embedding_dim = 128,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            ev_option = ev_option)
+      runTestAdagrad(self, emb_var, g)
+
   @test_util.run_gpu_only
   def testEmbeddingVariableForHBMandDRAMSaveCkpt(self):
     print("testEmbeddingVariableForHBMandDRAMSaveCkpt")

From 8b02c47ab1c12e4924a486ececf5681312416e14 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 8 Aug 2023 13:56:31 +0800
Subject: [PATCH 46/91] [Release] Update DeepRec release version to
 1.15.5+deeprec2306. (#921)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index be63cf7d2fd..d5fa79bf2b1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '1.15.5+deeprec2304'
+_VERSION = '1.15.5+deeprec2306'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.9.0',

From 419162720dcfc543a84873b24772a262bc1de6b3 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 8 Aug 2023 20:15:06 +0800
Subject: [PATCH 47/91] [Docs] Update deeprec2306 release images and notes in
 README.md & RELEASE.md. (#922)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 84 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 927afe31480..53cca5c5c83 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index d41d9e569ad..43e03bc2b49 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,87 @@
+# Release r1.15.5-deeprec2306
+
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Support StaticGPUHashMap to optimize EmbeddingVariable in inference.
+- Update logic of GroupEmbedding in feature_column API.
+- Refine APIs for foward-backward optimization.
+- Move insertions of new features into the backward process when lti-tier storage. 
+- Move insertion of new features into the backward ops.
+- Modify calculation logic of embedding lookup sparse combiner.
+- Add memory and performance tests of EmbeddingVariable.
+
+### **Graph & Grappler Optimization**
+
+- Support IteratorGetNext for SmartStage as a starting node for searching.
+- Reimplement PrefetchRunner in C++.
+
+### **Runtime Optimization**
+
+- Dispatch expensive ops via multiple threads in theadpool.
+- Enable multi-stream in session_group by default.
+- Support for loading saved_model with device information when use p and multi_stream.
+- Make ARENA_ARRAY_SIZE to be configurable.
+- Optimize EV allocator performance.
+- Integrate HybridBackend in collective training mode.
+
+### **Ops & Hardware Acceleration**
+
+- Disable MatMul fused with LeakyRule when MKL is disabled.
+
+### **Serving**
+
+- Clear virtual_device configurations before load new checkpoint.
+
+### **Environment & Build**
+
+- Update docker images in user documents.
+- Update DEFAULT_CUDA_VERSION and DEFAULT_CUDNN_VERSION in configure.py.
+- Move thirdparties from WORKSPACE to workspace.bzl.
+- Update urls corresponding to colm, ragel, aliyun-oss-sdk and uuid.
+- Update default TF_CUDA_COMPUTE_CAPABILITIES to 7.0,7.5,8.0,8.6.
+- Update SparseOperationKit to v23.5.01 and docker file.
+
+### **BugFix**
+
+- Fix issue of missing params while constructing the ngScope.
+- Fix memory leak to avoid OOM.
+- Fix shape validation in API shared_embedding_columns.
+- Fix the device placement bug of stage_subgraph_on_cpu in distributed.
+- Fix hung issue when using both SOK and SmartStaged simultaneously.
+- Fix bug: init global_step before saving variables
+- Fix bug: reserve input nodes, clear saver devices on demand.
+- Fix memory leak when a graph node is invalid.
+
+### **ModelZoo**
+
+- Add examples and docs to demonstrate Collective Training.
+- Update documents and config files for modelzoo benchmark.
+- Update modelzoo README.
+
+### **Tool & Documents**
+
+- Update cases of configure TF_CUDA_COMPUTE_CAPABILITIES for H100.
+- Update COMMITTERS.md.
+- Update device placement documents.
+- Update document for SmartStage.
+- Update session_group documents.
+- Update the download link of the library that Processor depends on.
+- Update sok to 1.20.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2304
 
 ## **Major Features and Improvements**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index 0a170177353..83ba4854b9f 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index cdc04044875..73b6a36f318 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2304
+Develop Branch：master, Latest Release Branch: deeprec2306
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 8ced3628673..346a848ca74 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2304
+Develop Branch: master, Latest Release Branch: deeprec2306
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index 20df07aa252..08d249f8eeb 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index 332b96e6086..e5455aae91a 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2304
+开发分支：master，最新Release分支：deeprec2306
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index 27bfc864e4e..0c76400e6c6 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2304
+开发分支：master，最新Release分支：deeprec2306
 
 ## TFServing编译&打包
 

From 4983e027e2eae258a82b34ee19b8ae2cb59e6c56 Mon Sep 17 00:00:00 2001
From: shijieliu <aleliu@nvidia.com>
Date: Wed, 9 Aug 2023 11:26:59 +0800
Subject: [PATCH 48/91] [Distributed] Fix wgrad bug in Sparse Operation Kit.
 (#918)

Use new_git_repository to manage sok dependency,update sok for fixing localized mode wgrad bug.

Signed-off-by: aleliu <aleliu@nvidia.com>
---
 tensorflow/tools/pip_package/build_sok.sh |  3 +--
 tensorflow/workspace.bzl                  | 11 +++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_sok.sh b/tensorflow/tools/pip_package/build_sok.sh
index 2c99ceb5ac1..3860f5fdcff 100755
--- a/tensorflow/tools/pip_package/build_sok.sh
+++ b/tensorflow/tools/pip_package/build_sok.sh
@@ -16,5 +16,4 @@ export MAKEFLAGS=-j$(nproc)
 export SOK_COMPILE_GPU_SM="70;75;80"
 cd ./bazel-DeepRec/external/hugectr/sparse_operation_kit
 
-"${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel
-pip install ./dist/merlin_sok-1.2.0-cp38-cp38-linux_x86_64.whl
+"${PYTHON_BIN_PATH:-python}" setup.py install
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 3495efd182d..540f733b2ea 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1369,13 +1369,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    http_archive(
+    new_git_repository(
         name = "hugectr",                                     # Apache License 2.0
-        build_file = "//third_party:hugectr.BUILD",
-        strip_prefix = "HugeCTR-23.06.00",
-        urls = [
-            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.06.00.tar.gz",
-        ],
+	build_file = "//third_party:hugectr.BUILD",
+	commit = "869028c1c32bdcda2f18efc88d54f0527ed28d6d",
+	init_submodules = True,
+	remote = "https://github.com/NVIDIA-Merlin/HugeCTR.git",
     )
 
 def tf_bind():

From f09e5ec0c1a2424727f8ffc5eaf98b771c4b374e Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Fri, 11 Aug 2023 14:02:44 +0800
Subject: [PATCH 49/91] [Embedding] Add GetSnapshot and Create API for
 EmbeddingVariable. (#923)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/embedding_var.h  | 35 ++++++++++++
 .../framework/embedding/eviction_manager.h    |  5 +-
 .../kernels/embedding_variable_ops_test.cc    | 54 +++++++++++++++++--
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 9a5b5cf9a19..b29493f2169 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -186,6 +186,13 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  Status Insert(K key, V* value) {
+    ValuePtr<V>* value_ptr = nullptr;
+    CreateKey(key, &value_ptr, true);
+    LookupOrCreateEmb(value_ptr, value);
+    return Status::OK();
+  }
+
   Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr) {
     Status s = storage_->GetOrCreate(key, value_ptr,
         emb_config_.total_num(storage_->GetAllocLen()));
@@ -592,6 +599,34 @@ class EmbeddingVar : public ResourceBase {
                           default_value_);
   }
 
+  void GetSnapshot(std::vector<K>* key_list,
+                   std::vector<V*>* value_list,
+                   std::vector<int64>* version_list,
+                   std::vector<int64>* freq_list) {
+    std::vector<ValuePtr<V>*> value_ptr_list;
+    storage_->GetSnapshot(key_list, &value_ptr_list);
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+    for (int64 i = 0; i < key_list->size(); i++) {
+      V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0);
+      if (val != nullptr) {
+        value_list->emplace_back(val);
+      } else {
+        value_list->emplace_back(default_value_);
+      }
+
+      if(is_save_version) {
+        int64 dump_version = value_ptr_list[i]->GetStep();
+        version_list->emplace_back(dump_version);
+      }
+
+      if(is_save_freq) {
+        int64 dump_freq = value_ptr_list[i]->GetFreq();
+        freq_list->emplace_back(dump_freq);
+      }
+    }
+  }
+
   mutex* mu() {
     return &mu_;
   }
diff --git a/tensorflow/core/framework/embedding/eviction_manager.h b/tensorflow/core/framework/embedding/eviction_manager.h
index b5a78765170..ca646c9b420 100644
--- a/tensorflow/core/framework/embedding/eviction_manager.h
+++ b/tensorflow/core/framework/embedding/eviction_manager.h
@@ -47,8 +47,7 @@ class EvictionManager {
           "EVICTION_MANAGER", 3, /*low_latency_hint=*/false));
   }
   
-  ~EvictionManager() {
-  }
+  ~EvictionManager() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(EvictionManager);
 
@@ -124,8 +123,8 @@ class EvictionManager {
   int64 num_of_threads_;
   int64 num_of_active_threads_;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::map<MultiTierStorage<K,V>*, StorageItem<K, V>*> storage_table_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
   mutex mu_;
 };
 
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index eff4b77c2dc..4839c171708 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -1191,6 +1191,7 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
 }
 
 TEST(EmbeddingVariableTest, TestCacheRestore) {
+  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
   int64 value_size = 4;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
@@ -1237,8 +1238,11 @@ TEST(EmbeddingVariableTest, TestCacheRestore) {
   LOG(INFO) << "size:" << variable->Size();
 
   BundleWriter writer(Env::Default(), Prefix("foo"));
-  DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
-  TF_ASSERT_OK(writer.Finish());  
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
+  TF_ASSERT_OK(writer.Finish());
+  variable->Unref();
 
   auto imported_storage= embedding::StorageFactory::Create<int64, float>(
       embedding::StorageConfig(embedding::DRAM_SSDHASH,
@@ -1258,6 +1262,7 @@ TEST(EmbeddingVariableTest, TestCacheRestore) {
 
   ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
   ASSERT_EQ(imported_storage->Size(1), 2);
+  delete imported_storage;
 }
 
 void t1_gpu(KVInterface<int64, float>* hashmap) {
@@ -1703,7 +1708,50 @@ TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
    for (auto &t : insert_threads) {
      t.join();
    }
- }
+}
+
+TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  auto emb_config = EmbeddingConfig(
+      /*emb_index = */0, /*primary_emb_index = */0,
+      /*block_num = */1, /*slot_num = */0,
+      /*name = */"", /*steps_to_live = */0,
+      /*filter_freq = */0, /*max_freq = */999999,
+      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
+      /*max_element_size = */0, /*false_positive_probability = */-1.0,
+      /*counter_type = */DT_UINT64);
+  auto storage = embedding::StorageFactory::Create<int64, float>(
+      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
+  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
+      storage,
+      emb_config,
+      cpu_allocator());
+  var->Init(value, 1);
+  float* set_value = (float*)malloc(value_size * sizeof(float));
+  //Insertion
+  for (int i = 0; i < 100; i++) {
+    for (int j = 0; j < value_size; j++) {
+      set_value[j] = i + j;
+    }
+    var->Insert(i, set_value);
+  }
+  free(set_value);
+  //GetSnapshot
+  std::vector<int64> key_list;
+  std::vector<float*> value_ptr_list;
+  std::vector<int64> version_list;
+  std::vector<int64> freq_list;
+  var->GetSnapshot(&key_list, &value_ptr_list,
+                  &version_list, &freq_list);
+  for (int i = 0; i < key_list.size(); i++) {
+    ASSERT_EQ(key_list[i], i);
+    for (int j = 0; j < value_size; j++) {
+      ASSERT_EQ(value_ptr_list[i][j], i + j);
+    }
+  }
+}
 
 } // namespace
 } // namespace embedding

From 8d8e16aae66add22cf8a4812d549c83f3569ef13 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Fri, 11 Aug 2023 18:00:40 +0800
Subject: [PATCH 50/91] [Embedding] Fix set initialized flag too early in
 restore subgraph. (#920)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/config.proto     |  4 ++
 .../framework/embedding/multi_tier_storage.h  | 10 +--
 tensorflow/core/framework/variable.proto      |  2 +
 tensorflow/core/kernels/kv_variable_ops.cc    | 28 ++++----
 .../python/ops/embedding_variable_ops_test.py | 65 +++++++++++++++++++
 tensorflow/python/ops/kv_variable_ops.py      | 52 +++++++++++++++
 tensorflow/python/training/optimizer.py       |  3 +-
 .../training/saving/saveable_object_util.py   |  2 +-
 tensorflow/python/training/slot_creator.py    | 18 +++--
 9 files changed, 158 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index 3d5fae9f6ad..a8535347020 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -56,3 +56,7 @@ enum ValuePosition {
   IN_DRAM = 0;
   NOT_IN_DRAM = 1;
 }
+
+enum IsSetInitialized {
+  NOT_SET_INITAILIZED = 0;
+}
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index ff18425ad9a..8239d109e64 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -81,10 +81,12 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   void InitCache(embedding::CacheStrategy cache_strategy) override {
-    cache_ = CacheFactory::Create<K>(cache_strategy, name_);
-    eviction_manager_ = EvictionManagerCreator::Create<K, V>();
-    eviction_manager_->AddStorage(this);
-    cache_thread_pool_ = CacheThreadPoolCreator::Create();
+    if (cache_ == nullptr) {
+      cache_ = CacheFactory::Create<K>(cache_strategy, name_);
+      eviction_manager_ = EvictionManagerCreator::Create<K, V>();
+      eviction_manager_->AddStorage(this);
+      cache_thread_pool_ = CacheThreadPoolCreator::Create();
+    }
   }
 
   Status BatchCommit(const std::vector<K>& keys,
diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index 79ccd107628..5f9e0f16b5d 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -74,6 +74,8 @@ message VariableDef {
 
   // EmebddingVariable
   bool is_embedding_var = 91;
+
+  string initialize_op_for_restore = 92;
 }
 
 message SaveSliceInfoDef {
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 20ea6d3cb61..8a01a7bf2cd 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -43,11 +43,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-const int64 kEmbeddingVarUseDB = -214;
-const int64 kInitializableEmbeddingVarUseDB = -215;
-}
-
 Status MoveMatchingFiles(
     Env* env,
     const tstring& pattern,
@@ -207,6 +202,15 @@ class InitializeKvVariableOp : public OpKernel {
         (embedding_var_type ==
          embedding::EmbeddingVariableType::IMMUTABLE);
 
+    //initial_num_buckets is useless, so is used to set is_set_initialized_.
+    int64 initial_num_buckets = 0;
+    OP_REQUIRES_OK(c, c->GetAttr("initial_num_buckets", &initial_num_buckets));
+    is_set_initialized_ = true;
+    if (initial_num_buckets ==
+        embedding::IsSetInitialized::NOT_SET_INITAILIZED) {
+      is_set_initialized_ = false;
+    }
+
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
     storage_type_ = static_cast<embedding::StorageType>(storage_type);
@@ -263,15 +267,10 @@ class InitializeKvVariableOp : public OpKernel {
                                   " should be DRAM when layout is 'compact'."));
     }
 
-    if (steps_to_live_ == kEmbeddingVarUseDB ||
-        steps_to_live_ == kInitializableEmbeddingVarUseDB) {
-      LOG(INFO) << "hashmap use db";
-      //use_db_ = true;
-    } else {
-      OP_REQUIRES(c, steps_to_live_ >= 0,
-          errors::InvalidArgument(
+    OP_REQUIRES(c, steps_to_live_ >= 0,
+        errors::InvalidArgument(
             "steps_to_live must >= 0, ", std::to_string(steps_to_live_)));
-    }
+
     OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
     if (embedding::StorageType::LEVELDB == storage_type_) {
       ht_type_ = "leveldb_kv";
@@ -406,7 +405,7 @@ class InitializeKvVariableOp : public OpKernel {
       core::ScopedUnref unref_me(primary_variable);
     }
     core::ScopedUnref unref_me(ev);
-    if (steps_to_live_ != kEmbeddingVarUseDB) {
+    if (is_set_initialized_) {
       ev->SetInitialized();
     }
   }
@@ -436,6 +435,7 @@ class InitializeKvVariableOp : public OpKernel {
   bool record_freq_;
   bool record_version_;
   bool is_inference_;
+  bool is_set_initialized_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index d3e453df9d1..25a0cb6ff11 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2751,5 +2751,70 @@ def testCPUFbjOptWithBloomFilter(self):
         self.assertNotEqual(val, 1.0)
     del os.environ["TF_EMBEDDING_FBJ_OPT"]
 
+  def testSetInitializedWithoutRestore(self):
+    print("testSetInitializedWithoutRestore")
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    saver = saver_module.Saver()
+    with self.test_session() as sess:
+      result = sess.run(var._is_initialized_op)
+      self.assertEqual(False, result)
+      sess.run([init])
+      result = sess.run(var._is_initialized_op)
+      self.assertEqual(True, result)
+
+  def testSetInitializedWithRestore(self):
+    print("testSetInitializedWitRestore")
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,2 ,3], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        sess.run([init])
+        sess.run(train_op)
+        saver.save(sess, ckpt_path)
+
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1, 2, 3], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(False, result)
+        sess.run([var._initializer_for_restore])
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(False, result)
+
+        saver.restore(sess, ckpt_path)
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(True, result)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index e6140c9c149..701c03f6975 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -434,6 +434,8 @@ def is_multi_tier(storage_type):
             with ops.control_dependencies(set_attr_ops + [self._init_op]):
               self._initializer_op = control_flow_ops.no_op()
         
+            self.create_init_op_for_restore(name, initial_value, invalid_key, rank)
+
         self._graph_element = self._handle
         self._cached_value = None
         if not context.executing_eagerly():
@@ -444,6 +446,49 @@ def is_multi_tier(storage_type):
   def export(self):
     return gen_kv_variable_ops.kv_resource_export(self._handle, Tkeys=self._invalid_key_type)
 
+
+  def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
+        with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]):
+          self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op(
+              self._handle,
+              self._primary._handle,
+              variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+              ops.convert_to_tensor(invalid_key),
+              initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED,
+              slot_num = self._slot_num,
+              shape=initial_value.get_shape()[rank:],
+              steps_to_live=self._steps_to_live,
+              emb_index=self._emb_index, block_num=self.block_num,
+              slot_index=self._slot_index,
+              ht_type=self._ht_type,
+              ht_partition_num=self._ht_partition_num,
+              filter_freq = self._filter_freq,
+              l2_weight_threshold = self._l2_weight_threshold,
+              max_element_size = self._max_element_size,
+              false_positive_probability = self._false_positive_probability,
+              counter_type = self._counter_type,
+              max_freq = 99999,
+              layout = self._layout,
+              storage_type = self._storage_type,
+              storage_path = self._storage_path,
+              storage_size = self._storage_size,
+              default_value_dim = self._default_value_dim,
+              default_value_no_permission = self._default_value_no_permission,
+              record_freq = self._record_freq,
+              record_version = self._record_version,
+              embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE)
+        set_attr_ops = []
+        if self._is_primary and self._is_multi_tier:
+          with ops.control_dependencies([self._initializer_for_restore]):
+            set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
+                self._handle,
+                cache_strategy=self._storage_cache_strategy,
+                Tkeys=self._invalid_key_type,
+                dtype=self._dtype)
+          set_attr_ops.append(set_cache_op)
+        with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
+          self._init_op_for_restore = control_flow_ops.no_op()
+
   def need_counts(self):
     return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
   @property
@@ -482,6 +527,11 @@ def _init_from_proto(self, variable_def, import_scope=None):
           cache_op = op
     elif self._initializer_op.type == "InitializeKvVariableOp":
       init_op = self._initializer_op
+
+    self._init_op_for_restore = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.initialize_op_for_restore,
+            import_scope=import_scope))
     self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(
@@ -842,6 +892,8 @@ def to_proto(self, export_scope=None):
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
             self._save_slice_info.to_proto(export_scope=export_scope))
+      var_def.initialize_op_for_restore = ops.strip_name_scope(
+          self._init_op_for_restore.name, export_scope)
       return var_def
     else:
       return None
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 2b765814c0d..578d682cc11 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -243,8 +243,7 @@ def _get_processor(v):
   if v.op.type == "KvVarHandleOp":
     from tensorflow.core.framework import attr_value_pb2
     from tensorflow.core.framework.embedding import config_pb2
-    v._init_op._set_attr("embedding_variable_type",
-        attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+    slot_creator._set_init_op_embedding_type_attr(v, config_pb2.EmbeddingVariableType.MUTABLE)
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index cd3cba52676..0d8bfe87022 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -195,7 +195,7 @@ def restore(self, restored_tensors, unused_restored_shapes):
       if self.var._init_data_source is not None:
         return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
       else:
-        with ops.control_dependencies([self.var._initializer_op]):
+        with ops.control_dependencies([self.var._init_op_for_restore]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
               restored_tensors[0],
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 90a820d82f6..6a359321c20 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -94,8 +94,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
         validate_shape=validate_shape,
         steps_to_live=primary._steps_to_live,
         ht_partition_num=primary._ht_partition_num)
-      slot._init_op._set_attr("embedding_variable_type",
-            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+      _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE)
     else:
       filter_strategy = None
       if primary._filter_freq != 0:
@@ -107,7 +106,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
         else:
           filter_strategy = variables.CounterFilter(filter_freq=primary._filter_freq)
       if slot_config.slot_type is config_pb2.SlotType.EMBEDDING_VARIABLE:
-        primary._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_config.slot_num))
+        _set_init_op_slot_num_attr(primary, slot_config.slot_num)
         primary._slot_num = slot_config.slot_num
         emb_index = primary._emb_index
         if primary.block_num > 1:
@@ -132,8 +131,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
             l2_weight_threshold=primary._l2_weight_threshold,
             filter_strategy=filter_strategy)
         )
-        slot._init_op._set_attr("embedding_variable_type",
-            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+        _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE)
       else:
         slot = variable_scope.get_variable(
           scope,
@@ -300,3 +298,13 @@ def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True, slo
     return create_slot(primary, val, name,
                        colocate_with_primary=colocate_with_primary,
                        slot_config=slot_config)
+
+def _set_init_op_embedding_type_attr(var, embedding_type):
+  var._init_op._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=embedding_type))
+  var._initializer_for_restore._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=embedding_type))
+
+def _set_init_op_slot_num_attr(var, slot_num):
+  var._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))
+  var._initializer_for_restore._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))

From 821d5e8d39156d477bacd2ede9f68f76ede0f77d Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 19 Sep 2023 09:56:20 +0800
Subject: [PATCH 51/91] [Embedding] Remove the dependency on private header
 file in EmbeddingVariable. (#927)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   5 +-
 .../framework/embedding/embedding_config.h    |   3 +
 .../core/framework/embedding/embedding_var.h  |   1 -
 .../embedding/embedding_var_ckpt_data.cc      | 262 +++++++
 .../embedding/embedding_var_ckpt_data.h       | 190 +----
 .../embedding/embedding_var_dump_iterator.h   |   7 +-
 .../embedding/embedding_var_restore.cc        | 647 ++++++++++++++++++
 .../embedding/embedding_var_restore.h         | 534 +--------------
 .../core/framework/embedding/kv_interface.h   |   8 +-
 .../embedding/ssd_record_descriptor.cc        |  88 +++
 .../embedding/ssd_record_descriptor.h         |  49 +-
 tensorflow/core/framework/embedding/storage.h |   4 +-
 tensorflow/core/kernels/BUILD                 |   5 +-
 13 files changed, 1041 insertions(+), 762 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_restore.cc
 create mode 100644 tensorflow/core/framework/embedding/ssd_record_descriptor.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8ae5b4f156c..95bbbab5624 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3026,7 +3026,10 @@ tf_cuda_library(
             "framework/embedding/gpu_hash_table.cu.cc",
             "framework/embedding/gpu_hash_table.h",
             "framework/embedding/embedding_var.cu.cc",
-            "framework/embedding/multi_tier_storage.cu.cc"
+            "framework/embedding/multi_tier_storage.cu.cc",
+            "framework/embedding/embedding_var_ckpt_data.cc",
+            "framework/embedding/embedding_var_restore.cc",
+            "framework/embedding/ssd_record_descriptor.cc"
         ],
     ) + select({
         "//tensorflow:windows": [],
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index 0a50b492159..d47d07d4205 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -3,6 +3,9 @@
 
 #include <cmath>
 #include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/default/logging.h"
 
 namespace tensorflow {
 struct EmbeddingConfig {
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index b29493f2169..28ce5094d87 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/storage.h"
 #include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
new file mode 100644
index 00000000000..c1b43a608b5
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
@@ -0,0 +1,262 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "tensorflow/core/framework/embedding/embedding_var_ckpt_data.h"
+#include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace embedding {
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(
+    K key, ValuePtr<V>* value_ptr,
+    const EmbeddingConfig& emb_config,
+    V* default_value, int64 value_offset,
+    bool is_save_freq,
+    bool is_save_version,
+    bool save_unfiltered_features) {
+  if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
+    return;
+
+  V* primary_val = value_ptr->GetValue(0, 0);
+  bool is_not_admit =
+      primary_val == nullptr
+      && emb_config.filter_freq != 0;
+
+  if (!is_not_admit) {
+    key_vec_.emplace_back(key);
+
+    if (primary_val == nullptr) {
+      value_ptr_vec_.emplace_back(default_value);
+    } else if (
+        (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
+      value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
+    } else {
+      V* val = value_ptr->GetValue(emb_config.emb_index,
+          value_offset);
+      value_ptr_vec_.emplace_back(val);
+    }
+
+
+    if(is_save_version) {
+      int64 dump_version = value_ptr->GetStep();
+      version_vec_.emplace_back(dump_version);
+    }
+
+    if(is_save_freq) {
+      int64 dump_freq = value_ptr->GetFreq();
+      freq_vec_.emplace_back(dump_freq);
+    }
+  } else {
+    if (!save_unfiltered_features)
+      return;
+
+    key_filter_vec_.emplace_back(key);
+
+    if(is_save_version) {
+      int64 dump_version = value_ptr->GetStep();
+      version_filter_vec_.emplace_back(dump_version);
+    }
+
+    int64 dump_freq = value_ptr->GetFreq();
+    freq_filter_vec_.emplace_back(dump_freq);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
+      ktype, ValuePtr<vtype>*, const EmbeddingConfig&, \
+      vtype*, int64, bool, bool, bool); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(K key, V* value_ptr) {
+  key_vec_.emplace_back(key);
+  value_ptr_vec_.emplace_back(value_ptr);
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
+      ktype, vtype*); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::SetWithPartition(
+    std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
+  part_offset_.resize(kSavedPartitionNum + 1);
+  part_filter_offset_.resize(kSavedPartitionNum + 1);
+  part_offset_[0] = 0;
+  part_filter_offset_[0] = 0;
+  for (int i = 0; i < kSavedPartitionNum; i++) {
+    part_offset_[i + 1] =
+        part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
+
+    part_filter_offset_[i + 1] =
+        part_filter_offset_[i] +
+        ev_ckpt_data_parts[i].key_filter_vec_.size();
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
+      key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
+      value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
+      version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
+      freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
+      key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) {
+      version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
+      freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
+    }
+  }
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::SetWithPartition(  \
+      std::vector<EmbeddingVarCkptData<ktype, vtype>>&); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+Status EmbeddingVarCkptData<K, V>::ExportToCkpt(
+    const string& tensor_name,
+    BundleWriter* writer,
+    int64 value_len,
+    ValueIterator<V>* value_iter) {
+  size_t bytes_limit = 8 << 20;
+  std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
+
+  EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
+  Status s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys", writer, dump_buffer.get(),
+      bytes_limit, &key_dump_iter,
+      TensorShape({key_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EV2dVectorDataDumpIterator<V> value_dump_iter(
+      value_ptr_vec_, value_len, value_iter);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-values", writer, dump_buffer.get(),
+      bytes_limit, &value_dump_iter,
+      TensorShape({value_ptr_vec_.size(), value_len}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-versions", writer, dump_buffer.get(),
+      bytes_limit, &version_dump_iter,
+      TensorShape({version_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-freqs", writer, dump_buffer.get(),
+      bytes_limit, &freq_dump_iter,
+      TensorShape({freq_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys_filtered", writer, dump_buffer.get(),
+      bytes_limit, &filtered_key_dump_iter,
+      TensorShape({key_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64>
+      filtered_version_dump_iter(version_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-versions_filtered",
+      writer, dump_buffer.get(),
+      bytes_limit, &filtered_version_dump_iter,
+      TensorShape({version_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64>
+      filtered_freq_dump_iter(freq_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-freqs_filtered",
+      writer, dump_buffer.get(),
+      bytes_limit, &filtered_freq_dump_iter,
+      TensorShape({freq_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int32>
+      part_offset_dump_iter(part_offset_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-partition_offset",
+      writer, dump_buffer.get(),
+      bytes_limit, &part_offset_dump_iter,
+      TensorShape({part_offset_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int32>
+      part_filter_offset_dump_iter(part_filter_offset_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-partition_filter_offset",
+      writer, dump_buffer.get(),
+      bytes_limit, &part_filter_offset_dump_iter,
+      TensorShape({part_filter_offset_.size()}));
+  if (!s.ok())
+    return s;
+
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status EmbeddingVarCkptData<ktype, vtype>::ExportToCkpt(  \
+      const string&, BundleWriter*, int64, ValueIterator<vtype>*); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+}// namespace embedding
+}// namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index aa1a08cbcfd..6d7b09e70b0 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -15,11 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 namespace tensorflow {
+class BundleWriter;
+
 namespace embedding {
 
 template<class K, class V>
@@ -30,195 +30,17 @@ class  EmbeddingVarCkptData {
                V* default_value, int64 value_offset,
                bool is_save_freq,
                bool is_save_version,
-               bool save_unfiltered_features) {
-    if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
-      return;
-
-    V* primary_val = value_ptr->GetValue(0, 0);
-    bool is_not_admit =
-        primary_val == nullptr
-        && emb_config.filter_freq != 0;
-
-    if (!is_not_admit) {
-       key_vec_.emplace_back(key);
-
-      if (primary_val == nullptr) {
-        value_ptr_vec_.emplace_back(default_value);
-      } else if (
-          (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
-        value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
-      } else {
-        V* val = value_ptr->GetValue(emb_config.emb_index,
-            value_offset);
-        value_ptr_vec_.emplace_back(val);
-      }
-
-
-      if(is_save_version) {
-        int64 dump_version = value_ptr->GetStep();
-        version_vec_.emplace_back(dump_version);
-      }
-
-      if(is_save_freq) {
-        int64 dump_freq = value_ptr->GetFreq();
-        freq_vec_.emplace_back(dump_freq);
-      }
-    } else {
-      if (!save_unfiltered_features)
-        return;
-
-      key_filter_vec_.emplace_back(key);
+               bool save_unfiltered_features);
 
-      if(is_save_version) {
-        int64 dump_version = value_ptr->GetStep();
-        version_filter_vec_.emplace_back(dump_version);
-      }
-
-      int64 dump_freq = value_ptr->GetFreq();
-      freq_filter_vec_.emplace_back(dump_freq);
-    }
-  }
-
-  void Emplace(K key, V* value_ptr) {
-    key_vec_.emplace_back(key);
-    value_ptr_vec_.emplace_back(value_ptr);
-  }
+  void Emplace(K key, V* value_ptr);
 
   void SetWithPartition(
-      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
-    part_offset_.resize(kSavedPartitionNum + 1);
-    part_filter_offset_.resize(kSavedPartitionNum + 1);
-    part_offset_[0] = 0;
-    part_filter_offset_[0] = 0;
-    for (int i = 0; i < kSavedPartitionNum; i++) {
-      part_offset_[i + 1] =
-          part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
-
-      part_filter_offset_[i + 1] =
-          part_filter_offset_[i] +
-          ev_ckpt_data_parts[i].key_filter_vec_.size();
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
-        key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
-        value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
-        version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
-        freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
-        key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) {
-        version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
-        freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
-      }
-    }
-  }
+      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts);
 
   Status ExportToCkpt(const string& tensor_name,
                       BundleWriter* writer,
                       int64 value_len,
-                      ValueIterator<V>* value_iter = nullptr) {
-    size_t bytes_limit = 8 << 20;
-    std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
-
-    EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
-    Status s = SaveTensorWithFixedBuffer(
-        tensor_name + "-keys", writer, dump_buffer.get(),
-        bytes_limit, &key_dump_iter,
-        TensorShape({key_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EV2dVectorDataDumpIterator<V> value_dump_iter(
-        value_ptr_vec_, value_len, value_iter);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-values", writer, dump_buffer.get(),
-        bytes_limit, &value_dump_iter,
-        TensorShape({value_ptr_vec_.size(), value_len}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-versions", writer, dump_buffer.get(),
-        bytes_limit, &version_dump_iter,
-        TensorShape({version_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-freqs", writer, dump_buffer.get(),
-        bytes_limit, &freq_dump_iter,
-        TensorShape({freq_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-keys_filtered", writer, dump_buffer.get(),
-        bytes_limit, &filtered_key_dump_iter,
-        TensorShape({key_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64>
-        filtered_version_dump_iter(version_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-versions_filtered",
-        writer, dump_buffer.get(),
-        bytes_limit, &filtered_version_dump_iter,
-        TensorShape({version_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64>
-        filtered_freq_dump_iter(freq_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-freqs_filtered",
-        writer, dump_buffer.get(),
-        bytes_limit, &filtered_freq_dump_iter,
-        TensorShape({freq_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int32>
-        part_offset_dump_iter(part_offset_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-partition_offset",
-        writer, dump_buffer.get(),
-        bytes_limit, &part_offset_dump_iter,
-        TensorShape({part_offset_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int32>
-        part_filter_offset_dump_iter(part_filter_offset_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-partition_filter_offset",
-        writer, dump_buffer.get(),
-        bytes_limit, &part_filter_offset_dump_iter,
-        TensorShape({part_filter_offset_.size()}));
-    if (!s.ok())
-      return s;
-
-    return Status::OK();
-  }
-
+                      ValueIterator<V>* value_iter = nullptr);
  private:
   std::vector<K> key_vec_;
   std::vector<V*> value_ptr_vec_;
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
index 71ba054b873..84c823a90dc 100644
--- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -15,9 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/kv_interface.h"
 namespace tensorflow {
+template <class T>
+class DumpIterator;
+
 namespace embedding {
 template<class T>
 class EVVectorDataDumpIterator: public DumpIterator<T> {
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc
new file mode 100644
index 00000000000..11c13008995
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc
@@ -0,0 +1,647 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/embedding_var_restore.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+template <typename K>
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
+  TensorShape shape;
+  Status st;
+  st = reader->LookupTensorShape(record_key, &shape);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  size_t bytes_read = 0;
+  *buffer = new K[shape.dim_size(0)];
+  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
+                             (char*)*buffer, bytes_read);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  return shape.dim_size(0);
+}
+#define REGISTER_KERNELS(ktype)                               \
+  template int64 ReadRecord(BundleReader*, const string&, ktype**);
+REGISTER_KERNELS(int32);                                    
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreSSD() {
+  std::string name_string_temp(restore_args_.m_name_string);
+  std::string new_str = "_";
+  int64 pos = name_string_temp.find("/");
+  while (pos != std::string::npos) {
+    name_string_temp.replace(pos, 1, new_str.data(), 1);
+    pos = name_string_temp.find("/");
+  }
+  std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" +
+                                     name_string_temp + "-ssd_record";
+  if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
+    std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
+                                    name_string_temp + "-emb_files";
+    BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
+    RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
+    VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
+    storage_->RestoreSSD(ev_->GetEmbeddingIndex(),
+                         ev_->GetEmbeddingSlotNum(), ev_->ValueLen(),
+                         ssd_emb_file_name, ev_, ssd_buffer);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::RestoreSSD();
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreInternal(
+    const std::string& name_string,
+    const EmbeddingConfig& emb_config,
+    const Eigen::GpuDevice* device,
+    RestoreBuffer& restore_buff) {
+  Status s = EVInitTensorNameAndShape(name_string);
+  if (!s.ok()) {
+    LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
+    return;
+  }
+
+  Tensor part_offset_tensor;
+  Tensor part_filter_offset_tensor;
+  if (!restore_args_.m_is_oldform) {
+    /****** InitPartOffsetTensor ******/
+    TensorShape part_offset_shape, part_filter_offset_shape;
+    DataType part_offset_type, part_filter_offset_type;
+    string offset_tensor_name;
+    if (!restore_args_.m_is_incr) {
+      offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
+    } else {
+      offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
+    }
+  
+    string offset_filter_tensor_name =
+        name_string + kPartFilterOffsetTensorSuffsix;
+    Status s = reader_->LookupDtypeAndShape(
+        offset_tensor_name, &part_offset_type, &part_offset_shape);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.error_message();
+    }
+    s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                     &part_filter_offset_type,
+                                     &part_filter_offset_shape);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    }
+    part_offset_tensor =
+        Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
+    part_filter_offset_tensor = Tensor(
+        cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
+    s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.error_message();
+    }
+
+    s = reader_->Lookup(offset_filter_tensor_name,
+                        &part_filter_offset_tensor);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    }
+  }
+  auto part_offset_flat = part_offset_tensor.flat<int32>();
+  auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
+  
+  if (restore_args_.m_is_oldform) {
+    VLOG(1) << "old form, EV name:" << name_string
+            << ", partition_id:" << restore_args_.m_partition_id
+            << ", new partition num:" << restore_args_.m_partition_num;
+    int64 new_dim = ev_->ValueLen();
+    TensorShape key_shape;
+    Status st =
+        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+    if (!st.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << st.error_message();
+    }
+    int tot_key_num = key_shape.dim_size(0);
+    Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff,
+                                 new_dim, emb_config, device);
+    if (!s.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message();
+    }
+  } else {
+    int64 new_dim = ev_->ValueLen();
+    VLOG(1) << "new form checkpoint... :" << name_string
+            << " , partition_id:" << restore_args_.m_partition_id
+            << " , partition_num:" << restore_args_.m_partition_num;
+    for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
+      int subpart_id = restore_args_.m_loaded_parts[i];
+      size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+      size_t value_unit_bytes_new = sizeof(V) * new_dim;
+      int subpart_offset = part_offset_flat(subpart_id);
+      int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
+      int64 key_part_offset = subpart_offset * sizeof(K);
+      int64 value_part_offset =
+          subpart_offset * sizeof(V) * restore_args_.m_old_dim;
+      int64 version_part_offset = subpart_offset * sizeof(int64);
+      int64 freq_part_offset = subpart_offset * sizeof(int64);
+      VLOG(1) << "dynamically load ev : " << name_string
+              << ", subpartid:" << subpart_id;
+
+      EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
+                        version_part_offset, freq_part_offset, restore_buff,
+                        new_dim, emb_config, device);
+
+      if (restore_args_.m_has_filter) {
+        Status s = EVRestoreFilteredFeatures(
+            subpart_id, new_dim, restore_buff, part_filter_offset_flat,
+            emb_config, device);
+        if (!s.ok()) {
+          LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message();
+        }
+      }
+    }
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::RestoreInternal(   \
+    const std::string&, const EmbeddingConfig&,                      \
+    const Eigen::GpuDevice*, RestoreBuffer&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+bool CheckpointLoader<K, V>::IsOldCheckpoint(
+    const std::string& curr_partid_str,
+    const std::string& kPartOffsetTensorSuffsix) {
+  if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
+    string tensor_name = restore_args_.m_name_string;
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+
+    string part_id = std::to_string(0);
+    tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
+
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+  } else {
+    string part_id = std::to_string(0);
+    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+    size_t part_size = strlen(kPartStr);
+    size_t cur_part_size = curr_partid_str.size();
+
+    string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+    string post_subname = restore_args_.m_name_string.substr(
+        part_pos + part_size + cur_part_size);
+    string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+    pre_subname =
+        restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
+    post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
+                                                      cur_part_size);
+    tensor_name = pre_subname + post_subname;
+
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+  }
+
+  return true;
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template bool CheckpointLoader<ktype, vtype>::IsOldCheckpoint( \
+    const std::string&, const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+
+template <class K, class V>
+void CheckpointLoader<K, V>::InitPartNumAndLoadedParts(
+    std::vector<std::string>& tensor_name_vec) {
+  std::string tmp_key_suffix;
+  std::string tmp_kPartOffsetTensorSuffsix;
+  if (!restore_args_.m_is_incr) {
+    tmp_key_suffix = kKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
+  } else {
+    tmp_key_suffix = kIncrKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
+  }
+
+  restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
+  int orig_partnum = 0;
+  const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
+  size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+
+  if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
+    restore_args_.m_is_oldform = true;
+  }
+
+  if (part_pos == std::string::npos) {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string tensor_name =
+          restore_args_.m_name_string + "/" + kPartStr + part_id;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      tensor_name_vec.emplace_back(restore_args_.m_name_string);
+    }
+    for (int i = 0; i < kSavedPartitionNum; ++i) {
+      restore_args_.m_loaded_parts.push_back(i);
+    }
+  } else {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tmp_name = pre_subname + post_subname;
+      tensor_name_vec.emplace_back(tmp_name);
+    }
+    for (int i = 0; i < kSavedPartitionNum; i++) {
+      if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
+        restore_args_.m_loaded_parts.push_back(i);
+      }
+    }
+  }
+  for (auto& tensor_name : tensor_name_vec) {
+    VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
+            << " ****";
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::InitPartNumAndLoadedParts(\
+    std::vector<std::string>&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
+    const std::string& tensor_name) {
+  if (!restore_args_.m_is_incr) {
+    restore_args_.m_tensor_key = tensor_name + kKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
+  } else {
+    restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
+  }
+
+  TensorShape key_shape, value_shape, version_shape, freq_shape;
+
+  Status st =
+      reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
+                                  &version_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_key,
+                             sizeof(K) * key_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_value,
+                             sizeof(V) * value_shape.dim_size(0) *
+                             value_shape.dim_size(1));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_version,
+                             sizeof(int64) * version_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      freq_shape = version_shape;
+    } else {
+      return st;
+    }
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_freq,
+                             sizeof(int64) * freq_shape.dim_size(0));
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      restore_args_.m_has_freq = false;
+    } else {
+      return st;
+    }
+  }
+  restore_args_.m_old_dim = value_shape.dim_size(1);
+
+  if (!restore_args_.m_is_oldform) {
+    TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
+                                    &key_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        key_filter_shape = key_shape;
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupTensorShape(
+        restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
+    if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
+                               sizeof(K) * key_filter_shape.dim_size(0));
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
+                               sizeof(K) * version_filter_shape.dim_size(0));
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
+                                    &freq_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        freq_filter_shape = freq_shape;
+      } else {
+        return st;
+      }
+    }
+
+    st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
+                               sizeof(K) * freq_filter_shape.dim_size(0));
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
+      return st;
+    }
+  }
+  return st;
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVInitTensorNameAndShape(\
+    const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFeatures(
+    int tot_key_num, int64 key_part_offset,
+    int64 value_part_offset, int64 version_part_offset,
+    int64 freq_part_offset, RestoreBuffer& restore_buff,
+    int64 new_dim, const EmbeddingConfig& emb_config,
+    const Eigen::GpuDevice* device) {
+  size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+  size_t value_unit_bytes_new = sizeof(V) * new_dim;
+  int64 tot_key_bytes_read(0);
+  int64 tot_value_bytes_read(0);
+  int64 tot_version_bytes_read(0);
+  int64 tot_freq_bytes_read(0);
+  size_t key_bytes_read = 0;
+  size_t value_bytes_read = 0;
+  size_t version_bytes_read = 0;
+  size_t freq_bytes_read = 0;
+
+  while (tot_key_num > 0) {
+    size_t read_key_num = std::min(
+        std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
+                 kBufferSize / sizeof(int64));
+    read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
+    read_key_num = std::min((int)read_key_num, tot_key_num);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
+        read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read,
+        read_key_num * value_unit_bytes, restore_buff.value_buffer,
+        value_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_version,
+          version_part_offset + tot_version_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.version_buffer,
+          version_bytes_read);
+      if (version_bytes_read == 0) {
+        memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
+      }
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+
+    if (restore_args_.m_has_freq) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.freq_buffer,
+          freq_bytes_read);
+      if (freq_bytes_read == 0) {
+        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+        for (int64 i = 0; i < read_key_num; i++) {
+          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+        }
+      }
+    } else {
+      int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+      for (int64 i = 0; i < read_key_num; i++) {
+        freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+      }
+    }
+    if (key_bytes_read > 0) {
+      read_key_num = key_bytes_read / sizeof(K);
+      Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
+                                   value_bytes_read, value_unit_bytes_new,
+                                   restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+
+      st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr,
+          emb_config, device,
+          filter_, restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+    }
+
+    tot_key_num -= read_key_num;
+    tot_key_bytes_read += key_bytes_read;
+    tot_value_bytes_read += value_bytes_read;
+    tot_version_bytes_read += version_bytes_read;
+    tot_freq_bytes_read += freq_bytes_read;
+  }
+
+  return Status::OK();
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFeatures( \
+    int, int64, int64, int64, int64, RestoreBuffer&, \
+    int64, const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFilteredFeatures(
+    int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
+    typename TTypes<int32>::Flat part_filter_offset_flat,
+    const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
+  int subpart_filter_offset = part_filter_offset_flat(subpart_id);
+  int tot_key_filter_num =
+      part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
+  int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
+  int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
+  int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
+
+  VLOG(1) << "key_filter_num: " << tot_key_filter_num
+         << ", subpart_filter_offset: " << subpart_filter_offset;
+
+  size_t key_filter_bytes_read = 0;
+  size_t version_filter_bytes_read = 0;
+  size_t freq_filter_bytes_read = 0;
+
+  while (tot_key_filter_num > 0) {
+    size_t read_key_num =
+        std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
+    read_key_num = std::min((int)read_key_num, tot_key_filter_num);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_key + "_filtered",
+        key_filter_part_offset + key_filter_bytes_read,
+        read_key_num * sizeof(K), restore_buff.key_buffer,
+        key_filter_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_version + "_filtered",
+          version_filter_part_offset + version_filter_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.version_buffer,
+          version_filter_bytes_read);
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_freq + "_filtered",
+        freq_filter_part_offset + freq_filter_bytes_read,
+        read_key_num * sizeof(int64), restore_buff.freq_buffer,
+        freq_filter_bytes_read);
+    if (key_filter_bytes_read > 0) {
+      read_key_num = key_filter_bytes_read / sizeof(K);
+      VLOG(2) << "restore, read_key_num:" << read_key_num;
+      Status st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr,
+          emb_config, device,
+          filter_, restore_buff);
+      if (!st.ok()) return st;
+      tot_key_filter_num -= read_key_num;
+    }
+  }
+  return Status::OK();
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFilteredFeatures( \
+    int64, int64, RestoreBuffer&, typename TTypes<int32>::Flat, \
+    const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+}// namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.h b/tensorflow/core/framework/embedding/embedding_var_restore.h
index ec97566fbec..3016ba9eeb8 100644
--- a/tensorflow/core/framework/embedding/embedding_var_restore.h
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.h
@@ -16,23 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
 
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/storage.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 using GPUDevice = Eigen::GpuDevice;
@@ -60,26 +48,7 @@ namespace {
 }  // namespace
 
 template <typename K>
-int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
-  TensorShape shape;
-  Status st;
-  st = reader->LookupTensorShape(record_key, &shape);
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  size_t bytes_read = 0;
-  *buffer = new K[shape.dim_size(0)];
-  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
-                             (char*)*buffer, bytes_read);
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  return shape.dim_size(0);
-}
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer);
 
 template <typename K>
 struct RestoreSSDBuffer {
@@ -178,513 +147,28 @@ class CheckpointLoader {
   void RestoreInternal(const std::string& name_string,
                        const EmbeddingConfig& emb_config,
                        const Eigen::GpuDevice* device,
-                       RestoreBuffer& restore_buff) {
-    Status s = EVInitTensorNameAndShape(name_string);
-    if (!s.ok()) {
-      LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
-      return;
-    }
-
-    Tensor part_offset_tensor;
-    Tensor part_filter_offset_tensor;
-    if (!restore_args_.m_is_oldform) {
-      /****** InitPartOffsetTensor ******/
-      TensorShape part_offset_shape, part_filter_offset_shape;
-      DataType part_offset_type, part_filter_offset_type;
-      string offset_tensor_name;
-      if (!restore_args_.m_is_incr) {
-        offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
-      } else {
-        offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
-      }
-      
-      string offset_filter_tensor_name =
-          name_string + kPartFilterOffsetTensorSuffsix;
-      Status s = reader_->LookupDtypeAndShape(
-          offset_tensor_name, &part_offset_type, &part_offset_shape);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail:" << s.error_message();
-      }
-      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
-                                       &part_filter_offset_type,
-                                       &part_filter_offset_shape);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail: " << s.error_message();
-      }
-      part_offset_tensor =
-          Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
-      part_filter_offset_tensor = Tensor(
-          cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
-      s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail:" << s.error_message();
-      }
-
-      s = reader_->Lookup(offset_filter_tensor_name,
-                          &part_filter_offset_tensor);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail: " << s.error_message();
-      }
-    }
-    auto part_offset_flat = part_offset_tensor.flat<int32>();
-    auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
-    
-    if (restore_args_.m_is_oldform) {
-      VLOG(1) << "old form, EV name:" << name_string
-              << ", partition_id:" << restore_args_.m_partition_id
-              << ", new partition num:" << restore_args_.m_partition_num;
-      int64 new_dim = ev_->ValueLen();
-      TensorShape key_shape;
-      Status st =
-          reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
-      if (!st.ok()) {
-      }
-      int tot_key_num = key_shape.dim_size(0);
-      Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff,
-                                   new_dim, emb_config, device);
-      if (!s.ok()) {
-        LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message();
-      }
-    } else {
-      int64 new_dim = ev_->ValueLen();
-      VLOG(1) << "new form checkpoint... :" << name_string
-              << " , partition_id:" << restore_args_.m_partition_id
-              << " , partition_num:" << restore_args_.m_partition_num;
-      for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
-        int subpart_id = restore_args_.m_loaded_parts[i];
-        size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
-        size_t value_unit_bytes_new = sizeof(V) * new_dim;
-        int subpart_offset = part_offset_flat(subpart_id);
-        int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
-        int64 key_part_offset = subpart_offset * sizeof(K);
-        int64 value_part_offset =
-            subpart_offset * sizeof(V) * restore_args_.m_old_dim;
-        int64 version_part_offset = subpart_offset * sizeof(int64);
-        int64 freq_part_offset = subpart_offset * sizeof(int64);
-        VLOG(1) << "dynamically load ev : " << name_string
-                << ", subpartid:" << subpart_id;
-
-        EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
-                          version_part_offset, freq_part_offset, restore_buff,
-                          new_dim, emb_config, device);
-
-        if (restore_args_.m_has_filter) {
-          Status s = EVRestoreFilteredFeatures(
-              subpart_id, new_dim, restore_buff, part_filter_offset_flat,
-              emb_config, device);
-          if (!s.ok()) {
-            LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message();
-          }
-        }
-      }
-    }
-  }
+                       RestoreBuffer& restore_buff);
 
  private:
-  void RestoreSSD() {
-    std::string name_string_temp(restore_args_.m_name_string);
-    std::string new_str = "_";
-    int64 pos = name_string_temp.find("/");
-    while (pos != std::string::npos) {
-      name_string_temp.replace(pos, 1, new_str.data(), 1);
-      pos = name_string_temp.find("/");
-    }
-    std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" +
-                                       name_string_temp + "-ssd_record";
-    if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
-      std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
-                                      name_string_temp + "-emb_files";
-      BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
-      RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
-      VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
-      storage_->RestoreSSD(ev_->GetEmbeddingIndex(),
-                           ev_->GetEmbeddingSlotNum(), ev_->ValueLen(),
-                           ssd_emb_file_name, ev_, ssd_buffer);
-    }
-  }
+  void RestoreSSD();
 
   bool IsOldCheckpoint(const std::string& curr_partid_str,
-                       const std::string& kPartOffsetTensorSuffsix) {
-    if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
-      string tensor_name = restore_args_.m_name_string;
-      TensorShape part_offset_shape;
-      DataType part_offset_type;
-      Status st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (st.ok()) return false;
-
-      string part_id = std::to_string(0);
-      tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
-
-      Status form_st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (form_st.ok()) return false;
-    } else {
-      string part_id = std::to_string(0);
-      size_t part_pos = restore_args_.m_name_string.find(kPartStr);
-      size_t part_size = strlen(kPartStr);
-      size_t cur_part_size = curr_partid_str.size();
-
-      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
-      string post_subname = restore_args_.m_name_string.substr(
-          part_pos + part_size + cur_part_size);
-      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
-
-      TensorShape part_offset_shape;
-      DataType part_offset_type;
-      Status form_st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (form_st.ok()) return false;
-      pre_subname =
-          restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
-      post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
-                                                        cur_part_size);
-      tensor_name = pre_subname + post_subname;
-
-      Status st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (st.ok()) return false;
-    }
-
-    return true;
-  }
-
-  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec) {
-    std::string tmp_key_suffix;
-    std::string tmp_kPartOffsetTensorSuffsix;
-    if (!restore_args_.m_is_incr) {
-      tmp_key_suffix = kKeySuffix;
-      tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
-    } else {
-      tmp_key_suffix = kIncrKeySuffix;
-      tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
-    }
-
-    restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
-    int orig_partnum = 0;
-    const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
-    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
-
-    if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
-      restore_args_.m_is_oldform = true;
-    }
-
-    if (part_pos == std::string::npos) {
-      for (;; orig_partnum++) {
-        string part_id = std::to_string(orig_partnum);
-        string tensor_name =
-            restore_args_.m_name_string + "/" + kPartStr + part_id;
-        string tensor_key = tensor_name + tmp_key_suffix;
-        TensorShape key_shape;
-        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
-        if (!st.ok()) {
-          break;
-        }
-        tensor_name_vec.emplace_back(tensor_name);
-      }
-      if (orig_partnum == 0) {
-        tensor_name_vec.emplace_back(restore_args_.m_name_string);
-      }
-      for (int i = 0; i < kSavedPartitionNum; ++i) {
-        restore_args_.m_loaded_parts.push_back(i);
-      }
-    } else {
-      for (;; orig_partnum++) {
-        string part_id = std::to_string(orig_partnum);
-        string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
-        string post_subname = restore_args_.m_name_string.substr(
-            part_pos + strlen(kPartStr) + curr_partid_str.size());
-        string tensor_name = pre_subname + kPartStr + part_id + post_subname;
-        string tensor_key = tensor_name + tmp_key_suffix;
-        TensorShape key_shape;
-        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
-        if (!st.ok()) {
-          break;
-        }
-        tensor_name_vec.emplace_back(tensor_name);
-      }
-      if (orig_partnum == 0) {
-        string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
-        string post_subname = restore_args_.m_name_string.substr(
-            part_pos + strlen(kPartStr) + curr_partid_str.size());
-        string tmp_name = pre_subname + post_subname;
-        tensor_name_vec.emplace_back(tmp_name);
-      }
-      for (int i = 0; i < kSavedPartitionNum; i++) {
-        if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
-          restore_args_.m_loaded_parts.push_back(i);
-        }
-      }
-    }
-    for (auto& tensor_name : tensor_name_vec) {
-      VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
-              << " ****";
-    }
-  }
+                       const std::string& kPartOffsetTensorSuffsix);
 
-  Status EVInitTensorNameAndShape(const std::string& tensor_name) {
-    if (!restore_args_.m_is_incr) {
-      restore_args_.m_tensor_key = tensor_name + kKeySuffix;
-      restore_args_.m_tensor_value = tensor_name + kValueSuffix;
-      restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
-      restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
-    } else {
-      restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
-      restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
-      restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
-      restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
-    }
+  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec);
 
-    TensorShape key_shape, value_shape, version_shape, freq_shape;
-
-    Status st =
-        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
-                                    &version_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_key,
-                               sizeof(K) * key_shape.dim_size(0));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_value,
-                               sizeof(V) * value_shape.dim_size(0) *
-                               value_shape.dim_size(1));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_version,
-                               sizeof(int64) * version_shape.dim_size(0));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        freq_shape = version_shape;
-      } else {
-        return st;
-      }
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_freq,
-                               sizeof(int64) * freq_shape.dim_size(0));
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        restore_args_.m_has_freq = false;
-      } else {
-        return st;
-      }
-    }
-    restore_args_.m_old_dim = value_shape.dim_size(1);
-
-    if (!restore_args_.m_is_oldform) {
-      TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
-      st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
-                                      &key_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          key_filter_shape = key_shape;
-          restore_args_.m_has_filter = false;
-        } else {
-          return st;
-        }
-      }
-      st = reader_->LookupTensorShape(
-          restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
-      if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
-        return st;
-      }
-      st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
-                                 sizeof(K) * key_filter_shape.dim_size(0));
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          restore_args_.m_has_filter = false;
-        } else {
-          return st;
-        }
-      }
-      st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
-                                 sizeof(K) * version_filter_shape.dim_size(0));
-      if (!st.ok()) {
-        return st;
-      }
-      st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
-                                      &freq_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          freq_filter_shape = freq_shape;
-        } else {
-          return st;
-        }
-      }
-
-      st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
-                                 sizeof(K) * freq_filter_shape.dim_size(0));
-      if (!st.ok() && st.code() != error::NOT_FOUND) {
-        return st;
-      }
-    }
-    return st;
-  }
+  Status EVInitTensorNameAndShape(const std::string& tensor_name);
 
   Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset,
                            int64 value_part_offset, int64 version_part_offset,
                            int64 freq_part_offset, RestoreBuffer& restore_buff,
                            int64 new_dim, const EmbeddingConfig& emb_config,
-                           const Eigen::GpuDevice* device) {
-    size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
-    size_t value_unit_bytes_new = sizeof(V) * new_dim;
-    int64 tot_key_bytes_read(0);
-    int64 tot_value_bytes_read(0);
-    int64 tot_version_bytes_read(0);
-    int64 tot_freq_bytes_read(0);
-    size_t key_bytes_read = 0;
-    size_t value_bytes_read = 0;
-    size_t version_bytes_read = 0;
-    size_t freq_bytes_read = 0;
-
-    while (tot_key_num > 0) {
-      size_t read_key_num = std::min(
-          std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
-                   kBufferSize / sizeof(int64));
-      read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
-      read_key_num = std::min((int)read_key_num, tot_key_num);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
-          read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read,
-          read_key_num * value_unit_bytes, restore_buff.value_buffer,
-          value_bytes_read);
-      if (!restore_args_.m_reset_version) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_version,
-            version_part_offset + tot_version_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.version_buffer,
-            version_bytes_read);
-        if (version_bytes_read == 0) {
-          memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
-        }
-      } else {
-        int64* version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-
-      if (restore_args_.m_has_freq) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.freq_buffer,
-            freq_bytes_read);
-        if (freq_bytes_read == 0) {
-          int64* freq_tmp = (int64*)restore_buff.freq_buffer;
-          for (int64 i = 0; i < read_key_num; i++) {
-            freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
-          }
-        }
-      } else {
-        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
-        for (int64 i = 0; i < read_key_num; i++) {
-          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
-        }
-      }
-      if (key_bytes_read > 0) {
-        read_key_num = key_bytes_read / sizeof(K);
-        Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
-                                     value_bytes_read, value_unit_bytes_new,
-                                     restore_buff);
-        if (!st.ok()) {
-          LOG(FATAL) << "EV Restore fail:" << st.ToString();
-        }
-
-        st = storage_->RestoreFeatures(
-            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
-            restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr,
-            emb_config, device,
-            filter_, restore_buff);
-        if (!st.ok()) {
-          LOG(FATAL) << "EV Restore fail:" << st.ToString();
-        }
-      }
-
-      tot_key_num -= read_key_num;
-      tot_key_bytes_read += key_bytes_read;
-      tot_value_bytes_read += value_bytes_read;
-      tot_version_bytes_read += version_bytes_read;
-      tot_freq_bytes_read += freq_bytes_read;
-    }
-
-    return Status::OK();
-  }
+                           const Eigen::GpuDevice* device);
 
   Status EVRestoreFilteredFeatures(
       int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
       typename TTypes<int32>::Flat part_filter_offset_flat,
-      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
-    int subpart_filter_offset = part_filter_offset_flat(subpart_id);
-    int tot_key_filter_num =
-        part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
-    int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
-    int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
-    int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
-
-    VLOG(1) << "key_filter_num: " << tot_key_filter_num
-            << ", subpart_filter_offset: " << subpart_filter_offset;
-
-    size_t key_filter_bytes_read = 0;
-    size_t version_filter_bytes_read = 0;
-    size_t freq_filter_bytes_read = 0;
-
-    while (tot_key_filter_num > 0) {
-      size_t read_key_num =
-          std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
-      read_key_num = std::min((int)read_key_num, tot_key_filter_num);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_key + "_filtered",
-          key_filter_part_offset + key_filter_bytes_read,
-          read_key_num * sizeof(K), restore_buff.key_buffer,
-          key_filter_bytes_read);
-      if (!restore_args_.m_reset_version) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_version + "_filtered",
-            version_filter_part_offset + version_filter_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.version_buffer,
-            version_filter_bytes_read);
-      } else {
-        int64* version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_freq + "_filtered",
-          freq_filter_part_offset + freq_filter_bytes_read,
-          read_key_num * sizeof(int64), restore_buff.freq_buffer,
-          freq_filter_bytes_read);
-      if (key_filter_bytes_read > 0) {
-        read_key_num = key_filter_bytes_read / sizeof(K);
-        VLOG(2) << "restore, read_key_num:" << read_key_num;
-        Status st = storage_->RestoreFeatures(
-            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
-            restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr,
-            emb_config, device,
-            filter_, restore_buff);
-        if (!st.ok()) return st;
-        tot_key_filter_num -= read_key_num;
-      }
-    }
-    return Status::OK();
-  }
+      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device);
 
   Status RestoreCustomDim(int new_dim, int read_key_num,
                           size_t value_unit_bytes, size_t value_bytes_read,
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 71667cf0917..5d1f20b581a 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -29,6 +30,7 @@ class ValuePtr;
 template <class K, class V>
 class GPUHashTable;
 
+using GPUDevice = Eigen::GpuDevice;
 namespace embedding {
 
 template<class V>
@@ -90,15 +92,15 @@ class KVInterface {
 
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
       int32 default_v_num,
-      size_t n, const Eigen::GpuDevice& device) {
+      size_t n, const GPUDevice& device) {
     return Status::OK();
   }
   virtual Status BatchLookupOrCreateKeys(const K* keys, size_t n,
-      int32* item_idxs, const Eigen::GpuDevice& device) {
+      int32* item_idxs, const GPUDevice& device) {
     return Status::OK();
   }
 
-  virtual Status BatchLookup(const Eigen::GpuDevice& device, 
+  virtual Status BatchLookup(const GPUDevice& device,
       const K* keys, V* val, size_t n, const V* default_v) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.cc b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc
new file mode 100644
index 00000000000..b224b24e856
--- /dev/null
+++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc
@@ -0,0 +1,88 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+#include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K>
+template <class T>
+void SsdRecordDescriptor<K>::DumpSection(
+    const std::vector<T>& data_vec,
+    const std::string& section_str,
+    BundleWriter* writer,
+    std::vector<char>& dump_buffer) {
+  EVVectorDataDumpIterator<T> iter(data_vec);
+  SaveTensorWithFixedBuffer(
+      section_str,
+      writer, dump_buffer.data(),
+      dump_buffer.size(), &iter,
+      TensorShape({data_vec.size()}));
+}
+#define REGISTER_KERNELS(ktype, ttype)                   \
+  template void SsdRecordDescriptor<ktype>::DumpSection( \
+      const std::vector<ttype>&, const std::string&,       \
+      BundleWriter*, std::vector<char>&);
+REGISTER_KERNELS(int32, int32);
+REGISTER_KERNELS(int32, int64);                                    
+REGISTER_KERNELS(int64, int32);
+REGISTER_KERNELS(int64, int64);
+#undef REGISTER_KERNELS
+
+template <class K>
+void SsdRecordDescriptor<K>::DumpSsdMeta(
+    const std::string& prefix,
+    const std::string& var_name) {
+  std::fstream fs;
+  std::string var_name_temp(var_name);
+  std::string new_str = "_";
+  int64 pos = var_name_temp.find("/");
+  while (pos != std::string::npos) {
+    var_name_temp.replace(pos, 1, new_str.data(), 1);
+    pos = var_name_temp.find("/");
+  }
+
+  std::string ssd_record_path =
+      prefix + "-" + var_name_temp + "-ssd_record";
+  BundleWriter ssd_record_writer(Env::Default(),
+                                 ssd_record_path);
+  size_t bytes_limit = 8 << 20;
+  std::vector<char> dump_buffer(bytes_limit);
+
+  DumpSection(key_list, "keys",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(key_file_id_list, "keys_file_id",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(key_offset_list, "keys_offset",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(file_list, "files",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(invalid_record_count_list, "invalid_record_count",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(record_count_list, "record_count",
+              &ssd_record_writer, dump_buffer);
+
+  ssd_record_writer.Finish();
+}
+#define REGISTER_KERNELS(ktype)                               \
+  template void SsdRecordDescriptor<ktype>::DumpSsdMeta(  \
+      const std::string&, const std::string&);
+REGISTER_KERNELS(int32);                                    
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+}//namespace embedding
+}//namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.h b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
index 9d015236934..aeb8d324759 100644
--- a/tensorflow/core/framework/embedding/ssd_record_descriptor.h
+++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
@@ -20,14 +20,13 @@ limitations under the License.
 #include <vector>
 #include <cstdlib>
 #include <iomanip>
-
+#include <fstream>
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
+class BundleWriter;
 namespace embedding {
 
 template <class K>
@@ -59,48 +58,10 @@ class SsdRecordDescriptor {
   void DumpSection(const std::vector<T>& data_vec,
                    const std::string& section_str,
                    BundleWriter* writer,
-                   std::vector<char>& dump_buffer) {
-    EVVectorDataDumpIterator<T> iter(data_vec);
-    SaveTensorWithFixedBuffer(
-        section_str,
-        writer, dump_buffer.data(),
-        dump_buffer.size(), &iter,
-        TensorShape({data_vec.size()}));
-  }
+                   std::vector<char>& dump_buffer);
 
   void DumpSsdMeta(const std::string& prefix,
-                   const std::string& var_name) {
-    std::fstream fs;
-    std::string var_name_temp(var_name);
-    std::string new_str = "_";
-    int64 pos = var_name_temp.find("/");
-    while (pos != std::string::npos) {
-      var_name_temp.replace(pos, 1, new_str.data(), 1);
-      pos = var_name_temp.find("/");
-    }
-
-    std::string ssd_record_path =
-        prefix + "-" + var_name_temp + "-ssd_record";
-    BundleWriter ssd_record_writer(Env::Default(),
-                                   ssd_record_path);
-    size_t bytes_limit = 8 << 20;
-    std::vector<char> dump_buffer(bytes_limit);
-
-    DumpSection(key_list, "keys",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(key_file_id_list, "keys_file_id",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(key_offset_list, "keys_offset",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(file_list, "files",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(invalid_record_count_list, "invalid_record_count",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(record_count_list, "record_count",
-                &ssd_record_writer, dump_buffer);
-
-    ssd_record_writer.Finish();
-  }
+                   const std::string& var_name);
 
   void CopyEmbeddingFilesToCkptDir(
       const std::string& prefix,
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index d212e5b9c77..bb949183492 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/storage_config.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tensorflow/core/framework/device_base.h"
 #if GOOGLE_CUDA
@@ -53,6 +52,9 @@ struct SsdRecordDescriptor;
 template <class K, class V>
 class GPUHashTable;
 
+class BundleWriter;
+class BundleReader;
+
 template<typename Device>
 struct EmbeddingVarContext;
 namespace {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fc1b2cd9c67..115e3c4bae6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2907,7 +2907,10 @@ tf_kernel_library(
     hdrs = ["kv_variable_ops.h"],
     srcs = ["kv_variable_ops.cc",
             "kv_variable_lookup_ops.cc",
-            "kv_variable_restore_ops.cc"],
+            "kv_variable_restore_ops.cc",
+            "//tensorflow/core:framework/embedding/embedding_var_ckpt_data.cc",
+            "//tensorflow/core:framework/embedding/embedding_var_restore.cc",
+            "//tensorflow/core:framework/embedding/ssd_record_descriptor.cc"],
     copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",

From fe194b0718f9cc4f30a31e721780da2a956b6df8 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 19 Sep 2023 09:57:00 +0800
Subject: [PATCH 52/91] [Embedding] Fix incorrect frequency in
 shared-embedding. (#931)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../python/ops/embedding_variable_ops_test.py | 74 +++++++++++++++++++
 tensorflow/python/ops/kv_variable_ops.py      |  4 +-
 .../python/training/gradient_descent.py       | 15 +++-
 tensorflow/python/training/optimizer.py       | 30 +++++++-
 4 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 25a0cb6ff11..c6cdf951a1e 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2816,5 +2816,79 @@ def testSetInitializedWithRestore(self):
         result = sess.run(var._is_initialized_op)
         self.assertEqual(True, result)
 
+  def testCountsTensor(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      sp2 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([3,3,3,4,4,1], dtypes.int64),
+                      dense_shape=[6, 1])
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+  
+  def testCountsTensorWithGradientDescent(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      sp2 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([3,3,3,4,4,1], dtypes.int64),
+                      dense_shape=[6, 1])
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = gradient_descent.GradientDescentOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+
+    del os.environ["TF_RECORD_FREQ"]
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 701c03f6975..96329ca345b 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -368,7 +368,7 @@ def _init_from_args(self,
         self._dtype = initial_value.dtype.base_dtype
         self._constraint = constraint
         self._gather_op = None
-        self._counts_tensor = None
+        self._counts_tensor = {}
         if self._is_primary:
           self._slot_num = 0 
         else:
@@ -850,7 +850,7 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
               default_value,
               counts, is_inference=True,
               name=name)
-        self._counts_tensor = counts
+        self._counts_tensor[indices] = counts
       else:
         value = gen_kv_variable_ops.kv_resource_gather(self._handle,
               indices,
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 32a12a0554f..799e3c5f5bd 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -71,12 +71,23 @@ def _resource_apply_dense(self, grad, handle):
   def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     if isinstance(handle, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      if handle.need_counts() and handle._counts_tensor is not None:
+      if handle.need_counts() and len(handle._counts_tensor.keys()) != 0:
+        if indices.op.type == "ConcatV2":
+          total_counts = []
+          for tensor in indices.op.inputs:
+            if tensor.op.type == "Reshape":
+              indices_tensor = tensor.op.inputs[0]
+              total_counts.append(handle._counts_tensor[indices_tensor])
+          from tensorflow.python.ops import array_ops
+          counts_tensor = array_ops.concat(total_counts, 0)
+        elif indices.op.type == "Reshape":
+          indices_tensor = indices.op.inputs[0]
+          counts_tensor = handle._counts_tensor[indices_tensor]
         return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
                                          grad.dtype.base_dtype),
             grad, indices, global_step,
-            handle._counts_tensor, use_locking=self._use_locking)
+            counts_tensor, use_locking=self._use_locking)
       else:
         return training_ops.kv_resource_sparse_apply_gradient_descent(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 578d682cc11..7523604ccf9 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -93,6 +93,18 @@ def _deduplicate_indexed_slices_with_counts(values, indices):
       array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, indices_counts)
 
+def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts):
+  """Sums `values` associated with any non-unique `indices`
+  and return counts of each count in `values`."""
+  unique_indices, new_index_positions = array_ops.unique(indices)
+  summed_values = math_ops.unsorted_segment_sum(
+      values, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  summed_counts = math_ops.unsorted_segment_sum(
+      counts, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  return (summed_values, unique_indices, summed_counts)
+
 def _var_key(var):
   # TODO(ashankar): Consolidate handling for eager and graph
   if hasattr(var, "op"):
@@ -1088,14 +1100,24 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     """
     from tensorflow.python.ops import kv_variable_ops
     if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts():
-      if handle._counts_tensor is None:
+      if len(handle._counts_tensor.keys()) == 0:
         summed_grad, unique_indices, indices_counts = \
             _deduplicate_indexed_slices_with_counts(
                 values=grad, indices=indices)
       else:
-        summed_grad, unique_indices = _deduplicate_indexed_slices(
-            values=grad, indices=indices)
-        indices_counts = handle._counts_tensor
+        if indices.op.type == "ConcatV2":
+          total_counts = []
+          for tensor in indices.op.inputs:
+            if tensor.op.type == "Reshape":
+              indices_tensor = tensor.op.inputs[0]
+              total_counts.append(handle._counts_tensor[indices_tensor])
+          counts_tensor = array_ops.concat(total_counts, 0)
+        elif indices.op.type == "Reshape":
+          indices_tensor = indices.op.inputs[0]
+          counts_tensor = handle._counts_tensor[indices_tensor]
+        summed_grad, unique_indices, indices_counts = \
+            _deduplicate_indexed_slices_with_counts_reduction(
+                grad, indices, counts_tensor)
       return self._resource_apply_sparse(
           summed_grad, handle, unique_indices, indices_counts)
     else:

From 29ecde4f6418cd3beca400a31e87e1e53d9567dc Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Wed, 20 Sep 2023 10:45:48 +0800
Subject: [PATCH 53/91] [Embedding] Fix missing return value of RestoreSSD of
 DramSSDHashStorage. (#926)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/framework/embedding/dram_ssd_storage.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 4243cc14eb3..356a61d865f 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -181,7 +181,9 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
                       restore_buff.key_offset_list_buf,
                       restore_buff.num_of_keys,
                       file_id_map);
+    return Status::OK();
   }
+
   Status Eviction(K* evict_ids, int64 evict_size) override {
     ValuePtr<V>* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {

From 06f81cc7c26972d8d0851a652dc212976f54f592 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 17 Oct 2023 15:49:38 +0800
Subject: [PATCH 54/91] [Embedding] Refactor the data structure of
 EmbeddingVariable. (#924)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../framework/embedding/bloom_filter_policy.h |  77 ++-
 .../core/framework/embedding/config.proto     |   6 +-
 .../counter_filter_descriptor_impl.h          | 272 ++++++++
 .../embedding/counter_filter_policy.h         | 104 ++-
 .../framework/embedding/cpu_hash_map_kv.h     |  91 ++-
 .../framework/embedding/dense_hash_map_kv.h   |  15 +-
 .../embedding/dram_leveldb_storage.h          |  75 +-
 .../framework/embedding/dram_pmem_storage.h   |  88 +--
 .../framework/embedding/dram_ssd_storage.h    |  62 +-
 .../dynamic_dim_feature_descriptor_impl.h     | 214 ++++++
 .../framework/embedding/embedding_config.h    |  17 +-
 .../embedding/embedding_memory_pool.h         |  12 +-
 .../framework/embedding/embedding_var.cu.cc   | 144 ----
 .../core/framework/embedding/embedding_var.h  | 345 +++-------
 .../embedding/embedding_var_ckpt_data.cc      |  38 +-
 .../embedding/embedding_var_ckpt_data.h       |  10 +-
 .../embedding/embedding_var_dump_iterator.h   |   4 +-
 .../framework/embedding/feature_descriptor.h  | 200 ++++++
 .../embedding/feature_descriptor_impl.h       | 317 +++++++++
 .../core/framework/embedding/filter_factory.h |  12 +-
 .../core/framework/embedding/filter_policy.h  |  48 +-
 .../embedding/globalstep_shrink_policy.h      |  18 +-
 .../framework/embedding/gpu_hash_map_kv.h     |  20 +-
 .../embedding/hbm_dram_ssd_storage.h          | 458 ++++---------
 .../framework/embedding/hbm_dram_storage.h    | 411 ++++-------
 .../hbm_multi_tier_feature_descriptor.h       | 122 ++++
 .../embedding/hbm_storage_iterator.h          |   7 +-
 .../core/framework/embedding/kv_interface.h   |  29 +-
 .../embedding/l2weight_shrink_policy.h        |  19 +-
 .../core/framework/embedding/layout_creator.h | 104 ---
 .../core/framework/embedding/leveldb_kv.h     |  79 ++-
 .../embedding/lockless_hash_map_cpu.h         | 243 -------
 .../embedding/multi_tier_storage.cu.cc        |  77 ++-
 .../framework/embedding/multi_tier_storage.h  | 136 ++--
 .../embedding/normal_feature_descriptor.h     | 134 ++++
 .../embedding/nullable_filter_policy.h        |  99 ++-
 .../core/framework/embedding/shrink_policy.h  |  21 +-
 .../framework/embedding/single_tier_storage.h | 237 +++----
 .../core/framework/embedding/ssd_hash_kv.h    | 112 +--
 tensorflow/core/framework/embedding/storage.h | 170 +++--
 .../core/framework/embedding/storage_config.h |  30 +-
 .../framework/embedding/storage_factory.h     |  42 +-
 .../core/framework/embedding/value_ptr.h      | 647 ------------------
 tensorflow/core/kernels/BUILD                 |   5 +-
 .../kernels/embedding_variable_memory_test.cc |  20 +-
 .../kernels/embedding_variable_ops_test.cc    | 632 ++++-------------
 .../embedding_variable_performance_test.cc    |  25 +-
 .../core/kernels/embedding_variable_test.h    |  43 +-
 .../group_embedding_lookup_ops_test.cc        |   4 +-
 .../core/kernels/incr_save_restore_ops.h      |   4 +-
 .../core/kernels/kv_variable_lookup_ops.cc    |   4 +-
 tensorflow/core/kernels/kv_variable_ops.cc    | 129 ++--
 tensorflow/core/kernels/kv_variable_ops.h     |   1 +
 .../core/kernels/kv_variable_restore_ops.cc   |  72 +-
 tensorflow/core/kernels/save_restore_tensor.h |   1 -
 .../core/kernels/training_ali_op_helpers.h    |  53 +-
 tensorflow/core/kernels/training_ali_ops.cc   |  59 +-
 tensorflow/python/framework/ops.py            |   2 +
 .../ops/embedding_variable_ops_gpu_test.py    | 164 ++---
 .../python/ops/embedding_variable_ops_test.py | 197 +++---
 tensorflow/python/ops/kv_variable_ops.py      |  14 +
 .../training/saving/saveable_object_util.py   |   3 +-
 62 files changed, 3060 insertions(+), 3738 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/feature_descriptor.h
 create mode 100644 tensorflow/core/framework/embedding/feature_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
 delete mode 100644 tensorflow/core/framework/embedding/layout_creator.h
 delete mode 100644 tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
 create mode 100644 tensorflow/core/framework/embedding/normal_feature_descriptor.h
 delete mode 100644 tensorflow/core/framework/embedding/value_ptr.h

diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 29b85e5bb4e..781511578af 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -35,9 +35,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::config_;
 
  public:
-  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {
-    
+  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                    embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {
     switch (config_.counter_type){
       case DT_UINT64:
         VLOG(2) << "The type of bloom counter is uint64";
@@ -64,10 +65,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -81,17 +82,17 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -109,13 +110,13 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::vector<K>> lookup_or_create_ids(num_worker_threads);
     std::vector<std::vector<int>>
         lookup_or_create_cursor(num_worker_threads);
-    std::vector<std::vector<ValuePtr<V>*>>
+    std::vector<std::vector<void*>>
         lookup_or_create_ptrs(num_worker_threads);
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
     std::vector<std::list<int64>>
@@ -147,7 +148,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
           1000, do_work);
 
     std::vector<K> total_ids(num_of_keys);
-    std::vector<ValuePtr<V>*> total_ptrs(num_of_keys);
+    std::vector<void*> total_ptrs(num_of_keys);
     std::vector<int> total_cursors(num_of_keys);
     int num_of_admit_id = 0;
     for (int i = 0; i < num_worker_threads; i++) {
@@ -157,7 +158,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                sizeof(K) * lookup_or_create_ids[i].size());
         memcpy(total_ptrs.data() + num_of_admit_id,
                lookup_or_create_ptrs[i].data(),
-               sizeof(ValuePtr<V>*) * lookup_or_create_ptrs[i].size());
+               sizeof(void*) * lookup_or_create_ptrs[i].size());
         memcpy(total_cursors.data() + num_of_admit_id,
                lookup_or_create_cursor[i].data(),
                sizeof(int) * lookup_or_create_cursor[i].size());
@@ -174,11 +175,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
     if (GetBloomFreq(key) >= config_.filter_freq) {
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+      bool is_filter = true;
+      TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       AddFreq(key, count);
@@ -186,19 +188,27 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    *val = nullptr;
-    if ((GetFreq(key, *val) + count) >= config_.filter_freq) {
+    *value_ptr = nullptr;
+    if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) {
+      Status s = ev_->LookupKey(key, value_ptr);
+      if (!s.ok()) {
+        *value_ptr = feat_desc_->Allocate();
+        feat_desc_->SetDefaultValue(*value_ptr, key);
+        ev_->storage()->Insert(key, value_ptr);
+        s = Status::OK();
+      }
       *is_filter = true;
-      return ev_->LookupOrCreateKey(key, val);
+      feat_desc_->AddFreq(*value_ptr, count);
+    } else {
+      *is_filter = false;
+      AddFreq(key, count);
     }
-    *is_filter = false;
-    AddFreq(key, count);
     return Status::OK();
   }
 
-  int64 GetFreq(K key, ValuePtr<V>*) override {
+  int64 GetFreq(K key, void* val) override {
     return GetBloomFreq(key);
   }
 
@@ -210,7 +220,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     return bloom_counter_;
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     if (value_ptr == nullptr) {
       return false;
     } else {
@@ -326,8 +336,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
+      void* value_ptr = nullptr;
       int64 new_freq = freq_buff[i];
+      int64 import_version = -1;
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
           SetBloomFreq(key_buff[i], freq_buff[i]);
@@ -339,17 +353,9 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         SetBloomFreq(key_buff[i], freq_buff[i]);
       }
       if (new_freq >= config_.filter_freq){
-        ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
-        if (config_.steps_to_live != 0 || config_.record_version) {
-          value_ptr->SetStep(version_buff[i]);
-        }
-        if (!is_filter){
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 value_buff + i * ev_->ValueLen());
-        } else {
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 ev_->GetDefaultValue(key_buff[i]));
-        }
+        ev_->storage()->Import(key_buff[i],
+            value_buff + i * ev_->ValueLen(),
+            new_freq, import_version, config_.emb_index);
       }
     }
     return Status::OK();
@@ -449,6 +455,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
  private:
   void* bloom_counter_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
   std::vector<int64> seeds_;
 };
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index a8535347020..424fc5e1a38 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -50,11 +50,7 @@ enum EmbeddingVariableType {
 enum ValuePtrStatus {
   OK = 0;
   IS_DELETED = 1;
-}
-
-enum ValuePosition {
-  IN_DRAM = 0;
-  NOT_IN_DRAM = 1;
+  NOT_IN_DRAM = 2;
 }
 
 enum IsSetInitialized {
diff --git a/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
new file mode 100644
index 00000000000..e51166a2895
--- /dev/null
+++ b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
@@ -0,0 +1,272 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  CounterFilterDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version,
+      int64 filter_freq,
+      StorageType storage_type) 
+      : filter_freq_(filter_freq),
+        is_record_freq_(need_record_freq),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    if (filter_freq >= (1L << version_offset_bits_)) {
+      LOG(FATAL)<<"Filter freqeuncy threshold shouldn't bigger than 2^12.";
+    }
+
+    if (storage_type == StorageType::HBM_DRAM || 
+        storage_type == StorageType::HBM_DRAM_SSDHASH) {
+#if GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+#endif //GOOGLE_CUDA
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  CounterFilterDescriptorImpl(CounterFilterDescriptorImpl<V>* feat_desc_impl)
+      : filter_freq_(feat_desc_impl->filter_freq_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {
+#if GOOGLE_CUDA
+    if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == 
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>*)){
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+    } else {
+#endif //GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+#if GOOGLE_CUDA
+    }
+#endif //GOOGLE_CUDA
+  }
+
+  ~CounterFilterDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    return feat_desc_impl_->InitSlotInfo(feat_desc_impl);
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  bool IsAdmit(void* val) override {
+    return (GetFlag(val) == 0);
+  }
+
+  void* Admit(void* val) override {
+    if (!IsAdmit(val)) {
+      return feat_desc_impl_->Allocate();
+    } else {
+      LOG(FATAL)<<"Only unadmited feature could be admited.";
+      return nullptr;
+    }
+  }
+
+  void* Allocate() override {
+    uint64* val = (uint64*)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    uint64 flag = 1L << flag_offset_bits_;
+    uint64 version = (0xffffffffffffffff << version_offset_bits_);
+    uint64 freq = 0;
+    *val = version + freq;
+    val = (uint64*)((uint64)val | flag);
+    return (void*)val;
+  }
+
+  void* Allocate(int64 freq) override {
+    if (freq < filter_freq_) {
+      return Allocate();
+    } else {
+      return feat_desc_impl_->Allocate();
+    }
+  }
+
+  void Deallocate(void* val) override {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->Deallocate(val);
+    } else {
+      void* tmp = GetPtr(val);
+      alloc_->DeallocateRaw(tmp);
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      if (IsAdmit(val)) {
+        feat_desc_impl_->Deallocate(val);
+      } else {
+        void* tmp = GetPtr(val);
+        alloc_->DeallocateRaw(tmp);
+      }
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {
+    uint64* tmp = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      __sync_fetch_and_add(tmp, count);
+    } else {
+      feat_desc_impl_->AddFreq(val, count);
+    }
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->SetValue(val, emb_index, value);
+    }
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {
+    feat_desc_impl_->SetDefaultValue(val, key);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    feat_desc_impl_->SetDefaultValues(
+        keys, init_cursor,
+        value_ptrs, compute_stream,
+        event_mgr, gpu_device);
+  }
+#endif
+
+  int64 GetFreq(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      return *((uint64*)tmp) &
+             ((1L << version_offset_bits_) - 1);
+    } else {
+      if (is_record_freq_) {
+        return feat_desc_impl_->GetFreq(val);
+      } else {
+        return filter_freq_;
+      } 
+    }
+  }
+
+  int64 GetVersion(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      int64 version = *(uint64*)tmp >> version_offset_bits_;
+      if (version == 0xffffffffffff) {
+        version = -1;
+      }
+      return version;
+    } else {
+      return feat_desc_impl_->GetVersion(val);
+    }
+  }
+
+  void UpdateVersion(void* val, int64 version) override {
+    if (!IsAdmit(val)) {
+      void* tmp_ptr = GetPtr(val);
+      uint64 tmp_val = 0;
+      uint64 result  = 0;
+      do {
+        tmp_val = *(uint64*)tmp_ptr;
+        version = version << version_offset_bits_;
+        uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1);
+        result = version + freq;
+      } while(!__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result));
+    } else {
+      feat_desc_impl_->UpdateVersion(val, version);
+    }
+  }
+
+  void SetFreq(void* val, int64 freq) override {
+    uint64* tmp_ptr = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      uint64 tmp = *tmp_ptr;
+      tmp = ~((1L << version_offset_bits_) - 1) & tmp;
+      tmp += freq;
+      __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp);
+    } else {
+      feat_desc_impl_->SetFreq(val, freq);
+    }
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  uint64 GetFlag(void* val) {
+    return (uint64)val >> flag_offset_bits_;
+  }
+
+  void* GetPtr(void* val) {
+    return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1));
+  }
+
+  int64 filter_freq_;
+  int alloc_bytes_ = 8;
+  Allocator* alloc_ = ev_allocator();
+  const int freq_offset_bits_ = 0;
+  const int version_offset_bits_ = 16;
+  const int flag_offset_bits_ = 48;
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+  bool is_record_freq_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index c9f19f34cd2..19cd90ad01c 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -25,18 +25,19 @@ template<typename K, typename V, typename EV>
 class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
-  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {}
+  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                      embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
-    if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+    if (s.ok() && feat_desc_->IsAdmit(value_ptr)) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -50,18 +51,18 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         int64 freq = GetFreq(keys[i], value_ptr);
-        if (value_ptr != nullptr && freq >= config_.filter_freq) {
+        if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -79,7 +80,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
@@ -90,36 +91,61 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    if (GetFreq(key, *value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    if (is_filter) {
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    Status s = ev_->LookupOrCreateKey(key, val);
-    *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq;
+    *is_filter = false;
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      if (count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        feat_desc_->Deallocate(*value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+      ev_->storage()->Insert(key, value_ptr);
+      s = Status::OK();
+    } else if (!feat_desc_->IsAdmit(*value_ptr)) {
+      int64 freq = feat_desc_->GetFreq(*value_ptr);
+      if (freq + count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetFreq(admit_value_ptr, freq);
+        feat_desc_->UpdateVersion(
+            admit_value_ptr, feat_desc_->GetVersion(*value_ptr));
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+    } else {
+      *is_filter = true;
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
     return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    return value_ptr->GetFreq();
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetFreq();
-  }
-
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
-    return (GetFreq(key, value_ptr) >= config_.filter_freq);
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -136,27 +162,33 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
-          value_ptr->SetFreq(freq_buff[i]);
+          import_freq = freq_buff[i];
         } else {
-          value_ptr->SetFreq(config_.filter_freq);
+          import_freq = config_.filter_freq;
         }
       } else {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
-      }
-      if (value_ptr->GetFreq() >= config_.filter_freq) {
-        LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                  value_ptr, value_buff, key_buff);
+        import_version = version_buff[i];
       }
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
+
+  bool is_admit(K key, void* value_ptr) override {
+    return feat_desc_->IsAdmit(value_ptr);
+  }
+
+ private:
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 600f6c20e44..8476c399c40 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -21,25 +21,25 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
 class LocklessHashMap : public KVInterface<K, V> {
  public:
-  LocklessHashMap() {
+  LocklessHashMap(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {
     hash_map_.max_load_factor(0.8);
     hash_map_.set_empty_key_and_value(
         LocklessHashMap<K, V>::EMPTY_KEY_, nullptr);
     hash_map_.set_counternum(16);
     hash_map_.set_deleted_key(LocklessHashMap<K, V>::DELETED_KEY_);
+    pthread_key_create(&key_, NULL);
   }
 
-  ~LocklessHashMap() override {}
+  ~LocklessHashMap() override {
+    pthread_key_delete(key_);
+  }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == LocklessHashMap<K, V>::EMPTY_KEY_) {
       return errors::NotFound(
@@ -60,10 +60,10 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
     // insert fail, exist key
     if ((*(iter.first)).second != value_ptr){
       return errors::AlreadyExists(
@@ -88,14 +88,40 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
+  Status Commit(K key, const void* value_ptr) override {
+    auto iter = hash_map_.insert_lockless(std::move(
+        std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
+    if ((*(iter.first)).second != value_ptr) {
+      AppendToValuePtrQueue((*(iter.first)).second);
+      __sync_bool_compare_and_swap(
+          &((*(iter.first)).second),
+          (*(iter.first)).second,
+          value_ptr);
+    }
+    return Status::OK();
+  }
+
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
+    for(int i = 0; i < keys.size(); ++i) {
+      auto iter = hash_map_.insert_lockless(std::move(
+          std::pair<K, void*>(keys[i],
+              const_cast<void*>(value_ptrs[i]))));
+      if ((*(iter.first)).second != value_ptrs[i]) {
+        AppendToValuePtrQueue((*(iter.first)).second);
+        __sync_bool_compare_and_swap(
+            &((*(iter.first)).second),
+            (*(iter.first)).second,
+            value_ptrs[i]);
+      }
+    }
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
+      std::vector<void*>* value_ptr_list) override {
+    std::pair<const K, void*> *hash_map_dump;
     int64 bucket_count;
     auto it = hash_map_.GetSnapshot();
     hash_map_dump = it.first;
@@ -120,11 +146,50 @@ class LocklessHashMap : public KVInterface<K, V> {
     return "";
   }
 
+  void UpdateValuePtr(
+      K key, void* new_value_ptr, 
+      void* old_value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, old_value_ptr)));
+    bool flag = __sync_bool_compare_and_swap(
+        &((*(iter.first)).second), old_value_ptr, new_value_ptr);
+    if (flag) {
+      AppendToValuePtrQueue(old_value_ptr);
+    } else {
+      feat_desc_->Deallocate(new_value_ptr);
+    }
+  }
+
+ private:
+  void AppendToValuePtrQueue(void* old_value_ptr) {
+    //A parameter that can be adjusted in the future
+    std::deque<void*>* value_ptr_queue = GetOutOfDateValuePtrQueue();
+    if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) {
+      void* value_ptr = value_ptr_queue->front();
+      feat_desc_->Deallocate(value_ptr);
+      value_ptr_queue->pop_front();
+    }
+    value_ptr_queue->emplace_back(old_value_ptr);
+  }
+
+  std::deque<void*>* GetOutOfDateValuePtrQueue() {
+    std::deque<void*>* value_ptr_queue = 
+        static_cast<std::deque<void*>*>(pthread_getspecific(key_));
+    if (value_ptr_queue == nullptr) {
+      value_ptr_queue = new std::deque<void*>();
+      pthread_setspecific(key_, value_ptr_queue);
+    }
+    return value_ptr_queue;
+  }
+
  private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>*> LockLessHashMap;
+  typedef google::dense_hash_map_lockless<K, void*> LockLessHashMap;
   static const int EMPTY_KEY_;
   static const int DELETED_KEY_;
   LockLessHashMap hash_map_;
+  const int CAP_INVALID_VALUEPTR = 20000;
+  FeatureDescriptor<V>* feat_desc_;
+  pthread_key_t key_;
 };
 template <class K, class V>
 const int LocklessHashMap<K, V>::EMPTY_KEY_ = -1;
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index 92baf037721..ffaf2e335dc 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
@@ -45,7 +42,7 @@ class DenseHashMap : public KVInterface<K, V> {
     delete []hash_map_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_rd_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -70,7 +67,7 @@ class DenseHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_wr_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -80,8 +77,8 @@ class DenseHashMap : public KVInterface<K, V> {
           "already exists Key: ", key, " in DenseHashMap.");
     } else {
       auto iter = hash_map_[l_id].hash_map.insert(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
       return Status::OK();
     }
   }
@@ -109,7 +106,7 @@ class DenseHashMap : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     dense_hash_map hash_map_dump[partition_num_];
     for (int i = 0; i< partition_num_; i++) {
       spin_rd_lock l(hash_map_[i].mu);
@@ -132,7 +129,7 @@ class DenseHashMap : public KVInterface<K, V> {
   const int partition_num_ = 1000;
   struct dense_hash_map {
     mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
-    google::dense_hash_map<K, ValuePtr<V>* > hash_map;
+    google::dense_hash_map<K, void* > hash_map;
   };
   dense_hash_map* hash_map_;
 };
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index fdb6697d541..2f9fbade6c5 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramLevelDBStore : public MultiTierStorage<K, V> {
  public:
-  DramLevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    leveldb_ = new LevelDBStore<K, V>(sc, alloc, lc);
+  DramLevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    leveldb_ = new LevelDBStore<K, V>(sc, feat_desc);
   }
 
   ~DramLevelDBStore() override {
@@ -46,7 +44,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -63,23 +61,22 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramLevelDBStore.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramLevelDBStore can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -93,7 +90,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       leveldb_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
  
@@ -146,15 +143,15 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_leveldb_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_leveldb_value_list;
+    std::vector<void*> value_ptr_list, tmp_leveldb_value_list;
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
 
     TF_CHECK_OK(leveldb_->GetSnapshot(
         &tmp_leveldb_key_list, &tmp_leveldb_value_list));
 
     for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) {
-      tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index);
+      tmp_leveldb_value_list[i] =
+          (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset));
     }
 
     std::vector<K> leveldb_key_list;
@@ -173,26 +170,34 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(leveldb_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM,
+          true, true,
+          {false, 0});
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = &hbm_feat_desc;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           value_iter)));
     }
 
     for (auto it: tmp_leveldb_value_list) {
-      delete it;
+      cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff));
     }
-
     delete value_iter;
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -206,8 +211,8 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(leveldb_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -218,14 +223,20 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    leveldb_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_;
   LevelDBStore<K, V>* leveldb_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index fd19f75ab4c..e58d9450d96 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -15,14 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 
+#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
-#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,36 +29,36 @@ namespace embedding {
 template<typename K, typename V>
 class DramPmemStorage : public MultiTierStorage<K, V> {
  public:
-  DramPmemStorage(const StorageConfig& sc, Allocator* dram_alloc,
-      Allocator* pmem_alloc, LayoutCreator<V>* lc,
+  DramPmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc,
       const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, dram_alloc, lc, new LocklessHashMap<K, V>());
-    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_alloc, lc);
-    value_ptr_size_ =
-        const_cast<EmbeddingConfig&>(sc.embedding_config).total_num(
-            Storage<K, V>::GetAllocLen());
+      : dram_feat_desc_(feat_desc), 
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    pmem_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    pmem_feat_desc_->SetAllocator(experimental_pmem_allocator(sc.path, sc.size[0]));
+
+    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_feat_desc_);
   }
 
   ~DramPmemStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete dram_;
     delete pmem_;
+    delete pmem_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_);
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_);
-      *value_ptr = new_value_ptr;
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
       s = dram_->TryInsert(key, *value_ptr);
       if (s.ok()) {
         return s;
@@ -71,19 +69,19 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramPmemStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-     LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramPmemStorage can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   bool IsUseHbm() override {
@@ -94,18 +92,16 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
 
-    ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(size);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * size);
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
     }
     *value_ptr = new_value_ptr;
     
@@ -159,7 +155,7 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_pmem_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_pmem_value_list;
+    std::vector<void*> value_ptr_list, tmp_pmem_value_list;
 
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
     dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
@@ -182,13 +178,14 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
         emb_config,
         value_len, default_value,
         key_list,
-        value_ptr_list)));
+        value_ptr_list,
+        pmem_feat_desc_)));
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -202,8 +199,8 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(pmem_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -214,13 +211,26 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    pmem_feat_desc_->InitSlotInfo(dram_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {}
+  int total_dim() override {
+    return pmem_feat_desc_->total_dim();
+  }
 
  private:
   DramStorage<K, V>* dram_;
   PmemLibpmemStorage<K, V>* pmem_;
-  int64 value_ptr_size_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* pmem_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 356a61d865f..ddd2d782e03 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramSsdHashStorage : public MultiTierStorage<K, V> {
  public:
-  DramSsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_= new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    ssd_hash_ = new SsdHashStorage<K, V>(sc, alloc, lc);
+  DramSsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_= new DramStorage<K, V>(sc, feat_desc);
+    ssd_hash_ = new SsdHashStorage<K, V>(sc, feat_desc);
   }
 
   ~DramSsdHashStorage() override {
@@ -46,7 +44,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -64,24 +62,22 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramSsdHashStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramSsdStorage can not be called.";
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -96,7 +92,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
       ssd_hash_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
 
@@ -164,7 +160,6 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
                     const std::string& ssd_emb_file_name, EmbeddingVar<K, V>* ev,
                     RestoreSSDBuffer<K>& restore_buff) override {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
     std::map<int64, int64> file_id_map;
     for (int64 i = 0; i < restore_buff.num_of_files; i++) {
       file_id_map[restore_buff.file_list_buf[i]] = i;
@@ -185,7 +180,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -199,8 +194,8 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(ssd_hash_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -211,14 +206,25 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    ssd_hash_->Init();
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    ssd_hash_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_hash_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
new file mode 100644
index 00000000000..c1fa878788b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#include <list>
+#include <bitset>
+#include <atomic>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+constexpr int COLUMN_BITSET_BYTES = 5;
+constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
+
+struct MetaHeader {
+  volatile unsigned char embed_num;
+  unsigned char value_type;
+  unsigned char header_size;
+  unsigned char column_bitset[COLUMN_BITSET_BYTES];
+
+  static const int kEmbeddingNumStartIndex = 0;
+  static const int kValueTypeStartIndex =
+      kEmbeddingNumStartIndex + sizeof(char);
+  static const int kHeaderSizeStartIndex =
+      kValueTypeStartIndex + sizeof(char);
+  static const int kColumnBitsetIndex =
+      kHeaderSizeStartIndex + sizeof(char);
+
+  inline unsigned int GetEmbeddingNum() {
+    return (unsigned int) embed_num;
+  }
+
+  inline void SetEmbeddingNum(size_t s) {
+    embed_num = (unsigned char)s;
+  }
+
+  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
+    unsigned long meta = ((unsigned long*)this)[0];
+    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
+    return bs;
+  }
+
+  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
+      unsigned int embnum) {
+    ((unsigned long*)(this))[0] =
+      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
+      (header_size << (8 * kHeaderSizeStartIndex)) |
+      (value_type << (8 * kValueTypeStartIndex)) |
+      (embnum << (8 * kEmbeddingNumStartIndex));
+  }
+
+  inline unsigned int GetHeaderSize() {
+    return (unsigned int) header_size;
+  }
+
+  inline void SetHeaderSize(size_t size) {
+    header_size = (unsigned char)size;
+  }
+};
+
+template <class V>
+class DynmaicDimDescriptorImpl: public FeatureDescriptorImpl<V> {
+using FeatureDescriptorImpl<V>::slot_infos_;
+ public:
+  DynmaicDimDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num) 
+      : alloc_bytes_(sizeof(std::atomic_flag) +
+                     sizeof(MetaHeader) +
+                     sizeof(V*) * slot_num),
+        header_offset_bytes_(sizeof(V*) * slot_num),
+        flag_offset_bytes_(sizeof(MetaHeader) +
+                           sizeof(V*) * slot_num),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 false,
+                                 false) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+  }
+  ~DynmaicDimDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+  } 
+
+  V* GetEmbedding(void* val, int emb_index) override {
+		MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->embed_num;
+    auto metadata = meta->GetColumnBitset();
+    
+    if (!metadata.test(emb_index)) {
+      std::atomic_flag* flag= (std::atomic_flag*)(val + flag_offset_bytes_);
+      while(flag->test_and_set(std::memory_order_acquire));
+      metadata = meta->GetColumnBitset();
+      if (metadata.test(emb_index)) {
+        flag->clear(std::memory_order_release);
+        return ((V**)val)[emb_index];
+      }
+      embnum++ ;
+      int64 alloc_value_len = slot_infos_[emb_index].embedding_dim;
+      V* tensor_val = (V*)alloc_->AllocateRaw(
+          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
+      V* default_v = (V*)slot_infos_[emb_index].default_value;
+      memcpy(tensor_val, default_v,
+             sizeof(V) * slot_infos_[emb_index].default_value_len);
+      ((V**)val)[emb_index] = tensor_val;
+
+      metadata.set(emb_index);
+      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
+      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
+      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
+      //LOG(INFO)<<"emb_num: "<<embnum;
+      meta->SetColumnBitset(metadata, embnum);
+      flag->clear(std::memory_order_release);
+      return tensor_val;
+    } else {
+      return ((V**)val)[emb_index];
+    }
+  }
+
+  bool IsAdmit(void* val) override {
+    return true;
+  }
+
+  void* Admit(void* val) override {}
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    memset(val, 0, alloc_bytes_);
+    new ((char*)val + header_offset_bytes_) MetaHeader();
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->GetEmbeddingNum();
+    //LOG(INFO)<<"emb_num in deallocate: "<<embnum;
+    auto metadata = meta->GetColumnBitset();
+    for (int i = 0; i< embnum; i++) {
+      if (metadata.test(i)) {
+        V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i];
+        if (val_ptr != nullptr) {
+          alloc_->DeallocateRaw(val_ptr);
+        }
+      }
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      Deallocate(val);
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {}
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {}
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+
+  int64 GetFreq(void* val) override {}
+
+  int64 GetVersion(void* val) override {}
+
+  void UpdateVersion(void* val, int64 version) override {}
+
+  void SetFreq(void* val, int64 freq) override {}
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  int alloc_bytes_ = 0;
+  int header_offset_bytes_ = 0;
+  int flag_offset_bytes_ = 0;
+  Allocator* alloc_ = ev_allocator();
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index d47d07d4205..a39d2dca303 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -23,7 +23,6 @@ struct EmbeddingConfig {
   DataType counter_type;
   int64 default_value_dim;
   float default_value_no_permission;
-  int normal_fix_flag;
   bool record_freq;
   bool record_version;
   bool is_inference;
@@ -37,7 +36,6 @@ struct EmbeddingConfig {
                   int64 filter_freq = 0,
                   int64 max_freq = 999999,
                   float l2_weight_threshold = -1.0,
-                  const std::string& layout = "normal",
                   int64 max_element_size = 0,
                   float false_positive_probability = -1.0,
                   DataType counter_type = DT_UINT64,
@@ -58,7 +56,6 @@ struct EmbeddingConfig {
       counter_type(counter_type),
       default_value_dim(default_value_dim),
       default_value_no_permission(default_value_no_permission),
-      normal_fix_flag(0),
       record_freq(record_freq),
       record_version(record_version),
       is_inference(is_inference) {
@@ -70,10 +67,6 @@ struct EmbeddingConfig {
       kHashFunc = 0;
       num_counter = 0;
     }
-    if (layout == "normal_contiguous" ||
-        layout == "normal_contiguous_gpu") {
-      normal_fix_flag = 1;
-    }
   }
 
   int64 calc_num_counter(int64 max_element_size,
@@ -105,21 +98,13 @@ struct EmbeddingConfig {
   }
 
   bool is_save_freq() const {
-    return filter_freq != 0 ||
-           record_freq ||
-           normal_fix_flag == 1;
+    return filter_freq != 0 || record_freq;
   }
 
   bool is_save_version() const {
     return steps_to_live != 0 || record_version;
   }
 
-  int64 total_num(int alloc_len) {
-    return block_num *
-           (1 + (1 - normal_fix_flag) * slot_num) *
-           (1 + normal_fix_flag * (alloc_len * (slot_num + 1) - 1));
-  }
-
   int64 get_filter_freq() {
     return filter_freq;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_memory_pool.h b/tensorflow/core/framework/embedding/embedding_memory_pool.h
index 27b31ce1ed7..ef175151b00 100644
--- a/tensorflow/core/framework/embedding/embedding_memory_pool.h
+++ b/tensorflow/core/framework/embedding/embedding_memory_pool.h
@@ -18,9 +18,6 @@ limitations under the License.
 #include <deque>
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 template<typename V>
 class EmbeddingMemoryPool {
@@ -50,7 +47,7 @@ class EmbeddingMemoryPool {
     return ptr;
   }
 
-  void Deallocate(std::vector<ValuePtr<V>*> value_ptrs) {
+  void Deallocate(std::vector<void*> value_ptrs) {
     int64 prev_size = value_ptrs_queue_.size();
     for (auto it : value_ptrs) {
       value_ptrs_queue_.emplace_back(it);
@@ -59,9 +56,8 @@ class EmbeddingMemoryPool {
       int64 n = value_ptrs_queue_.size() - embs_per_block_;
       n = std::min(prev_size, n);
       for (int64 i = 0; i < n; i++) {
-        ValuePtr<V>* val = value_ptrs_queue_.front();
-        free_ptr_queue_.emplace_back(val->GetValue(0, 0));
-        delete val;
+        void* val = value_ptrs_queue_.front();
+        free_ptr_queue_.emplace_back((V*)val);
         value_ptrs_queue_.pop_front();
       }
     }
@@ -88,7 +84,7 @@ class EmbeddingMemoryPool {
   int64 embs_per_block_;
   Allocator* alloc_;
   std::deque<V*> free_ptr_queue_;
-  std::deque<ValuePtr<V>*> value_ptrs_queue_;
+  std::deque<void*> value_ptrs_queue_;
   std::vector<V*> block_list_;
 };
 } //embedding
diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc
index 0c0be83ec1d..f7162fd2c22 100644
--- a/tensorflow/core/framework/embedding/embedding_var.cu.cc
+++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc
@@ -42,71 +42,6 @@ void SyncWithEventMgr(se::Stream* stream,
   while(!is_kernel_finish) {}
 }
 
-template <class K, class V>
-void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
-    const K* keys, int64 size, const std::list<int64>& init_cursor,
-    V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device) {
-  if (init_cursor.size() > 0) {
-    int64 total = init_cursor.size();
-    V** value_address = nullptr;
-    value_address = TypedAllocator::Allocate<V*>(cpu_allocator(), total * 2,
-                                                 AllocationAttributes());
-    V** default_value_address = value_address + total;
-    V** dev_value_address = nullptr;
-    dev_value_address =
-        TypedAllocator::Allocate<V*>(alloc_, total * 2, AllocationAttributes());
-    V** dev_default_value_address = dev_value_address + total;
-    int64 i = 0;
-    auto it = init_cursor.cbegin();
-    for (; it != init_cursor.cend(); ++it, ++i) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_address[i] =
-          *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) +
-          storage_->GetOffset(emb_config_.emb_index);
-      default_value_address[i] =
-          default_value_ +
-          (keys[i] % emb_config_.default_value_dim) % value_len_;
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
-                               total * 2 * sizeof(V*));
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::CopyEmbedding<V>,
-        (total * value_len_ + block_dim - 1) / block_dim,
-        block_dim, 0, gpu_device.stream(), dev_default_value_address,
-        dev_value_address, value_len_, total));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    // Set init meta of ValuePtrs
-    for (auto it = init_cursor.cbegin(); it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr->SetInitialized(emb_config_.emb_index);
-      memcpy_address[*it] = value_ptr->GetValue(
-          emb_config_.emb_index,
-          storage_->GetOffset(emb_config_.emb_index));
-    }
-    TypedAllocator::Deallocate(alloc_, dev_value_address, total * 2);
-    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
-  }
-}
-
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::SetDefaultValueOfNewFeatures(     \
-      const ktype*, int64, const std::list<int64>&, vtype**,                  \
-      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
 template <class K, class V>
 void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
     V* val_base, int64 size, V** memcpy_address,
@@ -136,85 +71,6 @@ void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
 
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
-template <class K, class V>
-void EmbeddingVar<K, V>::CopyEmbeddingsFromCPUToGPU(
-    const K* keys, const std::list<int64>& copyback_cursor, V** memcpy_address,
-    se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device,
-    const DeviceBase::CpuWorkerThreads* worker_threads,
-    int64* output_value_ptrs) {
-  if (copyback_cursor.size() > 0) {
-    int64 total = copyback_cursor.size();
-    size_t value_len = emb_config_.total_num(storage_->GetAllocLen());
-    V* memcpy_buffer_gpu = nullptr;
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[total];
-    memcpy_buffer_gpu = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                total * value_len * sizeof(V));
-    storage_->CopyEmbeddingsFromCPUToGPU(
-        total, keys, copyback_cursor, memcpy_address, value_len, gpu_value_ptrs,
-        memcpy_buffer_gpu, compute_stream, event_mgr, worker_threads);
-
-    V** value_address = (V**)cpu_allocator()->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * total);
-    V** dev_value_address = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                 sizeof(V*) * total);
-    std::vector<K> copyback_keys(total);
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    for (; it != copyback_cursor.cend(); ++it, ++i) {
-      bool init;
-      // Get the curosr
-      int64 cursor = *it & 0x0fffffffffffffff;
-      gpu_value_ptrs[i]->SetInitialized(emb_config_.emb_index);
-      memcpy_address[cursor] = LookupOrCreateEmb(gpu_value_ptrs[i], init);
-      value_address[i] = memcpy_address[cursor];
-      copyback_keys[i] = keys[cursor];
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, total * sizeof(V*));
-
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
-        block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu,
-        value_len, total));
-
-    auto do_insert = [this, copyback_keys, gpu_value_ptrs, value_len](
-                         int64 start, int64 limit) {
-      for (int64 i = start; i < limit; i++)
-        storage_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers,
-          copyback_keys.size(), 100000, do_insert);
-    if (output_value_ptrs != nullptr) {
-      auto it = copyback_cursor.cbegin();
-      for (int64 i = 0; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 cursor = *it & 0x0fffffffffffffff;
-        output_value_ptrs[cursor] = (int64)gpu_value_ptrs[i];
-      }
-    }
-    SyncWithEventMgr(compute_stream, event_mgr);
-
-    alloc_->DeallocateRaw(dev_value_address);
-    alloc_->DeallocateRaw(memcpy_buffer_gpu);
-    cpu_allocator()->DeallocateRaw(value_address);
-    delete[] gpu_value_ptrs;
-  }
-}
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::CopyEmbeddingsFromCPUToGPU(       \
-      const ktype*, const std::list<int64>&, vtype**, se::Stream*, EventMgr*, \
-      const Eigen::GpuDevice&, const DeviceBase::CpuWorkerThreads*, int64*);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 28ce5094d87..487f595bf31 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/embedding/embedding_var_restore.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/framework/embedding/filter_factory.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
@@ -57,7 +56,8 @@ class EmbeddingVar : public ResourceBase {
   EmbeddingVar(const string& name,
                embedding::Storage<K, V>* storage,
                EmbeddingConfig emb_cfg,
-               Allocator* alloc):
+               Allocator* alloc,
+               embedding::FeatureDescriptor<V>* feat_desc):
       name_(name),
       storage_(storage),
       default_value_(nullptr),
@@ -65,27 +65,8 @@ class EmbeddingVar : public ResourceBase {
       value_len_(0),
       alloc_(alloc),
       default_value_alloc_(alloc),
-      emb_config_(emb_cfg) {
-    if (IsMultiLevel() || emb_config_.record_freq) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        value_ptr->AddFreq(freq);
-      };
-    } else if (emb_config_.is_counter_filter()) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        if (value_ptr->GetFreq() < filter_freq)
-          value_ptr->AddFreq(freq);
-      };
-    } else {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {};
-    }
-    if (emb_config_.steps_to_live != 0 || emb_config_.record_version) {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {
-        value_ptr->SetStep(gs);
-      };
-    } else {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {};
-    }
-  }
+      emb_config_(emb_cfg),
+      feat_desc_(feat_desc) {}
 
   Status Init(const Tensor& default_tensor, int64 default_value_dim) {
     if (storage_ == nullptr) {
@@ -95,17 +76,11 @@ class EmbeddingVar : public ResourceBase {
 
     storage_type_ = storage_->GetStorageType();
     filter_ = FilterFactory::CreateFilter<K, V, EmbeddingVar<K, V>>(
-        emb_config_, this, storage_);
+        emb_config_, this, storage_, feat_desc_);
     emb_config_.default_value_dim = default_value_dim;
     value_len_ =
         default_tensor.NumElements() / emb_config_.default_value_dim;
 
-    if (LayoutType::NORMAL_CONTIGUOUS == storage_->GetLayoutType() ||
-        LayoutType::NORMAL_CONTIGUOUS_GPU == storage_->GetLayoutType() ||
-        LayoutType::COMPACT == storage_->GetLayoutType()) {
-      storage_->SetAllocLen(value_len_, emb_config_.slot_num + 1);
-    }
-
     if (storage_->IsUseHbm()) {
 #if GOOGLE_CUDA
       default_value_ = TypedAllocator::Allocate<V>(alloc_,
@@ -115,12 +90,6 @@ class EmbeddingVar : public ResourceBase {
       dev_addr_buffer_size_ = 0;
       cudaMemcpy(default_value_, &default_tensor_flat(0),
           default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
-      storage_->
-          CreateEmbeddingMemoryPool(
-              alloc_,
-              emb_config_.total_num(
-                  storage_->GetAllocLen()),
-              1024 * 1024 * 64);
 #endif  // GOOGLE_CUDA
     } else if (storage_->IsSingleHbm()) {
 #if GOOGLE_CUDA
@@ -147,6 +116,14 @@ class EmbeddingVar : public ResourceBase {
             emb_config_.default_value_no_permission);
       }
     }
+    bool is_all_slots_initialized = 
+        feat_desc_->InitSlotInfo(
+            emb_config_.emb_index, value_len_,
+            std::pair<V*, int64>(
+                default_value_, emb_config_.default_value_dim));
+    if (is_all_slots_initialized) {
+      storage_->Init();
+    }
 
     return Status::OK();
   }
@@ -159,57 +136,92 @@ class EmbeddingVar : public ResourceBase {
     return is_initialized_;
   }
 
-  Status LookupKey(K key, ValuePtr<V>** value_ptr) {
+  Status LookupKey(K key, void** value_ptr) {
     return storage_->Get(key, value_ptr);
   }
 
   void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
                       const K* keys,
-                      ValuePtr<V>** value_ptr_list,
+                      void** value_ptr_list,
                       int64 num_of_keys) {
-    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys,
-                       emb_config_.total_num(storage_->GetAllocLen()));
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
+  Status LookupOrCreateKey(K key, void** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
     if (indices_as_pointer) {
-      *value_ptr = (ValuePtr<V>*)key;
-      *is_filter = (*value_ptr != nullptr);
+      *value_ptr = (void*)key;
+      *is_filter = filter_->is_admit(key, *value_ptr);
       return Status::OK();
     } else {
       Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count);
-      add_freq_fn_(*value_ptr, count, emb_config_.filter_freq);
       return s;
     }
   }
 
   Status Insert(K key, V* value) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     CreateKey(key, &value_ptr, true);
-    LookupOrCreateEmb(value_ptr, value);
+    feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value);
     return Status::OK();
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr) {
-    Status s = storage_->GetOrCreate(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()));
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys,
+                           void** value_ptrs,
+                           int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            add_freq_fn);
+    }
+    return Status::OK();
+  }
+
+
+  Status LookupOrCreateKey(K key, void** value_ptr) {
+    Status s = storage_->GetOrCreate(key, value_ptr);
     TF_CHECK_OK(s);
     return s;
   }
 
-  void CreateKey(K key, ValuePtr<V>** value_ptr, bool to_dram) {
-    storage_->Insert(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()), to_dram);
+  void CreateKey(K key, void** value_ptr, bool to_dram) {
+    storage_->CreateAndInsert(key, value_ptr, to_dram);
   }
 
-  void UpdateVersion(ValuePtr<V>* value_ptr, int64 gs) {
-    update_version_fn_(value_ptr, gs);
+  void UpdateVersion(void* value_ptr, int64 gs) {
+    feat_desc_->UpdateVersion(value_ptr, gs);
   }
 
   void BatchCommit(const std::vector<K>& keys,
-                   const std::vector<ValuePtr<V>*>& value_ptrs) {
+                   const std::vector<void*>& value_ptrs) {
     TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs));
   }
 
@@ -218,9 +230,9 @@ class EmbeddingVar : public ResourceBase {
   }
 
   int64 GetVersion(K key) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetStep();
+    return feat_desc_->GetVersion(value_ptr);
   }
 
   int64 GetFreq(K key) {
@@ -261,11 +273,11 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         V* default_v = default_value + i * value_len_;
-        ValuePtr<V>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         filter_->LookupOrCreate(
             keys[i], output + i * value_len_, default_v, &value_ptr, 1,
             default_value_no_permission_);
-        add_freq_fn_(value_ptr, 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptr, 1);
       }
     };
     auto worker_threads = context.worker_threads;
@@ -276,7 +288,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<CPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
+                      void** value_ptrs,
                       int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
     auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) {
@@ -295,7 +307,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GatherEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
@@ -303,13 +315,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
         V* value = nullptr;
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          value = LookupOrCreateEmb(value_ptrs[i], default_v);
+          value = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           value = default_value_no_permission_;
         }
@@ -341,8 +350,9 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
-                      int64 num_of_keys) {
+                      void** value_ptrs,
+                      int64 num_of_keys,
+                      bool indices_as_pointer = false) {
     const K* keys = (K*)keys_tensor.data();
     filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
     storage_->AddToCachePrefetchList(keys_tensor);
@@ -351,17 +361,17 @@ class EmbeddingVar : public ResourceBase {
   void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& context,
       const K* keys,
-      ValuePtr<V>** value_ptrs,
+      void** value_ptrs,
       int64 num_of_keys,
       std::vector<std::list<int64>>& not_found_cursor_list) {
     storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys,
-                               emb_config_.total_num(storage_->GetAllocLen()),
+                               value_len_,
                                not_found_cursor_list);
   }
 
   void GatherEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     std::vector<V*> embedding_ptr(num_of_keys);
@@ -370,12 +380,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptrs[i], 1);
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v);
+          embedding_ptr[i] = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission_;
         }
@@ -394,72 +402,8 @@ class EmbeddingVar : public ResourceBase {
 
     storage_->AddToCache(keys_tensor);
   }
-
-  void BatchLookupOrCreateEmb(
-      const EmbeddingVarContext<GPUDevice>& ctx,
-      V** var_ptr,
-      ValuePtr<V>** value_ptrs,
-      const K* indices,
-      int64 num_of_keys,
-      IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-    int num_worker_threads = ctx.worker_threads->num_threads;
-    std::vector<std::list<int64>> init_cursor_list(
-        num_worker_threads + 1);
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-
-    auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list,
-        &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptr[i] = LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          num_of_keys, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
-
-    auto stream = ctx.compute_stream;
-    auto event_mgr = ctx.event_mgr;
-
-    SetDefaultValueOfNewFeatures(
-        indices, num_of_keys,
-        init_cursor_list[0],
-        var_ptr, stream, event_mgr,
-        ctx.gpu_device);
-  }
 #endif
 
-  void LookupOrCreate(K key, V* val, V* default_v, int count = 1)  {
-    const V* default_value_ptr =
-      (default_v == nullptr) ? default_value_ : default_v;
-    ValuePtr<V>* value_ptr = nullptr;
-    filter_->LookupOrCreate(key, val, default_value_ptr, &value_ptr, count,
-                            default_value_no_permission_);
-    add_freq_fn_(value_ptr, count, emb_config_.filter_freq);
-  }
-
-  void BatchInitEmb(int64 size, V** memcpy_address, V* default_value,
-      bool* init_flags, int64 value_len) {
-    filter_->BatchInitEmb(size, memcpy_address, default_value,
-        init_flags, value_len);
-  }
-
 #if GOOGLE_CUDA
   void CopyEmbeddingsToBuffer(
       V* val_base, int64 size,
@@ -467,73 +411,18 @@ class EmbeddingVar : public ResourceBase {
       se::Stream* compute_stream,
       EventMgr* event_mgr,
       const Eigen::GpuDevice& gpu_device);
-
-  void SetDefaultValueOfNewFeatures(
-      const K* keys, int64 size,
-      const std::list<int64>& init_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device);
-
-  void CopyEmbeddingsFromCPUToGPU(
-      const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device,
-      const DeviceBase::CpuWorkerThreads* worker_threads,
-      int64* output_value_ptrs = nullptr);
-
-  void AllocateMemoryForNewFeatures(
-      V** memcpy_address,
-      const std::list<int64>& init_cursor) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
-    for (auto it = init_cursor.cbegin();
-      it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr_list.emplace_back(value_ptr);
-    }
-    storage_->AllocateMemoryForNewFeatures(value_ptr_list);
-  }
 #endif  // GOOGLE_CUDA
 
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-          emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v,
-                       Allocator* alloc) {
-    return value_ptr->GetOrAllocate(alloc, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-            emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, bool &need_initialize) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, nullptr,
-        emb_config_.emb_index,
-        storage_->GetOffset(emb_config_.emb_index),
-        need_initialize);
-  }
-
-  V* LookupPrimaryEmb(ValuePtr<V>* value_ptr) {
-    V* primary_val = value_ptr->GetValue(emb_config_.primary_emb_index,
-        storage_->GetOffset(emb_config_.primary_emb_index));
-    return primary_val;
-  }
-
-  typename TTypes<V>::Flat flat(ValuePtr<V>* value_ptr, int64 index) {
-    V* default_v =
-        default_value_ + (index % emb_config_.default_value_dim) * value_len_;
-    V* val = LookupOrCreateEmb(value_ptr, default_v);
+  typename TTypes<V>::Flat flat(void* value_ptr) {
+    V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
     Eigen::array<Eigen::DenseIndex, 1> dims({value_len_});
     return typename TTypes<V>::Flat(val, dims);
   }
 
+  V* GetValuePtr(void* ptr) {
+    return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index);
+  }
+
   int64 ValueLen() const {
     return value_len_;
   }
@@ -602,25 +491,26 @@ class EmbeddingVar : public ResourceBase {
                    std::vector<V*>* value_list,
                    std::vector<int64>* version_list,
                    std::vector<int64>* freq_list) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     storage_->GetSnapshot(key_list, &value_ptr_list);
     bool is_save_freq = emb_config_.is_save_freq();
     bool is_save_version = emb_config_.is_save_version();
     for (int64 i = 0; i < key_list->size(); i++) {
-      V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0);
-      if (val != nullptr) {
+      if (feat_desc_->IsAdmit(value_ptr_list[i])) {
+        V* val = feat_desc_->GetEmbedding(
+            value_ptr_list[i], emb_config_.emb_index);
         value_list->emplace_back(val);
       } else {
         value_list->emplace_back(default_value_);
       }
 
       if(is_save_version) {
-        int64 dump_version = value_ptr_list[i]->GetStep();
+        int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]);
         version_list->emplace_back(dump_version);
       }
 
       if(is_save_freq) {
-        int64 dump_freq = value_ptr_list[i]->GetFreq();
+        int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]);
         freq_list->emplace_back(dump_freq);
       }
     }
@@ -634,6 +524,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_;
   }
 
+  embedding::FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
+  }
+
   Status Shrink(embedding::ShrinkArgs& shrink_args) {
     if (emb_config_.is_primary()) {
       shrink_args.value_len = value_len_;
@@ -671,10 +565,6 @@ class EmbeddingVar : public ResourceBase {
     return alloc_;
   }
 
-  int64 GetAllocLen() {
-    return emb_config_.total_num(storage_->GetAllocLen());
-  }
-
   V** GetBuffer(int64 size) {
     if (dev_addr_buffer_size_ >= size) {
       return dev_addr_buffer_;
@@ -756,16 +646,17 @@ class EmbeddingVar : public ResourceBase {
     return storage_->HashTable();
   }
 
- protected:
   FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const {
     return filter_;
   }
 
+ protected:
   ~EmbeddingVar() override {
     // When dynamic dimension embedding is used,
     // there will be more than one primary slot
     if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
       delete storage_;
+      delete feat_desc_;
     }
     if (embedding::StorageType::HBM_DRAM == storage_type_) {
       alloc_->DeallocateRaw(dev_addr_buffer_);
@@ -804,35 +695,6 @@ class EmbeddingVar : public ResourceBase {
           value_len_ * sizeof(V), do_work);
   }
 
-  V* GetAddressOfGpuValuePtr(ValuePtr<V>* value_ptr,
-      int64 index,
-      bool copyback_flag,
-      std::list<int64>& init_cursor,
-      std::list<int64>& copyback_cursor) {
-    V* mem_addr = nullptr;
-    bool init_flag = false;
-    if (!copyback_flag) {
-      mem_addr = LookupOrCreateEmb(value_ptr, init_flag);
-    } else {
-      mem_addr = value_ptr->GetValue(0,0);
-      if (copyback_flag ==
-          embedding::CopyBackFlag::COPYBACK_AND_DESTROY) {
-        delete value_ptr;
-        // If the 64th bit of cursor is set to 1,
-        // the corresponding valueptr need to be deleted later.
-        int64 tmp = 1;
-        tmp = tmp << 63;
-        copyback_cursor.emplace_back(index | tmp);
-      } else {
-        copyback_cursor.emplace_back(index);
-      }
-    }
-    if (init_flag) {
-      init_cursor.emplace_back(index);
-    }
-    return mem_addr;
-  }
-
   std::string name_;
   bool is_initialized_ = false;
 
@@ -849,8 +711,7 @@ class EmbeddingVar : public ResourceBase {
   embedding::StorageType storage_type_;
   EmbeddingConfig emb_config_;
   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
-  std::function<void(ValuePtr<V>*, int64, int64)> add_freq_fn_;
-  std::function<void(ValuePtr<V>*, int64)> update_version_fn_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar);
 };
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
index c1b43a608b5..7dddf714b6b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
@@ -21,42 +21,38 @@ namespace tensorflow {
 namespace embedding {
 template<class K, class V>
 void EmbeddingVarCkptData<K, V>::Emplace(
-    K key, ValuePtr<V>* value_ptr,
+    K key, void* value_ptr,
     const EmbeddingConfig& emb_config,
-    V* default_value, int64 value_offset,
+    V* default_value,
+    FeatureDescriptor<V>* feat_desc,
     bool is_save_freq,
     bool is_save_version,
     bool save_unfiltered_features) {
   if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
     return;
 
-  V* primary_val = value_ptr->GetValue(0, 0);
-  bool is_not_admit =
-      primary_val == nullptr
-      && emb_config.filter_freq != 0;
+  bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+  bool is_admit = feat_desc->IsAdmit(value_ptr);
 
-  if (!is_not_admit) {
+  if (is_admit) {
     key_vec_.emplace_back(key);
 
-    if (primary_val == nullptr) {
+    if (!is_in_dram) {
+      value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM);
+      value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+    } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) {
       value_ptr_vec_.emplace_back(default_value);
-    } else if (
-        (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
-      value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
     } else {
-      V* val = value_ptr->GetValue(emb_config.emb_index,
-          value_offset);
+      V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index);
       value_ptr_vec_.emplace_back(val);
     }
-
-
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_vec_.emplace_back(dump_version);
     }
 
     if(is_save_freq) {
-      int64 dump_freq = value_ptr->GetFreq();
+      int64 dump_freq = feat_desc->GetFreq(value_ptr);
       freq_vec_.emplace_back(dump_freq);
     }
   } else {
@@ -66,18 +62,18 @@ void EmbeddingVarCkptData<K, V>::Emplace(
     key_filter_vec_.emplace_back(key);
 
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_filter_vec_.emplace_back(dump_version);
     }
 
-    int64 dump_freq = value_ptr->GetFreq();
+    int64 dump_freq = feat_desc->GetFreq(value_ptr);
     freq_filter_vec_.emplace_back(dump_freq);
   }
 }
 #define REGISTER_KERNELS(ktype, vtype)                               \
   template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
-      ktype, ValuePtr<vtype>*, const EmbeddingConfig&, \
-      vtype*, int64, bool, bool, bool); 
+      ktype, void*, const EmbeddingConfig&, \
+      vtype*, FeatureDescriptor<vtype>*, bool, bool, bool);
 #define REGISTER_KERNELS_ALL_INDEX(type)                             \
   REGISTER_KERNELS(int32, type)                                      \
   REGISTER_KERNELS(int64, type)
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index 6d7b09e70b0..10bf0d0e43b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -19,15 +19,19 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 namespace tensorflow {
 class BundleWriter;
+namespace {
+  const int kSavedPartitionNum = 1000;
+  const int kDramFlagOffset = 49;
+}
 
 namespace embedding {
-
 template<class K, class V>
 class  EmbeddingVarCkptData {
  public:
-  void Emplace(K key, ValuePtr<V>* value_ptr,
+  void Emplace(K key, void* value_ptr,
                const EmbeddingConfig& emb_config,
-               V* default_value, int64 value_offset,
+               V* default_value,
+               FeatureDescriptor<V>* feat_desc,
                bool is_save_freq,
                bool is_save_version,
                bool save_unfiltered_features);
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
index 84c823a90dc..4c052b43c7e 100644
--- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -57,7 +57,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
         value_len_(value_len),
         col_idx_(0) {
     if (!valueptr_list.empty()) {
-      if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+      if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
         curr_ptr_ = val_iter_->Next();
       } else {
         curr_ptr_ = *curr_iter_;
@@ -75,7 +75,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
       curr_iter_++;
       col_idx_ = 0;
       if (curr_iter_ != end_iter_) {
-        if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+        if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
           curr_ptr_ = val_iter_->Next();
         } else {
           curr_ptr_ = *curr_iter_;
diff --git a/tensorflow/core/framework/embedding/feature_descriptor.h b/tensorflow/core/framework/embedding/feature_descriptor.h
new file mode 100644
index 00000000000..8808da353f4
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
+#include "tensorflow/core/framework/embedding/normal_feature_descriptor.h"
+#include <list>
+
+namespace tensorflow {
+namespace embedding {
+
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl;
+
+template <class V>
+class FeatureDescriptor {
+ public:
+  FeatureDescriptor(
+      int64 block_num,
+      int64 slot_num,
+      Allocator* alloc,
+      StorageType storage_type,
+      bool need_record_freq,
+      bool need_record_version,
+      const std::pair<bool, int64>& filter_info) {
+    if (block_num > 1) {
+      feat_desc_impl_.reset(
+          new DynmaicDimDescriptorImpl<V>(
+              alloc, block_num * slot_num));
+    } else if (filter_info.first) {
+      feat_desc_impl_.reset(
+          new CounterFilterDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version,
+              filter_info.second,
+              storage_type));
+    } else if (storage_type == StorageType::HBM_DRAM || 
+               storage_type == StorageType::HBM_DRAM_SSDHASH) {
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  FeatureDescriptor(FeatureDescriptor<V>* feat_desc) {
+    if (typeid(*(feat_desc->feat_desc_impl_.get())) == 
+        typeid(CounterFilterDescriptorImpl<V>*)) {
+      feat_desc_impl_.reset(
+        new CounterFilterDescriptorImpl<V>(
+          dynamic_cast<CounterFilterDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    }
+    else if (typeid(*(feat_desc->feat_desc_impl_.get())) ==
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>)) {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+    else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+  }
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptor<V>* feat_desc) {
+    return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get());
+  }
+
+  V* GetEmbedding(void *val, int emb_index) {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  void* Allocate() {
+    return feat_desc_impl_->Allocate();
+  }
+
+  void* Allocate(int64 freq) {
+    return feat_desc_impl_->Allocate(freq);
+  }
+
+  void Deallocate(void* val) {
+    feat_desc_impl_->Deallocate(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) {
+    feat_desc_impl_->Deallocate(value_ptrs);
+  }
+
+  void SetDefaultValue(void* val, int64 index) {
+    feat_desc_impl_->SetDefaultValue(val, index);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    feat_desc_impl_->SetValue(val, emb_index, value);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    reinterpret_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(feat_desc_impl_.get())->SetDefaultValues(
+        keys, init_cursor, value_ptrs,
+        compute_stream, event_mgr, gpu_device);
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  int data_bytes() {
+    return feat_desc_impl_->data_bytes();
+  }
+
+  int64 GetFreq(void* val) {
+    return feat_desc_impl_->GetFreq(val);
+  }
+
+  int64 GetVersion(void* val) {
+    return feat_desc_impl_->GetVersion(val);
+  }
+
+  void SetFreq(void* val, int64 freq) {
+    feat_desc_impl_->SetFreq(val, freq);
+  }
+
+  void UpdateVersion(void* val, int64 version) {
+    feat_desc_impl_->UpdateVersion(val, version);
+  }
+
+  void AddFreq(void* val, int64 freq) {
+    feat_desc_impl_->AddFreq(val, freq);
+  }
+
+  int total_dim() {
+    return feat_desc_impl_->total_dim();
+  }
+  
+  bool IsAdmit(void* val) {
+    return feat_desc_impl_->IsAdmit(val);
+  }
+
+  void* Admit(void* val) {
+    return feat_desc_impl_->Admit(val);
+  }
+
+
+ protected:
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/feature_descriptor_impl.h b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
new file mode 100644
index 00000000000..6996d22f447
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
@@ -0,0 +1,317 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace embedding {
+struct SlotInfo {
+  int embedding_dim;
+  int embedding_offset;
+  void* default_value;
+  int64 default_value_dim;
+  int default_value_len;
+};
+
+class BaseFreqDescriptor {
+ public:
+  virtual int64 GetFreq(void* value_ptr) = 0;
+  virtual void AddFreq(void* value_ptr, int64 freq) {}
+  virtual void SetFreq(void* value_ptr, int64 freq) {}
+  virtual BaseFreqDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class FreqDescriptor: public BaseFreqDescriptor {
+ public:
+  explicit FreqDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+
+  int64 GetFreq(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void AddFreq(void* value_ptr, int64 freq) override {
+    __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq);
+  }
+
+  void SetFreq(void* value_ptr, int64 freq) override {
+    *(int64*)(value_ptr + offset_byte_) = freq;
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new FreqDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+  
+ private:
+  int offset_byte_;
+};
+
+class NonFreqDescriptor: public BaseFreqDescriptor {
+ public:
+  int64 GetFreq(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get freq from NonFreqCounter.";
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new NonFreqDescriptor();
+  }
+};
+
+class BaseVersionDescriptor {
+ public:
+  virtual int64 GetVersion(void* value_ptr) = 0;
+  virtual void UpdateVersion(void* value_ptr, int64 version) {}
+  virtual BaseVersionDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class VersionDescriptor: public BaseVersionDescriptor {
+ public:
+  explicit VersionDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+  
+  int64 GetVersion(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void UpdateVersion(void* value_ptr, int64 version) override {
+    *(int64*)(value_ptr + offset_byte_) = version;
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new VersionDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+
+ private:
+  int offset_byte_;
+};
+
+class NonVersionDescriptor: public BaseVersionDescriptor {
+ public:
+  int64 GetVersion(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get version from NonFreqCounter.";
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new NonVersionDescriptor();
+  }
+};
+
+template <class V>
+class FeatureDescriptorImpl {
+ public:
+  FeatureDescriptorImpl(int64 slot_num,
+                    bool need_record_freq,
+                    bool need_record_version) {
+    slot_infos_.resize(slot_num);
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE;
+    }
+
+    if (!need_record_freq) {
+      freq_desc_.reset(new NonFreqDescriptor());
+    }
+    if (!need_record_version) {
+      version_desc_.reset(new NonVersionDescriptor());
+    }
+  }
+
+  FeatureDescriptorImpl(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+    freq_desc_.reset(
+        feat_desc_impl->freq_desc_->Clone());
+    version_desc_.reset(
+        feat_desc_impl->version_desc_->Clone());
+  }
+
+  virtual ~FeatureDescriptorImpl() {}
+
+  virtual bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) = 0;
+  virtual bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    LOG(FATAL)<<"InitSlotInfo(feat_desc_impl) is not implemented.";
+  }
+  virtual V* GetEmbedding(void* val, int emb_index) = 0;
+  virtual void* Allocate() = 0;
+  virtual void* Allocate(int64 freq) {return Allocate();}
+  virtual void Deallocate(void* val) = 0;
+  virtual void Deallocate(const std::vector<void*>& val) = 0;
+  virtual void SetAllocator(Allocator* alloc) = 0;
+  virtual void SetDefaultValue(void* val, int64 key) = 0;
+  virtual void SetValue(void* val, int64 emb_index, V* value) {}
+  virtual bool IsAdmit(void* val) {return true;}
+  virtual void* Admit(void* val) {}
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+  virtual int data_bytes() = 0;
+
+  virtual int64 GetFreq(void* val) {
+    return freq_desc_->GetFreq(val);
+  }
+
+  virtual int64 GetVersion(void* val) {
+    return version_desc_->GetVersion(val);
+  }
+
+  virtual void SetFreq(void* val, int64 freq) {
+    freq_desc_->SetFreq(val, freq);
+  }
+
+  virtual void UpdateVersion(void* val, int64 version) {
+    version_desc_->UpdateVersion(val, version);
+  }
+
+  virtual void AddFreq(void* val, int64 freq) {
+    freq_desc_->AddFreq(val, freq);
+  }
+
+  inline int total_dim() {
+    int64 slot_num = slot_infos_.size();
+    return slot_infos_[slot_num - 1].embedding_offset
+           + slot_infos_[slot_num - 1].embedding_dim;
+  }
+
+ protected:
+  bool SetEmbeddingInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    slot_infos_[emb_index].default_value = default_value.first;
+    slot_infos_[emb_index].default_value_dim = default_value.second;
+    slot_infos_[emb_index].default_value_len = embedding_dim;
+
+    bool is_aligned = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true,
+        &is_aligned));
+    if (is_aligned) {
+      embedding_dim = ComputeAlignedDim(embedding_dim);
+    }
+
+    //Avoid parallel consitency issue
+    __sync_bool_compare_and_swap(
+        &slot_infos_[emb_index].embedding_offset,
+        EMPTY_OFFSET_VALUE, embedding_dim);
+    slot_infos_[emb_index].embedding_dim = embedding_dim;
+    //Check whether all offsets are set
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) {
+        return false;
+      }
+    }
+
+    ComputeEmbeddingOffsets();
+    return true;
+  }
+
+  void SetSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+  }
+
+  void ComputeAllocBytes(int* alloc_bytes) {
+    for(auto slot_info: slot_infos_) {
+      *alloc_bytes += slot_info.embedding_dim * sizeof(V);
+    }
+  }
+
+  void CreateFreqAndVersionDescriptor(int* alloc_bytes) {
+    if (!freq_desc_) {
+      freq_desc_.reset(new FreqDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+    if (!version_desc_) {
+      version_desc_.reset(new VersionDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+  }
+
+  void InitFreqAndVersion(void* val) {
+    freq_desc_->SetFreq(val, 0);
+    version_desc_->UpdateVersion(val, -1);
+  }
+
+  void SetFreqAndVersionOffset(int* alloc_bytes) {
+    freq_desc_->SetOffset(alloc_bytes);
+    version_desc_->SetOffset(alloc_bytes);
+  }
+
+  V* GetDefaultValuePtr(int64 emb_index, int64 key) {
+    V* default_value_base = (V*)slot_infos_[emb_index].default_value;
+    int64 default_value_offset =
+        (key % slot_infos_[emb_index].default_value_dim) *
+        slot_infos_[emb_index].default_value_len;
+    return default_value_base + default_value_offset;
+  }
+
+  void SetDefaultValue(void* val, int64 emb_index, int64 key) {
+    memcpy(val,
+           GetDefaultValuePtr(emb_index, key),
+           slot_infos_[emb_index].default_value_len * sizeof(V));
+  }
+
+ private:
+  int64 ComputeAlignedDim(int64 embedding_dim) {
+    int padding_bytes =
+        ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES;
+    if (padding_bytes == ALIGN_BYTES) {
+      return embedding_dim;
+    } else {
+      return embedding_dim + padding_bytes / sizeof(V);
+    }
+  }
+
+  void ComputeEmbeddingOffsets() {
+    for (int i = slot_infos_.size() - 1 ; i >= 0; i--) {
+      slot_infos_[i].embedding_offset = 0;
+      for (int j = 0; j < i; j++) {
+        slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset;
+      }
+    }
+  }
+
+ protected:
+  const int EMPTY_OFFSET_VALUE= -1;
+  const int ALIGN_BYTES = 16;
+  std::vector<SlotInfo> slot_infos_;
+  std::unique_ptr<BaseFreqDescriptor> freq_desc_;
+  std::unique_ptr<BaseVersionDescriptor> version_desc_;
+};
+
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
diff --git a/tensorflow/core/framework/embedding/filter_factory.h b/tensorflow/core/framework/embedding/filter_factory.h
index 5bb92467a51..0127e2c882a 100644
--- a/tensorflow/core/framework/embedding/filter_factory.h
+++ b/tensorflow/core/framework/embedding/filter_factory.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/nullable_filter_policy.h"
 
-
 namespace tensorflow {
 namespace embedding{
 template <class K, class V>
@@ -34,22 +33,23 @@ class FilterFactory {
   template<typename K, typename V, typename EV>
   static FilterPolicy<K, V, EV>* CreateFilter(
       const EmbeddingConfig& config, EV* ev,
-      embedding::Storage<K, V>* storage) {
+      embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc) {
     if (config.filter_freq > 0) {
       if (config.kHashFunc != 0) {
         return new BloomFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       } else {
         return new CounterFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       }
     } else {
       return new NullableFilterPolicy<K, V, EV>(
-          config, ev, storage);
+          config, ev, storage, feat_desc);
     }
   }
 };
 
-} // tensorflow
+} //namespace tensorflow
 
 #endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 559a6796246..256d3b044d4 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/emb_file.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 
 namespace tensorflow {
 
@@ -45,9 +46,6 @@ struct RestoreBuffer {
 template<typename K>
 class RestoreSSDBuffer;
 
-template <typename V>
-class ValuePtr;
-
 template<typename K, typename V, typename EV>
 class FilterPolicy {
  public:
@@ -55,7 +53,7 @@ class FilterPolicy {
       config_(config), ev_(ev) {}
 
   virtual void LookupOrCreate(K key, V* val,
-      const V* default_value_ptr, ValuePtr<V>** value_ptr,
+      const V* default_value_ptr, void** value_ptr,
       int count, const V* default_value_no_permission) = 0;
 
   virtual Status Lookup(K key, V* val, const V* default_value_ptr,
@@ -70,53 +68,25 @@ class FilterPolicy {
 
   virtual void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& ctx,
-      const K* keys, ValuePtr<V>** value_ptrs_list,
+      const K* keys, void** value_ptrs_list,
       int64 num_of_keys) = 0;
 #endif //GOOGLE_CUDA
 
-  virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  virtual Status LookupOrCreateKey(K key, void** val,
       bool* is_filter, int64 count) = 0;
+  
+  virtual Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) {}
 
-  virtual int64 GetFreq(K key, ValuePtr<V>* value_ptr) = 0;
-
+  virtual int64 GetFreq(K key, void* value_ptr) = 0;
   virtual int64 GetFreq(K key) = 0;
 
-  virtual bool is_admit(K key, ValuePtr<V>* value_ptr) = 0;
+  virtual bool is_admit(K key, void* value_ptr) = 0;
 
   virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
                          bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0;
 
- protected:
-  void LookupOrCreateEmbInternal(bool is_filter, bool to_dram,
-                                 int i, int value_len,
-                                 ValuePtr<V>* value_ptr,
-                                 V* value_src, K* key_src) {
-    
-    if (!is_filter) {
-      ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen());
-      return;
-    } else {
-      if (to_dram) {
-#if GOOGLE_CUDA
-        std::vector<V> default_value_host;
-        default_value_host.resize(config_.default_value_dim * value_len);
-        cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(),
-                    sizeof(V) * config_.default_value_dim * value_len,
-                    cudaMemcpyDeviceToHost);
-        ev_->LookupOrCreateEmb(value_ptr,
-                               default_value_host.data() +
-                                  (key_src[i] % config_.default_value_dim)
-                                  * ev_->ValueLen());
-#endif
-        return;
-      } else {
-        ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i]));
-      return;
-      }
-    }
-  }
-
  protected:
   EmbeddingConfig config_;
   EV* ev_;
diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
index a2af6a2430a..b0950eff22d 100644
--- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
@@ -18,25 +18,21 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/shrink_policy.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   GlobalStepShrinkPolicy(int64 steps_to_live,
-                         Allocator* alloc,
+                         FeatureDescriptor<V>* feat_desc,
                          KVInterface<K, V>* kv)
       : steps_to_live_(steps_to_live),
         kv_(kv),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.global_step,
@@ -46,16 +42,16 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 global_step,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      int64 version = value_list[i]->GetStep();
+      int64 version = ShrinkPolicy<K, V>::feat_desc_->GetVersion(value_list[i]);
       if (version == -1) {
-        value_list[i]->SetStep(global_step);
+        ShrinkPolicy<K, V>::feat_desc_->UpdateVersion(value_list[i], global_step);
       } else {
         if (global_step - version > steps_to_live_) {
           kv_->Remove(key_list[i]);
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 1dd90d63a6e..fc4a2506313 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -204,29 +204,29 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchLookupOrCreate(const K* keys, size_t n,
-                             ValuePtr<V>** value_ptrs) override {
+                             void** value_ptrs) override {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     return Status::OK();
   }
 
   Status Contains(K key) override { return Status::OK(); }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status Remove(K key) override { return Status::OK(); }
 
   Status BatchLookup(const K* keys, size_t size,
-                     ValuePtr<V>** value_ptrs) override {
+                     void** value_ptrs) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
@@ -235,22 +235,20 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
   int64 Size() const override { return 0; }
 
-  void SetTotalDims(int total_dims) override {}
+  void FreeValuePtr(void* value_ptr) override {}
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {}
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 581f1f1cfaf..1056f4bbd78 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -3,7 +3,6 @@
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -14,9 +13,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -26,15 +22,17 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramSsdStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-      Allocator* cpu_alloc, LayoutCreator<V>* lc, const std::string& name)
-      : cpu_alloc_(cpu_alloc), gpu_alloc_(gpu_alloc),
+  HbmDramSsdStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
         MultiTierStorage<K, V>(sc, name),
         dram_capacity_(-1) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc_, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc_, lc,
-        new LocklessHashMapCPU<K, V>(gpu_alloc_));
-    ssd_ = new SsdHashStorage<K, V>(sc, cpu_alloc_, lc);
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
+    ssd_ = new SsdHashStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramSsdStorage() override {
@@ -46,29 +44,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    ssd_->Init();
 
-      MultiTierStorage<K, V>::cache_capacity_ =
-          Storage<K, V>::storage_config_.size[0]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
+    MultiTierStorage<K, V>::cache_capacity_ =
+        Storage<K, V>::storage_config_.size[0]
+        / (total_dim() * sizeof(V));
           
-      dram_capacity_ = Storage<K, V>::storage_config_.size[1]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
-      MultiTierStorage<K, V>::ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+    dram_capacity_ = Storage<K, V>::storage_config_.size[1]
+        / (total_dim() * sizeof(V));
+    MultiTierStorage<K, V>::ready_eviction_ = true;
   }
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -88,13 +77,12 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -102,20 +90,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -124,70 +112,27 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
 
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+              bool to_dram = false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->Insert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->Insert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    // Insert Failed
-    if (!s.ok()) {
-      {
-        mutex_lock l(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-      }
-      delete *value_ptr;
-      return hbm_->Get(key, value_ptr);
-    } else {
-      return s;
-    }
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-    s = ssd_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK_AND_DESTROY;
-      return s;
-    }
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   void InitCache(embedding::CacheStrategy cache_strategy) override {
@@ -195,66 +140,6 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     dram_cache_ = new LRUCache<K>();
   }
 
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it & 0x0fffffffffffffff;
-        memory_index[i] = *it;
-        ValuePtr<V>* gpu_value_ptr =
-            hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int64 j = memory_index[i] & 0x0fffffffffffffff;
-        bool destroy_flag = (memory_index[i] >> 63) & 0x1;
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-        if (destroy_flag) {
-          ssd_->DestroyValuePtr(reinterpret_cast<ValuePtr<V>*>(
-              (char *)memcpy_address[j] - sizeof(FixedLengthHeader)));
-        }
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
-  }
-
   Status Remove(K key) override {
     hbm_->Remove(key);
     dram_->Remove(key);
@@ -311,25 +196,23 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
 
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -347,17 +230,24 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
+      }
     }
 
     ssd_->Save(tensor_name, prefix, writer, emb_config,
@@ -368,7 +258,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   Status DramToSsdBatchCommit(std::shared_ptr<std::vector<K>> keys) {
     MultiTierStorage<K, V>::ReleaseValuePtrs(dram_value_ptr_out_of_date_,
-                                             dram_->alloc_);
+                                             dram_feat_desc_);
     mutex_lock l(*(ssd_->get_mutex()));
     mutex_lock l1(*(dram_->get_mutex()));
 
@@ -380,7 +270,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, DramEvictionSize);
       K dram_evic_ids[DramEvictionSize];
       size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       for (int64 i = 0; i < true_size; ++i) {
         if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) {
           TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr));
@@ -408,22 +298,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::shared_ptr<std::vector<K>> keys(new std::vector<K>());
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys->emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(*keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+      
+      dram_->BatchCommit(*keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : *keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -435,58 +334,14 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     }
   }
 
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-        new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
   }
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
-    ssd_->SetTotalDims(total_dims);
-  }
-
-  void CopyToGpuValuePtr(
-      ValuePtr<V>* gpu_ptr,
-      ValuePtr<V>* cpu_ptr,
-      int64 size) {
-    V* cpu_data_address = cpu_ptr->GetValue(0, 0);
-    V* gpu_data_address = gpu_ptr->GetValue(0, 0);
-    cudaMemcpy(gpu_data_address, cpu_data_address,
-        size * sizeof(V), cudaMemcpyHostToDevice);
-    memcpy(gpu_ptr->GetPtr(),
-           cpu_ptr->GetPtr(),
-           sizeof(FixedLengthHeader));
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
 
   void Restore(const std::string& name_string,
@@ -539,6 +394,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
                                            (int64*)restore_buff.freq_buffer);
     return s;
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {}
  private:
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
     V* memcpy_buffer_cpu = new V[size * value_len];
@@ -551,46 +410,30 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-                                         Storage<K, V>::GetOffset(emb_index)),
-                                         value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -611,10 +454,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
-      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list,
+      std::vector<std::list<void*>>& ssd_value_ptr_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
@@ -688,39 +531,32 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
-                                   std::list<ValuePtr<V>*>& ssd_value_ptrs,
-                                   int64 value_len) {
+                                   std::list<void*>& ssd_value_ptrs) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs,  hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -730,12 +566,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -752,34 +583,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
       //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      //Mutex with eviction thread
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -787,12 +615,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -804,29 +627,28 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_;
   Allocator* gpu_alloc_;
-  Allocator* cpu_alloc_;
   BatchCache<K>* dram_cache_;
   int64 dram_capacity_;
-  std::deque<ValuePtr<V>*> dram_value_ptr_out_of_date_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  std::deque<void*> dram_value_ptr_out_of_date_;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index 518c39287e0..d058d95f05b 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
@@ -29,9 +28,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -41,27 +37,27 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-                 Allocator* cpu_alloc, LayoutCreator<V>* lc,
-                 const std::string& name)
-      : gpu_alloc_(gpu_alloc), MultiTierStorage<K, V>(sc, name) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc, lc);
-    StorageConfig storage_config = StorageConfig();
-    storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc,
-                                  LayoutCreatorFactory::Create<V>(storage_config),
-                                  new LocklessHashMapCPU<K, V>(gpu_alloc));
+  HbmDramStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
+        MultiTierStorage<K, V>(sc, name) {
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete hbm_;
     delete dram_;
+    delete dram_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -76,9 +72,8 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
@@ -87,18 +82,17 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
@@ -110,115 +104,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list, &not_fountd_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
-
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->CreateAndInsert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->CreateAndInsert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    // Insert Failed, key already exist
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-    }
-    delete *value_ptr;
-    return hbm_->Get(key, value_ptr);
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
-  }
-
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-    //Split from above for loop for minize the cost of mutex lock
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int j = memory_index[i];
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   Status Remove(K key) override {
@@ -270,25 +171,23 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
-
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
+    
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -306,54 +205,26 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
-    }
-    return Status::OK();
-  }
-
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-       new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
       }
     }
+    return Status::OK();
   }
 
   void BatchEviction() override {
@@ -372,22 +243,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::vector<K> keys;
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys.emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+      
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+
+      dram_->BatchCommit(keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -430,6 +310,16 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     }
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
   Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
@@ -447,14 +337,14 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
  private:
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
@@ -522,38 +412,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
-                                   std::list<int64>& copyback_cursors,
-                                   int64 value_len) {
+                                   void** value_ptr_list,
+                                   std::list<int64>& copyback_cursors) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -563,12 +446,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -580,34 +458,29 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
-      //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      //Create Hbm ValuePtrs.      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -615,12 +488,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -632,16 +500,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
@@ -655,45 +529,30 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-             Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -714,9 +573,9 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_ = nullptr;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   Allocator* gpu_alloc_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
new file mode 100644
index 00000000000..a3603a61550
--- /dev/null
+++ b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
@@ -0,0 +1,122 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template<class V>
+class HbmMultiTierFeatureDescriptorImpl
+    : public FeatureDescriptorImpl<V> {
+ public:
+  HbmMultiTierFeatureDescriptorImpl(
+      Allocator* alloc, int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version)
+      : dram_alloc_bytes_(sizeof(V*)),
+        hbm_alloc_(alloc),
+        dram_alloc_(ev_allocator()),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&dram_alloc_bytes_);
+  }
+
+  ~HbmMultiTierFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes =
+        FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+            emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&hbm_alloc_bytes_);
+      embedding_mem_pool_.reset(
+        new EmbeddingMemoryPool<V>(hbm_alloc_,
+                                   hbm_alloc_bytes_ / sizeof(V),
+                                   1024 * 1024 * 64));
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return *((V**)val) +
+        FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = dram_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, dram_alloc_bytes_);
+    mutex_lock l(memory_pool_mu_);
+    *((V**)val) = embedding_mem_pool_->Allocate();
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    mutex_lock l(memory_pool_mu_);
+    embedding_mem_pool_->Deallocate(*((V**)val));
+    dram_alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    mutex_lock l(memory_pool_mu_);
+    for (auto ptr: value_ptrs) {
+      embedding_mem_pool_->Deallocate(*((V**)ptr));
+      dram_alloc_->DeallocateRaw(ptr);
+    }
+  }
+  void SetDefaultValue(void* val, int64 key) override {
+    LOG(FATAL)<<"Can't call SetDefaultValue(void* val, int64 key,"
+              <<"int default_value_len) in HbmMultiTierFeatureDescriptor.";
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    hbm_alloc_ = alloc;
+  }
+
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device);
+
+  int data_bytes() override {
+    return dram_alloc_bytes_;
+  }
+ public:
+  friend class NormalFeatureDescriptorImpl<V>;
+ protected:
+  int dram_alloc_bytes_;
+  int hbm_alloc_bytes_ = 0;
+  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  Allocator* hbm_alloc_;
+  Allocator* dram_alloc_;
+  std::unique_ptr<EmbeddingMemoryPool<V>> embedding_mem_pool_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
index 36d331e74aa..31dc4459a13 100644
--- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h
+++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
@@ -28,10 +28,11 @@ class HbmValueIterator: public ValueIterator<V> {
  public:
   HbmValueIterator(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       int64 emb_index,
       int64 value_len,
-      Allocator* alloc)
+      Allocator* alloc,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         alloc_(alloc) {
     int64 emb_offset = value_len_ * emb_index;
@@ -40,7 +41,7 @@ class HbmValueIterator: public ValueIterator<V> {
       for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
         if (key_list[i] % kSavedPartitionNum == part_id) {
           value_parts_vec[part_id].emplace_back(
-              value_ptr_list[i]->GetValue(emb_index, emb_offset));
+              feat_desc->GetEmbedding(value_ptr_list[i], emb_index));
           break;
         }
       }
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 5d1f20b581a..3659187c825 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -24,9 +25,6 @@ namespace {
 const char* kInferenceMode = "INFERENCE_MODE";
 }
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class GPUHashTable;
 
@@ -43,19 +41,19 @@ template <class K, class V>
 class KVInterface {
  public:
   virtual ~KVInterface() {}
-  virtual Status Lookup(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Lookup(K key, void** value_ptr) = 0;
   virtual Status Contains(K key) = 0;
-  virtual Status Insert(K key, const ValuePtr<V>* value_ptr) = 0;
+  virtual Status Insert(K key, const void* value_ptr) = 0;
   virtual Status Remove(K key) = 0;
 
   virtual Status BatchLookup(const K* keys, size_t size,
-                             ValuePtr<V>** value_ptrs) {
+                             void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
   }
   // KV Batch Insert
   virtual Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchInsert in KVInterface.");
   }
@@ -66,27 +64,30 @@ class KVInterface {
   }
 
   virtual Status BatchLookupOrCreate(const K* keys, size_t size,
-      ValuePtr<V>** value_ptrs) {
+      void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookupOrInsert in KVInterface.");
   }
 
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) {
+    LOG(FATAL)<<"Unimplemented for UpdateValuePtr in KVInterface.";
+  }
+
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   // KV Size
   virtual int64 Size() const = 0;
 
-  virtual void SetTotalDims(int total_dims) {}
-
-  virtual void FreeValuePtr(ValuePtr<V>* value_ptr) {}
+  virtual void FreeValuePtr(void* value_ptr) {}
 
-  virtual Status Commit(K key, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K key, const void* value_ptr) {
     return Status::OK();
   }
 
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
 
   virtual std::string DebugString() const = 0;
 
diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
index 2af6b58f94b..9b0ea8aba3f 100644
--- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
@@ -19,28 +19,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   L2WeightShrinkPolicy(float l2_weight_threshold,
                        int64 index,
-                       int64 offset,
-                       Allocator* alloc,
+                       FeatureDescriptor<V>* feat_desc,
                        KVInterface<K, V>* kv)
       : index_(index),
-        offset_(offset),
         kv_(kv),
         l2_weight_threshold_(l2_weight_threshold),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.value_len,
@@ -50,9 +45,9 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 value_len,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      V* val = value_list[i]->GetValue(index_, offset_);
+      V* val = ShrinkPolicy<K, V>::feat_desc_->GetEmbedding(value_list[i], index_);
       if (val != nullptr) {
         V l2_weight = (V)0.0;
         for (int64 j = 0; j < value_len; j++) {
@@ -61,7 +56,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
         l2_weight *= (V)0.5;
         if (l2_weight < (V)l2_weight_threshold_) {
           kv_->Remove(key_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
         }
       }
@@ -70,7 +65,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
 
  private:
   int64 index_;
-  int64 offset_;
+  //int64 offset_;
   KVInterface<K, V>* kv_;
   float l2_weight_threshold_;
 };
diff --git a/tensorflow/core/framework/embedding/layout_creator.h b/tensorflow/core/framework/embedding/layout_creator.h
deleted file mode 100644
index 07d50451bf0..00000000000
--- a/tensorflow/core/framework/embedding/layout_creator.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-======================================================================*/
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-
-#include "tensorflow/core/framework/embedding/cache.h"
-#include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/storage_config.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-template <class V>
-class ValuePtr;
-
-namespace embedding {
-template<typename V>
-class LayoutCreator {
- public:
-  virtual ValuePtr<V>* Create(Allocator* alloc, size_t size) = 0;
-};
-
-template<typename V>
-class NormalLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class LightLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new LightValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalContiguousValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousGPULayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalGPUValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class CompactLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new CompactValuePtr<V>(alloc, size);
-  }
-};
-
-class LayoutCreatorFactory {
- public:
-  template<typename V>
-  static LayoutCreator<V>* Create(const StorageConfig& sc) {
-    switch (sc.layout_type) {
-      case LayoutType::NORMAL:
-        static NormalLayoutCreator<V> normal_creator;
-        return &normal_creator;
-      case LayoutType::LIGHT:
-        static LightLayoutCreator<V> light_creator;
-        return &light_creator;
-      case LayoutType::NORMAL_CONTIGUOUS:
-        static NormalContiguousLayoutCreator<V> normal_contiguous_creator;
-        return &normal_contiguous_creator;
-      case LayoutType::NORMAL_CONTIGUOUS_GPU:
-        static NormalContiguousGPULayoutCreator<V>
-                   normal_contiguous_gpu_creator;
-        return &normal_contiguous_gpu_creator;
-      case LayoutType::COMPACT:
-        static CompactLayoutCreator<V> compact_creator;
-        return &compact_creator;
-      default:
-        static NormalLayoutCreator<V> default_creator;
-        return &default_creator;
-    }
-  }
-};
-} // embedding
-} // tensorflow
-
-#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index 8ea1fa63fc2..e488ab3776d 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
 
 #include "tensorflow/core/lib/io/path.h"
-
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 
 #include "leveldb/db.h"
@@ -35,9 +33,6 @@ using leveldb::WriteBatch;
 using leveldb::WriteOptions;
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K>
@@ -76,28 +71,21 @@ class SizeCounter {
 template <class K, class V>
 class LevelDBKV : public KVInterface<K, V> {
  public:
-  LevelDBKV(std::string path) {
+  LevelDBKV(std::string path, FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc) {
     path_ = io::JoinPath(path,
         "level_db_" + std::to_string(Env::Default()->NowMicros()));;
     options_.create_if_missing = true;
     leveldb::Status s = leveldb::DB::Open(options_, path_, &db_);
     CHECK(s.ok());
     counter_ =  new SizeCounter<K>(8);
-    new_value_ptr_fn_ = [] (size_t size) {
-      return new NormalContiguousValuePtr<V>(ev_allocator(), size);
-    };
-    total_dims_ = 0;
-  }
-
-  void SetTotalDims(int total_dims) {
-    total_dims_ = total_dims;
   }
 
   ~LevelDBKV() override {
     delete db_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     std::string val_str;
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::ReadOptions options;
@@ -106,8 +94,8 @@ class LevelDBKV : public KVInterface<K, V> {
       return errors::NotFound(
           "Unable to find Key: ", key, " in LevelDB.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
-      memcpy((int64 *)(val->GetPtr()), &val_str[0], val_str.length());
+      void* val = feat_desc_->Allocate();
+      memcpy((int64 *)val, &val_str[0], val_str.length());
       *value_ptr = val;
       return Status::OK();
     }
@@ -126,22 +114,22 @@ class LevelDBKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     counter_->add(key, 1);
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   } 
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     WriteBatch batch;
     for (int i = 0; i < keys.size(); i++) {
-      std::string value_res((char*)value_ptrs[i]->GetPtr(),
-          sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+      std::string value_res((char*)value_ptrs[i],
+          feat_desc_->data_bytes());
       leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*));
       batch.Put(db_key, value_res);
       delete value_ptrs[i];
@@ -150,9 +138,9 @@ class LevelDBKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    std::string value_res((char*)value_ptr->GetPtr(),
-        sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+  Status Commit(K key, const void* value_ptr) override {
+    std::string value_res((char*)value_ptr,
+        feat_desc_->data_bytes());
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res);
     if (!s.ok()){
@@ -176,22 +164,32 @@ class LevelDBKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
     leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       K key;
       memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
       key_list->emplace_back(key);
-      ValuePtr<V>* value_ptr =
-          new NormalGPUValuePtr<V>(ev_allocator(), 1);
-      memcpy((char *)value_ptr->GetPtr(),
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM, true, true,
+          {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr,
              it->value().ToString().data(),
-             sizeof(FixedLengthHeader));
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(
+          value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(
+          value_ptr, feat_desc_->GetVersion(dram_value_ptr));
       value_ptr_list->emplace_back(value_ptr);
     }
     delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
     return Status::OK();
   }
 
@@ -199,8 +197,8 @@ class LevelDBKV : public KVInterface<K, V> {
     return counter_->size();
   }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
   std::string DebugString() const override{
@@ -212,8 +210,7 @@ class LevelDBKV : public KVInterface<K, V> {
   SizeCounter<K>* counter_;
   Options options_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
-  int total_dims_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<class K, class  V>
@@ -223,10 +220,12 @@ class DBValueIterator: public ValueIterator<V> {
       const std::vector<K>& key_list,
       int64 emb_index,
       int64 value_len,
-      LevelDBKV<K, V>* leveldb_kv)
+      LevelDBKV<K, V>* leveldb_kv,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         emb_index_(emb_index),
-        leveldb_kv_(leveldb_kv) {
+        leveldb_kv_(leveldb_kv),
+        feat_desc_(feat_desc) {
     int64 emb_offset = value_len_ * emb_index;
     std::vector<std::list<K>> keys_parts_vec(kSavedPartitionNum);
     for (int64 i = 0; i < key_list.size(); i++) {
@@ -251,8 +250,7 @@ class DBValueIterator: public ValueIterator<V> {
 
   V* Next() {
     if (value_ptr_ != nullptr) {
-      value_ptr_->Destroy(ev_allocator());
-      delete value_ptr_;
+      feat_desc_->Deallocate(value_ptr_);
     }
     K key = *(keys_iter_++);
 
@@ -260,16 +258,17 @@ class DBValueIterator: public ValueIterator<V> {
     if (!s.ok()) {
       LOG(FATAL)<<"Not found value in LevelDB when Save.";
     }
-    return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_);
+    return feat_desc_->GetEmbedding(value_ptr_, emb_index_);
   }
 
  private:
   int64 value_len_;
   int64 emb_index_;
   LevelDBKV<K, V>* leveldb_kv_;
+  FeatureDescriptor<V>* feat_desc_;
   std::list<K> keys_;
   typename std::list<K>::const_iterator keys_iter_;
-  ValuePtr<V>* value_ptr_ = nullptr;
+  void* value_ptr_ = nullptr;
   int64 key_cursor_ = 0;
 };
 
diff --git a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h b/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
deleted file mode 100644
index 8dcea81d4a1..00000000000
--- a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-=======================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "sparsehash/dense_hash_map_lockless"
-#include "tensorflow/core/framework/embedding/batch.h"
-#include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/stream_executor.h"
-
-namespace tensorflow {
-using se::DeviceMemoryBase;
-using se::Stream;
-
-namespace embedding {
-
-template <class K, class V>
-class LocklessHashMapCPU : public KVInterface<K, V> {
- public:
-  LocklessHashMapCPU(Allocator* gpu_alloc): gpu_alloc_(gpu_alloc) {
-    hash_map_.max_load_factor(0.8);
-    hash_map_.set_empty_key_and_value(EMPTY_KEY_, nullptr);
-    hash_map_.set_counternum(16);
-    hash_map_.set_deleted_key(DELETED_KEY_);
-    cudaEventCreate(&is_finish_);
-  }
-
-  ~LocklessHashMapCPU() override {
-    cudaEventDestroy(is_finish_);
-  }
-
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      *value_ptr = iter.second;
-      return Status::OK();
-    }
-  }
-
-  Status Contains(K key) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
-    auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
-    // insert fail, exist key
-    if ((*(iter.first)).second != value_ptr){
-      return errors::AlreadyExists(
-          "already exists Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  // Other Method
-  int64 Size() const override {
-    return hash_map_.size_lockless();
-  }
-
-  // Remove KV
-  Status Remove(K key) override {
-    if (hash_map_.erase_lockless(key)) {
-      return Status::OK();
-    } else {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    }
-  }
-
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-  }
-
-  void AppendToValuePtrQueue(ValuePtr<V>* old_value_ptr) {
-    //A parameter that can be adjusted in the future
-    if (value_ptr_out_of_date_.size() > CAP_INVALID_VALUEPTR) {
-      ValuePtr<V>* value_ptr = value_ptr_out_of_date_.front();
-      delete value_ptr;
-      value_ptr_out_of_date_.pop_front();
-    }
-    value_ptr_out_of_date_.emplace_back(old_value_ptr);
-  }
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    ValuePtr<V>* cpu_value_ptr =
-      new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-    cudaMemcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-               *(char **)((char*)value_ptr->GetPtr() + sizeof(FixedLengthHeader)),
-               total_dims_ * sizeof(V),
-               cudaMemcpyDeviceToHost);
-    memcpy((char *)cpu_value_ptr->GetPtr(),
-        (char*)value_ptr->GetPtr(), sizeof(FixedLengthHeader));
-    auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-    if ((*(iter.first)).second != cpu_value_ptr) {
-      AppendToValuePtrQueue((*(iter.first)).second);
-      (*(iter.first)).second = cpu_value_ptr;
-    }
-    return Status::OK();
-  }
-
-  Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
-    int batch_size = keys.size();
-    Allocator* cpu_alloc = cpu_allocator();
-    V** value_address = (V **)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
-    V** dev_value_address;
-    V* batch_data_place;
-    V* dev_batch_data_place;
-    dev_value_address = (V**)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
-    dev_batch_data_place = (V*)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-    batch_data_place = (V *)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-
-    // Copy GPU addresses V*
-    for(int i = 0;i < batch_size;++i) {
-      value_address[i] =
-        *(V **)((char*)value_ptrs[i]->GetPtr() + sizeof(FixedLengthHeader));
-    }
-
-    cudaMemcpyAsync(dev_value_address, value_address,
-                    sizeof(V*) * batch_size,
-                    cudaMemcpyHostToDevice);
-
-    // Launch Kernel,Copy data to continuous place
-    int block_dim = 128;
-    void* args[] = { (void*)&dev_value_address,
-        (void*)&dev_batch_data_place, (void*)&total_dims_,
-        (void*)&batch_size};
-
-    cudaLaunchKernel((void *)BatchCopy<V>,
-                     (batch_size * total_dims_ + block_dim - 1) / block_dim,
-                     block_dim, args, 0, NULL);
-
-    cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
-                    sizeof(V) * batch_size * total_dims_,
-                    cudaMemcpyDeviceToHost);
-
-    cudaEventRecord(is_finish_);
-    cudaEventSynchronize(is_finish_);
-
-    // Copy data to ValuePtrs in memory;Insert it into hashmap
-    for(int i = 0; i < batch_size; ++i) {
-      ValuePtr<V>* cpu_value_ptr =
-        new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-      memcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-          &batch_data_place[i * total_dims_], total_dims_ * sizeof(V));
-      memcpy((char *)cpu_value_ptr->GetPtr(),
-          (char *)value_ptrs[i]->GetPtr(), sizeof(FixedLengthHeader));
-      auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(keys[i],
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-      if ((*(iter.first)).second != cpu_value_ptr) {
-        AppendToValuePtrQueue((*(iter.first)).second);
-        (*(iter.first)).second = cpu_value_ptr;
-      }
-    }
-
-    gpu_alloc_->DeallocateRaw(dev_value_address);
-    gpu_alloc_->DeallocateRaw(dev_batch_data_place);
-
-    cpu_alloc->DeallocateRaw(batch_data_place);
-    cpu_alloc->DeallocateRaw(value_address);
-
-    return Status::OK();
-  }
-
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
-    int64 bucket_count;
-    auto it = hash_map_.GetSnapshot();
-    hash_map_dump = it.first;
-    bucket_count = it.second;
-    for (int64 j = 0; j < bucket_count; j++) {
-      if (hash_map_dump[j].first != EMPTY_KEY_ &&
-          hash_map_dump[j].first != DELETED_KEY_) {
-        key_list->emplace_back(hash_map_dump[j].first);
-        value_ptr_list->emplace_back(hash_map_dump[j].second);
-      }
-    }
-    free(hash_map_dump);
-    return Status::OK();
-  }
-
-  std::string DebugString() const override {
-    LOG(INFO) << "map info size:" << Size()
-              << "map info bucket_count:" << hash_map_.bucket_count()
-              << "map info load_factor:" << hash_map_.load_factor()
-              << "map info max_load_factor:" << hash_map_.max_load_factor()
-              << "map info min_load_factor:" << hash_map_.min_load_factor();
-    return "";
-  }
-
- private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>* >
-    LockLessHashMap;
-  static const int EMPTY_KEY_ = -1;
-  static const int DELETED_KEY_ = -2;
-  static constexpr int CAP_INVALID_VALUEPTR = 200000;
-  LockLessHashMap hash_map_;
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
-  int total_dims_;
-  Allocator* gpu_alloc_;
-  cudaEvent_t is_finish_;
-};
-}  // namespace embedding
-}  // namespace tensorflow
-
-#endif //GOOGLE_CUDA
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
index de275183d22..9745ab5fcc3 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/framework/embedding/batch.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -44,11 +43,13 @@ template <class K, class V>
 void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     const EmbeddingVarContext<GPUDevice>& ctx,
     const K* keys,
-    ValuePtr<V>** value_ptr_list,
+    void** value_ptr_list,
     std::list<int64>& copyback_cursor,
     const std::vector<int64>& memory_index,
-    const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-    int value_len) {
+    const std::vector<void*>& gpu_value_ptrs,
+    int value_len,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
   if (copyback_cursor.size() > 0) {
     int total = copyback_cursor.size();
     //Alocate memcpy buffer on CPU and GPU.
@@ -64,11 +65,13 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     auto do_work = [memory_index,
                     memcpy_buffer_cpu, value_ptr_list,
                     gpu_value_ptrs,
+                    dram_feat_desc,
                     value_len, this] (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
         int j = memory_index[i];
         memcpy(memcpy_buffer_cpu + i * value_len,
-               value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V));
+               dram_feat_desc->GetEmbedding(value_ptr_list[j], 0),
+               value_len * sizeof(V));
         value_ptr_list[j] = gpu_value_ptrs[i];
       }
     };
@@ -96,8 +99,7 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     for (; it != copyback_cursor.cend(); ++it, ++i) {
       // Get the cursor
       int64 cursor = *it;
-      gpu_value_ptrs[i]->SetInitialized(0);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0);
+      value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0);
     }
     DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*));
     compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*));
@@ -119,16 +121,71 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
 }
 #define REGISTER_KERNELS(ktype, vtype)                                        \
   template void MultiTierStorage<ktype, vtype>::CopyEmbeddingsFromDramToHbm(       \
-      const EmbeddingVarContext<GPUDevice>&, const ktype*, ValuePtr<vtype>**,\
+      const EmbeddingVarContext<GPUDevice>&, const ktype*, void**,\
       std::list<int64>&, const std::vector<int64>&,\
-      const std::vector<ValuePtr<vtype>*>&, int);
+      const std::vector<void*>&, int, FeatureDescriptor<vtype>*,\
+      FeatureDescriptor<vtype>*);
 #define REGISTER_KERNELS_ALL(type) \
   REGISTER_KERNELS(int32, type);   \
   REGISTER_KERNELS(int64, type)
 #define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <class TValue>
+template <class K>
+void HbmMultiTierFeatureDescriptorImpl<TValue>::SetDefaultValues(
+    const K* keys, const std::list<int64>& init_cursor,
+    void** value_ptrs, se::Stream* compute_stream, EventMgr* event_mgr,
+    const Eigen::GpuDevice& gpu_device) {
+  if (init_cursor.size() > 0) {
+    int64 total = init_cursor.size();
+    TValue** value_address = nullptr;
+    value_address = TypedAllocator::Allocate<TValue*>(cpu_allocator(), total * 2,
+                                                 AllocationAttributes());
+    TValue** default_value_address = value_address + total;
+    TValue** dev_value_address = nullptr;
+    dev_value_address =
+        TypedAllocator::Allocate<TValue*>(hbm_alloc_, total * 2, AllocationAttributes());
+    TValue** dev_default_value_address = dev_value_address + total;
+    for (int emb_index = 0; emb_index < FeatureDescriptorImpl<TValue>::slot_infos_.size(); emb_index++) {
+      int64 i = 0;
+      auto it = init_cursor.cbegin();
+      for (; it != init_cursor.cend(); ++it, ++i) {
+        value_address[i] = GetEmbedding(value_ptrs[*it], emb_index);
+        default_value_address[i] =
+            FeatureDescriptorImpl<TValue>::GetDefaultValuePtr(emb_index, keys[i]);
+      }
+      DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(TValue*));
+      compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
+                                total * 2 * sizeof(TValue*));
+      int block_dim = 128;
+      int value_len = FeatureDescriptorImpl<TValue>::slot_infos_[emb_index].default_value_len;
+      TF_CHECK_OK(GpuLaunchKernel(
+          embedding::CopyEmbedding<TValue>,
+          (total * value_len + block_dim - 1) / block_dim,
+          block_dim, 0, gpu_device.stream(), dev_default_value_address,
+          dev_value_address, value_len, total));
+      SyncWithEventMgr(compute_stream, event_mgr);  
+    }
+    
+    TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2);
+    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
+  }
+}
 
+#define REGISTER_KERNELS(ktype, vtype)                                        \
+  template void HbmMultiTierFeatureDescriptorImpl<vtype>::SetDefaultValues(     \
+      const ktype*, const std::list<int64>&, void**,\
+      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 } // namespace embedding
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 8239d109e64..7955322aca6 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -31,10 +31,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/core/status.h"
 
-namespace tensorflow {
-template<typename V>
-class ValuePtr;
+#if GOOGLE_CUDA
+#include "tensorflow/core/framework/embedding/batch.h"
+#endif
 
+namespace tensorflow {
 template<typename K, typename V>
 class EmbeddingVar;
 
@@ -54,22 +55,10 @@ class MultiTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-
-      cache_capacity_ = Storage<K, V>::storage_config_.size[0]
-                        / (Storage<K, V>::total_dims_ * sizeof(V));
-      ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+  virtual void Init() override {
+    cache_capacity_ = Storage<K, V>::storage_config_.size[0]
+                      / (total_dim() * sizeof(V));
+    ready_eviction_ = true;
   }
 
   int64 CacheSize() const override {
@@ -90,13 +79,13 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL)<<"BatchCommit isn't supported by MultiTierStorage.";
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
   }
 
@@ -104,7 +93,7 @@ class MultiTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -128,17 +117,6 @@ class MultiTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
-
   void Schedule(std::function<void()> fn) override {
     cache_thread_pool_->Schedule(std::move(fn));
   }
@@ -223,50 +201,50 @@ class MultiTierStorage : public Storage<K, V> {
     }
     return s;
   }
- 
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual int total_dim() = 0;
 
   void DeleteFromEvictionManager() {
     eviction_manager_->DeleteStorage(this);
   }
 
-  void ReleaseValuePtrs(std::deque<ValuePtr<V>*>& value_ptrs,
-                        Allocator* allocator) {
+  void ReleaseValuePtrs(std::deque<void*>& value_ptrs,
+                        FeatureDescriptor<V>* feat_desc) {
     constexpr int CAP_INVALID_VALUEPTR = 64 * 1024;
     if (value_ptrs.size() > CAP_INVALID_VALUEPTR) {
       int64 num_of_deleted_value_ptrs =
           value_ptrs.size() - CAP_INVALID_VALUEPTR;
       for (int i = 0; i < num_of_deleted_value_ptrs; i++) {
-        ValuePtr<V>* value_ptr = value_ptrs.front();
-        value_ptr->Destroy(allocator);
-        delete value_ptr;
+        void* value_ptr = value_ptrs.front();
+        feat_desc->Deallocate(value_ptr);
         value_ptrs.pop_front();
       }
     }
   }
 
-  void ReleaseInvalidValuePtr(Allocator* allocator) {
-    ReleaseValuePtrs(value_ptr_out_of_date_, allocator);
+  void ReleaseInvalidValuePtr(FeatureDescriptor<V>* feat_desc) {
+    ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc);
   }
 
-  void KeepInvalidValuePtr(ValuePtr<V>* value_ptr) {
+  void KeepInvalidValuePtr(void* value_ptr) {
     value_ptr_out_of_date_.emplace_back(value_ptr);
   }
 
 #if GOOGLE_CUDA
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& context,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
                                    const std::vector<int64>& memory_index,
-                                   const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-                                   int value_len);
+                                   const std::vector<void*>& gpu_value_ptrs,
+                                   int value_len,
+                                   FeatureDescriptor<V>* hbm_feat_desc,
+                                   FeatureDescriptor<V>* dram_feat_desc);
 #endif //GOOGL_CUDA
  private:
   virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {}
 
  protected:
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
+  std::deque<void*> value_ptr_out_of_date_;
   BatchCache<K>* cache_ = nullptr;
 
   EvictionManager<K, V>* eviction_manager_;
@@ -281,6 +259,70 @@ class MultiTierStorage : public Storage<K, V> {
   std::string name_;
   std::vector<mutex> mu_list_;
 };
+
+#if GOOGLE_CUDA
+template <class V>
+void CopyEmbeddingFromHbmToDram(
+    const std::vector<void*>& hbm_value_ptrs,
+    const std::vector<void*>& dram_value_ptrs,
+    Allocator* gpu_alloc,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
+  int batch_size = hbm_value_ptrs.size();
+    V** dev_value_address;
+
+  dev_value_address = (V**)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
+  Allocator* cpu_alloc = ev_allocator();
+  V** value_address = (V**)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
+
+  V* batch_data_place;
+  V* dev_batch_data_place;
+  int total_dim = dram_feat_desc->total_dim();
+  dev_batch_data_place = (V*)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  batch_data_place = (V *)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  // Copy GPU addresses V*
+  for(int i = 0; i < batch_size; ++i) {
+    value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0);
+  }
+  cudaMemcpyAsync(dev_value_address, value_address,
+                  sizeof(V*) * batch_size,
+                  cudaMemcpyHostToDevice);
+
+  // Launch Kernel,Copy data to continuous place
+  int block_dim = 128;
+  void* args[] = { (void*)&dev_value_address,
+      (void*)&dev_batch_data_place, (void*)&total_dim,
+      (void*)&batch_size};
+
+  cudaLaunchKernel((void *)BatchCopy<V>,
+                    (batch_size * total_dim + block_dim - 1) / block_dim,
+                    block_dim, args, 0, NULL);
+
+  cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
+                  sizeof(V) * batch_size * total_dim,
+                  cudaMemcpyDeviceToHost);
+
+  cudaEvent_t is_finish_;
+  cudaEventCreate(&is_finish_);
+  cudaEventRecord(is_finish_);
+  cudaEventSynchronize(is_finish_);
+  cudaEventDestroy(is_finish_);
+  
+  for(int i = 0; i < batch_size; ++i) {
+    memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0),
+        &batch_data_place[i * total_dim], total_dim * sizeof(V));
+  }
+
+  cpu_alloc->DeallocateRaw(value_address);
+  cpu_alloc->DeallocateRaw(batch_data_place);
+  gpu_alloc->DeallocateRaw(dev_value_address);
+  gpu_alloc->DeallocateRaw(dev_batch_data_place);
+}
+#endif //GOOGL_CUDA
 } // embedding
 } // tensorflow
 
diff --git a/tensorflow/core/framework/embedding/normal_feature_descriptor.h b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
new file mode 100644
index 00000000000..817b33d058b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
@@ -0,0 +1,134 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+#if GOOGLE_CUDA
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+#endif
+
+template<class V>
+class NormalFeatureDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num,
+                          bool need_record_freq,
+                          bool need_record_version)
+      : alloc_bytes_(0),
+        alloc_(alloc),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {}
+  
+  NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_(feat_desc_impl->alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  NormalFeatureDescriptorImpl(
+      HbmMultiTierFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_bytes_(0),
+        alloc_(feat_desc_impl->dram_alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  ~NormalFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes = FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+      FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    FeatureDescriptorImpl<V>::SetSlotInfo(feat_desc_impl);
+    FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+    FeatureDescriptorImpl<V>::SetFreqAndVersionOffset(&alloc_bytes_);
+    return true;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return reinterpret_cast<V*>(val)
+        + FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    for (auto val: value_ptrs) {
+      Deallocate(val);
+    }
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+  void SetDefaultValue(void* val, int64 index) override {
+    for (int i = 0; i < FeatureDescriptorImpl<V>::slot_infos_.size(); i++) {
+      V* val_ptr = GetEmbedding(val, i);
+      FeatureDescriptorImpl<V>::SetDefaultValue((void*)val_ptr, i, index);
+    }
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    LOG(FATAL)<<"Can't call SetDefaultValue(const K*, const std::list<int64>&,"
+              <<"void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)"
+              <<" in HbmMultiTierFeatureDescriptor.";
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+
+ private:
+  int alloc_bytes_;
+  Allocator* alloc_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 0c5ce80886a..7e3ace0063d 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -30,19 +30,21 @@ template<typename K, typename V, typename EV>
 class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
   NullableFilterPolicy(const EmbeddingConfig& config,
-                       EV* ev, embedding::Storage<K, V>* storage) : 
-      FilterPolicy<K, V, EV>(config, ev), storage_(storage) {}
+      EV* ev, embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc)
+      : storage_(storage), feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(
+          value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_ptr,
@@ -57,17 +59,17 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_ptr;
         }
@@ -85,65 +87,55 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs,
+                              const K* keys, void** value_ptrs,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         not_found_cursor_list(num_worker_threads + 1);
     ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs,
                                 num_of_keys, not_found_cursor_list);
-    std::vector<V*> var_ptrs(num_of_keys);
-    auto do_work = [this, value_ptrs, &var_ptrs]
-        (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptrs[i] = ev_->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-      }
-    };
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers, num_of_keys,
-          1000, do_work);
-
-    ev_->SetDefaultValueOfNewFeatures(
-        keys, num_of_keys,
-        not_found_cursor_list[0],
-        var_ptrs.data(), ctx.compute_stream,
-        ctx.event_mgr, ctx.gpu_device);
   }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
     memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
     *is_filter = true;
-    return ev_->LookupOrCreateKey(key, val);
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      feat_desc_->SetDefaultValue(*value_ptr, key);
+      storage_->Insert(key, value_ptr);
+      s = Status::OK();
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
+    return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      return value_ptr->GetFreq();
-    }else {
-      return 0;
-    }
+  Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) override {
+    *is_filter = true;
+    return ev_->LookupKey(key, val);
+  }
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      ValuePtr<V>* value_ptr = nullptr;
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-      return value_ptr->GetFreq();
-    }else {
+    if (!config_.is_save_freq())
       return 0;
-    }
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -161,27 +153,30 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
+
       if (config_.filter_freq !=0 || ev_->IsMultiLevel()
           || config_.record_freq) {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
+        import_version = version_buff[i];
       }
-      LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                value_ptr, value_buff, key_buff);
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     return true;
   }
 
  private:
   embedding::Storage<K, V>* storage_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h
index ea063a113a3..a8d0d9ada75 100644
--- a/tensorflow/core/framework/embedding/shrink_policy.h
+++ b/tensorflow/core/framework/embedding/shrink_policy.h
@@ -15,14 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 class Allocator;
 
 namespace embedding {
@@ -40,31 +37,29 @@ struct ShrinkArgs {
 template<typename K, typename V>
 class ShrinkPolicy {
  public:
-  ShrinkPolicy(Allocator* alloc): alloc_(alloc) {}
+  ShrinkPolicy(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {}
   virtual ~ShrinkPolicy() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy);
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list,
+                      std::vector<void*>& value_list,
                       const ShrinkArgs& shrink_args) = 0;
 
  protected:
-  void EmplacePointer(ValuePtr<V>* value_ptr) {
+  void EmplacePointer(void* value_ptr) {
     to_delete_.emplace_back(value_ptr);
   }
 
   void ReleaseValuePtrs() {
     for (auto it : to_delete_) {
-      it->Destroy(alloc_);
-      delete it;
+      feat_desc_->Deallocate(it);
     }
     to_delete_.clear();
   }
  protected:
-  std::vector<ValuePtr<V>*> to_delete_;
- private:
-  Allocator* alloc_;
+  std::vector<void*> to_delete_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
@@ -74,7 +69,7 @@ class NonShrinkPolicy: public ShrinkPolicy<K, V> {
   TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {}
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index f9de65df588..be08afd7f50 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -24,7 +24,6 @@ limitations under the License.
 #endif // GOOGLE_CUDA
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/embedding/l2weight_shrink_policy.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/leveldb_kv.h"
 #include "tensorflow/core/framework/embedding/ssd_hash_kv.h"
 #include "tensorflow/core/framework/embedding/storage_config.h"
@@ -32,9 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -62,24 +58,22 @@ class HbmDramSsdStorage;
 template<typename K, typename V>
 class SingleTierStorage : public Storage<K, V> {
  public:
-  SingleTierStorage(const StorageConfig& sc, Allocator* alloc,
-      KVInterface<K, V>* kv, LayoutCreator<V>* lc)
-      : kv_(kv), alloc_(alloc), layout_creator_(lc),
+  SingleTierStorage(const StorageConfig& sc,
+      KVInterface<K, V>* kv, FeatureDescriptor<V>* feat_desc)
+      : kv_(kv), feat_desc_(feat_desc),
         Storage<K, V>(sc) {
     if (sc.embedding_config.steps_to_live != 0) {
       shrink_policy_ =
           new GlobalStepShrinkPolicy<K, V>(
               sc.embedding_config.steps_to_live,
-              alloc_,
+              feat_desc_,
               kv_);
     } else if (sc.embedding_config.l2_weight_threshold != -1.0) {
       shrink_policy_ =
           new L2WeightShrinkPolicy<K, V>(
               sc.embedding_config.l2_weight_threshold,
               sc.embedding_config.primary_emb_index,
-              Storage<K, V>::GetOffset(
-                  sc.embedding_config.primary_emb_index),
-              alloc_,
+              feat_desc_,
               kv_);
     } else {
       shrink_policy_ = new NonShrinkPolicy<K, V>();
@@ -89,11 +83,10 @@ class SingleTierStorage : public Storage<K, V> {
   ~SingleTierStorage() override {
     mutex_lock l(Storage<K, V>::mu_);
     std::vector<K> key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     kv_->GetSnapshot(&key_list, &value_ptr_list);
     for (auto value_ptr : value_ptr_list) {
-      value_ptr->Destroy(alloc_);
-      delete value_ptr;
+      feat_desc_->Deallocate(value_ptr);
     }
     delete kv_;
     delete shrink_policy_;
@@ -101,7 +94,7 @@ class SingleTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     return kv_->Lookup(key, value_ptr);
   }
 
@@ -109,47 +102,45 @@ class SingleTierStorage : public Storage<K, V> {
     return kv_->Contains(key);
   }
 
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) override {
+  virtual void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     do {
-      *value_ptr = layout_creator_->Create(alloc_, alloc_len);
+      *value_ptr = feat_desc_->Allocate();
       Status s = kv_->Insert(key, *value_ptr);
       if (s.ok()) {
         break;
       } else {
-        (*value_ptr)->Destroy(alloc_);
-        delete *value_ptr;
+        feat_desc_->Deallocate(*value_ptr);
       }
     } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in SingleTireStorage.";
+  virtual void Insert(K key, void** value_ptr) override {
+    do {
+      Status s = kv_->Insert(key, *value_ptr);
+      if (s.ok()) {
+        break;
+      } else {
+        feat_desc_->Deallocate(*value_ptr);
+      }
+    } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = kv_->Lookup(key, value_ptr);
     if (s.ok()) {
       return s;
     }
 
-    *value_ptr = layout_creator_->Create(alloc_, size);
+    *value_ptr = feat_desc_->Allocate();
     s = kv_->Insert(key, *value_ptr);
     if (s.ok()) {
       return s;
     }
     // Insert Failed, key already exist
-    (*value_ptr)->Destroy(alloc_);
-    delete *value_ptr;
+    feat_desc_->Deallocate(*value_ptr);
     return kv_->Lookup(key, value_ptr);
   }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    return GetOrCreate(key, value_ptr, size);
-  }
  
   Status Remove(K key) override {
     return kv_->Remove(key);
@@ -180,7 +171,7 @@ class SingleTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -198,13 +189,13 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL) << "Unsupport BatchCommit in Storage: "
                << typeid(this).name();
     return Status::OK();
   }
 
-  virtual Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K keys, const void* value_ptr) {
      LOG(FATAL) << "Unsupport Commit in Storage: "
                 << typeid(this).name();
     return Status::OK();
@@ -222,19 +213,12 @@ class SingleTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) override {}
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
@@ -247,7 +231,7 @@ class SingleTierStorage : public Storage<K, V> {
       ShrinkArgs& shrink_args,
       int64 value_len,
       V* default_value) override {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     std::vector<K> key_list_tmp;
     TF_CHECK_OK(kv_->GetSnapshot(
         &key_list_tmp, &value_ptr_list));
@@ -255,30 +239,16 @@ class SingleTierStorage : public Storage<K, V> {
     if (emb_config.is_primary()) {
       Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len);
     }
-
     TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
         tensor_name, writer,
         emb_config,
         value_len, default_value,
         key_list_tmp,
-        value_ptr_list)));
+        value_ptr_list,
+        SingleTierStorage<K, V>::feat_desc_)));
     return Status::OK();
   }
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
-  }
-
   bool IsMultiLevel() override {
     return false;
   }
@@ -299,16 +269,22 @@ class SingleTierStorage : public Storage<K, V> {
     LOG(FATAL) << "Unsupport Schedule in SingleTierStorage.";
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual void* CreateValuePtr() {
+    return feat_desc_->Allocate();
+  }
 
-  virtual ValuePtr<V>* CreateValuePtr(int64 size) {
-    return layout_creator_->Create(alloc_, size);
+  virtual void DestroyValuePtr(void* value_ptr) {
+    feat_desc_->Deallocate(value_ptr);
   }
 
-  virtual void DestroyValuePtr(ValuePtr<V>* value_ptr) {
-    value_ptr->Destroy(alloc_);
-    delete value_ptr;
+  FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
   }
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -324,7 +300,7 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_ptr_list,
+                      std::vector<void*>& value_ptr_list,
                       ShrinkArgs& shrink_args,
                       int64 value_len) {
     mutex_lock l(Storage<K, V>::mu_);
@@ -339,31 +315,40 @@ class SingleTierStorage : public Storage<K, V> {
   KVInterface<K, V>* kv_;
   ShrinkPolicy<K, V>* shrink_policy_;
   Allocator* alloc_;
-  LayoutCreator<V>* layout_creator_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
 class DramStorage : public SingleTierStorage<K, V> {
  public:
-  DramStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc,
-      KVInterface<K, V>* kv)
-      : SingleTierStorage<K, V>(sc, alloc, kv, lc) {}
+  DramStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {}
 
   ~DramStorage() override {}
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return SingleTierStorage<K, V>::kv_->BatchCommit(keys, value_ptrs);
   }
 
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) override{
+  Status Commit(K keys, const void* value_ptr) override{
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    void* value_ptr = SingleTierStorage<K, V>::feat_desc_->Allocate(freq);
+    SingleTierStorage<K, V>::Insert(key, &value_ptr);
+    SingleTierStorage<K, V>::feat_desc_->SetValue(value_ptr, emb_index, value);
+    SingleTierStorage<K, V>::feat_desc_->SetFreq(value_ptr, freq);
+    SingleTierStorage<K, V>::feat_desc_->UpdateVersion(value_ptr, version);
+  }
  
   TF_DISALLOW_COPY_AND_ASSIGN(DramStorage);
  public:
@@ -375,12 +360,8 @@ class DramStorage : public SingleTierStorage<K, V> {
   friend class HbmDramSsdStorage<K, V>;
 #endif
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -395,9 +376,10 @@ class DramStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class HbmStorage : public SingleTierStorage<K, V> {
  public:
-  HbmStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new GPUHashMapKV<K, V>(sc.embedding_config, alloc), lc) {
+  HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new GPUHashMapKV<K, V>(
+            sc.embedding_config, gpu_allocator), feat_desc) {
   }
   ~HbmStorage() override {}
 
@@ -488,48 +470,27 @@ class HbmStorage : public SingleTierStorage<K, V> {
     gpu_kv->Import(key_import, value_import, device, emb_config);
     return Status::OK();
   }
-
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
  public:
-  HbmStorageWithCpuKv(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  HbmStorageWithCpuKv(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
 
   ~HbmStorageWithCpuKv() override {}
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    do {
-      Status s = SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
-      if (s.ok()) {
-        break;
-      } else {
-        value_ptr->Destroy(SingleTierStorage<K, V>::alloc_);
-        delete value_ptr;
-      }
-    } while (!(SingleTierStorage<K, V>::kv_->Lookup(key, &value_ptr)).ok());
-  }
-
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    SingleTierStorage<K, V>::Insert(key, value_ptr, alloc_len, to_dram);
-  }
-
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
  public:
   friend class HbmDramStorage<K, V>;
   friend class HbmDramSsdStorage<K, V>;
  protected:
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -544,28 +505,25 @@ class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class PmemMemkindStorage : public SingleTierStorage<K, V> {
  public:
-  PmemMemkindStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemMemkindStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemMemkindStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage);
- 
- protected:
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  public:
-  PmemLibpmemStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemLibpmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemLibpmemStorage() override {}
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -573,10 +531,8 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  
  protected:
   friend class DramPmemStorage<K, V>;
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -590,15 +546,15 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class LevelDBStore : public SingleTierStorage<K, V> {
  public:
-  LevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LevelDBKV<K, V>(sc.path), lc) {
+  LevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LevelDBKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~LevelDBStore() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -608,29 +564,25 @@ class LevelDBStore : public SingleTierStorage<K, V> {
     LevelDBKV<K, V>* leveldb_kv =
         reinterpret_cast<LevelDBKV<K, V>*>(SingleTierStorage<K, V>::kv_);
     return new DBValueIterator<K, V>(
-        key_list, emb_index, value_len, leveldb_kv);
+        key_list, emb_index, value_len,
+        leveldb_kv, SingleTierStorage<K, V>::feat_desc_);
   }
  public:
   friend class DramLevelDBStore<K, V>;
-
- protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
 };
 
 template<typename K, typename V>
 class SsdHashStorage : public SingleTierStorage<K, V> {
  public:
-  SsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new SSDHashKV<K, V>(sc.path, alloc), lc) {
+  SsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new SSDHashKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~SsdHashStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -691,8 +643,9 @@ class SsdHashStorage : public SingleTierStorage<K, V> {
 #endif
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
+  void Init() override {
+    dynamic_cast<SSDHashKV<K, V>*>(
+        SingleTierStorage<K, V>::kv_)->Init();
   }
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index 8040421233e..f51c6904a50 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -25,17 +25,12 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
 #include "tensorflow/core/framework/embedding/emb_file_creator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
-
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 class EmbPosition {
  public:
@@ -115,55 +110,6 @@ class SSDIterator {
     }
   }
 
-  virtual void Key(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    memcpy((char*)val, &((file_map_[f_id])[curr_vec_].first), dim);
-  }
-
-  virtual void Value(char* val, int64 dim, int64 value_offset) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, dim,
-              posi->offset_ + value_offset + sizeof(FixedLengthHeader));
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_ +
-          value_offset + sizeof(FixedLengthHeader), dim);
-    }
-  }
-
-  virtual void Freq(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) =
-        reinterpret_cast<FixedLengthHeader*>(val)->GetFreqCounter();
-  }
-
-  virtual void Version(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) = 
-        reinterpret_cast<FixedLengthHeader*>(val)->GetGlobalStep();
-  }
-
   virtual K Key() {
     int64 f_id = file_id_vec_[curr_file_];
     return (file_map_[f_id])[curr_vec_].first;
@@ -192,8 +138,9 @@ class SSDIterator {
 template <class K, class V>
 class SSDHashKV : public KVInterface<K, V> {
  public:
-  explicit SSDHashKV(const std::string& path, Allocator* alloc)
-  : alloc_(alloc) {
+  explicit SSDHashKV(const std::string& path,
+                     FeatureDescriptor<V>* feat_desc)
+  : feat_desc_(feat_desc) {
     path_ = io::JoinPath(
         path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_");
     hash_map_.max_load_factor(0.8);
@@ -205,9 +152,6 @@ class SSDHashKV : public KVInterface<K, V> {
     evict_file_set_.set_counternum(16);
     evict_file_set_.set_deleted_key(DELETED_KEY);
 
-    new_value_ptr_fn_ = [this](size_t size) {
-      return new NormalContiguousValuePtr<V>(alloc_, size);
-    };
     is_async_compaction_ = true;
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true,
           &is_async_compaction_));
@@ -224,7 +168,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [this](){Compaction();}; 
       check_buffer_fn_ = [this](){CheckBuffer();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKV(key, value_ptr, is_compaction);
       };
@@ -233,7 +177,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Async Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [](){};
       check_buffer_fn_ = [this](){CheckBufferAsync();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKVAsync(key, value_ptr, is_compaction);
       };
@@ -244,9 +188,8 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-    val_len_ = sizeof(FixedLengthHeader) + total_dims_ * sizeof(V);
+  void Init() {
+    val_len_ = feat_desc_->data_bytes();
     max_app_count_ = BUFFER_SIZE / val_len_;
     write_buffer_ = new char[BUFFER_SIZE];
     unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_);
@@ -334,18 +277,18 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == EMPTY_KEY) {
       return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+      void* val = feat_desc_->Allocate();
       EmbPosition* posi = iter.second;
       if (posi->flushed_) {
-        emb_files_[posi->version_]->Read((char*)(val->GetPtr()),
+        emb_files_[posi->version_]->Read((char*)val,
             val_len_, posi->offset_);
       } else {
-        memcpy((char*)val->GetPtr(),
+        memcpy((char*)val,
             write_buffer_ + posi->buffer_offset_, val_len_);
       }
       *value_ptr = val;
@@ -363,17 +306,17 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, keys.size());
     for (int i = 0; i < keys.size(); i++) {
@@ -384,7 +327,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, 1);
     check_buffer_fn_();
@@ -402,7 +345,7 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
@@ -467,8 +410,8 @@ class SSDHashKV : public KVInterface<K, V> {
 
   int64 Size() const override { return hash_map_.size_lockless(); }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
  private:
@@ -555,10 +498,10 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void AppendToWriteBuffer(size_t curr_buffer_offset, K key,
-                            const ValuePtr<V>* value_ptr) {
+                            const void* value_ptr) {
     current_offset_ += val_len_;
     memcpy(write_buffer_ + curr_buffer_offset,
-        (char*)value_ptr->GetPtr(), val_len_);
+        (char*)value_ptr, val_len_);
     key_buffer_[buffer_cur_] = key;
     ++buffer_cur_;
   }
@@ -582,7 +525,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return flag;
   }
 
-  void SaveKV(K key, const ValuePtr<V>* value_ptr,
+  void SaveKV(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, current_version_,
@@ -608,7 +551,7 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SaveKVAsync(K key, const ValuePtr<V>* value_ptr,
+  void SaveKVAsync(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, evict_version_,
@@ -681,21 +624,21 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void MoveToNewFile() {
-    ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+    void* val = feat_desc_->Allocate();
     for (auto it : evict_file_map_) {
       EmbFile* file = emb_files_[it.first];
       total_app_count_ -= file->InvalidCount();
       file->MapForRead();
       for (auto it_vec : it.second) {
         EmbPosition* posi = it_vec.second;
-        file->ReadWithMemcpy((char*)(val->GetPtr()), val_len_,
+        file->ReadWithMemcpy((char*)val, val_len_,
             posi->offset_);
         CheckBuffer();
         SaveKV(it_vec.first, val, true);
       }
       file->UnmapForRead();
     }
-    delete val;
+    feat_desc_->Deallocate(val);
   }
 
   void MoveToNewFileAsync() {
@@ -825,11 +768,10 @@ class SSDHashKV : public KVInterface<K, V> {
   char* write_buffer_ = nullptr;
   K* key_buffer_ = nullptr;
   bool is_async_compaction_;
-  Allocator* alloc_ = nullptr;
+  FeatureDescriptor<V>* feat_desc_;
 
   int total_dims_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
 
   typedef google::dense_hash_map_lockless<K, EmbPosition*> LockLessHashMap;
   LockLessHashMap hash_map_;
@@ -857,7 +799,7 @@ class SSDHashKV : public KVInterface<K, V> {
 
   std::function<void()> compaction_fn_;
   std::function<void()> check_buffer_fn_;
-  std::function<void(K, const ValuePtr<V>*, bool)> save_kv_fn_;
+  std::function<void(K, const void*, bool)> save_kv_fn_;
   EmbFileCreator* emb_file_creator_ = nullptr;
 };
 template <class K, class V>
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index bb949183492..1ffb435054b 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -40,9 +40,6 @@ using GPUDevice = Eigen::GpuDevice;
 template <class K, class V>
 class CheckpointLoader;
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -57,9 +54,6 @@ class BundleReader;
 
 template<typename Device>
 struct EmbeddingVarContext;
-namespace {
-  const int kSavedPartitionNum = 1000;
-}
 namespace embedding {
 
 template<typename K, typename V>
@@ -67,42 +61,40 @@ class Storage {
  friend class CheckpointLoader<K, V>;
  public:
   explicit Storage(const StorageConfig& storage_config)
-      : storage_config_(storage_config) {}
+      : storage_config_(storage_config) {
+    initialize_value_.resize(storage_config.embedding_config.slot_num + 1);    
+  }
   virtual ~Storage() {}
   TF_DISALLOW_COPY_AND_ASSIGN(Storage);
 
-  virtual Status Get(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Get(K key, void** value_ptr) = 0;
 #if GOOGLE_CUDA
   virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                         const K* key,
-                        ValuePtr<V>** value_ptr_list,
-                        int64 num_of_keys,
-                        int64 value_len) {}
+                        void** value_ptr_list,
+                        int64 num_of_keys) {}
 
   virtual void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* key,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_found_cursor_list) {}
 #endif //GOOGLE_CUDA
   virtual Status Contains(K key) = 0;
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) = 0;
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) = 0;
-  virtual void SetAllocLen(int64 value_len, int slot_num) = 0;
+  virtual void CreateAndInsert(K key, void** value_ptr,
+                               bool to_dram=false) = 0;
+  virtual void Insert(K key, void** value_ptr) = 0;
+  virtual void Init() {}
   virtual void SetValueLen(int64 value_len) {}
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) = 0;
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) = 0;
+  virtual Status GetOrCreate(K key, void** value_ptr) = 0;
   virtual int LookupTier(K key) const = 0;
   virtual Status Remove(K key) = 0;
   virtual int64 Size() const = 0;
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
   virtual Status Save(
       const string& tensor_name,
       const string& prefix,
@@ -113,7 +105,7 @@ class Storage {
       V* default_value) = 0;
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   virtual Status Eviction(K* evict_ids, int64 evict_size) = 0;
 
@@ -121,7 +113,7 @@ class Storage {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -149,25 +141,11 @@ class Storage {
       Allocator* alloc,
       int64 value_len,
       int64 block_size) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list, int64 num_of_value_ptrs) = 0;
  
   inline mutex* get_mutex() { return &mu_; }
   inline int64 GetAllocLen() { return alloc_len_; }
   inline int64 GetOffset(int64 index) { return alloc_len_ * index; }
   inline int64 GetTotalDims() { return total_dims_; }
-  inline int64 ComputeAllocLen(int64 value_len) {
-    if (LayoutType::COMPACT == storage_config_.layout_type) {
-      return value_len;
-    } else {
-      return (value_len * sizeof(V) % 16 == 0)
-          ? value_len
-          : value_len + (16 - (sizeof(V) * value_len) % 16) / sizeof(V);
-    }
-  }
-  inline LayoutType GetLayoutType() { return storage_config_.layout_type; }
   inline embedding::StorageType GetStorageType() { return storage_config_.type; }
   inline std::string GetStoragePath() { return storage_config_.path; }
   inline embedding::CacheStrategy
@@ -183,7 +161,7 @@ class Storage {
   }
 
   inline void Insert(const std::vector<K>& keys,
-                     ValuePtr<V>** value_ptrs) {
+                     void** value_ptrs) {
     for (size_t i = 0; i < keys.size(); i++) {
       Insert(keys[i], value_ptrs[i]);
     }
@@ -211,6 +189,13 @@ class Storage {
                                     reset_version, reader);
     restorer.RestoreCkpt(emb_config, device);
   };
+  
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) = 0;
+  
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) = 0;
 
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -227,12 +212,7 @@ class Storage {
                             const std::string& ssd_emb_file_name,
                             EmbeddingVar<K, V>* ev,
                             RestoreSSDBuffer<K>& restore_buff) {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
-    auto* alloc = ev->GetAllocator();
     for (int64 i = 0; i < restore_buff.num_of_keys; i++) {
-      ValuePtr<V>* value_ptr = nullptr;
-      ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr);
-      value_ptr->SetInitialized(emb_index);
       int64 file_id = restore_buff.key_file_id_list_buf[i];
       int64 key_offset = restore_buff.key_offset_list_buf[i];
       // Read data from embedding files on SSD. Data are stored in
@@ -240,32 +220,29 @@ class Storage {
       std::stringstream ss;
       ss << ssd_emb_file_name << "/" << file_id << ".emb";
       int fd = open(ss.str().data(), O_RDONLY);
+      EmbeddingConfig& emb_config = storage_config_.embedding_config;
+      FeatureDescriptor<V> normal_feat_desc(
+          emb_config.block_num, emb_config.slot_num + 1,
+          ev_allocator(), StorageType::DRAM, true,
+          true, {false, 0});
+      void* value_ptr = normal_feat_desc.Allocate();
       char* file_addr = (char*)mmap(nullptr,
-                                    sizeof(FixedLengthHeader) +
-                                    alloc_len * sizeof(V) * (emb_slot_num + 1) +
+                                    normal_feat_desc.data_bytes() +
                                     key_offset,
                                     PROT_READ, MAP_PRIVATE, fd, 0);
-
-      NormalContiguousValuePtr<V> tmp_value_ptr(alloc,
-                                                alloc_len * (emb_slot_num + 1));
-      void* ptr = tmp_value_ptr.GetPtr();
-      memcpy(ptr, file_addr + key_offset,
-             sizeof(FixedLengthHeader) +
-              alloc_len * sizeof(V) * (emb_slot_num + 1));
+      memcpy(value_ptr, file_addr + key_offset,
+             normal_feat_desc.data_bytes());
       munmap(file_addr,
-             sizeof(FixedLengthHeader) +
-             alloc_len * sizeof(V) * (emb_slot_num + 1) +
+             normal_feat_desc.data_bytes() +
              key_offset);
       close(fd);
       // Copy Data to ValuePtr, data of slots are set by primary here.
-      for (int j = 0; j < emb_slot_num + 1; j++) {
-        V* value = tmp_value_ptr.GetValue(j, alloc_len * j);
-        if (value != nullptr) {
-          value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j);
-        }
-      }
-      value_ptr->SetFreq(tmp_value_ptr.GetFreq());
-      value_ptr->SetStep(tmp_value_ptr.GetStep());
+      int64 import_freq = normal_feat_desc.GetFreq(value_ptr);
+      int64 import_version = normal_feat_desc.GetVersion(value_ptr);
+      V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index);
+      Import(restore_buff.key_list_buf[i], value,
+             import_freq, import_version, emb_index);
+      normal_feat_desc.Deallocate(value_ptr);
     }
     return Status::OK();
   }
@@ -273,10 +250,11 @@ class Storage {
  private:
   void GeneratePartitionedCkptData(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
       const EmbeddingConfig& emb_config,
-      V* default_value) {
+      V* default_value,
+      FeatureDescriptor<V>* feat_desc) {
     std::vector<EmbeddingVarCkptData<K, V>>
         ev_ckpt_data_parts(kSavedPartitionNum);
 
@@ -293,7 +271,43 @@ class Storage {
           ev_ckpt_data_parts[part_id].Emplace(
               key_list[i], value_ptr_list[i],
               emb_config, default_value,
-              GetOffset(emb_config.emb_index),
+              feat_desc,
+              is_save_freq,
+              is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config,
+      V* default_value,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc) {
+    std::vector<EmbeddingVarCkptData<K, V>>
+        ev_ckpt_data_parts(kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset;
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i],
+              emb_config, default_value,
+              feat_desc[feat_desc_type],
               is_save_freq,
               is_save_version,
               save_unfiltered_features);
@@ -333,12 +347,33 @@ class Storage {
       int64 value_len,
       V* default_value,
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
+      FeatureDescriptor<V>* feat_desc,
+      ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value, feat_desc);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(
+            tensor_name, writer, value_len, value_iter);
+    return Status::OK();
+  }
+
+  Status SaveToCheckpoint(
+      const string& tensor_name,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      int64 value_len,
+      V* default_value,
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc,
       ValueIterator<V>* value_iter = nullptr) {
     EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
     GeneratePartitionedCkptData(key_list, value_ptr_list,
                                 &partitioned_ckpt_data, emb_config,
-                                default_value);
+                                default_value, feat_desc);
     Status s =
         partitioned_ckpt_data.ExportToCkpt(
             tensor_name, writer, value_len, value_iter);
@@ -366,6 +401,7 @@ class Storage {
 
   mutex mu_;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
+  std::vector<V*> initialize_value_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h
index 85e44879dcb..23babc9ef08 100644
--- a/tensorflow/core/framework/embedding/storage_config.h
+++ b/tensorflow/core/framework/embedding/storage_config.h
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 namespace tensorflow {
 namespace embedding {
 struct StorageConfig {
   StorageConfig() : type(StorageType::DEFAULT),
                     path(""),
-                    layout_type(LayoutType::NORMAL),
                     cache_strategy(CacheStrategy::LFU) {
     size = {1<<30,1<<30,1<<30,1<<30};
   }
@@ -31,32 +29,14 @@ struct StorageConfig {
   StorageConfig(StorageType t,
                 const std::string& p,
                 const std::vector<int64>& s,
-                const std::string& layout,
                 const EmbeddingConfig& ec,
                 const CacheStrategy cache_strategy_ = CacheStrategy::LFU)
-                                      : type(t),
-                                        path(p),
-                                        embedding_config(ec),
-                                        cache_strategy(cache_strategy_) {
-    if ("normal" == layout) {
-      layout_type = LayoutType::NORMAL;
-    } else if ("light" == layout) {
-      layout_type = LayoutType::LIGHT;
-    } else if ("normal_contiguous" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    } else if ("normal_contiguous_gpu" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS_GPU;
-    } else if ("compact" == layout){
-      layout_type = LayoutType::COMPACT;
-    } else {
-      LOG(WARNING) << "Unknown layout: "
-        << layout << ", use LayoutType::NORMAL by default.";
-      layout_type = LayoutType::NORMAL;
-    }
-    size = s;
-  }
+      : type(t),
+        path(p),
+        size(s),
+        embedding_config(ec),
+        cache_strategy(cache_strategy_) {}
   StorageType type;
-  LayoutType layout_type;
   std::string path;
   std::vector<int64> size;
   CacheStrategy cache_strategy;
diff --git a/tensorflow/core/framework/embedding/storage_factory.h b/tensorflow/core/framework/embedding/storage_factory.h
index 10d2d52b83f..c585b058470 100644
--- a/tensorflow/core/framework/embedding/storage_factory.h
+++ b/tensorflow/core/framework/embedding/storage_factory.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
 
 #include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/dram_leveldb_storage.h"
 #include "tensorflow/core/framework/embedding/dram_pmem_storage.h"
 #include "tensorflow/core/framework/embedding/dram_ssd_storage.h"
@@ -34,50 +33,41 @@ class StorageFactory {
  public:
   template<typename K, typename V>
   static Storage<K, V>* Create(const StorageConfig& sc,
-      Allocator* gpu_allocator, const string& name) {
-    auto layout_creator = LayoutCreatorFactory::Create<V>(sc);
-
+      Allocator* gpu_allocator, FeatureDescriptor<V>* feat_desc,
+      const string& name) {
     switch (sc.type) {
       case StorageType::DRAM:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_MEMKIND:
-        return new PmemMemkindStorage<K, V>(sc, pmem_allocator(),
-            layout_creator);
+        feat_desc->SetAllocator(pmem_allocator());
+        return new PmemMemkindStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_LIBPMEM:
-        return new PmemLibpmemStorage<K, V>(sc,
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator);
+        feat_desc->SetAllocator(
+            experimental_pmem_allocator(sc.path, sc.size[0]));
+        return new PmemLibpmemStorage<K, V>(sc, feat_desc);
       case StorageType::DRAM_PMEM:
-        return new DramPmemStorage<K, V>(sc, ev_allocator(),
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator, name);
+        return new DramPmemStorage<K, V>(sc,
+            feat_desc, name);
       case StorageType::LEVELDB:
       case StorageType::DRAM_LEVELDB:
-        return new DramLevelDBStore<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramLevelDBStore<K, V>(sc, feat_desc, name);
       case StorageType::SSDHASH:
       case StorageType::DRAM_SSDHASH:
-        return new DramSsdHashStorage<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramSsdHashStorage<K, V>(sc, feat_desc, name);
       case StorageType::HBM:
 #if GOOGLE_CUDA
-        return new HbmStorage<K, V>(sc, gpu_allocator,
-            layout_creator);
+        return new HbmStorage<K, V>(sc, gpu_allocator, feat_desc);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM:
 #if GOOGLE_CUDA
-        return new HbmDramStorage<K, V>(sc, gpu_allocator,
-        ev_allocator(), layout_creator, name);
+        return new HbmDramStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM_SSDHASH:
 #if GOOGLE_CUDA
-        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator,
-            ev_allocator(), layout_creator, name);
+        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       default:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
     }
   }
 };
diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h
deleted file mode 100644
index ca7d234ed61..00000000000
--- a/tensorflow/core/framework/embedding/value_ptr.h
+++ /dev/null
@@ -1,647 +0,0 @@
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-
-#include <pthread.h>
-#include <bitset>
-#include <atomic>
-#include <memory>
-
-#include "tensorflow/core/framework/typed_allocator.h"
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#endif  // GOOGLE_CUDA
-
-namespace tensorflow {
-
-enum class LayoutType {
-  LIGHT,
-  NORMAL,
-  LEVELDB,
-  NORMAL_CONTIGUOUS,
-  NORMAL_CONTIGUOUS_GPU,
-  COMPACT,
-};
-
-namespace {
-constexpr int COLUMN_BITSET_BYTES = 5;
-constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
-
-struct MetaHeader {
-  unsigned char embed_num;
-  unsigned char value_type;
-  unsigned char header_size;
-  unsigned char column_bitset[COLUMN_BITSET_BYTES];
-
-  static const int kEmbeddingNumStartIndex = 0;
-  static const int kValueTypeStartIndex =
-      kEmbeddingNumStartIndex + sizeof(char);
-  static const int kHeaderSizeStartIndex =
-      kValueTypeStartIndex + sizeof(char);
-  static const int kColumnBitsetIndex =
-      kHeaderSizeStartIndex + sizeof(char);
-
-  inline unsigned int GetEmbeddingNum() {
-    return (unsigned int) embed_num;
-  }
-
-  inline void SetEmbeddingNum(size_t s) {
-    embed_num = (unsigned char)s;
-  }
-
-  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
-    unsigned long meta = ((unsigned long*)this)[0];
-    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
-    return bs;
-  }
-
-  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
-      unsigned int embnum) {
-    ((unsigned long*)(this))[0] =
-      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
-      (header_size << (8 * kHeaderSizeStartIndex)) |
-      (value_type << (8 * kValueTypeStartIndex)) |
-      (embnum << (8 * kEmbeddingNumStartIndex));
-  }
-
-  inline unsigned int GetHeaderSize() {
-    return (unsigned int) header_size;
-  }
-
-  inline void SetHeaderSize(size_t size) {
-    header_size = (unsigned char)size;
-  }
-
-  inline void SetLayoutType(LayoutType vt) {
-    value_type = (unsigned char)vt;
-  }
-
-  inline LayoutType GetLayoutType() {
-    return (LayoutType)value_type;
-  }
-};
-
-struct LightHeader {
-/*__________________________________________________________________________________________
- |           |          |          |               |    embedding     |       slot       |
- | number of | valueptr |  header  | each bit a V* |        V*        |        V*        |
- | embedding | type     |   size   |    1 valid    | actually pointer | actually pointer |...
- |  columns  |          |          |   0 no-valid  |    by alloctor   |    by alloctor   |
- |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |     (8 bytes)    |     (8 bytes)    |
- --------------------------------------------------------------------------------------------
-*/
-  MetaHeader meta;
-  LightHeader() {
-    memset(this, 0, sizeof(LightHeader));
-    meta.SetLayoutType(LayoutType::LIGHT);
-    meta.SetHeaderSize(sizeof(LightHeader) / sizeof(int64));
-  }
-};
-
-struct NormalHeader {
-/*_________________________________________________________________________________________________________________________
-  |           |          |          |               |             |               |    embedding     |       slot       |
-  | number of | valueptr |  header  | each bit a V* | global step | freq counter  |        V*        |        V*        |
-  | embedding | type     |   size   |    1 valid    |             |               | actually pointer | actually pointer |...
-  |  columns  |          |          |   0 no-valid  |    int64    |     int64     |    by alloctor   |    by alloctor   |
-  |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |  (8 bytes)  |   (8 bytes)   |     (8 bytes)    |     (8 bytes)    |
-  --------------------------------------------------------------------------------------------------------------------------
- */
-  MetaHeader meta;
-  int64 global_step;
-  int64 freq_counter;
-
-  NormalHeader() {
-    memset(this, 0, sizeof(NormalHeader));
-    meta.SetLayoutType(LayoutType::NORMAL);
-    meta.SetHeaderSize(sizeof(NormalHeader) / sizeof(int64));
-    SetGlobalStep(-1);
-  }
-
-  inline int64 GetGlobalStep() {
-    return global_step;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    global_step = gs;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-
-struct FixedLengthHeader {
-/*_________________________________________________________________________________
-  |                        |               |                embeddings             |
-  | slotflag + global step | freq counter  |                    V                  |
-  |                        |               |             actually value            |
-  |           int64        |     int64     |               by alloctor             |
-  |         (8 bytes)      |   (8 bytes)   |     (4 * slot_num * emb_dim bytes)    |
-  ---------------------------------------------------------------------------------
-*/
-  int64 global_step;
-  int64 freq_counter;
-
-  FixedLengthHeader() {
-    memset(this, 0, sizeof(FixedLengthHeader));
-    SetGlobalStep(-1);
-  }
-
-   inline int64 GetGlobalStep() {
-    return global_step & 0x0000ffffffffffff;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    int64 temp = global_step;
-    temp &= 0xffff000000000000;
-    gs &= 0x0000ffffffffffff;
-    temp |= gs;
-    global_step = temp;
-  }
-
-  inline void SetInitialized(int64 emb_index) {
-    int64 temp = 1;
-    temp = temp << (48 + emb_index);
-    global_step |= temp;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-} // namespace
-
-template <class V>
-class ValuePtr {
- public:
-  virtual ~ValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) = 0;
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) = 0;
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) = 0;
-
-  virtual void Destroy(Allocator* allocator) = 0;
-
-  virtual void* GetPtr() const = 0;
-
-  // Global Step
-  virtual int64 GetStep() {
-    LOG(FATAL) << "Unsupport GlobalStep in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetStep(int64 gs) {}
-
-  // Frequency Counter
-  virtual int64 GetFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetFreq(int64 freq) {}
-
-  virtual void AddFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void AddFreq(int64 count) {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void SetValue(V val, size_t size) {
-    LOG(FATAL) << "Unsupport SetValue in subclass of ValuePtrBase";
-  }
-
-  virtual void SetInitialized(int64 emb_index) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-  }
-
-  virtual bool SetPtr(V* ptr) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-    return false;
-  }
-
-};
-
-template <class V>
-class LooseValuePtr : public ValuePtr<V> {
- public:
-  virtual ~LooseValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-
-    if (!metadata.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      metadata = meta->GetColumnBitset();
-      if (metadata.test(emb_index)) {
-        this->flag_.clear(std::memory_order_release);
-        return ((V**)((int64*)ptr_ +
-              (unsigned int)meta->header_size))[emb_index];
-      }
-      embnum++ ;
-      int64 alloc_value_len = value_len;
-      V* tensor_val = (V*)allocator->AllocateRaw(
-          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]  = tensor_val;
-
-      metadata.set(emb_index);
-      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
-      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
-      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
-      meta->SetColumnBitset(metadata, embnum);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    auto metadata = meta->GetColumnBitset();
-    if (metadata.test(emb_index)) {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-    for (int i = 0; i< embnum; i++) {
-      if (metadata.test(i)) {
-        V* val = ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[i];
-        if (val != nullptr) {
-          allocator->DeallocateRaw(val);
-        }
-      }
-    }
-  }
-
-  virtual void* GetPtr() const {
-    return ptr_;
-  }
-
- protected:
-  void* ptr_;
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-template <class V>
-class LightValuePtr : public LooseValuePtr<V> {
- public:
-  LightValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*)malloc(
-        sizeof(LightHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(LightHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) LightHeader();
-  }
-
-  ~LightValuePtr() {
-    free(this->ptr_);
-  }
-};
-
-template <class V>
-class NormalValuePtr : public LooseValuePtr<V> {
- public:
-  NormalValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(NormalHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(NormalHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) NormalHeader();
-  }
-
-  ~NormalValuePtr() {
-    free(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((NormalHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((NormalHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((NormalHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((NormalHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    return ((NormalHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    return ((NormalHeader*)this->ptr_)->AddFreq(count);
-  }
-};
-
-template <class V>
-class NormalContiguousValuePtr : public LooseValuePtr<V> {
-  public:
-   NormalContiguousValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
-      sizeof(FixedLengthHeader) + sizeof(V) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(FixedLengthHeader), 0, sizeof(V) * size);
-    new ((char*)this->ptr_) FixedLengthHeader();
-   }
-
-   ~NormalContiguousValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(FixedLengthHeader) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  void SetValue(V val, size_t size) {
-    for (int i = 0; i < size; ++i) {
-      *((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + i) = val;
-    }
-  }
-};
-
-template <class V>
-class NormalGPUValuePtr : public LooseValuePtr<V> {
- public:
-  NormalGPUValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(FixedLengthHeader) + sizeof(V *));
-    *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = nullptr;
-    new ((char*)this->ptr_) FixedLengthHeader();
-  }
-
-  ~NormalGPUValuePtr() {
-    free(this->ptr_);
-  }
-
-#if GOOGLE_CUDA
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      V* tensor_val =
-        *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      cudaMemcpy(tensor_val, default_v, value_len * sizeof(V),
-          cudaMemcpyDeviceToDevice);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-#endif  // GOOGLE_CUDA
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset,
-      bool &need_initialize) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      need_initialize = 1;
-      this->flag_.clear(std::memory_order_release);
-      return reinterpret_cast<V*>(this);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    return;
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  bool SetPtr(V* ptr) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    V* value_ptr = *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader));
-    if (value_ptr == nullptr) {
-      *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = ptr;
-      this->flag_.clear(std::memory_order_release);
-      return true;
-    } else {
-      this->flag_.clear(std::memory_order_release);
-      return false;
-    }
-  }
-
-  void SetInitialized(int64 emb_index) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    ((FixedLengthHeader*)this->ptr_)->SetInitialized(emb_index);
-    this->flag_.clear(std::memory_order_release);
-  }
-
-};
-
-template <class V>
-class CompactValuePtr : public ValuePtr<V> {
-  public:
-   CompactValuePtr(Allocator* allocator, size_t size) {
-    memset(static_cast<char*>(this->ptr_), 0, sizeof(V) * size + sizeof(int64));
-   }
-
-   ~CompactValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(int64) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(int64) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(int64) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(int64) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  virtual void* GetPtr() const {
-    return (void*)ptr_;
-  }
-
- private:
-  char ptr_[23];
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 115e3c4bae6..0c08c30c30a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -439,7 +439,8 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "embedding_variable_ops_test",
-    srcs = ["embedding_variable_ops_test.cc"],
+    srcs = ["embedding_variable_ops_test.cc",
+            "embedding_variable_test.h"],
     extra_copts = ["-fexceptions", "-g"],
     deps = [
         ":io",
@@ -6497,7 +6498,7 @@ tf_kernel_library(
         "training_ali_ops_gpu.h",
         "training_ali_ops.h"
     ],
-    copts = tf_copts(),
+    copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",
         ":training_op_helpers",
diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc
index 7ec6b1cf109..393e9a9754b 100644
--- a/tensorflow/core/kernels/embedding_variable_memory_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc
@@ -19,17 +19,22 @@ namespace embedding {
 float PerfMemory(Tensor& default_value,
                 const std::vector<int64>& id_list,
                 int value_size, int64 default_value_dim,
-                int64 filter_freq = 0) {
+                int64 filter_freq = 0, int64 steps_to_live = 0,
+                int64 record_freq = false) {
   auto ev = CreateEmbeddingVar(value_size, default_value,
-                               default_value_dim, filter_freq);
-  ValuePtr<float>* value_ptr = nullptr;
+                               default_value_dim, filter_freq,
+                               steps_to_live, -1.0,
+                               embedding::StorageType::DRAM,
+                               {1024, 1024, 1024, 1024},
+                               record_freq);
+  void* value_ptr = nullptr;
   bool is_filter = false;
   double start_mem, end_mem;
   start_mem = getResident() * getpagesize();
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
     if (is_filter)
-      ev->flat(value_ptr, id_list[i]);
+      ev->flat(value_ptr);
   }
   end_mem = getResident() * getpagesize();
   double used_mb = (end_mem - start_mem)/1000000;
@@ -58,7 +63,7 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   float used_mb = PerfMemory(default_value, id_list,
                              value_size, default_value_dim);
   float theoritical_mb =
-      50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000;
+      50 + num_of_ids * (value_size * sizeof(float)) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
               (used_mb < theoritical_mb * 1.01));
 
@@ -68,9 +73,10 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   used_mb = PerfMemory(default_value, id_list, value_size,
                        default_value_dim, filter_freq);
   theoritical_mb =
-      50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000;
+      50 + num_of_ids * (8 + value_size * sizeof(float) / 2
+                         + 4/*memory for ids_list*/) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
-              (used_mb < theoritical_mb * 1.01));
+              (used_mb < theoritical_mb * 1.02));
 }
 } //namespace embedding
 } //namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 4839c171708..e30381fef07 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -21,6 +21,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/embedding_variable_test.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -48,18 +49,6 @@ namespace {
 const int THREADNUM = 16;
 const int64 max = 2147483647;
 
-template<class K, class V>
-class TestableEmbeddingVar : public EmbeddingVar<K, V> {
- public:
-  TestableEmbeddingVar(const string& name,
-                       embedding::Storage<K, V>* storage,
-                       EmbeddingConfig emb_cfg = EmbeddingConfig(),
-                       Allocator* alloc = nullptr) : EmbeddingVar<K, V>(
-                         name, storage, emb_cfg, alloc) {}
-
-  using EmbeddingVar<K, V>::GetFilter;
-};
-
 struct ProcMemory {
   long size;      // total program size
   long resident;  // resident set size
@@ -123,11 +112,7 @@ TEST(EmbeddingVariableTest, TestEmptyEV) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   {
-    auto storage = embedding::StorageFactory::Create<int64, float>(
-        embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-    auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-        storage, EmbeddingConfig(), cpu_allocator());
-    variable->Init(value, 1);
+    auto variable = CreateEmbeddingVar(value_size, value, 1);
 
     LOG(INFO) << "size:" << variable->Size();
     Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
@@ -191,19 +176,14 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
   int64 value_size = 8;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddigVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   for (int64 i = 0; i < 5; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
     vflat(i) = 5.0;
   }
 
@@ -269,20 +249,15 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   int64 ev_size = 10048576;
   for (int64 i = 0; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -344,9 +319,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
 
 void multi_insertion(EmbeddingVar<int64, float>* variable, int64 value_size){
   for (long j = 0; j < 5; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, j);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 }
 
@@ -355,12 +330,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   std::vector<std::thread> insert_threads(THREADNUM);
   for (size_t i = 0 ; i < THREADNUM; i++) {
@@ -375,54 +345,45 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
 
 void InsertAndLookup(EmbeddingVar<int64, float>* variable,
                      int64 *keys, long ReadLoops, int value_size){
-  float *default_value_fake = (float *)malloc((value_size)*sizeof(float));
-  for (int j = 0; j < value_size; j++) {
-      default_value_fake[j] = -1.0;
-    }
   for (long j = 0; j < ReadLoops; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    float *default_value = (float *)malloc((value_size)*sizeof(float));
-    for (int k = 0; k < value_size; k++) {
-      default_value[k] = (float)keys[j];
-    }
-    variable->LookupOrCreate(keys[j], val, default_value);
-    variable->LookupOrCreate(keys[j], val, default_value_fake);
-    ASSERT_EQ(default_value[0] , val[0]);
-    free(val);
-    free(default_value);
+    void* val = nullptr;
+    void* val_1 = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(keys[j], &val, &is_filter, false);
+    variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false);
+    ASSERT_EQ(val, val_1);
   }
-  free(default_value_fake);
 }
 
 void MultiBloomFilter(EmbeddingVar<int64, float>* var, int value_size, int64 i) {
   for (long j = 0; j < 1; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    var->LookupOrCreate(i+1, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    var->LookupOrCreateKey(i+1, &val, &is_filter, false);
   }
 }
 
 TEST(EmbeddingVariableTest, TestBloomFilter) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, "normal", 10, 0.01),
-      cpu_allocator());
-
-  var->Init(value, 1);
-
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  float *default_value = (float *)malloc((value_size+1)*sizeof(float));
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(2, val, default_value);
+  std::vector<float> default_value =
+      {0.0 ,1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+  test::FillValues<float>(&value, default_value);
+
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01);
+
+  //float *val = (float *)malloc((value_size+1)*sizeof(float));
+  void* val = nullptr;
+  bool is_filter = true;
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(2, &val, &is_filter, false);
   
   std::vector<int64> keylist;
   std::vector<float *> valuelist;
@@ -437,14 +398,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt64) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT64), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT64);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -509,14 +467,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt32) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT32), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT32);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -581,14 +536,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt16) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT16), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT16);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -654,14 +606,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT8), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT8);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -725,12 +674,7 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
   int64 value_size = 128;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 InsertLoops = 1000;
   bool* flag = (bool *)malloc(sizeof(bool)*max);
@@ -765,8 +709,9 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
 }
 
 void MultiFilter(EmbeddingVar<int64, float>* variable, int value_size) {
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  variable->LookupOrCreate(20, val, nullptr);
+  bool is_filter = true;
+  void* val;
+  variable->LookupOrCreateKey(20, &val, &is_filter, false);
 }
 
 TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
@@ -774,14 +719,8 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 7),
-      cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5);
+
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   int thread_num = 5;
   std::vector<std::thread> insert_threads(thread_num);
@@ -792,20 +731,16 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
     t.join();
   }
 
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   var->LookupOrCreateKey(20, &value_ptr);
-  ASSERT_EQ(value_ptr->GetFreq(), thread_num);
+  ASSERT_EQ(var->GetFreq(20), thread_num);
 }
 
 EmbeddingVar<int64, float>* InitEV_Lockless(int64 value_size) {
   Tensor value(DT_INT64, TensorShape({value_size}));
   test::FillValues<int64>(&value, std::vector<int64>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
-  variable->Init(value, 1);
   return variable;
 }
 
@@ -813,7 +748,7 @@ void MultiLookup(EmbeddingVar<int64, float>* variable,
     int64 InsertLoop, int thread_num, int i) {
   for (int64 j = i * InsertLoop/thread_num;
       j < (i+1)*InsertLoop/thread_num; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
   }
 }
@@ -829,9 +764,9 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
   float* fill_v = (float*)malloc(value_size * sizeof(float));
 
   for (int64 i = 0; i < InsertLoop; i++){
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   testing::StartTiming();
@@ -848,58 +783,6 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
 
 }
 
-void hybrid_process(EmbeddingVar<int64, float>* variable,
-    int64* keys, int64 InsertLoop, int thread_num,
-    int64 i, int64 value_size) {
-  float *val = (float *)malloc(sizeof(float)*(value_size + 1));
-  for (int64 j = i * InsertLoop/thread_num;
-      j < (i+1) * InsertLoop/thread_num; j++) {
-    variable->LookupOrCreate(keys[j], val, nullptr);
-  }
-}
-
-void BM_HYBRID_LOCKLESS(int iters, int thread_num) {
-  testing::StopTiming();
-  testing::UseRealTime();
-
-  int64 value_size = 128;
-  auto variable = InitEV_Lockless(value_size);
-  int64 InsertLoop =  1000000;
-
-  srand((unsigned)time(NULL));
-  int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoop);
-
-  for (int64 i = 0; i < InsertLoop; i++) {
-    keys[i] =  rand() % 1000;
-  }
-
-  testing::StartTiming();
-  while (iters--) {
-    std::vector<std::thread> insert_threads(thread_num);
-    for (size_t i = 0 ; i < thread_num; i++) {
-      insert_threads[i] = std::thread(hybrid_process,
-          variable, keys, InsertLoop, thread_num, i, value_size);
-    }
-    for (auto &t : insert_threads) {
-      t.join();
-    }
-  }
-}
-
-BENCHMARK(BM_MULTIREAD_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
-BENCHMARK(BM_HYBRID_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
 
 TEST(EmbeddingVariableTest, TestAllocate) {
   int value_len = 8;
@@ -923,23 +806,13 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(/*emb_index = */0, /*primary_emb_index = */0,
-                      /*block_num = */1, /*slot_num = */1,
-                      /*name = */"", /*steps_to_live = */0,
-                      /*filter_freq = */0, /*max_freq = */999999,
-                      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-                      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-                      /*counter_type = */DT_UINT64),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 ev_size = 100;
   for (int64 i = 0; i < ev_size; i++) {
-    variable->LookupOrCreate(i, fill_v, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(i, &val, &is_filter, false);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -947,59 +820,20 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
 
 void t1(KVInterface<int64, float>* hashmap) {
   for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalValuePtr<float>(ev_allocator(), 100));
+    hashmap->Insert(i, nullptr);
   }
 }
 
 TEST(EmbeddingVariableTest, TestRemoveLockless) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMap<int64, float>();
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(t1, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofDBKV) {
-  int64 value_size = 4;
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM,
+      false, false, {false, 0});
   KVInterface<int64, float>* hashmap =
-      new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(value_size);
-
-  for (int64 i = 0; i < 6; ++i) {
-    const ValuePtr<float>* tmp =
-        new NormalContiguousValuePtr<float>(ev_allocator(), value_size);
-    hashmap->Commit(i, tmp);
-  }
-
-  for(int64 i = 0; i < 6; i++) {
-    ValuePtr<float>* tmp = nullptr;
-    Status s = hashmap->Lookup(i, &tmp);
-    ASSERT_EQ(s.ok(), true);
-  }
-}
-
-void InsertAndCommit(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    const ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(ev_allocator(), 100);
-    hashmap->Insert(i, tmp);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestSizeDBKV) {
-  KVInterface<int64, float>* hashmap =
-    new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(100);
+      new LocklessHashMap<int64, float>(feat_desc);
+  feat_desc->InitSlotInfo(0, 100, {nullptr, 1});
   ASSERT_EQ(hashmap->Size(), 0);
   LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(InsertAndCommit, hashmap);
+  auto t = std::thread(t1, hashmap);
   t.join();
   LOG(INFO) << "hashmap size: " << hashmap->Size();
   ASSERT_EQ(hashmap->Size(), 100);
@@ -1190,213 +1024,6 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
   }
 }
 
-TEST(EmbeddingVariableTest, TestCacheRestore) {
-  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
-  int64 value_size = 4;
-  Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float));
-  std::vector<int64> size;
-  size.emplace_back(64);
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal_contiguous",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, emb_config, cpu_allocator());
-  variable->Init(value, 1);
-  variable->InitCache(CacheStrategy::LFU);
-
-  Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
-
-  int64 ev_size = 7;
-  int64 cache_size = 3;
-  for (int64 i = 1; i < cache_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(2);
-  }
-  for (int64 i = cache_size; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(1);
-  }
-
-  LOG(INFO) << "size:" << variable->Size();
-
-  BundleWriter writer(Env::Default(), Prefix("foo"));
-  embedding::ShrinkArgs shrink_args;
-  shrink_args.global_step = 1;
-  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
-  TF_ASSERT_OK(writer.Finish());
-  variable->Unref();
-
-  auto imported_storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar1");
-  auto imported_variable = new EmbeddingVar<int64, float>("EmbeddingVar1",
-      imported_storage, emb_config, cpu_allocator());
-  imported_variable->Init(value, 1);
-  imported_variable->InitCache(CacheStrategy::LFU);
-
-  BundleReader reader(Env::Default(), Prefix("foo"));
-  std::string name_string("var");
-  imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false);
-
-  ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
-  ASSERT_EQ(imported_storage->Size(1), 2);
-  delete imported_storage;
-}
-
-void t1_gpu(KVInterface<int64, float>* hashmap) {
-  for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalGPUValuePtr<float>(ev_allocator(), 100));
-  }
-}
-
-#if GOOGLE_CUDA
-TEST(EmbeddingVariableTest,TestRemoveLocklessCPU) {
-    SessionOptions sops;
-    std::unique_ptr<Device> device =
-      DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
-    Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-        GPUOptions(), TfGpuId(0), 1 << 26);
-    KVInterface<int64, float>* hashmap =
-      new LocklessHashMapCPU<int64, float>(gpu_allocator);
-    ASSERT_EQ(hashmap->Size(), 0);
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    auto t = std::thread(t1, hashmap);
-    t.join();
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    ASSERT_EQ(hashmap->Size(), 100);
-    TF_CHECK_OK(hashmap->Remove(1));
-    TF_CHECK_OK(hashmap->Remove(2));
-    ASSERT_EQ(hashmap->Size(), 98);
-    LOG(INFO) << "2 size:" << hashmap->Size();
-}
-#endif  // GOOGLE_CUDA
-
-/*void CommitGPU(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    ValuePtr<float>* tmp= new NormalGPUValuePtr<float>(ev_allocator(), 100);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestCommitHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(100);
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(CommitGPU, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestGPUValuePtr) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(), ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float host_data[ev_list_size];
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << initial_data[i];
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(host_data, address, ev_list_size * sizeof(float), cudaMemcpyDeviceToHost);
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << host_data[i];
-  }
-}//Forbidden, due to no gpu allocator at that time
-
-TEST(EmbeddingVariableTest, TestCommitValue) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(),ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(ev_list_size);
-  hashmap->Commit(1, ptr_);
-  ValuePtr<float>* check;
-  hashmap->Lookup(1,&check);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << tmp[i];
-    //ASSERT_EQ(tmp[i], 10);
-  }//
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofLocklessHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  const int EmbeddingSize = 16;
-  const int BatchSize = 16;
-
-  hashmap->SetTotalDims(EmbeddingSize);
-  std::vector<ValuePtr<float>*> value_ptr_list;
-  std::vector<int64> key_list;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    key_list.emplace_back(i);
-    ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(EmbeddingSize);
-    float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-    float initial_data[EmbeddingSize];
-    for(int j = 0;j < EmbeddingSize;++j){
-      initial_data[j] = i;
-      //LOG(INFO) << "initial[" << i << "][" << j << "]=" << initial_data[j];
-    }
-    cudaMemcpy(address, initial_data, EmbeddingSize * sizeof(float), cudaMemcpyHostToDevice);
-    value_ptr_list.emplace_back(ptr_);
-  }//initialize V on GPU
-
-  timespec start,end;
-  clock_gettime(CLOCK_MONOTONIC, &start);
-  hashmap->BatchCommit(key_list, value_ptr_list);
-  clock_gettime(CLOCK_MONOTONIC, &end);
-  std::cout << "time: " << ((double)(end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec)/1000000 << "ms" << std::endl;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    ValuePtr<float>* check;
-    hashmap->Lookup(i,&check);
-    float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-    for(int j = 0;j < EmbeddingSize;++j){
-      LOG(INFO) << "batch[" << i << "][" << j << "]=" << tmp[j];
-      //ASSERT_EQ(tmp[j], i);
-    }
-  }//compare value after BatchCommit
-}
-*/
-
 const int total_size = 1024 * 8;
 const int th_num = 1;
 const int malloc_size = total_size / th_num;
@@ -1466,17 +1093,11 @@ TEST(EmbeddingVariableTest, TestCPUGPUMalloc) {
   auto mem_pool = new EmbeddingMemoryPool<float>(gpu_allocator, 256, 1024);
   float* ptr_1 = mem_pool->Allocate();
   float* ptr_2 = mem_pool->Allocate();
-  ValuePtr<float>* value_ptr1 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  ValuePtr<float>* value_ptr2 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  value_ptr1->SetPtr(ptr_1);
-  value_ptr2->SetPtr(ptr_2);
-  value_ptr1->SetInitialized(0);
-  value_ptr2->SetInitialized(0);
-  std::vector<ValuePtr<float>*> value_ptrs;
-  value_ptrs.emplace_back(value_ptr1);
+  std::vector<void*> value_ptrs;
+  value_ptrs.emplace_back(ptr_1);
   mem_pool->Deallocate(value_ptrs);
   value_ptrs.clear();
-  value_ptrs.emplace_back(value_ptr2);
+  value_ptrs.emplace_back(ptr_2);
   mem_pool->Deallocate(value_ptrs);
   float* ptr_3 = mem_pool->Allocate();
   ASSERT_EQ(ptr_1, ptr_3);
@@ -1539,16 +1160,16 @@ TEST(EmbeddingVariableTest, TestEVMallocFree) {
 
 void SingleCommit(KVInterface<int64, float>* hashmap,
     std::vector<int64> keys, int bias) {
-  std::vector<ValuePtr<float>*> value_ptrs;
+  std::vector<void*> value_ptrs;
   for (int64 i = 0; i < keys.size(); ++i) {
-    ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(cpu_allocator(), 124);
-    tmp->SetValue(float(keys[i] + bias), 124);
+    void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16);
+    for (int j = 0; j < 124; j++) {
+      ((float*)tmp)[j] = keys[i] + bias;
+    }
     value_ptrs.push_back(tmp);
   }
   ASSERT_EQ(keys.size(), value_ptrs.size());
   uint64 start = Env::Default()->NowNanos();
-  
   for (int64 i = 0; i < keys.size(); i++) {
     hashmap->Commit(keys[i], value_ptrs[i]);
   }
@@ -1558,9 +1179,13 @@ void SingleCommit(KVInterface<int64, float>* hashmap,
 
 void TestCompaction() {
   std::string temp_dir = testing::TmpDir();
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262144; i++) {
@@ -1576,12 +1201,12 @@ void TestCompaction() {
   t1.join();
   ids.clear();
   sleep(1);
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   for (int i = 131073; i < 262144; i++) {
@@ -1596,16 +1221,16 @@ void TestCompaction() {
   sleep(1);
   for (int i = 0; i < 131073; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 1);
+      ASSERT_EQ(v[j], i + 1);
     }
   }
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 2);
+      ASSERT_EQ(v[j], i + 2);
     }
   }
   delete hashmap;
@@ -1622,10 +1247,14 @@ TEST(KVInterfaceTest, TestSSDKVSyncCompaction) {
 }
 
 void TestReadEmbFile() {
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   std::string temp_dir = testing::TmpDir();
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262145; i++) {
@@ -1634,12 +1263,12 @@ void TestReadEmbFile() {
   SingleCommit(hashmap, ids, 3);
   sleep(1);
   ids.clear();
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 0; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   delete hashmap;
@@ -1666,9 +1295,10 @@ TEST(KVInterfaceTest, TestDirectIoFile) {
 void InsertKey(EmbeddingVar<int64, float>* variable, int value_size) {
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   for (int64 i = 0; i < 100000000; i++) {
-    variable->LookupOrCreate(20, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(20, &val, &is_filter, false);
   }
-  LOG(INFO)<<"Finish Insert";
 }
 
 void RemoveKey(EmbeddingVar<int64, float>* variable) {
@@ -1676,29 +1306,13 @@ void RemoveKey(EmbeddingVar<int64, float>* variable) {
     sleep(1);
     variable->storage()->Remove(20);
   }
-  LOG(INFO)<<"Remove thread finish";
 }
 
 TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */2, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-
-   var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
    int thread_num = 5;
    std::vector<std::thread> insert_threads(thread_num);
    for (size_t i = 0 ; i < thread_num - 1; i++) {
@@ -1714,21 +1328,7 @@ TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
   float* set_value = (float*)malloc(value_size * sizeof(float));
   //Insertion
   for (int i = 0; i < 100; i++) {
diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc
index 9b01e35840b..16f4a894858 100644
--- a/tensorflow/core/kernels/embedding_variable_performance_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc
@@ -90,14 +90,21 @@ void GenerateSkewInput(int num_of_ids, float skew_factor,
 void thread_lookup_or_create(
     EmbeddingVar<int64, float>* ev,
     const int64* input_batch,
+    float* default_value,
+    int default_value_dim,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false);
-    auto val = ev->flat(value_ptr, input_batch[i]);
-    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    if (is_filter) {
+      auto val = ev->flat(value_ptr);
+      memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    } else {
+      int default_value_index = input_batch[i] % default_value_dim;
+      memcpy(outputs[i], default_value + default_value_index * value_size, sizeof(float) * value_size);
+    }
   }
 }
 
@@ -138,6 +145,8 @@ double PerfLookupOrCreate(
     for (int i = 0; i < num_thread; i++) {
       worker_threads[i] = std::thread(thread_lookup_or_create,
                                       ev, input_batches[k].data(),
+                                      default_value_matrix.data(),
+                                      default_value_dim,
                                       outputs.data(), value_size,
                                       thread_task_range[i].first,
                                       thread_task_range[i].second);
@@ -201,11 +210,11 @@ void thread_lookup(
     const int64* input_batch,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupKey(input_batch[i], &value_ptr);
-    auto val = ev->flat(value_ptr, input_batch[i]);
+    auto val = ev->flat(value_ptr);
     memcpy(outputs[i], &val(0), sizeof(float) * value_size);
   }
 }
@@ -293,7 +302,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) {
 		}
 	}
   auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   for (int i = 0; i < hot_ids_list.size(); i++) {
     ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false);
@@ -339,13 +348,13 @@ void PerfSave(Tensor& default_value,
       value_size, default_value,
       default_value_dim, 0, steps_to_live,
       l2_weight_threshold);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   srand((unsigned)time(NULL));
 
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
-    ev->flat(value_ptr, id_list[i]);
+    ev->flat(value_ptr);
     int64 global_step = rand() % 100;
     ev->UpdateVersion(value_ptr, global_step);
   }
diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h
index d06304fb78a..07c34764fb0 100644
--- a/tensorflow/core/kernels/embedding_variable_test.h
+++ b/tensorflow/core/kernels/embedding_variable_test.h
@@ -107,35 +107,42 @@ EmbeddingVar<int64, float>* CreateEmbeddingVar(
     int value_size, Tensor& default_value,
     int64 default_value_dim, int64 filter_freq = 0,
     int64 steps_to_live = 0,
-    float l2_weight_threshold=-1.0) {
-  std::string layout_type = "light";
-  if (filter_freq != 0) {
-    layout_type = "normal";
-  }
-
-  if (steps_to_live != 0) {
-    if (layout_type == "light") {
-      layout_type = "normal_contiguous";
-    }
-  }
+    float l2_weight_threshold=-1.0,
+    embedding::StorageType storage_type = embedding::StorageType::DRAM,
+    std::vector<int64> storage_size = {1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024},
+    bool record_freq = false,
+    int64 max_element_size = 0,
+    float false_positive_probability = -1.0,
+    DataType counter_type = DT_UINT64) {
   auto embedding_config = EmbeddingConfig(
-			0, 0, 1, 0, "emb_var", steps_to_live,
-			filter_freq, 999999, l2_weight_threshold, layout_type,
-			0, -1.0, DT_UINT64, default_value_dim,
-			0.0, false, false, false);
+      0, 0, 1, 0, "emb_var", steps_to_live,
+      filter_freq, 999999, l2_weight_threshold,
+      max_element_size, false_positive_probability,
+      counter_type, default_value_dim,
+      0.0, record_freq, false, false);
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), storage_type,
+      record_freq,
+      embedding_config.is_save_version(),
+      {embedding_config.is_counter_filter(), filter_freq});
   auto storage =
       embedding::StorageFactory::Create<int64, float>(
           embedding::StorageConfig(
-              embedding::StorageType::DRAM, "",
-              {1024, 1024, 1024, 1024}, layout_type,
+              storage_type, "",
+              storage_size,
               embedding_config),
           cpu_allocator(),
+          feat_desc,
           "emb_var");
 	auto ev = new EmbeddingVar<int64, float>(
       "emb_var",
       storage,
       embedding_config,
-      cpu_allocator());
+      cpu_allocator(),
+      feat_desc);
 	ev->Init(default_value, default_value_dim);
   return ev;
 }
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
index 55dd40176a8..2f07e2ef537 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
@@ -774,7 +774,7 @@ class GroupEmbeddingVariableForWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
@@ -958,7 +958,7 @@ class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h
index 0582697ad16..d84838ae413 100644
--- a/tensorflow/core/kernels/incr_save_restore_ops.h
+++ b/tensorflow/core/kernels/incr_save_restore_ops.h
@@ -225,9 +225,9 @@ class IncrEVValueDumpIterator : public  DumpIterator<T> {
       keys_idx_++;
       col_idx_ = 0;
     }
-    ValuePtr<T>* value_ptr = NULL;
+    void* value_ptr = NULL;
     TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr));
-    return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++);
+    return emb_var_->flat(value_ptr)(col_idx_++);
   }
 
  private:
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index c69aec8ebb9..7e40dfff7ac 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -121,7 +121,7 @@ class KvResourceLookupIDOp : public OpKernel {
       const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GetOrCreateKey(ev_ctx, indices,
-                         reinterpret_cast<ValuePtr<TValue>**>(out_base),
+                         reinterpret_cast<void**>(out_base),
                          indices_size);
     }
   }
@@ -203,7 +203,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
       const size_t slice_bytes = slice_elems * sizeof(TValue);
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GatherEmbeddings(ev_ctx, indices,
-                          (ValuePtr<TValue>**)pointer.data(),
+                          (void**)pointer.data(),
                           out_base, N);
     }
   }
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 8a01a7bf2cd..5cd0ef140bd 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -214,16 +214,16 @@ class InitializeKvVariableOp : public OpKernel {
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
     storage_type_ = static_cast<embedding::StorageType>(storage_type);
-    auto device_type_str = c->device_type().type_string();
+    device_type_str_ = c->device_type().type_string();
     if (storage_type_ == embedding::DEFAULT) {
-      if (device_type_str == "CPU") {
+      if (device_type_str_ == "CPU") {
         storage_type_ = embedding::DRAM;
       } else {
         storage_type_ = embedding::HBM;
       }
     }
 
-    bool if_op_on_gpu = (device_type_str == "GPU");
+    bool if_op_on_gpu = (device_type_str_ == "GPU");
     bool if_embedding_on_hbm = (storage_type_ == embedding::HBM ||
                                 storage_type_ == embedding::HBM_DRAM ||
                                 storage_type_ == embedding::HBM_DRAM_SSDHASH);
@@ -238,57 +238,14 @@ class InitializeKvVariableOp : public OpKernel {
       filter_freq_ = 0;
     }
 
-    OP_REQUIRES_OK(c, c->GetAttr("layout", &layout_));
-    if (!layout_.empty()) {
-      // use layout by user configuration
-    } else if ((filter_freq_ != 0 && max_element_size_ == 0)
-               || steps_to_live_ != 0 || record_freq_
-               || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        if (storage_type == embedding::HBM_DRAM ||
-            storage_type == embedding::HBM_DRAM_SSDHASH) {
-          layout_ = "normal_contiguous_gpu";
-        } else {
-          layout_ = "normal_contiguous";
-        }
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
-    if ("compact" == layout_) {
-      OP_REQUIRES(c, shape_.dim_size(0) == 1 &&
-            storage_type_ == embedding::StorageType::DRAM,
-          errors::InvalidArgument("embedding_dim must be 1 and storage type"
-                                  " should be DRAM when layout is 'compact'."));
-    }
+    record_freq_ |= (storage_type > 5);
+    record_version_ |= (storage_type > 5);
 
     OP_REQUIRES(c, steps_to_live_ >= 0,
         errors::InvalidArgument(
             "steps_to_live must >= 0, ", std::to_string(steps_to_live_)));
 
     OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
-    if (embedding::StorageType::LEVELDB == storage_type_) {
-      ht_type_ = "leveldb_kv";
-      if (layout_ != "normal_contiguous")
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS when storage type is LEVELDB";
-      layout_ = "normal_contiguous";
-    }
-
-    if (embedding::StorageType::PMEM_LIBPMEM == storage_type_ ||
-        embedding::StorageType::PMEM_MEMKIND == storage_type_){
-      if (layout_ != "normal_contiguous"){
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS"
-          << " when storage type is PMEM_LIBPMEM or PMEM_MEMKIND";
-      }
-      layout_ = "normal_contiguous";
-    }
     OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_));
   }
 
@@ -314,35 +271,43 @@ class InitializeKvVariableOp : public OpKernel {
               context, handle_self, &ev,
               [this, default_values, opname, context,
                handle_self](EmbeddingVar<TKey, TValue>** ptr) {
-            Allocator* gpu_allocator =
+            Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 emb_index_ + block_num_ * slot_index_,
                 emb_index_, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_,
                 filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
-            return Status::OK();
-          }));
-      ev->Init(default_values, default_value_dim_);
+                alloc_for_ev,
+                feat_desc);
+            return (*ptr)->Init(default_values, default_value_dim_);
+          }));   
     } else {
       EmbeddingVar<TKey, TValue>* primary_variable = nullptr;
       OP_REQUIRES_OK(
@@ -352,30 +317,38 @@ class InitializeKvVariableOp : public OpKernel {
               [this, default_values, opname,
                handle_primary, context](EmbeddingVar<TKey, TValue>** ptr) {
             int64 primary_slot_index(0), primary_emb_index(0);
-            Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes());
-            //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes());
+            Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 primary_emb_index + block_num_ * primary_slot_index,
                 primary_emb_index,
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_primary.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
+                alloc_for_ev,
+                feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -386,20 +359,26 @@ class InitializeKvVariableOp : public OpKernel {
             context, handle_self, &ev,
             [this, default_values, opname, primary_variable,
              handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
+           Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
+          auto embedding_config = EmbeddingConfig(
+              emb_index_ + block_num_ * slot_index_,
+              emb_index_,
+              block_num_, slot_num_, opname,
+              steps_to_live_, filter_freq_,
+              max_freq_, l2_weight_threshold_,
+              max_element_size_,
+              false_positive_probability_,
+              counter_type_, default_value_dim_,
+              default_value_no_permission_,
+              record_freq_, record_version_,
+              is_inference_);
+          Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
           *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
               primary_variable->storage(),
-              EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                              emb_index_,
-                              block_num_, slot_num_, opname,
-                              steps_to_live_, filter_freq_,
-                              max_freq_, l2_weight_threshold_,
-                              layout_, max_element_size_,
-                              false_positive_probability_,
-                              counter_type_, default_value_dim_,
-                              default_value_no_permission_,
-                              record_freq_, record_version_,
-                              is_inference_),
-          primary_variable->GetAllocator());
+              embedding_config,
+              alloc_for_ev,
+              primary_variable->feature_descriptor());
           return (*ptr)->Init(default_values, default_value_dim_);
         }));
       core::ScopedUnref unref_me(primary_variable);
@@ -424,7 +403,6 @@ class InitializeKvVariableOp : public OpKernel {
   int64 filter_freq_;
   int64 max_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_element_size_;
   float false_positive_probability_;
   embedding::StorageType storage_type_;
@@ -436,6 +414,7 @@ class InitializeKvVariableOp : public OpKernel {
   bool record_version_;
   bool is_inference_;
   bool is_set_initialized_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index 8e3572443ba..3202e6d12bf 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/cache_factory.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 23a504eea5d..3b10c2521b9 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -120,20 +120,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_));
     OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_));
 
-    if ((filter_freq_ != 0 && max_element_size_ == 0)
-         || steps_to_live_ != -1 || record_freq_
-         || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        layout_ = "normal_contiguous";
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true,
                                    &ev_async_restore_));
   }
@@ -170,24 +156,33 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                allocator);
+                alloc_for_ev,
+                feat_desc);
             return Status::OK();
         }));
       ev->Init(default_values, default_value_dim_);
@@ -207,19 +202,27 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 primary_emb_index, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(handle_primary.name(),
-                storage, embedding_config, allocator);
+                storage, embedding_config, alloc_for_ev, feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -232,17 +235,22 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
             Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
+            auto embedding_config = EmbeddingConfig(
+                emb_index_ + block_num_ * slot_index_,
+                emb_index_, block_num_, slot_num_, opname,
+                steps_to_live_, filter_freq_, max_freq_,
+                l2_weight_threshold_, max_element_size_,
+                false_positive_probability_,
+                counter_type_, default_value_dim_,
+                default_value_no_permission_,
+                record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
             *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
                 primary_variable->storage(),
-                EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                    emb_index_, block_num_, slot_num_, opname,
-                    steps_to_live_, filter_freq_, max_freq_,
-                    l2_weight_threshold_, layout_, max_element_size_,
-                    false_positive_probability_,
-                    counter_type_, default_value_dim_,
-                    default_value_no_permission_,
-                    record_freq_, record_version_),
-                    allocator);
+                embedding_config,
+                alloc_for_ev,
+                primary_variable->feature_descriptor());
             return (*ptr)->Init(default_values, default_value_dim_);
           }));
       core::ScopedUnref unref_me(primary_variable);
@@ -290,7 +298,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   int64 slot_num_;
   int64 filter_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_freq_;
   embedding::StorageType storage_type_;
   std::string storage_path_;
@@ -301,6 +308,7 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   bool record_version_;
   bool reset_version_;
   bool ev_async_restore_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                    \
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 4f69ebe3fb5..da58e17e1bb 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/hash_table/hash_table.h"
 #include "tensorflow/core/framework/hash_table/bloom_filter_strategy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h
index e013a6a2bae..12948de24a4 100644
--- a/tensorflow/core/kernels/training_ali_op_helpers.h
+++ b/tensorflow/core/kernels/training_ali_op_helpers.h
@@ -121,55 +121,54 @@ EmbeddingVariableInputLockHolder<K, V> MaybeLockEmbeddingVariableInputMutexesInO
 template<class K, class V, class Tstep>
 void LookupKeyAndSetVersion(
     OpKernelContext* ctx, EmbeddingVar<K, V>* var,
-    ValuePtr<V>** value_ptrs, Tstep gs, const K* indices,
+    void** value_ptrs, Tstep gs, const K* indices,
     int64 task_size, bool indices_as_pointer,
     int counts_index) {
+  EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
   int64* indices_counts = nullptr;
   std::function<int64(int64*, int64)> get_count_fn = 0;
   if (counts_index != -1) {
     const Tensor& counts_tensor = ctx->input(counts_index);
     indices_counts = (int64*)counts_tensor.data();
-    get_count_fn = [](int64* counts, int64 index) {
-      return counts[index];};
-  } else {
-    get_count_fn = [](int64* counts, int64 index) {return 1;};
   }
+  var->LookupOrCreateKey(ev_ctx, indices, value_ptrs,
+                         task_size, indices_counts,
+                         indices_as_pointer);
 
-  auto lookup_key_and_set_version_fn = [var, value_ptrs, gs,
-      indices, indices_as_pointer,
-      indices_counts, get_count_fn] (int64 start, int64 limit) {
-    ValuePtr<V>* value_ptr = nullptr;
+  auto update_version_fn = [var, value_ptrs, gs]
+      (int64 start, int64 limit) {
     for (int i = start; i < limit; i++) {
-      bool is_filter = false;
-      int64 count = get_count_fn(indices_counts, i);
-      var->LookupOrCreateKey(indices[i], &value_ptr,
-          &is_filter, indices_as_pointer, count);
-      value_ptrs[i] = value_ptr;
-      var->UpdateVersion(value_ptr, gs);
+      var->UpdateVersion(value_ptrs[i], gs);
     }
   };
   const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   Shard(worker_threads->num_threads,
         worker_threads->workers, task_size, unit_cost,
-        lookup_key_and_set_version_fn);
+        update_version_fn);
 }
 
 template<class K, class V>
-void LookupOrCreateEmbedding(
+void LookupEmbedding(
     OpKernelContext* ctx,
     std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
-    ValuePtr<V>** value_ptrs,
+    void** value_ptrs,
     const K* indices,
-    int64 num_of_keys,
-    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+    int64 num_of_keys) {
   for (auto it: vars) {
     EmbeddingVar<K, V>* var = it.first;
     V** var_ptr = it.second;
-    EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
-    var->BatchLookupOrCreateEmb(
-        ev_ctx, var_ptr, value_ptrs,
-        indices, num_of_keys, thread_copy_id_alloc);
+    auto lookup_emb_fn = [var, var_ptr, value_ptrs]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        var_ptr[i] = var->GetValuePtr(value_ptrs[i]);
+      }
+    };
+    const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads->num_threads,
+        worker_threads->workers, num_of_keys, unit_cost,
+        lookup_emb_fn);
   }
 }
 
@@ -180,12 +179,12 @@ void GetEmbeddingPointers(
     const K* indices, Tstep gs, bool indices_as_pointer,
     int counts_index, int64 num_of_keys,
     IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-  std::vector<ValuePtr<V>*> value_ptrs(num_of_keys);
+  std::vector<void*> value_ptrs(num_of_keys);
   LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(),
                          gs, indices, num_of_keys,
                          indices_as_pointer, counts_index);
-  LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(),
-                          indices, num_of_keys, thread_copy_id_alloc);
+  LookupEmbedding(ctx, vars, value_ptrs.data(),
+                  indices, num_of_keys);
 }
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 839ce82feef..546b30e29dd 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -141,16 +141,16 @@ class KvSparseApplyAdagradOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               a += g.square();
               v -= g.constant(lr_scalar) * g * a.rsqrt();
             }
@@ -542,15 +542,15 @@ class KvSparseApplyFtrlOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             if (is_filter) {
-              auto var = var_->flat(value_ptr, index);
-              auto accum = accum_->flat(value_ptr, index);
-              auto linear = linear_->flat(value_ptr, index);
+              auto var = var_->flat(value_ptr);
+              auto accum = accum_->flat(value_ptr);
+              auto linear = linear_->flat(value_ptr);
               auto grad = grad_flat.template chip<0>(i);
 
 // Use a macro to implement the computation here due to the templating of the
@@ -1301,19 +1301,19 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
 
-              auto v = var->flat(value_ptr, index);
-              auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
+              auto accum_decay_power = accum_decay_power_var->flat(value_ptr);
 
               if (gs / decay_step_scalar > accum_decay_power(0)) {
                 a *= a.constant(decay_rate_scalar);
@@ -1505,19 +1505,18 @@ class KvSparseApplyAdamOp : public OpKernel {
           auto indices_vec = indices.vec<Tindex>();
 
           int64 gs = global_step.scalar<int64>()();
-
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
@@ -2412,15 +2411,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
           Tstep gs = global_step.scalar<Tstep>()();
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto v_ = v->flat(value_ptr, index);
-              auto m_ = m->flat(value_ptr, index);
+              auto v_ = v->flat(value_ptr);
+              auto m_ = m->flat(value_ptr);
               auto grad_ = grad_flat.template chip<0>(i);
 
               v_ = v_ * v_.constant(beta2_scalar) +
@@ -2429,7 +2428,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
                      (v_ + v_.constant(epsilon_scalar)).rsqrt() *
                          v_.constant(lr_scalar) * grad_;
 
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= m_;
             }
           }
@@ -2461,17 +2460,17 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
 
             for (int64 i = start_i; i < limit_i; i++) {
               const Tindex index = indices_vec(i);
-              ValuePtr<T>* value_ptr = nullptr;
+              void* value_ptr = nullptr;
               bool is_filter = false;
               int64 count = get_count_fn(indices_counts, i);
               OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                              &is_filter, indices_as_pointer, count));
               var->UpdateVersion(value_ptr, gs);
               if (is_filter) {
-                auto m_a = m->flat(value_ptr, index);
-                auto v_a = v->flat(value_ptr, index);
+                auto m_a = m->flat(value_ptr);
+                auto v_a = v->flat(value_ptr);
                 auto g = grad_flat.template chip<0>(i);
-                auto var_i = var->flat(value_ptr, index);
+                auto var_i = var->flat(value_ptr);
 
                 m_a = m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
                 v_a = v_a * beta2_scalar + g.square() * (static_cast<T>(1) - beta2_scalar);
@@ -2939,7 +2938,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
@@ -2947,7 +2946,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= g.constant(lr_scalar) * g;
             }
           }
@@ -3136,16 +3135,16 @@ class KvSparseApplyAdamWOp : public OpKernel {
 
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
               // m_a = beta1 * m + (1 - beta1) * g
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2a56634206c..e89b095aff1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -6132,6 +6132,8 @@ class GraphKeys(object):
   TRAINABLE_VARIABLES = "trainable_variables"
   # Indicate EmbeddingVariable in CollectionDef
   EMBEDDING_VARIABLES = "embedding_variables"
+  # Collection for dependencies of EmbeddingVariable's restore op
+  EMBEDDING_VARIABLE_RESTORE_DEPENDENCY = "embedding_variable_restore_dependency"
   # Key to collect summaries.
   SUMMARIES = "summaries"
   # Key to collect QueueRunners.
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 240938e8675..d47d94d0d99 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -47,69 +47,6 @@
 
 
 class EmbeddingVariableGpuTest(test_util.TensorFlowTestCase):
-  def testDynamicDimensionEmbeddingVariable(self):
-    print("testDynamicDimensionEmbeddingVariable")
-    with ops.device('/gpu:0'):
-      def runTestAdagrad(self, var, g):
-        if isinstance(var, kv_variable_ops.EmbeddingVariable):
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-        else:
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-        fun = math_ops.multiply(emb, 2.0, name='multiply')
-        loss = math_ops.reduce_sum(fun, name='reduce_sum')
-        gs = training_util.get_or_create_global_step()
-        opt = adagrad.AdagradOptimizer(0.1)
-        g_v = opt.compute_gradients(loss)
-        train_op = opt.apply_gradients(g_v)
-        init = variables.global_variables_initializer()
-        with self.test_session(graph=g) as sess:
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-          sess.run([init])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          return r
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            embedding_dim = 8,
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      emb1 = runTestAdagrad(self, emb_var, g)
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      var =  variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                    embedding_block_dimension=4,
-                                                                    embedding_block_num=2,
-                                                                    storage_type=config_pb2.StorageType.HBM,
-                                                                    initializer=init_ops.ones_initializer(dtypes.float32))
-      emb2 = runTestAdagrad(self, var, g)
-    for i in range(0, 6):
-      for j in range(0, 8):
-        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-  def testDynamicEmbeddingVariableForInitFromProto(self):
-    print("testDynamicEmbeddingVariableForInitFromProto")
-    with ops.device('/gpu:0'):
-      embedding = variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                      embedding_block_dimension=4,
-                                                                      embedding_block_num=2,
-                                                                      storage_type=config_pb2.StorageType.HBM,
-                                                                      initializer=init_ops.ones_initializer(dtypes.float32))
-    emb = embedding_ops.embedding_lookup(embedding, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
-    ops.reset_default_graph()
-    with self.test_session() as sess:
-      res = saver_module.import_meta_graph(meta_graph_def)
-
   def testEmbeddingVariableForInitFromProto(self):
     print("testEmbeddingVariableForInitFromProto")
     with ops.device('/gpu:0'):
@@ -235,43 +172,6 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
-    print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-        ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    with ops.device("/gpu:0"):
-      W = feature_column.embedding_column(sparse_id_column=columns,
-              dimension=3,
-              initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
-
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val1 in emb1.tolist():
-        for val in val1:
-          self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for index, val1 in enumerate(emb1.tolist()):
-        if index < 7:
-          for val in val1:
-            self.assertNotEqual(val, 1.0)
-        else:
-          for val in val1:
-            self.assertEqual(val, .0)
-
   def testEmbeddingVariableForSparseColumnEmbeddingCol(self):
     columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
         ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)))
@@ -870,6 +770,66 @@ def testSaveV3(self):
         result = sess.run([emb1])
         print(result)
 
+  def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self):
+    print("testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+        
+        emb = embedding_ops.embedding_lookup(var, 
+                                            math_ops.cast([0,1,2,5,6,7],
+                                            dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver(sharded=True)
+        init = variables.global_variables_initializer()
+        graph = ops.get_default_graph()
+        with self.test_session() as sess:
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+          sess.run([init])
+          sess.run(train_op)
+          emb_ori = sess.run(emb)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        emb = embedding_ops.embedding_lookup(var, 
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver()
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          self.assertAllEqual(emb_ori, emb_val)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+          for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+            if "Adagrad-values" in name:
+              value = checkpoint_utils.load_variable(checkpoint_directory, name)
+              for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+                  self.assertAlmostEqual(1.1, value[i][j])            
+
   def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
     print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm")
     checkpoint_directory = self.get_temp_dir()
@@ -894,8 +854,8 @@ def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
         emb2 = embedding_ops.embedding_lookup(var2,
                                               math_ops.cast([0,1,2,5,6,7],
                                               dtypes.int64))
-        fun = math_ops.multiply(emb, 0.0, name='multiply')
-        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 1.0, name='multiply_1')
         loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
         gs = training_util.get_or_create_global_step()
         opt = adagrad.AdagradOptimizer(0.1)
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index c6cdf951a1e..81b315e2e43 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -120,7 +120,7 @@ def _CounterFilterTestTemplate(self, optimizer):
               initializer=init_ops.ones_initializer(dtypes.float32),
               ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
       fun = math_ops.multiply(emb, 2.0, name='multiply')
       loss = math_ops.reduce_sum(fun, name='reduce_sum')
       gs = training_util.get_or_create_global_step()
@@ -133,11 +133,18 @@ def _CounterFilterTestTemplate(self, optimizer):
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertEqual(val, .0)
+      
+        for val1 in emb1.tolist():
+          for val in val1:
+            self.assertEqual(val, .0)
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertNotEqual(val, 1.0)
+        for index, val1 in enumerate(emb1.tolist()):
+          if index < 7:
+            for val in val1:
+              self.assertNotEqual(val, 1.0)
+          else:
+            for val in val1:
+              self.assertEqual(val, .0)
 
   def _RecordFreqTestTemplate(self, optimizer):
     checkpoint_directory = self.get_temp_dir()
@@ -720,20 +727,11 @@ def testEmbeddingVariableForL2FeatureEviction(self):
       sess.run([init])
       emb_ori = sess.run([emb, train_op])
       save_path = saver.save(sess, os.path.join(checkpoint_directory, "model1.ckpt"), global_step=12345)
-      #for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
-      #  print('loading... ', name, shape)
-    with self.test_session() as sess:
-      saver.restore(sess, os.path.join(checkpoint_directory, "model1.ckpt-12345"))
-      emb_right = [[0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [1.0, 1.0, 1.0]]
-      emb_ori = sess.run(emb)
-      for i in range(6):
-        for j in range(3):
-          self.assertAlmostEqual(emb_ori[i][j], emb_right[i][j])
+      for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+        if name == "var_1-keys":
+          self.assertEqual(shape[0], 2)
+          keys = checkpoint_utils.load_variable(checkpoint_directory, name)
+          self.assertAllEqual(keys, [0, 1])
 
   def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
     columns_list=[]
@@ -764,14 +762,15 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
 
   def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
     print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-                                                          ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    W = feature_column.embedding_column(sparse_id_column=columns,
-            dimension=3,
-            initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
+    with ops.device("/cpu:0"):
+      columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
+                                                            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+      W = feature_column.embedding_column(sparse_id_column=columns,
+              dimension=3,
+              initializer=init_ops.ones_initializer(dtypes.float32))
+      ids={}
+      ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
+      emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
 
     fun = math_ops.multiply(emb, 2.0, name='multiply')
     loss = math_ops.reduce_sum(fun, name='reduce_sum')
@@ -786,6 +785,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
       emb1, top, l = sess.run([emb, train_op, loss])
+      
       for val1 in emb1.tolist():
         for val in val1:
           self.assertEqual(val, .0)
@@ -1328,66 +1328,6 @@ def testEmbeddingVariableForHTPartitionNum(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForLayout(self):
-    print("testEmbeddingVariableForLayout")
-    def runTestAdagrad(self, var, g):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adagrad.AdagradOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session(graph=g) as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            steps_to_live=5)
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=5)))
-      emb1 = runTestAdagrad(self, emb_var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], .0)
-
   def testEVInitializerWithKeyFetch(self):
     print("testEVInitializerWithKeyFetch")
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
@@ -2391,7 +2331,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
                               "model1.ckpt")
     with self.test_session() as sess:
       sess.run([init])
-      sess.run([emb, train_op])
+      sess.run([train_op])
       save_path = saver.save(sess, model_path)
       for name, shape in checkpoint_utils.list_variables(model_path):
         if name == "var_1-keys":
@@ -2403,6 +2343,37 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
            name == "var_1-freqs_filtered":
           self.assertEqual(0, shape[0])
     del os.environ["TF_EV_SAVE_FILTERED_FEATURES"]
+
+  def testEmbeddingVariableForSaveUnfilterFeature(self):
+    checkpoint_directory = self.get_temp_dir()
+    with ops.device("/cpu:0"):
+      emb_var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+    emb = embedding_ops.embedding_lookup(emb_var,  math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v, gs)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if name == "var_1-keys":
+          keys = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(1, len(keys))
+          self.assertEqual(1, keys[0])
+        if name == "var_1-keys_filtered" or \
+           name == "var_1-freqs_filtered":
+          self.assertEqual(2, shape[0])
   
   def testEmbeddingVariableForMultiTierInference(self):
     print("testEmbeddingVariableForMultiTierInference")
@@ -2716,7 +2687,55 @@ def testCPUFbjOpt(self):
   def testCPUFbjOptWithCounterFilter(self):
     print("testCPUFbjOpt")
     os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
-    self._CounterFilterTestTemplate("Adagrad")
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = self._CreateOptimizer("Adagrad")
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        emb1, top, l = sess.run([emb, train_op, loss])
+        emb_list = emb1.tolist()
+        emb_right = [[.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [.0, .0, .0]]
+        
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb_list[i][j], emb_right[i][j])
+
+        emb1= sess.run(emb)
+        emb_right = [[0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0]]
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb1[i][j], emb_right[i][j])
     del os.environ["TF_EMBEDDING_FBJ_OPT"]
   
   def testCPUFbjOptWithBloomFilter(self):
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 96329ca345b..1ef9550ef6d 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -373,6 +373,8 @@ def _init_from_args(self,
           self._slot_num = 0 
         else:
           self._slot_num = evconfig.slot_num
+        if self._is_primary:
+          self._import_dependency_ops = []
         with ops.name_scope("IsInitialized"):
           self._is_initialized_op = (
               gen_kv_variable_ops.kv_var_is_initialized_op(self._handle,
@@ -488,6 +490,7 @@ def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
           set_attr_ops.append(set_cache_op)
         with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
           self._init_op_for_restore = control_flow_ops.no_op()
+        self.collect_restore_denpendencies()
 
   def need_counts(self):
     return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
@@ -612,8 +615,19 @@ def _init_from_proto(self, variable_def, import_scope=None):
     else:
       self._is_primary = False
 
+    self.collect_restore_denpendencies()
   # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py)
 
+  def collect_restore_denpendencies(self):
+    restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    if len(restore_dependency) == 0:
+      ops.add_to_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY, {})
+      restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    dependency_dict = restore_dependency[0]
+    if not dependency_dict.__contains__(self._primary_handle):
+      dependency_dict[self._primary_handle] = []
+    dependency_dict[self._primary_handle].append(self._init_op_for_restore)
+
   def set_init_data_source_initializer(self, init_data_source):
     import pkgutil
     try:
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 0d8bfe87022..650b1a5e272 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -195,7 +195,8 @@ def restore(self, restored_tensors, unused_restored_shapes):
       if self.var._init_data_source is not None:
         return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
       else:
-        with ops.control_dependencies([self.var._init_op_for_restore]):
+        restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+        with ops.control_dependencies(restore_dependency[self.var._primary_handle]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
               restored_tensors[0],

From be62ec312595b51b74260f96a6c0872ce5f1540c Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 18 Oct 2023 10:11:16 +0800
Subject: [PATCH 55/91] [Graph] Fix hang bug for async embedding lookup. (#934)

Skip edges to 'SaveV3' Op.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/python/training/async_embedding_stage.py |  7 ++++++-
 tensorflow/python/training/monitored_session.py     | 10 ++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/training/async_embedding_stage.py b/tensorflow/python/training/async_embedding_stage.py
index 32433387c1c..858025bdab7 100644
--- a/tensorflow/python/training/async_embedding_stage.py
+++ b/tensorflow/python/training/async_embedding_stage.py
@@ -49,13 +49,14 @@ def __init__(self, options, checkpoint_dir = None):
         self._checkpoint_dir = checkpoint_dir if checkpoint_dir else ""
         self._use_stage_subgraph_thread_pool = options.use_stage_subgraph_thread_pool
         self._stage_subgraph_thread_pool_id = options.stage_subgraph_thread_pool_id
+        self._is_staged = False
         self._control_flow_ops = ['Switch', '_SwitchN', 'Merge', '_XlaMerge',
                                   'Enter', 'Exit']
         self._variable_ops = ['Variable', 'VariableV2', 'VarHandleOp',
                               'KvVarHandleOp', 'HashTableV2']
         self._variable_is_init_ops = ['IsVariableInitialized',
                                       'VarIsInitializedOp', 'KvVarIsInitializedOp']
-        self._saver_ops = ['SaveV2']
+        self._saver_ops = ['SaveV2', 'SaveV3']
         self._no_data_input_ops = self._variable_ops + ['Placeholder', 'PlaceholderV2', 'Const']
         self._boundary_ops = set()
         for tensor in ops.get_collection(ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS):
@@ -74,6 +75,10 @@ def __init__(self, options, checkpoint_dir = None):
     def stage(self, graph):
         """ add async embedding stage node to graph
         """
+        if self._is_staged:
+            return
+        self._is_staged = True
+
         logging.info('async embedding stage begin')
         logging.info('async embedding thread num: ' + str(self._threads_num))
         logging.info('async embedding capacity: ' + str(self._capacity))
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 09c05a02627..6eb204785dd 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -185,6 +185,7 @@ def __init__(self,
     self._saver = saver
     self._incremental_save_restore = incremental_save_restore
     self._incr_saver = None
+    self._async_embedding_stage = None
     self._enable_async_embedding = False
     self._async_embedding_checkpoint_dir = None
     self._async_embedding_options = None
@@ -247,10 +248,11 @@ def default_ready_for_local_init_op():
       self._incr_saver = incr_saver._get_incremental_saver(self._incremental_save_restore, self._saver)
 
     if self._enable_async_embedding:
-      async_embedding_stage = async_embedding.AsyncEmbeddingStage(
-        self._async_embedding_options,
-        self._async_embedding_checkpoint_dir)
-      async_embedding_stage.stage(ops.get_default_graph())
+      if self._async_embedding_stage is None:
+        self._async_embedding_stage = async_embedding.AsyncEmbeddingStage(
+          self._async_embedding_options,
+          self._async_embedding_checkpoint_dir)
+      self._async_embedding_stage.stage(ops.get_default_graph())
 
     ops.get_default_graph().finalize()
     logging.info('Graph was finalized.')

From 0e8127a2cc9b2529ec2ab2f6f361d6c536280d60 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 25 Oct 2023 05:10:05 -0700
Subject: [PATCH 56/91] [Distribute] Add elastic-grpc server. (#936)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 configure.py                                  |   3 +
 tensorflow/BUILD                              |   6 +
 tensorflow/contrib/elastic_grpc_server/BUILD  |  70 ++++
 .../elastic_grpc_server_lib.cc                | 317 ++++++++++++++++++
 .../elastic_grpc_server_lib.h                 |  66 ++++
 .../elastic_grpc_server_lib_test.cc           |  77 +++++
 .../elastic_grpc_server/elastic_service.cc    | 157 +++++++++
 .../elastic_grpc_server/elastic_service.h     |  31 ++
 tensorflow/core/BUILD                         |  23 ++
 .../distributed_runtime/rpc/grpc_server_lib.h |  14 +-
 .../core/platform/default/build_config.bzl    |   6 +
 .../platform/default/build_config_root.bzl    |   8 +
 .../core/protobuf/elastic_training.proto      |  76 +++++
 tensorflow/python/BUILD                       |   3 +-
 14 files changed, 849 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/elastic_grpc_server/BUILD
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.h
 create mode 100644 tensorflow/core/protobuf/elastic_training.proto

diff --git a/configure.py b/configure.py
index 362479981b2..6aeaf7d12af 100644
--- a/configure.py
+++ b/configure.py
@@ -1433,6 +1433,9 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_STAR', 'STAR', 'with_star_support',
                 True, 'star')
 
+  set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support',
+                True, 'elastic')
+
   set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support',
                 False, 'pmem')
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 493247a2162..8b4190ea680 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -434,6 +434,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_elastic_support",
+    values = {"define": "with_elastic_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_pmem_support",
     values = {"define": "with_pmem_support=true"},
diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD
new file mode 100644
index 00000000000..ea4b87e3b58
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/BUILD
@@ -0,0 +1,70 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+load(
+    "//tensorflow:tensorflow.bzl", "tf_cc_test",
+)
+
+cc_library(
+    name = "elastic_grpc_server_lib",
+    srcs = select({"//tensorflow:with_elastic_support": ["elastic_service.cc",
+                                                     "elastic_grpc_server_lib.cc"],
+		   "//conditions:default": []}),
+    hdrs = ["elastic_service.h",
+            "elastic_grpc_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        "//tensorflow/core:elastic_service_proto_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_master_service",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:graph_mgr",
+        "//tensorflow/core/distributed_runtime:local_master",
+        "//tensorflow/core/distributed_runtime:master",
+        "//tensorflow/core/distributed_runtime:master_env",
+        "//tensorflow/core/distributed_runtime:master_session",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_resource",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "elastic_grpc_test",
+    size = "small",
+    srcs = ["elastic_grpc_server_lib_test.cc"],
+    deps = [
+        ":elastic_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+    linkstatic = 1, 
+)
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
new file mode 100644
index 00000000000..d45d70d6c8c
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
@@ -0,0 +1,317 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "include/json/json.h"
+#include "grpc/support/alloc.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/local_master.h"
+#include "tensorflow/core/distributed_runtime/master.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_resource.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// static utility function
+RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
+  return new RpcRendezvousMgr(env);
+}
+
+}  // namespace
+
+ElasticGrpcServer::ElasticGrpcServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env) {}
+
+ElasticGrpcServer::~ElasticGrpcServer() {
+  delete elastic_service_;
+}
+
+Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num) {
+  std::string tf_config;
+  ReadStringFromEnvVar("TF_CONFIG", "", &tf_config);
+  if (!tf_config.empty()) {
+    Json::Reader reader;
+    Json::Value tf_config_json;
+    if(!reader.parse(tf_config, tf_config_json)) {
+      return errors::Internal("PARSE TF_CONFIG ERROR");
+    }
+    if ((tf_config_json["cluster"].isNull()) ||
+        (tf_config_json["cluster"]["ps"].isNull())) {
+      return errors::Internal("PARSE PS FROM TF_CONFIG ERROR");
+    }
+
+    Json::Value cluster_json;
+    if (!reader.parse(cluster_def_str, cluster_json)) {
+      LOG(ERROR) << "cluster_def is not correct with " << cluster_def_str;
+      return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
+    }
+
+    std::unordered_set<string> ps_addrs_vec;
+    after_part_num = cluster_json["cluster"]["ps"].size();
+    for (auto& value: cluster_json["cluster"]["ps"]) {
+      ps_addrs_vec.emplace(value.asString());
+    }
+
+    int job_size = server_def_.cluster().job_size();
+    for (int j = 0; j < job_size; ++j) {
+      auto* job = server_def_.mutable_cluster()->mutable_job(j);
+      if (job->name() == "ps") {
+        before_part_num = job->tasks_size();
+        if (before_part_num == after_part_num) {
+          return Status::OK();
+        } else if (after_part_num > before_part_num) {
+          int idx = before_part_num;
+          LOG(INFO) << "SCALING UP, partition_num is: " << after_part_num;
+          std::unordered_set<string> target_string_set;
+          for (auto& value: tf_config_json["cluster"]["ps"]) {
+            target_string_set.emplace(value.asString());
+          }
+          for (auto ps_addr: ps_addrs_vec) {
+            if (target_string_set.find(ps_addr) == target_string_set.end()) {
+              job->mutable_tasks()->insert({idx, ps_addr});
+              tf_config_json["cluster"]["ps"].append(ps_addr);
+            }
+          } 
+          break;
+        } else {
+          LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
+          for (int i = 0; i < before_part_num; ++i) {
+            string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
+            if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
+              Json::Value ps_addr;
+              tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
+              job->mutable_tasks()->erase(i);
+            }
+          }
+        }
+      }
+    }
+    Json::FastWriter writer;
+    std::string new_tf_config = writer.write(tf_config_json);
+    LOG(INFO) << "new TF_CONFIG " << new_tf_config;
+    setenv("TF_CONFIG", new_tf_config.c_str(), 1);
+  }
+  return Status::OK();
+}
+
+Status ElasticGrpcServer::Update(const string& cluster_def_str) {
+  int before_part_num, after_part_num;
+  Status s = UpdateServerDef(cluster_def_str, before_part_num, after_part_num);
+  if (!s.ok()) {
+    LOG(ERROR) << s.error_message();
+    return Status::OK();
+  }
+
+  if (after_part_num == before_part_num) {
+    return Status::OK();
+  }
+
+  WorkerCacheInterface* worker_cache;
+  WorkerCacheFactoryOptions worker_cache_factory_options(server_def_);
+  TF_RETURN_IF_ERROR(
+      WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
+  CHECK_NE(nullptr, worker_cache);
+  ConfigProto config = server_def_.default_session_config();
+  string unused;
+  string default_worker_name;
+  if (!DeviceNameUtils::SplitDeviceName(master_env()->local_devices[0]->name(),
+                                        &default_worker_name, &unused)) {
+    return errors::Internal("Could not parse worker name.");
+  }
+  std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+      new DeviceResolverDistributed(worker_env()->device_mgr, worker_cache,
+                                    default_worker_name));
+  std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+      new CollectiveParamResolverDistributed(config, worker_env()->device_mgr,
+                                              dev_resolver.get(), worker_cache,
+                                              default_worker_name));
+  worker_env()->collective_executor_mgr = new RpcCollectiveExecutorMgr(
+      config, worker_env()->device_mgr, std::move(dev_resolver),
+      std::move(param_resolver), worker_cache, default_worker_name);
+
+  if (worker_env()->session_mgr != nullptr) {
+    delete worker_env()->session_mgr;  // Deletes graph_mgr's.
+  }
+
+  // Set up worker environment.
+  worker_env()->session_mgr = new SessionMgr(
+      worker_env(), SessionMgr::WorkerNameFromServerDef(server_def_),
+      std::unique_ptr<WorkerCacheInterface>(worker_cache),
+      [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
+        WorkerCacheFactoryOptions options(server_def);
+        return WorkerCacheFactory(options, worker_cache);
+      });
+  master_env()->worker_cache = worker_cache;
+  // Finish setting up master environment.
+  
+  StatsPublisherFactory stats_factory = opts_.stats_factory;
+  master_env()->master_session_factory =
+      [config, stats_factory](
+          SessionOptions options, const MasterEnv* env,
+          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+          std::unique_ptr<WorkerCacheInterface> worker_cache,
+          std::unique_ptr<DeviceSet> device_set,
+          std::vector<string> filtered_worker_list) {
+        options.config.MergeFrom(config);
+        return new MasterSession(options, env, std::move(remote_devs),
+                                 std::move(worker_cache), std::move(device_set),
+                                 std::move(filtered_worker_list),
+                                 stats_factory);
+      };
+  master_env()->worker_cache_factory =
+      [this](const WorkerCacheFactoryOptions& options,
+             WorkerCacheInterface** worker_cache) {
+        return WorkerCacheFactory(options, worker_cache);
+      };
+  return Status::OK();
+}
+
+void ElasticGrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {
+  elastic_service_ = NewElasticGrpcService(this, builder);
+}
+
+Status ElasticGrpcServer::Start() {
+  {
+    mutex_lock l(mu_);
+    switch (state_) {
+      case NEW: {
+        update_server_thread_.reset(
+            env_->StartThread(ThreadOptions(), "TF_elastic_service",
+                              [this] { elastic_service_->HandleRPCsLoop(); }));
+        LOG(INFO) << "Started server with target: " << target();
+        break;
+      }
+      case STARTED:
+        LOG(INFO) << "Server already started (target: " << target() << ")";
+        return Status::OK();
+      case STOPPED:
+        return errors::FailedPrecondition("Server has stopped.");
+      default:
+        LOG(FATAL);
+    }
+  }
+  return GrpcServer::Start();
+}
+
+Status ElasticGrpcServer::Join() {
+  GrpcServer::Join();
+  mutex_lock l(mu_);
+  switch (state_) {
+    case NEW:
+      LOG(FATAL) << "Server shoud already closed";
+    case STARTED:
+    case STOPPED:
+      update_server_thread_.reset();  
+      return Status::OK();
+    default:
+      LOG(FATAL);
+  }
+}
+
+/* static */
+Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<ElasticGrpcServer> ret(
+      new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env));
+  ServiceInitFunction service_func = nullptr;
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+/* static */
+Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ElasticGrpcServer>* out_server) {
+  std::unique_ptr<ElasticGrpcServer> ret(
+      new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env));
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class ElasticGrpcServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "elastic-grpc";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return ElasticGrpcServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `ElasticGrpcServer` instances.
+class ElasticGrpcServerRegistrar {
+ public:
+  ElasticGrpcServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    memset(&alloc_fns, 0, sizeof(alloc_fns));
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("ELASTIC_GRPC_SERVER", new ElasticGrpcServerFactory());
+  }
+};
+static ElasticGrpcServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
new file mode 100644
index 00000000000..8853ceb2819
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
+
+#include <memory>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+class ElasticGrpcServer : public GrpcServer {
+ public:
+  ElasticGrpcServer(const ServerDef& server_def, Env* env);
+  
+  virtual ~ElasticGrpcServer() override;
+
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ElasticGrpcServer>* out_server);
+
+  Status Update(const string& cluster_def_str);
+
+  void MaybeMutateBuilder(::grpc::ServerBuilder* builder) override;
+
+  Status Start() override;
+  
+  Status Join() override;
+
+ private:
+  Status UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num);
+  
+ private:
+  // TensorFlow Eager implementation, and RPC polling thread.
+  AsyncServiceInterface* elastic_service_ = nullptr;
+  std::unique_ptr<Thread> update_server_thread_ GUARDED_BY(mu_);
+
+  std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
new file mode 100644
index 00000000000..e2db870a74a
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+#include "gtest/gtest.h"
+
+namespace tensorflow {
+
+class ElasticGrpcServerTest : public ::testing::Test {
+ protected:
+  Status FillServerDef(const string& job_spec, ServerDef* options) {
+    options->set_protocol("elastic-grpc");
+    options->set_job_name("chief");
+    options->set_task_index(0);
+
+    uint32 my_tasks_per_replica = 0;
+    for (const string& job_str : str_util::Split(job_spec, ',')) {
+        JobDef* job_def = options->mutable_cluster()->add_job();
+        // Split each entry in the flag into 2 pieces, separated by "|".
+        const std::vector<string> job_pieces = str_util::Split(job_str, '|');
+        CHECK_EQ(2, job_pieces.size()) << job_str;
+        job_def->set_name(job_pieces[0]);
+        // Does a bit more validation of the tasks_per_replica.
+        const StringPiece spec = job_pieces[1];
+        // job_str is of form <job_name>|<host_ports>.
+        const std::vector<string> host_ports = str_util::Split(spec, ';');
+        uint32 tasks_per_replica = host_ports.size();
+        for (size_t i = 0; i < host_ports.size(); ++i) {
+        (*job_def->mutable_tasks())[i] = host_ports[i];
+        }
+        if (job_def->name() == options->job_name()) {
+        my_tasks_per_replica = tasks_per_replica;
+        }
+        LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {"
+                << absl::StrJoin(host_ports, ", ") << "}";
+    }
+    if (my_tasks_per_replica == 0) {
+        return errors::InvalidArgument("Invalid job specification");
+    }
+    return Status::OK();
+  }  
+};
+
+//Test Update Logic
+TEST_F(ElasticGrpcServerTest, UpdateServer) {
+  Status s;
+  std::unique_ptr<ElasticGrpcServer> grpc_server;
+  ServerDef server_def;
+  std::string job_spec = "worker|localhost:2222,ps|localhost:10086;localhost:10087;localhost:10088,chief|localhost:2220";
+  TF_ASSERT_OK(FillServerDef(job_spec, &server_def));
+  s = ElasticGrpcServer::Create(server_def, Env::Default(), &grpc_server);
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not create server: " << s.error_message();
+  }
+  TF_ASSERT_OK(grpc_server->Start());
+  // TF_QCHECK_OK(grpc_server->Join());
+  LOG(INFO) << "SCALING DOWN";
+  std::string tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\"],\"chief\": [\"localhost:2220\"]]}}";
+  grpc_server->Update(tf_config_str);
+  LOG(INFO) << "SCALING UP";
+  tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\", \"localhost:10088\"],\"chief\": [\"localhost:2220\"]]}}";
+  grpc_server->Update(tf_config_str);
+  grpc_server.release();
+}
+
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
new file mode 100644
index 00000000000..61aa6e662ec
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
@@ -0,0 +1,157 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h"
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+#include "tensorflow/core/protobuf/elastic_training.grpc.pb.h"
+#include "tensorflow/core/protobuf/elastic_training.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+
+
+#include <grpc/support/log.h>
+#include <grpcpp/grpcpp.h>
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include "grpcpp/server_builder.h"
+
+using namespace des;
+
+using grpc::Server;
+using grpc::ServerAsyncResponseWriter;
+using grpc::ServerBuilder;
+using grpc::ServerCompletionQueue;
+using grpc::ServerContext;
+
+namespace tensorflow {
+
+class GrpcElasticService : public AsyncServiceInterface {
+ public:
+  GrpcElasticService(ElasticGrpcServer* elastic_grpc_server,
+                     ::grpc::ServerBuilder* builder): 
+      elastic_grpc_server_(elastic_grpc_server), builder_(builder) {
+    builder_->RegisterService(&elastic_service_);
+    cq_ = builder_->AddCompletionQueue();
+  }
+
+  ~GrpcElasticService() override { }
+  
+  void Shutdown() override {
+    cq_->Shutdown();
+  }
+
+  void HandleRPCsLoop() override {
+    new CallData(&elastic_service_, elastic_grpc_server_, cq_.get());
+    void* tag;
+    bool ok;
+    while (true) {
+      // Block waiting to read the next event from the completion queue. The
+      // event is uniquely identified by its tag, which in this case is the
+      // memory address of a CallData instance.
+      // The return value of Next should always be checked. This return value
+      // tells us whether there is any kind of event or cq_ is shutting down.
+      GPR_ASSERT(cq_->Next(&tag, &ok));
+      GPR_ASSERT(ok);
+      static_cast<CallData*>(tag)->Proceed();
+    }
+  }
+
+ private:
+  // Class encompasing the state and logic needed to serve a request.
+  class CallData {
+   public:
+    // Take in the "service" instance (in this case representing an asynchronous
+    // server) and the completion queue "cq" used for asynchronous communication
+    // with the gRPC runtime.
+    CallData(ElasticTrainingService::AsyncService* service, ElasticGrpcServer* elastic_grpc_server,
+        ServerCompletionQueue* cq)
+      : service_(service), elastic_grpc_server_(elastic_grpc_server),
+        cq_(cq), responder_(&ctx_), status_(CREATE) {
+      // Invoke the serving logic right away.
+      Proceed();
+    }
+
+    void Proceed() {
+      if (status_ == CREATE) {
+        // Make this instance progress to the PROCESS state.
+        status_ = PROCESS;
+
+        // As part of the initial CREATE state, we *request* that the system
+        // start processing SayHello requests. In this request, "this" acts are
+        // the tag uniquely identifying the request (so that different CallData
+        // instances can serve different requests concurrently), in this case
+        // the memory address of this CallData instance.
+        service_->RequestUpdateServerDef(&ctx_, &request_, &responder_,
+                                         cq_, cq_, this);
+      } else if (status_ == PROCESS) {
+        // Spawn a new CallData instance to serve new clients while we process
+        // the one for this CallData. The instance will deallocate itself as
+        // part of its FINISH state.
+        new CallData(service_, elastic_grpc_server_, cq_);
+
+        // The actual processing.
+        Status s = elastic_grpc_server_->Update(request_.cluster_def());
+        if (s.ok()) {
+          reply_.set_code(Code::OK);
+        } else {
+          reply_.set_code(Code::INTERNAL);
+          reply_.set_msg(s.ToString());
+          LOG(ERROR) << "error" << s.ToString();
+        }
+
+        // And we are done! Let the gRPC runtime know we've finished, using the
+        // memory address of this instance as the uniquely identifying tag for
+        // the event.
+        status_ = FINISH;
+        responder_.Finish(reply_, ::grpc::Status::OK, this);
+      } else {
+        GPR_ASSERT(status_ == FINISH);
+        // Once in the FINISH state, deallocate ourselves (CallData).
+        delete this;
+      }
+    }
+   private:
+    ElasticGrpcServer* elastic_grpc_server_;
+    // The means of communication with the gRPC runtime for an asynchronous
+    // server.
+    ElasticTrainingService::AsyncService* service_;
+    // The producer-consumer queue where for asynchronous server notifications.
+    ServerCompletionQueue* cq_;
+    // Context for the rpc, allowing to tweak aspects of it such as the use
+    // of compression, authentication, as well as to send metadata back to the
+    // client.
+    ServerContext ctx_;
+
+    // What we get from the client.
+    UpdateServerDefRequest request_;
+    // What we send back to the client.
+    UpdateServerDefResponse reply_;
+
+    // The means to get back to the client.
+    ServerAsyncResponseWriter<UpdateServerDefResponse> responder_;
+
+    // Let's implement a tiny state machine with the following states.
+    enum CallStatus { CREATE, PROCESS, FINISH };
+    CallStatus status_;  // The current serving state.
+  };
+
+  ElasticGrpcServer* elastic_grpc_server_;
+  ::grpc::ServerBuilder* builder_;
+  ElasticTrainingService::AsyncService elastic_service_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+};
+
+AsyncServiceInterface* NewElasticGrpcService(
+    ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder) {
+  return reinterpret_cast<AsyncServiceInterface*>(new GrpcElasticService(elastic_grpc_server, builder));
+}
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.h b/tensorflow/contrib/elastic_grpc_server/elastic_service.h
new file mode 100644
index 00000000000..9465a10c918
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
+#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
+
+
+#include <memory>
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+class ElasticGrpcServer;
+
+namespace tensorflow {
+
+class AsyncServiceInterface;
+AsyncServiceInterface* NewElasticGrpcService(
+    ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder);
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
\ No newline at end of file
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 95bbbab5624..0531200e7ab 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -139,6 +139,7 @@ load(
     "tf_lib_proto_parsing_deps",
     "tf_proto_library",
     "tf_proto_library_cc",
+    "tf_proto_library_py",
     "tf_protos_all",
     "tf_protos_all_impl",
     "tf_protos_grappler",
@@ -2475,6 +2476,28 @@ tf_proto_library_cc(
     ],
 )
 
+tf_proto_library_cc(
+    name = "elastic_service_proto",
+    srcs = ["protobuf/elastic_training.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+tf_proto_library_py(
+    name = "elastic_service_pb",
+    srcs = ["protobuf/elastic_training.proto"],
+    use_grpc_plugin = True,
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
 LIB_INTERNAL_PRIVATE_HEADERS = [
     "framework/resource_handle.h",
     "//tensorflow/core/platform:legacy_lib_internal_headers",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 521c8f206f8..79d6b0cd65e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -127,14 +127,11 @@ class GrpcServer : public ServerInterface {
   const ServerDef& server_def() const { return server_def_; }
   GrpcWorker* worker_impl() const { return worker_impl_.get(); }
 
-
- private:
-  // The overall server configuration.
-  const ServerDef server_def_;
+ protected:
+  // The overall server configuration. It may be changed during scaling.
+  ServerDef server_def_;
   Env* env_;
-
-  // The port to which this server is bound.
-  int bound_port_ = 0;
+  GrpcServerOptions opts_;
 
   // Guards state transitions.
   mutex mu_;
@@ -151,6 +148,9 @@ class GrpcServer : public ServerInterface {
   enum State { NEW, STARTED, STOPPED };
   State state_ GUARDED_BY(mu_);
 
+ private:
+  // The port to which this server is bound.
+  int bound_port_ = 0;
   // Implementation of a TensorFlow master, and RPC polling thread.
   MasterEnv master_env_;
   std::unique_ptr<Master> master_impl_;
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 406285e7f0f..75d3c671562 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -769,6 +769,12 @@ def tf_additional_star_lib_defines():
         "//conditions:default": [],
     })
 
+def tf_additional_elastic_server_lib_defines():
+    return select({
+        "//tensorflow:with_elastic_support": ["TENSORFLOW_USE_ELASTIC_SERVER"],
+        "//conditions:default": [],
+    })
+
 def tf_additional_api_compatible_defines():
     return select({
         "//tensorflow:with_api_compatible": ["TF_API_COMPATIBLE_1150"],
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 71651faf0b1..38191dea3c4 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -77,6 +77,14 @@ def tf_additional_star_deps():
         "//conditions:default": [],
     })
 
+def tf_additional_elastic_deps():
+    return select({
+        str(Label("//tensorflow:with_elastic_support")): [
+             str(Label("//tensorflow/contrib/elastic_grpc_server:elastic_grpc_server_lib")),
+        ],
+        "//conditions:default": [],
+    })
+
 # Include specific extra dependencies when building statically, or
 # another set of dependencies otherwise. If "macos" is provided, that
 # dependency list is used when using the framework_shared_object config
diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto
new file mode 100644
index 00000000000..ee0d0bd10e0
--- /dev/null
+++ b/tensorflow/core/protobuf/elastic_training.proto
@@ -0,0 +1,76 @@
+syntax = "proto3";
+
+package des;
+
+enum Code {
+  OK                  = 0;
+  CANCELLED           = 1;
+  UNKNOWN             = 2;
+  INVALID_ARGUMENT    = 3;
+  DEADLINE_EXCEEDED   = 4;
+  NOT_FOUND           = 5;
+  ALREADY_EXISTS      = 6;
+  PERMISSION_DENIED   = 7;
+  RESOURCE_EXHAUSTED  = 8;
+  FAILED_PRECONDITION = 9;
+  ABORTED             = 10;
+  OUT_OF_RANGE        = 11;
+  UNIMPLEMENTED       = 12;
+  INTERNAL            = 13;
+  UNAVAILABLE         = 14;
+  DATA_LOSS           = 15;
+  UNAUTHENTICATED     = 16;
+  REQUEST_STOP        = 17;
+}
+
+enum ElasticTrainingState {
+  READY = 0;
+  SCALING = 1;
+  All_SESSION_CLOSED = 2;
+}
+
+enum ScalingAction {
+  NONE = 0;
+  SCALING_UP = 1;
+  SCALING_DOWN = 2;
+}
+
+message IsReadyScalingRequest {
+  int32 task_index = 1;
+}
+
+message IsReadyScalingResponse {
+  Code code = 1;
+  string msg = 2;
+  ScalingAction scaling_action = 3;
+  int32 ps_num = 4; // updated ps_num;
+}
+
+message ReadyToUpdateRequest {};
+message ReadyToUpdateResponse {};
+
+message UpdateServerDefRequest {
+  string cluster_def = 1;//serialized cluster_def
+}
+
+message UpdateServerDefResponse {
+  Code code = 1;
+  string msg = 2;
+}
+
+message FetchParamsRequest {
+  repeated string names = 1; // vec of partitioned variables or ev
+}
+
+message FetchParamsResponse {
+  Code code = 1;
+  string msg = 2;
+  map<string, int32> param_partition_map = 3; // per partition num of variable
+}
+
+service ElasticTrainingService {
+  rpc IsReadyScaling(IsReadyScalingRequest) returns (IsReadyScalingResponse);
+  rpc ReadyToUpdate(ReadyToUpdateRequest) returns (ReadyToUpdateResponse);
+  rpc UpdateServerDef(UpdateServerDefRequest) returns (UpdateServerDefResponse);
+  rpc FetchParamsMeta(FetchParamsRequest) returns (FetchParamsResponse);
+}
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 68649078f5c..a740e0916d9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -24,7 +24,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow/core/platform:default/build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_cupti_test_flags", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
-load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps", "tf_additional_elastic_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -5307,6 +5307,7 @@ tf_py_wrap_cc(
          tf_additional_verbs_deps() +
          tf_additional_mpi_deps() +
          tf_additional_gdr_deps() +
+	 tf_additional_elastic_deps() +
          tf_additional_star_deps()) + if_ngraph([
         "@ngraph_tf//:ngraph_tf",
     ]),

From 2d31c8e37ea28d7c169879ebd9c3a89bd8d26cb5 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Thu, 26 Oct 2023 04:02:41 -0700
Subject: [PATCH 57/91] [Embedding] Add interface of EmbeddingVar for Elastic
 Training. (#933)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 configure.py                                  |  2 +-
 tensorflow/contrib/elastic_grpc_server/BUILD  |  3 +-
 tensorflow/core/BUILD                         |  5 +-
 .../framework/embedding/bloom_filter_policy.h |  2 +-
 .../embedding/counter_filter_policy.h         |  2 +-
 .../framework/embedding/cpu_hash_map_kv.h     | 22 +++++
 .../framework/embedding/dense_hash_map_kv.h   | 19 ++++
 .../core/framework/embedding/embedding_var.h  | 86 ++++++++++++++++++-
 .../embedding/embedding_var_ckpt_data.h       |  1 -
 .../core/framework/embedding/filter_policy.h  | 20 ++++-
 .../framework/embedding/gpu_hash_map_kv.h     |  7 ++
 .../core/framework/embedding/kv_interface.h   |  5 ++
 .../core/framework/embedding/leveldb_kv.h     | 32 +++++++
 .../framework/embedding/multi_tier_storage.h  |  9 +-
 .../embedding/nullable_filter_policy.h        |  2 +-
 .../framework/embedding/single_tier_storage.h | 13 ++-
 .../core/framework/embedding/ssd_hash_kv.h    |  6 ++
 tensorflow/core/framework/embedding/storage.h |  7 +-
 tensorflow/core/kernels/data/BUILD            |  6 ++
 tensorflow/core/kernels/data/iterator_ops.cc  | 12 ++-
 tensorflow/python/ops/embedding_ops.py        |  3 +-
 21 files changed, 244 insertions(+), 20 deletions(-)

diff --git a/configure.py b/configure.py
index 6aeaf7d12af..4fb1c78c40b 100644
--- a/configure.py
+++ b/configure.py
@@ -1434,7 +1434,7 @@ def main():
                 True, 'star')
 
   set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support',
-                True, 'elastic')
+                False, 'elastic')
 
   set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support',
                 False, 'pmem')
diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD
index ea4b87e3b58..16ec91f4435 100644
--- a/tensorflow/contrib/elastic_grpc_server/BUILD
+++ b/tensorflow/contrib/elastic_grpc_server/BUILD
@@ -56,7 +56,8 @@ cc_library(
 tf_cc_test(
     name = "elastic_grpc_test",
     size = "small",
-    srcs = ["elastic_grpc_server_lib_test.cc"],
+    srcs = select({"//tensorflow:with_elastic_support": ["elastic_grpc_server_lib_test.cc"],
+		   "//conditions:default": []}),
     deps = [
         ":elastic_grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0531200e7ab..ef1ebcb6dcf 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -128,6 +128,7 @@ load(
     "tf_additional_numa_deps",
     "tf_additional_numa_lib_defines",
     "tf_additional_star_lib_defines",
+    "tf_additional_elastic_server_lib_defines",
     "tf_additional_api_compatible_defines",
     "tf_additional_pmem_lib_defines",
     "tf_additional_test_deps",
@@ -1441,6 +1442,7 @@ tf_cc_test(
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
+    defines = tf_additional_elastic_server_lib_defines(),
     deps = [
         ":array_ops_op_lib",
         ":parquet_ops_op_lib",
@@ -2562,7 +2564,8 @@ LIB_INTERNAL_DEFINES = (
     tf_additional_gdr_lib_defines() +
     tf_additional_numa_lib_defines() +
     tf_additional_star_lib_defines() +
-    tf_additional_pmem_lib_defines()
+    tf_additional_pmem_lib_defines() +
+    tf_additional_elastic_server_lib_defines()
 )
 
 cc_library(
diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 781511578af..8019e70a312 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -333,7 +333,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       void* value_ptr = nullptr;
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index 19cd90ad01c..e53d574182c 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -159,7 +159,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       int64 import_freq = 0;
diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 8476c399c40..750ba282285 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -137,6 +137,28 @@ class LocklessHashMap : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    std::pair<const K, void*> *hash_map_dump;
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ 
+          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_
+          && hash_map_dump[j].first % kSavedPartitionNum 
+              % partition_nums != partition_id) {
+        key_list->emplace_back(hash_map_dump[j].first);
+        value_ptr_list->emplace_back(hash_map_dump[j].second);
+      }
+    }
+
+    free(hash_map_dump);
+    return Status::OK();
+  }
+
   std::string DebugString() const override {
     LOG(INFO) << "map info size:" << Size()
               << "map info bucket_count:" << hash_map_.bucket_count()
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index ffaf2e335dc..8a27404b66f 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -121,6 +121,25 @@ class DenseHashMap : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    dense_hash_map hash_map_dump[partition_num_];
+    for (int i = 0; i< partition_num_; i++) {
+      spin_rd_lock l(hash_map_[i].mu);
+      hash_map_dump[i].hash_map = hash_map_[i].hash_map;
+    }
+    for (int i = 0; i< partition_num_; i++) {
+      for (const auto it : hash_map_dump[i].hash_map) {
+        if (it.first % kSavedPartitionNum % partition_nums != partition_id) {
+          key_list->push_back(it.first);
+          value_ptr_list->push_back(it.second);
+        }
+      }
+    }
+    return Status::OK();
+  }
+
   std::string DebugString() const override {
     return "";
   }
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 487f595bf31..a66ec19fb97 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -435,6 +435,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_->CacheSize();
   }
 
+  int64 MemoryUsage() const {
+    return storage_->Size() * (sizeof(K) + feat_desc_->data_bytes());
+  }
+
   int64 MinFreq() {
     return emb_config_.filter_freq;
   }
@@ -516,6 +520,85 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  Status GetShardedSnapshot(std::vector<K>* key_list,
+                            std::vector<void*>* value_ptr_list,
+                            int partition_id, int partition_num) {
+    return storage_->GetShardedSnapshot(key_list, value_ptr_list,
+                                        partition_id, partition_num);
+  }
+
+  void ExportAndRemove(K* key_list, V* value_list,
+                     int64* version_list, int64* freq_list,
+                     std::vector<K>& tot_keys_list,
+                     std::vector<void*>& tot_value_ptr_list) {
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+
+    for (int64 i = 0; i < tot_keys_list.size(); ++i) {
+      auto& value_ptr = tot_value_ptr_list[i];
+      if((int64)value_ptr == embedding::ValuePtrStatus::IS_DELETED)
+        continue;
+
+      bool is_admit = feat_desc_->IsAdmit(value_ptr);
+      bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+
+      if (!is_admit) {
+        key_list[i] = tot_keys_list[i];
+        
+        if (!is_in_dram) {
+          auto tmp_value = value_list + i * value_len_;
+          tmp_value = (V*)embedding::ValuePtrStatus::NOT_IN_DRAM;
+          value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        } else if (feat_desc_->GetEmbedding(value_ptr, 0) == nullptr) {
+          memcpy(value_list + i * value_len_, default_value_, sizeof(V) * value_len_);
+        } else {
+          V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
+          memcpy(value_list + i * value_len_, val, sizeof(V) * value_len_);
+        }
+
+        if(is_save_version) {
+          int64 dump_version = feat_desc_->GetVersion(value_ptr);
+          version_list[i] = dump_version;
+        }
+
+        if(is_save_freq) {
+          int64 dump_freq = feat_desc_->GetFreq(value_ptr);
+          freq_list[i] = dump_freq;
+        }
+      } else {
+        if (!save_unfiltered_features)
+          return;
+        //TODO(JUNQI) : currently not export filtered keys
+      }
+
+      if (emb_config_.is_primary()) {
+        Status s;
+        s = storage_->Remove(tot_keys_list[i]);
+        if (!s.ok()) {
+          LOG(ERROR) << "Remove keys error: " << s.error_message();
+        }
+        feat_desc_->Deallocate(value_ptr);
+      }
+    }
+  }
+
+  Status RestoreFromKeysAndValues(int64 key_num, int partition_id,
+                                  int partition_num, const K* key_list,
+                                  const V* value_list, const int64* version_list,
+                                  const int64* freq_list,
+                                  const Eigen::GpuDevice* device = nullptr) {
+    RestoreBuffer restore_buff((char*)key_list, (char*)value_list,
+                                (char*)version_list, (char*)freq_list);
+    return storage_->RestoreFeatures(key_num, kSavedPartitionNum, 
+                                     partition_id, partition_num,
+                                     value_len_, false/* is_filter*/, false/* is_incr*/,
+                                     emb_config_, device, filter_, restore_buff);
+  }
+
   mutex* mu() {
     return &mu_;
   }
@@ -537,6 +620,8 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  string Name() {return name_; }
+
   V* GetDefaultValuePtr() {
     return default_value_;
   }
@@ -645,7 +730,6 @@ class EmbeddingVar : public ResourceBase {
   GPUHashTable<K, V>* HashTable() {
     return storage_->HashTable();
   }
-
   FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const {
     return filter_;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index 10bf0d0e43b..13072f9cdd1 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -20,7 +20,6 @@ limitations under the License.
 namespace tensorflow {
 class BundleWriter;
 namespace {
-  const int kSavedPartitionNum = 1000;
   const int kDramFlagOffset = 49;
 }
 
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 256d3b044d4..c994829bafc 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -27,19 +27,31 @@ struct RestoreBuffer {
   char* value_buffer = nullptr;
   char* version_buffer = nullptr;
   char* freq_buffer = nullptr;
+  bool should_release = false;
 
   explicit RestoreBuffer(size_t buffer_size) {
     key_buffer = new char[buffer_size];
     value_buffer = new char[buffer_size];
     version_buffer = new char[buffer_size];
     freq_buffer = new char[buffer_size];
+    should_release = true;
+  }
+
+  explicit RestoreBuffer(char* i_key_buffer, char* i_value_buffer,
+                         char* i_version_buffer, char* i_freq_buffer) {
+    key_buffer = i_key_buffer;
+    value_buffer = i_value_buffer;
+    version_buffer = i_version_buffer;
+    freq_buffer = i_freq_buffer;
   }
 
   ~RestoreBuffer() {
-    delete []key_buffer;
-    delete []value_buffer;
-    delete []version_buffer;
-    delete []freq_buffer;
+    if (should_release) {
+      delete []key_buffer;
+      delete []value_buffer;
+      delete []version_buffer;
+      delete []freq_buffer;
+    }
   }
 };
 
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index fc4a2506313..e73839e3f76 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -252,6 +252,13 @@ class GPUHashMapKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot";
+    return Status::OK();
+  }
+
   std::string DebugString() const override { return std::string(); }
 
   GPUHashTable<K, V>* HashTable() override { return hash_table_; }
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 3659187c825..dc603680138 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -23,6 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 const char* kInferenceMode = "INFERENCE_MODE";
+const int kSavedPartitionNum = 1000;
 }
 
 template <class K, class V>
@@ -89,6 +90,10 @@ class KVInterface {
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
 
+  virtual Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) = 0;
+
   virtual std::string DebugString() const = 0;
 
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index e488ab3776d..47c8a39dfbd 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -193,6 +193,38 @@ class LevelDBKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    ReadOptions options;
+    options.snapshot = db_->GetSnapshot();
+    leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      K key;
+      memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
+      if (key % kSavedPartitionNum % partition_nums == partition_id) continue;
+      key_list->emplace_back(key);
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM, true, true,
+          {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr,
+             it->value().ToString().data(),
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(
+          value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(
+          value_ptr, feat_desc_->GetVersion(dram_value_ptr));
+      value_ptr_list->emplace_back(value_ptr);
+    }
+    delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
+    return Status::OK();
+  }
+
   int64 Size() const override {
     return counter_->size();
   }
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 7955322aca6..f77fec8c85a 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -87,6 +87,14 @@ class MultiTierStorage : public Storage<K, V> {
   Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) override {
     LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
+    return Status::OK();
+  }
+
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage.";
+    return Status::OK();
   }
 
   void CopyEmbeddingsFromCPUToGPU(
@@ -170,7 +178,6 @@ class MultiTierStorage : public Storage<K, V> {
     });
   }
 
- protected:
   Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
                          bool is_incr, const EmbeddingConfig& emb_config,
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 7e3ace0063d..55f718d7ca4 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -150,7 +150,7 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       int64 import_freq = 0;
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index be08afd7f50..db96c807c5e 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -223,6 +223,14 @@ class SingleTierStorage : public Storage<K, V> {
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    mutex_lock l(Storage<K, V>::mu_);
+    return kv_->GetShardedSnapshot(key_list, value_ptr_list,
+                                   partition_id, partition_nums);
+  }
+
   Status Save(
       const std::string& tensor_name,
       const std::string& prefix,
@@ -286,7 +294,7 @@ class SingleTierStorage : public Storage<K, V> {
   FeatureDescriptor<V>* feature_descriptor() {
     return feat_desc_;
   }
- protected:
+
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                                  int64 partition_num, int64 value_len, bool is_filter,
                                  bool is_incr, const EmbeddingConfig& emb_config, 
@@ -298,7 +306,8 @@ class SingleTierStorage : public Storage<K, V> {
                                false/*to_dram*/, is_incr, restore_buff);
     return s;
   }
-
+ 
+ protected:
   virtual void Shrink(std::vector<K>& key_list,
                       std::vector<void*>& value_ptr_list,
                       ShrinkArgs& shrink_args,
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index f51c6904a50..a56c9f73385 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -349,6 +349,12 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    return Status::OK();
+  }
+
   Status GetSnapshot(
       std::vector<K>* key_list,
       std::vector<EmbFile*>* file_list) {
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 1ffb435054b..a652de5fa5f 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -95,6 +95,9 @@ class Storage {
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
+  virtual Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) = 0;
   virtual Status Save(
       const string& tensor_name,
       const string& prefix,
@@ -197,7 +200,6 @@ class Storage {
                       int64 freq, int64 version,
                       int emb_index) = 0;
 
- protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                                  int64 partition_num, int64 value_len, bool is_filter,
                                  bool is_incr, const EmbeddingConfig& emb_config,
@@ -206,7 +208,8 @@ class Storage {
                                  RestoreBuffer& restore_buff) {
     return Status::OK();
   }
-  
+
+ protected:
   virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num,
                             int64 value_len,
                             const std::string& ssd_emb_file_name,
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 08445403b58..6878c5f8350 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -9,6 +9,11 @@ load(
     "transitive_hdrs",
 )
 
+load(
+    "//tensorflow/core/platform:default/build_config.bzl",
+    "tf_additional_elastic_server_lib_defines",
+)
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -1119,6 +1124,7 @@ tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
     hdrs = ["iterator_ops.h"],
+    defines = tf_additional_elastic_server_lib_defines(),
     deps = [
         ":captured_function",
         ":dataset_utils",
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 08d9d936537..ed6b40a38a0 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -308,7 +308,11 @@ void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
       }
 
       ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+#ifdef TENSORFLOW_USE_ELASTIC_SERVER
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), true));
+#else
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), false));
+#endif
 
       IteratorResource* resource;
       OP_REQUIRES_OK(
@@ -783,7 +787,11 @@ class OneShotIteratorOp : public AsyncOpKernel {
 
   Status TryInit(OpKernelContext* ctx, IteratorResource** iterator,
                  ContainerInfo* cinfo) {
-    TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def()));
+#ifdef TENSORFLOW_USE_ELASTIC_SERVER
+      TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), true));
+#else
+      TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), false));
+#endif
 
     FunctionLibraryRuntime* flr;
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index cb2b7bb8154..e239c9ba8d5 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -44,6 +44,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
+SAVED_PARTITIONED_NUM = 1000
 
 def _clip(params, ids, max_norm):
   """Helper function for _embedding_lookup_and_transform.
@@ -216,7 +217,7 @@ def _embedding_lookup_and_transform(params,
 
       if isinstance(params[0], kv_variable_ops.EmbeddingVariable):
          new_ids = flat_ids
-         p_assignments = flat_ids % 1000 % np 
+         p_assignments = flat_ids % SAVED_PARTITIONED_NUM % np 
       elif partition_strategy == "mod":
         p_assignments = flat_ids % np
         new_ids = flat_ids // np

From 89c7d63f50ed335ea14eb17f295b315a59e9f843 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 1 Nov 2023 19:48:48 +0800
Subject: [PATCH 58/91] [Runtime] Update log level in direct_session. (#935)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/core/common_runtime/direct_session.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 9670e838f88..a3dd3eba2ed 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -2185,8 +2185,8 @@ Status DirectSession::GetOrCreateExecutors(
   auto insert_key_status = executors_.emplace(key, insert_result.first->second);
   *executors_and_keys = insert_result.first->second.get();
   if (insert_key_status.second) {
-    LOG(INFO) << "Add new unsort key to executors_ map: " << executors_idx++
-              << ", key: " << key << ", this: " << this;
+    VLOG(2) << "Add new unsort key to executors_ map: " << executors_idx++
+            << ", key: " << key << ", this: " << this;
   }
 
   return Status::OK();

From c2e664aecaec18106350ec77dee946e45dbcf1fb Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 7 Nov 2023 19:10:14 -0800
Subject: [PATCH 59/91] [Embedding] Remove private header. (#943)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h | 1 -
 tensorflow/core/framework/embedding/hbm_dram_storage.h     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 1056f4bbd78..4bc3b7d3aa2 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -7,7 +7,6 @@
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index d058d95f05b..15f6271fb4f 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
 #include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;

From fc4f9f5c48b3f84d1f945c6aa738253cac7acf95 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 7 Nov 2023 23:32:37 -0800
Subject: [PATCH 60/91] [Distributed] Fix ps address list sort by index. (#945)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../elastic_grpc_server_lib.cc                  | 17 +++++++++++------
 .../elastic_grpc_server/elastic_service.cc      |  2 +-
 tensorflow/core/protobuf/elastic_training.proto |  2 +-
 tensorflow/python/BUILD                         |  1 +
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
index d45d70d6c8c..66e237956e5 100644
--- a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include <google/protobuf/map.h>
 #include "include/json/json.h"
 #include "grpc/support/alloc.h"
 #include "grpcpp/grpcpp.h"
@@ -89,7 +90,7 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
       return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
     }
 
-    std::unordered_set<string> ps_addrs_vec;
+    std::set<string> ps_addrs_vec; //ordered
     after_part_num = cluster_json["cluster"]["ps"].size();
     for (auto& value: cluster_json["cluster"]["ps"]) {
       ps_addrs_vec.emplace(value.asString());
@@ -111,21 +112,25 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
           }
           for (auto ps_addr: ps_addrs_vec) {
             if (target_string_set.find(ps_addr) == target_string_set.end()) {
-              job->mutable_tasks()->insert({idx, ps_addr});
+              job->mutable_tasks()->insert({idx++, ps_addr});
               tf_config_json["cluster"]["ps"].append(ps_addr);
             }
           } 
           break;
         } else {
           LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
+          google::protobuf::Map< google::protobuf::int32, std::string > tasks;
+          Json::Value arr_value(Json::arrayValue);
+          int idx = 0;
           for (int i = 0; i < before_part_num; ++i) {
             string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
-            if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
-              Json::Value ps_addr;
-              tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
-              job->mutable_tasks()->erase(i);
+            if (ps_addrs_vec.find(tmp_string) != ps_addrs_vec.end()) {
+              arr_value.append(tf_config_json["cluster"]["ps"][i]);
+              tasks[idx++] = tmp_string;
             }
           }
+          tf_config_json["cluster"]["ps"].swap(arr_value);
+          job->mutable_tasks()->swap(tasks);
         }
       }
     }
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
index 61aa6e662ec..59f7fa473bd 100644
--- a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <grpcpp/server.h>
 #include "grpcpp/server_builder.h"
 
-using namespace des;
+using namespace deeprec;
 
 using grpc::Server;
 using grpc::ServerAsyncResponseWriter;
diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto
index ee0d0bd10e0..b6af4b139cf 100644
--- a/tensorflow/core/protobuf/elastic_training.proto
+++ b/tensorflow/core/protobuf/elastic_training.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package des;
+package deeprec;
 
 enum Code {
   OK                  = 0;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a740e0916d9..f9cc74743be 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4747,6 +4747,7 @@ py_library(
         ":platform",
         ":protos_all_py",
         ":session_run_hook",
+        "//tensorflow/core:elastic_service_pb_py",
         ":training_util",
         ":util",
     ],

From 29d9b464b55b571484ceae11947a6dfa25caba19 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 8 Nov 2023 19:17:25 -0800
Subject: [PATCH 61/91] [Op] Canonicalize SaveV2 Op device spec in distributed
 training. (#925)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/training/saver.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 981d01dd7be..acc9723c183 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -550,8 +550,12 @@ def _GroupByDevices(self, saveables):
     """
     per_device = collections.defaultdict(lambda: [])
     for saveable in saveables:
-      canonical_device = set(
-          pydev.canonical_name(spec.tensor.device) for spec in saveable.specs)
+      canonical_device = set()
+      for spec in saveable.specs:
+        device_name = pydev.canonical_name(spec.tensor.device)
+        device_spec = pydev.DeviceSpec.from_string(device_name)
+        device_spec.device_type = "CPU"
+        canonical_device.add(device_spec.to_string())
       if len(canonical_device) != 1:
         raise ValueError("All tensors of a saveable object must be "
                          "on the same device: %s" % saveable.name)

From feab52dd225b9838d41790f25abb0f2f0607b199 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 15 Nov 2023 10:24:34 +0800
Subject: [PATCH 62/91] [Embedding] Fix SharedEmbeddingColumn with
 PartitionedEmbedingVariable shape validation error. (#948)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 .../python/feature_column/feature_column.py   |  3 ++
 .../feature_column/feature_column_v2_test.py  | 35 +++++++++++++++++++
 tensorflow/python/ops/variables.py            |  3 ++
 3 files changed, 41 insertions(+)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 3d5e7a71330..86a190cf86b 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2675,6 +2675,9 @@ def create_embedding(self,
         embedding_weights = shared_embedding_collection[0]
         if isinstance(embedding_weights, kv_variable_ops.EmbeddingVariable):
           embedding_shape = (self.dimension)
+        elif isinstance(embedding_weights, variables.PartitionedVariable):
+          if isinstance(embedding_weights._get_variable_list()[0], kv_variable_ops.EmbeddingVariable):
+            embedding_shape = (self.dimension)
         if embedding_weights.get_shape() != embedding_shape:
           raise ValueError(
               'Shared embedding collection {} contains variable {} of '
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index ff5935b708f..7946aee1e1a 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -7705,6 +7705,41 @@ def testEmbeddingVariableForSharedEmbeddingColumnsMultiCol(self):
         for j in range(3):
           self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
 
+  def testEmbeddingVariableForSharedPartitionedEmbeddingColumnsMultiCol(self):
+    columns_list=[]
+    columns_list.append(fc.categorical_column_with_embedding("col_emb", dtype=dtypes.string))
+    columns_list.append(fc.categorical_column_with_embedding("col_emb2", dtype=dtypes.string))
+    W = fc.shared_embedding_columns(columns_list,
+            dimension=3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            shared_embedding_collection_name="xxxxx_shared")
+
+    ids={}
+    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    ids["col_emb2"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    with variable_scope.variable_scope("scope",partitioner=partitioned_variables.fixed_size_partitioner(4)):
+      emb = fc_old.input_layer(ids, W)
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables_lib.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init)
+      sess.run([emb, train_op,loss])
+      sess.run([emb, train_op,loss])
+      emb_r, _, _ = sess.run([emb, train_op,loss])
+      emb_right = [[0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214]]
+      for i in range(5):
+        for j in range(3):
+          self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
+
   @test_util.run_deprecated_v1
   def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self):
     columns_list=[]
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 6a3a1e0702b..8f92d091e68 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -3100,6 +3100,9 @@ def __init__(self, name, shape, dtype, variable_list, partitions):
 
     self._name = name
     self._shape = shape
+    from tensorflow.python.ops import kv_variable_ops
+    if isinstance(self._variable_list[0], kv_variable_ops.EmbeddingVariable):
+      self._shape = shape[1:]
     self._dtype = dtype
     self._partitions = partitions
     self._as_tensor = None

From 37221b53ca3a90ea1a3f85cc787463fc3c9884fe Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 15 Nov 2023 11:42:07 +0800
Subject: [PATCH 63/91] [Release] Update DeepRec release version to
 1.15.5+deeprec2310. (#949)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d5fa79bf2b1..e8635e1a298 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '1.15.5+deeprec2306'
+_VERSION = '1.15.5+deeprec2310'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.9.0',

From 3bc98886262c496ffcacac54f02391c9818e75ae Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Thu, 16 Nov 2023 16:53:48 +0800
Subject: [PATCH 64/91] [Docs] Update deeprec2310 release images and notes in
 README.md & RELEASE.md. (#950)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 41 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 53cca5c5c83..8f491e14665 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index 43e03bc2b49..6b7e4a7fd79 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,44 @@
+# Release r1.15.5-deeprec2310
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Refactor the data structure of EmbeddingVariable.
+- Add interface of EmbeddingVar for Elastic Training.
+- Add GetSnapshot and Create API for EmbeddingVariable.
+- Remove the dependency on private header file in EmbeddingVariable.
+
+### **Runtime Optimization**
+
+- Canonicalize SaveV2 Op device spec in distributed training.
+- Update log level in direct_session.
+
+### **Distributed**
+
+- Add elastic-grpc server.
+
+### **BugFix**
+
+- Fix missing return value of RestoreSSD of DramSSDHashStorage.
+- Fix incorrect frequency in shared-embedding.
+- Fix set initialized flag too early in restore subgraph.
+- Fix wgrad bug in Sparse Operation Kit.
+- Fix hang bug for async embedding lookup.
+- Fix ps address list sort by index.
+- Fix SharedEmbeddingColumn with PartitionedEmbedingVariable shape validation error.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2306
 
 ## **Major Features and Improvements**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index 83ba4854b9f..fdf3e295fdd 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 73b6a36f318..55f759a3c2a 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2306
+Develop Branch：master, Latest Release Branch: deeprec2310
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 346a848ca74..79a0944aa3e 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2306
+Develop Branch: master, Latest Release Branch: deeprec2310
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index 08d249f8eeb..ad8fd36dbf7 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index e5455aae91a..e54c8ddbd2f 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2306
+开发分支：master，最新Release分支：deeprec2310
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index 0c76400e6c6..a43d2d517a6 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2306
+开发分支：master，最新Release分支：deeprec2310
 
 ## TFServing编译&打包
 

From d8149699bd8366ef7bb32ea049c4202b0c8d0c68 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:41:02 -0800
Subject: [PATCH 65/91] [ModelZoo] Set Saver's parameter sharded=True in
 distributed training. (#954)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 modelzoo/bst/train.py              | 3 ++-
 modelzoo/dbmtl/train.py            | 3 ++-
 modelzoo/dcn/train.py              | 3 ++-
 modelzoo/dcnv2/train.py            | 3 ++-
 modelzoo/deepfm/train.py           | 3 ++-
 modelzoo/dien/train.py             | 6 +++---
 modelzoo/din/train.py              | 6 +++---
 modelzoo/dlrm/train.py             | 3 ++-
 modelzoo/dssm/train.py             | 3 ++-
 modelzoo/esmm/train.py             | 5 +++--
 modelzoo/masknet/train.py          | 3 ++-
 modelzoo/mlperf/train.py           | 3 ++-
 modelzoo/mmoe/train.py             | 3 ++-
 modelzoo/ple/train.py              | 3 ++-
 modelzoo/simple_multitask/train.py | 5 +++--
 modelzoo/wide_and_deep/train.py    | 3 ++-
 16 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py
index 2fb5e4e90f5..eeeb136678b 100644
--- a/modelzoo/bst/train.py
+++ b/modelzoo/bst/train.py
@@ -612,9 +612,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py
index 24595073b95..c848cbc76b2 100644
--- a/modelzoo/dbmtl/train.py
+++ b/modelzoo/dbmtl/train.py
@@ -527,9 +527,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py
index b8e1dba5d63..44701e22d9f 100644
--- a/modelzoo/dcn/train.py
+++ b/modelzoo/dcn/train.py
@@ -594,9 +594,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py
index 7ac4c1a0358..5b572af0425 100644
--- a/modelzoo/dcnv2/train.py
+++ b/modelzoo/dcnv2/train.py
@@ -610,9 +610,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py
index 896295b0ae6..166bedec0d0 100644
--- a/modelzoo/deepfm/train.py
+++ b/modelzoo/deepfm/train.py
@@ -472,9 +472,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py
index 6c583c3ac19..190695f6ce0 100644
--- a/modelzoo/dien/train.py
+++ b/modelzoo/dien/train.py
@@ -776,10 +776,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.tables_initializer(),
-                               tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py
index 6273e0d15a4..058583ce6fd 100644
--- a/modelzoo/din/train.py
+++ b/modelzoo/din/train.py
@@ -594,10 +594,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.tables_initializer(),
-                               tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py
index 0789e9418b8..cc4c045c349 100644
--- a/modelzoo/dlrm/train.py
+++ b/modelzoo/dlrm/train.py
@@ -507,9 +507,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py
index a757851711c..db949aac5e8 100644
--- a/modelzoo/dssm/train.py
+++ b/modelzoo/dssm/train.py
@@ -478,9 +478,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py
index 58219e19e3e..073b08814d4 100755
--- a/modelzoo/esmm/train.py
+++ b/modelzoo/esmm/train.py
@@ -534,9 +534,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.local_variables_initializer(), train_init_op),
-        saver=tf.train.Saver(max_to_keep=keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py
index 0790f200b21..bb96a467701 100644
--- a/modelzoo/masknet/train.py
+++ b/modelzoo/masknet/train.py
@@ -529,9 +529,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py
index db7e077250b..ce34fe5e55c 100644
--- a/modelzoo/mlperf/train.py
+++ b/modelzoo/mlperf/train.py
@@ -522,9 +522,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py
index 251e02c7a72..694eb45da80 100644
--- a/modelzoo/mmoe/train.py
+++ b/modelzoo/mmoe/train.py
@@ -523,9 +523,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py
index 2ba98363bbf..b2d2f2057ec 100644
--- a/modelzoo/ple/train.py
+++ b/modelzoo/ple/train.py
@@ -592,9 +592,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py
index ff90946c96d..4ef1874a521 100644
--- a/modelzoo/simple_multitask/train.py
+++ b/modelzoo/simple_multitask/train.py
@@ -427,9 +427,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.local_variables_initializer(), train_init_op),
-        saver=tf.train.Saver(max_to_keep=keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py
index b4f4dbc7a65..3024f58024e 100644
--- a/modelzoo/wide_and_deep/train.py
+++ b/modelzoo/wide_and_deep/train.py
@@ -543,9 +543,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(

From 7ce84779b69d746111db5934bc90b94fc3ada6fa Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 5 Dec 2023 00:51:05 -0800
Subject: [PATCH 66/91] [Embedding] Refine KVInterface::GetShardedSnapshot API.
 (#953)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../core/framework/embedding/cpu_hash_map_kv.h     | 14 ++++++++------
 .../core/framework/embedding/dense_hash_map_kv.h   | 10 ++++++----
 .../core/framework/embedding/embedding_var.h       |  9 +++++----
 .../core/framework/embedding/gpu_hash_map_kv.h     |  3 ++-
 tensorflow/core/framework/embedding/kv_interface.h |  3 ++-
 tensorflow/core/framework/embedding/leveldb_kv.h   | 10 ++++++----
 .../core/framework/embedding/multi_tier_storage.h  |  3 ++-
 .../core/framework/embedding/single_tier_storage.h |  3 ++-
 tensorflow/core/framework/embedding/ssd_hash_kv.h  |  3 ++-
 tensorflow/core/framework/embedding/storage.h      |  3 ++-
 10 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 750ba282285..f9a6e1fff25 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -138,7 +138,8 @@ class LocklessHashMap : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     std::pair<const K, void*> *hash_map_dump;
     int64 bucket_count;
@@ -147,11 +148,12 @@ class LocklessHashMap : public KVInterface<K, V> {
     bucket_count = it.second;
     for (int64 j = 0; j < bucket_count; j++) {
       if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ 
-          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_
-          && hash_map_dump[j].first % kSavedPartitionNum 
-              % partition_nums != partition_id) {
-        key_list->emplace_back(hash_map_dump[j].first);
-        value_ptr_list->emplace_back(hash_map_dump[j].second);
+          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_) {
+        int part_id = hash_map_dump[j].first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(hash_map_dump[j].first);
+          value_ptr_list[part_id].emplace_back(hash_map_dump[j].second);
+        }
       }
     }
 
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index 8a27404b66f..12749a92e6e 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -122,7 +122,8 @@ class DenseHashMap : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     dense_hash_map hash_map_dump[partition_num_];
     for (int i = 0; i< partition_num_; i++) {
@@ -131,9 +132,10 @@ class DenseHashMap : public KVInterface<K, V> {
     }
     for (int i = 0; i< partition_num_; i++) {
       for (const auto it : hash_map_dump[i].hash_map) {
-        if (it.first % kSavedPartitionNum % partition_nums != partition_id) {
-          key_list->push_back(it.first);
-          value_ptr_list->push_back(it.second);
+        int part_id = it.first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(it.first);
+          value_ptr_list[part_id].emplace_back(it.second);
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index a66ec19fb97..df6ae6f1277 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -520,8 +520,8 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
-  Status GetShardedSnapshot(std::vector<K>* key_list,
-                            std::vector<void*>* value_ptr_list,
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
                             int partition_id, int partition_num) {
     return storage_->GetShardedSnapshot(key_list, value_ptr_list,
                                         partition_id, partition_num);
@@ -546,7 +546,7 @@ class EmbeddingVar : public ResourceBase {
       bool is_admit = feat_desc_->IsAdmit(value_ptr);
       bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
 
-      if (!is_admit) {
+      if (is_admit) {
         key_list[i] = tot_keys_list[i];
         
         if (!is_in_dram) {
@@ -571,7 +571,7 @@ class EmbeddingVar : public ResourceBase {
         }
       } else {
         if (!save_unfiltered_features)
-          return;
+          continue;
         //TODO(JUNQI) : currently not export filtered keys
       }
 
@@ -584,6 +584,7 @@ class EmbeddingVar : public ResourceBase {
         feat_desc_->Deallocate(value_ptr);
       }
     }
+    return;
   }
 
   Status RestoreFromKeysAndValues(int64 key_num, int partition_id,
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index e73839e3f76..68fecf690ba 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -253,7 +253,8 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot";
     return Status::OK();
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index dc603680138..8480132a7d9 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -91,7 +91,8 @@ class KVInterface {
       std::vector<void*>* value_ptr_list) = 0;
 
   virtual Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) = 0;
 
   virtual std::string DebugString() const = 0;
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index 47c8a39dfbd..030a0969e5d 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -194,7 +194,8 @@ class LevelDBKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
@@ -203,8 +204,9 @@ class LevelDBKV : public KVInterface<K, V> {
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       K key;
       memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
-      if (key % kSavedPartitionNum % partition_nums == partition_id) continue;
-      key_list->emplace_back(key);
+      int part_id = key % kSavedPartitionNum % partition_nums;
+      if (part_id == partition_id) continue;
+      key_list[part_id].emplace_back(key);
       FeatureDescriptor<V> hbm_feat_desc(
           1, 1, ev_allocator()/*useless*/,
           StorageType::HBM_DRAM, true, true,
@@ -218,7 +220,7 @@ class LevelDBKV : public KVInterface<K, V> {
           value_ptr, feat_desc_->GetFreq(dram_value_ptr));
       hbm_feat_desc.UpdateVersion(
           value_ptr, feat_desc_->GetVersion(dram_value_ptr));
-      value_ptr_list->emplace_back(value_ptr);
+      value_ptr_list[part_id].emplace_back(value_ptr);
     }
     delete it;
     feat_desc_->Deallocate(dram_value_ptr);
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index f77fec8c85a..e27521f1a65 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -91,7 +91,8 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage.";
     return Status::OK();
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index db96c807c5e..1c6bdd90790 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -224,7 +224,8 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetShardedSnapshot(key_list, value_ptr_list,
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index a56c9f73385..bdc38cc5d5e 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -350,7 +350,8 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     return Status::OK();
   }
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index a652de5fa5f..559588af7e1 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -96,7 +96,8 @@ class Storage {
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
   virtual Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) = 0;
   virtual Status Save(
       const string& tensor_name,

From a5c014f144f00b5d5606ffa1e47bda0c8e0a2478 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Sun, 10 Dec 2023 22:07:29 +0800
Subject: [PATCH 67/91] [IO] Fix tensor shape meta-data bug for DataFrame
 Value. (#958)

* Revert "[IO] Add tensor shape meta-data support for ParquetDataset. (#849)"
* [IO] Fix tensor shape meta-data bug for DataFrame Value.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../python/data/experimental/ops/dataframe.py | 26 ++++++++---------
 .../experimental/ops/parquet_dataset_ops.py   | 28 +++++++++++--------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/dataframe.py b/tensorflow/python/data/experimental/ops/dataframe.py
index f3dc249653a..003f75259f1 100644
--- a/tensorflow/python/data/experimental/ops/dataframe.py
+++ b/tensorflow/python/data/experimental/ops/dataframe.py
@@ -59,17 +59,14 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None):
       self._ragged_rank = ragged_rank
       if shape:
         shape = tensor_shape.TensorShape(shape)
-        shape_rank = 0
-        for _ in shape:
-          shape_rank += 1
-        if ragged_rank is not None and ragged_rank != shape_rank:
+        for d in shape:
+          if d.value is None:
+            raise ValueError(
+              f'Field {name} has incomplete shape: {shape}')
+        if ragged_rank is not None and ragged_rank > 1:
           raise ValueError(
             f'Field {name} is a nested list ({ragged_rank}) '
             f'with shape {shape}')
-        self._ragged_rank = shape_rank
-      elif ragged_rank is not None:
-        shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)])
-
       self._shape = shape
 
     @property
@@ -134,16 +131,17 @@ def output_classes(self):
     def output_types(self):
       return self.map(lambda i: self._dtype if i == 0 else dtypes.int32)
 
-    def output_shapes(self, batch_size=None):
+    @property
+    def output_shapes(self):
       if self._shape is None:
-        return self.map(lambda i: tensor_shape.vector(batch_size) if i == 0
-                        else tensor_shape.vector(None))
+        return self.map(lambda _: tensor_shape.vector(None))
       return self.map(
-        lambda i: tensor_shape.vector(batch_size).concatenate(self._shape) if i == 0
+        lambda i: tensor_shape.vector(None).concatenate(self._shape) if i == 0
         else tensor_shape.vector(None))
 
-    def output_specs(self, batch_size=None):
-      shape = tensor_shape.vector(batch_size)
+    @property
+    def output_specs(self):
+      shape = tensor_shape.vector(None)
       if self._shape is not None:
         shape = shape.concatenate(self._shape)
       specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)]
diff --git a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
index 719940d1beb..5bb790c331d 100644
--- a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.util import nest
@@ -38,25 +39,23 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec):
   def value_type(self):
     return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor
 
-  def __init__(self, field, batch_size=None):
+  def __init__(self, field):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
       field: The field definition.
-      batch_size: The batch_size of DataFrame.
     """
     if field.incomplete:
       raise ValueError(
         f'Field {field} is incomplete, please specify dtype and ragged_rank')
     self._field = field
-    self._batch_size = batch_size
 
   def _serialize(self):
     return (self._field.dtype, self._field.ragged_rank)
 
   @property
   def _component_specs(self):
-    return self._field.output_specs(self._batch_size)
+    return self._field.output_specs
 
   def _to_components(self, value):
     if isinstance(value, DataFrame.Value):
@@ -80,7 +79,7 @@ def _to_legacy_output_types(self):
     return self._field.output_types
 
   def _to_legacy_output_shapes(self):
-    return self._field.output_shapes(self._batch_size)
+    return self._field.output_shapes
 
   def _to_legacy_output_classes(self):
     return self._field.output_classes
@@ -110,13 +109,18 @@ def __init__(
     self._batch_size = ops.convert_to_tensor(
       batch_size, dtype=dtypes.int64, name='batch_size')
     self._fields = fields
-    self._output_specs = {
-      f.name: (
-        DataFrameValueSpec(f, batch_size if drop_remainder else None)
-        if f.ragged_rank > 0
-        else tensor_spec.TensorSpec(
-            shape=[batch_size if drop_remainder else None], dtype=f.dtype))
-      for f in self._fields}
+    self._output_specs = {}
+    for f in self._fields:
+      item = None
+      if f.ragged_rank > 0:
+        item = DataFrameValueSpec(f)
+      else:
+        shape = tensor_shape.vector(batch_size if drop_remainder else None)
+        if f.shape:
+          shape = shape.concatenate(f.shape)
+        item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype)
+      self._output_specs[f.name] = item
+
     self._field_names = nest.flatten({f.name: f.name for f in self._fields})
     self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields})
     self._field_ragged_ranks = nest.flatten(

From 717f7c5e0840566c39739c321de024a88ddcc84f Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 13 Dec 2023 16:16:52 +0800
Subject: [PATCH 68/91] [Op] Implement of SliceSend/SliceRecv Op. (#947)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/framework/rendezvous.h        |   2 +
 tensorflow/core/graph/graph.cc                |   2 +
 tensorflow/core/graph/graph.h                 |  12 +-
 tensorflow/core/grappler/op_types.cc          |   8 +-
 tensorflow/core/grappler/op_types.h           |   2 +
 tensorflow/core/kernels/BUILD                 |  27 +-
 tensorflow/core/kernels/slice_sendrecv_ops.cc | 562 ++++++++++++++++++
 tensorflow/core/kernels/slice_sendrecv_ops.h  |  89 +++
 .../core/kernels/slice_sendrecv_ops_test.cc   | 339 +++++++++++
 tensorflow/core/ops/slice_sendrecv_ops.cc     |  78 +++
 11 files changed, 1118 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.h
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops_test.cc
 create mode 100644 tensorflow/core/ops/slice_sendrecv_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ef1ebcb6dcf..ce6850eb9da 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1237,6 +1237,7 @@ tf_gen_op_libs(
         "set_ops",
         "script_ops",
         "sendrecv_ops",
+        "slice_sendrecv_ops",
         "sparse_ops",
         "spectral_ops",
         "state_ops",
@@ -1497,6 +1498,7 @@ cc_library(
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
+        ":slice_sendrecv_ops_op_lib",
         ":sparse_ops_op_lib",
         ":star_run_graph_op_op_lib",
         ":summary_ops_op_lib",
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 255c0326e02..3c2b20379c8 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -80,6 +80,8 @@ class Rendezvous : public core::RefCounted {
     friend class SendOp;
     friend class RecvOp;
     friend class FuseRecvOp;
+    friend class SliceSendOp;
+    friend class SliceRecvOp;
     friend class RefSendOp;
     friend class RefRecvOp;
     string buf_;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 8ba5d345837..d9709d39f3f 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -69,11 +69,13 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"_Send", NC_SEND},
         {"_HostSend", NC_HOST_SEND},
         {"_RefSend", NC_REF_SEND},
+        {"_SliceSend", NC_SLICE_SEND},
         {"_Recv", NC_RECV},
         {"_HostRecv", NC_HOST_RECV},
         {"_RefRecv", NC_REF_RECV},
         {"_FuseRecv", NC_FUSE_RECV},
         {"_HostFuseRecv", NC_HOST_FUSE_RECV},
+        {"_SliceRecv", NC_SLICE_RECV},
         {"Const", NC_CONSTANT},
         {"HostConst", NC_CONSTANT},
         {"Variable", NC_VARIABLE},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 0e7e032c9a5..0baf8f257a9 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -219,12 +219,16 @@ class Node {
   bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; }
   bool IsSend() const { return class_ == NC_SEND ||
                                class_ == NC_HOST_SEND ||
-                               class_ == NC_REF_SEND; }
+                               class_ == NC_REF_SEND ||
+                               class_ == NC_SLICE_SEND; }
+  bool IsSliceSend() const { return class_ == NC_SLICE_SEND; }
   bool IsRecv() const { return class_ == NC_RECV ||
                                class_ == NC_HOST_RECV ||
-                               class_ == NC_REF_RECV; }
+                               class_ == NC_REF_RECV ||
+                               class_ == NC_SLICE_RECV; }
   bool IsFuseRecv() const { return class_ == NC_FUSE_RECV ||
                                    class_ == NC_HOST_FUSE_RECV; }
+  bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; }
   bool IsConstant() const { return class_ == NC_CONSTANT; }
   bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; }
   bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; }
@@ -334,11 +338,13 @@ class Node {
     NC_SEND,
     NC_HOST_SEND,
     NC_REF_SEND,
+    NC_SLICE_SEND,
     NC_RECV,
     NC_HOST_RECV,
     NC_REF_RECV,
     NC_FUSE_RECV,
     NC_HOST_FUSE_RECV,
+    NC_SLICE_RECV,
     NC_CONSTANT,
     NC_VARIABLE,
     NC_KV_VAR_HANDLE,
@@ -844,7 +850,9 @@ inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
 inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
+inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); }
 inline bool IsRecv(const Node* node) { return node->IsRecv(); }
+inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); }
 inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); }
 inline bool IsHostSend(const Node* node) { return node->IsHostSend(); }
 inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index a3a521fa123..1201623ffcd 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -454,7 +454,7 @@ bool IsReciprocalGrad(const NodeDef& node) {
 }
 
 bool IsRecv(const NodeDef& node) {
-  return node.op() == "_Recv" || node.op() == "_HostRecv";
+  return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node);
 }
 
 bool IsFuseRecv(const NodeDef& node) {
@@ -502,7 +502,7 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
 bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
 
 bool IsSend(const NodeDef& node) {
-  return node.op() == "_Send" || node.op() == "_HostSend";
+  return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node);
 }
 
 bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
@@ -517,6 +517,10 @@ bool IsSize(const NodeDef& node) { return node.op() == "Size"; }
 
 bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
 
+bool IsSliceRecv(const NodeDef& node) { return node.op() == "_SliceRecv"; }
+
+bool IsSliceSend(const NodeDef& node) { return node.op() == "_SliceSend"; }
+
 bool IsSnapshot(const NodeDef& node) { return node.op() == "Snapshot"; }
 
 bool IsSoftmax(const NodeDef& node) { return node.op() == "Softmax"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 19699ccb933..737581fd412 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -167,6 +167,8 @@ bool IsShuffle(const NodeDef& node);
 bool IsSigmoidGrad(const NodeDef& node);
 bool IsSize(const NodeDef& node);
 bool IsSlice(const NodeDef& node);
+bool IsSliceRecv(const NodeDef& node);
+bool IsSliceSend(const NodeDef& node);
 bool IsSnapshot(const NodeDef& node);
 bool IsSoftmax(const NodeDef& node);
 bool IsSoftplusGrad(const NodeDef& node);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0c08c30c30a..36721527cc2 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5423,8 +5423,9 @@ cc_library(
     name = "required",
     deps = [
         ":no_op",
-        ":sendrecv_ops",
         ":fuserecv_ops",
+        ":sendrecv_ops",
+        ":slice_sendrecv_ops",
     ],
 )
 
@@ -5445,6 +5446,12 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+tf_kernel_library(
+    name = "slice_sendrecv_ops",
+    prefix = "slice_sendrecv_ops",
+    deps = REQUIRED_DEPS,
+)
+
 tf_kernel_library(
     name = "group_embedding_ops",
     hdrs = ["group_embedding/group_embedding_lookup_sparse_forward_base_ops.h"],
@@ -5509,6 +5516,24 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "slice_sendrecv_ops_test",
+    srcs = ["slice_sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking
+    deps = [
+        ":control_flow_ops",
+        ":cwise_op",
+        ":logging_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":slice_sendrecv_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "fuserecv_ops",
     prefix = "fuserecv_ops",
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..f09f314ae10
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -0,0 +1,562 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/slice_sendrecv_ops.h"
+
+namespace tensorflow {
+
+//------------------------------------------------------------------------------
+// Utils.
+static string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                          const string& recv_device,
+                                          const uint64 send_device_incarnation,
+                                          const string& tensor_name) {
+  return strings::StrCat(send_device, ";",
+                         strings::FpToString(send_device_incarnation), ";",
+                         recv_device, ";", tensor_name);
+}
+
+static void GetSliceRendezvousKey(const string& key_prefix,
+                                  const string& tensor_name_suffix,
+                                  const FrameAndIter& frame_iter, string* key) {
+  key->clear();
+  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
+                     frame_iter.frame_id, ":", frame_iter.iter_id);
+}
+
+static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
+//------------------------------------------------------------------------------
+// Functions of SliceSendOp.
+
+SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    GetSliceRendezvousKeyPrefix(send_device, recv_device,
+                                send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+}
+
+void SliceSendOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(
+    ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  const Tensor& input_t = ctx->input(0);
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  // send total_bytes.
+  OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t));
+  // if input is dead, only send total_bytes dead tensor.
+  if (ctx->is_input_dead()) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, send directly.
+  if (input_t.TotalBytes() <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->input_alloc_attr(0);
+
+    Rendezvous::ParsedKey parsed_key;
+    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t,
+                                                ctx->is_input_dead()));
+    return;
+  }
+
+  // send shape.
+  OP_REQUIRES_OK(ctx, SendShape(ctx, frame_iter, input_t));
+
+  // send data.
+  if (dtype_ == DT_STRING) {
+    OP_REQUIRES_OK(ctx, SendString(ctx, frame_iter, input_t));
+  } else {
+    OP_REQUIRES_OK(ctx, SendBasicType(ctx, frame_iter, input_t));
+  }
+}
+
+Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx,
+                                   const FrameAndIter& frame_iter,
+                                   const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor total_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}),
+                                        &total_bytes_t));
+  total_bytes_t.scalar<int64>()() = input_t.TotalBytes();
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t,
+                                 ctx->is_input_dead());
+}
+
+Status SliceSendOp::SendShape(OpKernelContext* ctx,
+                              const FrameAndIter& frame_iter,
+                              const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor shape_t;
+  TensorShape shape = input_t.shape();
+  const int rank = shape.dims();
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({rank}),
+                                        &shape_t));
+  auto shape_vec = shape_t.vec<int64>();
+  for (int i = 0; i < rank; i++) {
+    shape_vec(i) = shape.dim_size(i);
+  }
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, shape_t,
+                                 ctx->is_input_dead());
+}
+
+Status SliceSendOp::SendString(OpKernelContext* ctx,
+                               const FrameAndIter& frame_iter,
+                               const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  // send elements size.
+  Tensor elements_size_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(),
+                                        &elements_size_t));
+  int64 num_elements = input_t.NumElements();
+  auto input_flat = input_t.flat<tstring>();
+  auto elements_size_flat = elements_size_t.flat<int64>();
+  for (int64 i = 0; i < num_elements; i++) {
+    elements_size_flat(i) = input_flat(i).size();
+  }
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
+                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t,
+                                             ctx->is_input_dead()));
+
+  // send data.
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Tensor data_t;
+  for (int64 i = 0; i < num_elements; i++) {
+    const std::string& elem = input_flat(i);
+    if (elem.size() <= slice_size_) {
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}),
+                                            &data_t));
+      data_t.scalar<tstring>()() = elem;
+      std::string tensor_name_suffix = \
+        strings::StrCat("_slice_transfer_data_", std::to_string(i));
+      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                            &parsed_key.buf_);
+      VLOG(2) << "SliceSend " << parsed_key.buf_;
+      TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+      TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                                 ctx->is_input_dead()));
+    } else {
+      TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
+                                    const FrameAndIter& frame_iter,
+                                    const std::string& elem, int64 index) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_;
+  Tensor data_t;
+  for (int64 i = 0; i < slice_num; i++) {
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    size_t start = i * slice_size_;
+    size_t copy_size = slice_size_;
+    if (start > elem.size() - slice_size_) {
+      copy_size = elem.size() - start;
+    }
+    data_t.scalar<tstring>()() = elem.substr(start, copy_size);
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
+                      std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+  return Status::OK();
+}
+
+Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Rendezvous::ParsedKey parsed_key;
+
+  // send data.
+  Tensor data_t;
+  int64 bytes_num = input_t.TotalBytes();
+  int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_;
+  unsigned char* input_base = reinterpret_cast<unsigned char*>(input_t.data());
+  for (int64 i = 0; i < slice_num; i++) {
+    int64 start = i * slice_size_;
+    int64 copy_size = slice_size_;
+    if (start > bytes_num - slice_size_) {
+      copy_size = bytes_num - start;
+    }
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT8, TensorShape({copy_size}),
+                                          &data_t));
+    auto data_base = data_t.data();
+    std::memcpy(data_base, input_base+start, copy_size);
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_CPU), SliceSendOp);
+REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_DEFAULT), SliceSendOp);
+
+//------------------------------------------------------------------------------
+// Functions of SliceRecvOp.
+
+SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    GetSliceRendezvousKeyPrefix(send_device, recv_device,
+                                send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_type", &dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_));
+}
+
+void SliceRecvOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(
+    ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  bool is_dead;
+
+  // recv total_bytes.
+  int64 total_bytes;
+  OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
+  if (is_dead) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, recv directly.
+  if (total_bytes <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->output_alloc_attr(0);
+    if (ctx->is_eager()) {
+      // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+      // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+      // rendezvous if it encounters any error.
+      args.cancellation_manager = ctx->cancellation_manager();
+    }
+
+    Rendezvous::ParsedKey parsed_key;
+    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceRecv " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    Tensor data_t;
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                                &is_dead, timeout_ms_));
+
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    ctx->set_output(0, data_t);
+    return;
+  }
+
+  // recv shape.
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape));
+
+  // recv data
+  Tensor* output_t = nullptr;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output_t));
+  if (dtype_ == DT_STRING) {
+    OP_REQUIRES_OK(ctx, RecvString(ctx, frame_iter, shape, output_t));
+  } else {
+    OP_REQUIRES_OK(ctx, RecvBasicType(ctx, frame_iter, total_bytes, output_t));
+  }
+}
+
+Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
+                                   const FrameAndIter& frame_iter,
+                                   bool& is_dead, int64& total_bytes) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor total_bytes_t;
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t,
+                                             &is_dead, timeout_ms_));
+  if (!is_dead) {
+    total_bytes = total_bytes_t.scalar<int64>()();
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvShape(OpKernelContext* ctx,
+                              const FrameAndIter& frame_iter,
+                              TensorShape& shape) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  Tensor shape_t;
+  bool is_dead;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto shape_vec = shape_t.vec<int64>();
+  const int64 num_elements = shape_t.NumElements();
+  for (int64 i = 0; i < num_elements; i++) {
+    shape.AddDim(shape_vec(i));
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvString(OpKernelContext* ctx,
+                               const FrameAndIter& frame_iter,
+                               const TensorShape& shape, Tensor*& output_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+  bool is_dead;
+
+  // recv elements size.
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
+                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  Tensor elements_size_t;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto elements_size_flat = elements_size_t.flat<int64>();
+  int64 num_elements = shape.num_elements();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  Tensor data_t;
+  auto output_flat = output_t->flat<tstring>();
+  for (int64 i = 0; i < num_elements; i++) {
+    if (elements_size_flat(i) <= slice_size_) {
+      std::string tensor_name_suffix = \
+        strings::StrCat("_slice_transfer_data_", std::to_string(i));
+      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                            &parsed_key.buf_);
+      VLOG(2) << "SliceRecv " << parsed_key.buf_;
+      TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+      TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                                 &is_dead, timeout_ms_));
+      // This shouldn't be a dead tensor.
+      CHECK_EQ(is_dead, false);
+      output_flat(i) = data_t.scalar<tstring>()();
+    } else {
+      TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i,
+                                         elements_size_flat(i), output_flat));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
+                                    const FrameAndIter& frame_iter,
+                                    const int64 index, const int64 element_size,
+                                    TTypes<tstring>::Flat& output_flat) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = (element_size + slice_size_ - 1) / slice_size_;
+  Tensor data_t;
+  bool is_dead = false;
+  for (int64 i = 0; i < slice_num; i++) {
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
+                      std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceRecv " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    output_flat(index) += data_t.scalar<tstring>()();
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  const int64 total_bytes,
+                                  Tensor*& output_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor data_t;
+  bool is_dead = false;
+  int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_;
+  unsigned char* output_base = \
+    reinterpret_cast<unsigned char*>(output_t->data());
+  for (int64 i = 0; i < slice_num; i++) {
+    int64 start = i * slice_size_;
+    int64 copy_size = slice_size_;
+    if (start > total_bytes - slice_size_) {
+      copy_size = total_bytes - start;
+    }
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    auto data_base = data_t.data();
+    std::memcpy(output_base+start, data_base, copy_size);
+  }
+
+  return Status::OK();
+
+}
+
+REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_CPU), SliceRecvOp);
+REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_DEFAULT), SliceRecvOp);
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
new file mode 100644
index 00000000000..df55c080aa1
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -0,0 +1,89 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class SliceSendOp : public OpKernel {
+ public:
+  explicit SliceSendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+  DataType dtype_;
+
+  // Functions.
+  Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        const Tensor& input_t);
+
+  Status SendShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   const Tensor& input_t);
+  Status SendString(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                    const Tensor& input_t);
+
+  Status SendStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const std::string& elem, int64 index);
+
+  Status SendBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const Tensor& input_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SliceSendOp);
+};
+
+class SliceRecvOp : public OpKernel {
+ public:
+  explicit SliceRecvOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variable.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+  int64 timeout_ms_;
+  DataType dtype_;
+
+  // Fucntions.
+  Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        bool& is_dead, int64& total_bytes);
+
+  Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   TensorShape& shape);
+
+  Status RecvString(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                    const TensorShape& shape, Tensor*& output_t);
+
+  Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const int64 index, const int64 element_size,
+                         TTypes<tstring>::Flat& output_flat);
+
+  Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const int64 total_bytes, Tensor*& output_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp);
+};
+
+} // End of namespace tensorflow
+
+#endif // End of TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
new file mode 100644
index 00000000000..5693ed57918
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
@@ -0,0 +1,339 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  // Functions.
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+    mutex_lock l(mu_);
+    // consumer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = send;
+      var.args = args;
+      var.data = val;
+      var.is_dead = is_dead;
+
+      kv_[key_str] = var;
+      return Status::OK();
+    }
+
+    auto var = kv_[key_str];
+    CHECK_EQ(var.type, recv);
+    var.done(Status::OK(), args, var.args, val, is_dead);
+    kv_.erase(key_str);
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+
+    mutex_lock l(mu_);
+    // producer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = recv;
+      var.args = args;
+      var.done = done;
+
+      kv_[key_str] = var;
+      return;
+    }
+
+    // auto var = kv_[key_str];
+    auto var =  kv_[key_str];
+    CHECK_EQ(var.type, send);
+    done(Status::OK(), var.args, args, var.data, var.is_dead);
+    kv_.erase(key_str);
+  }
+  void StartAbort(const Status& status) override {}
+
+ private:
+  enum RendezvousType {
+    send,
+    recv
+  };
+  // Type define.
+  struct Var {
+    RendezvousType type;
+    Args args;
+    Tensor data;
+    bool is_dead;
+    DoneCallback done;
+  };
+
+  // Variables.
+  mutex mu_;
+  std::unordered_map<std::string, struct Var> kv_ GUARDED_BY(mu_);
+};
+
+Node* SliceSend(Graph* g, Node* input, const string& tensor,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend")
+              .Input(input, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* SliceRecv(Graph* g, const string& tensor, const string& type,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size,
+                const int64 timeout_ms) {
+  Node* ret;
+  DataType dtype;
+  CHECK(DataTypeFromString(type, &dtype));
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv")
+              .Attr("tensor_type", dtype)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* Equal(Graph* g, Node* x, Node* y) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Equal")
+              .Input(x)
+              .Input(y)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* ReduceAll(Graph* g, Node* input, Node* axes) {
+  return test::graph::Reduce(g, "All", input, axes);
+}
+
+Node* Assert(Graph* g, Node* condition,
+             std::vector<NodeBuilder::NodeOut>& data) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert")
+              .Input(condition)
+              .Input(data)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+static Graph* TransferStringTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  Tensor input_t(DT_STRING, TensorShape({2, 4}));
+  input_t.flat<tstring>().setConstant(str); // total bytes: 44*8=352 bytes.
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBasicTypeTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+
+  Tensor input_t(DT_FLOAT, TensorShape({2, 8}));
+  input_t.flat<float>().setConstant(2); // total bytes = 4*2*8=64 bytes.
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBigStringTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 16;
+  const int64 timeout_ms = 5000;
+  std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  Tensor input_t(DT_STRING, TensorShape({2, 4}));
+  input_t.flat<tstring>().setConstant(str);
+  input_t.flat<tstring>()(0) = "short str";
+  Node* input_n = \
+    test::graph::Constant(g, input_t); // total bytes: 44*7+9=317 bytes.
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBigBasicTypeTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 16;
+  const int64 timeout_ms = 5000;
+
+  Tensor input_t(DT_FLOAT, TensorShape({2, 8}));
+  input_t.flat<float>().setConstant(2); // total bytes: 4*2*8=64
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferDeadTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+
+  // val
+  Tensor val_t(DT_FLOAT, TensorShape({}));
+  val_t.scalar<float>()() = 2;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  SliceSend(g, switch_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  return g;
+}
+
+static void BM_TransferStringTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferStringTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBasicTypeTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBasicTypeTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigStringTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigStringTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigBasicTypeTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigBasicTypeTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferDeadTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+BENCHMARK(BM_TransferStringTensor);
+BENCHMARK(BM_TransferBasicTypeTensor);
+BENCHMARK(BM_TransferBigStringTensor);
+BENCHMARK(BM_TransferBigBasicTypeTensor);
+BENCHMARK(BM_TransferDeadTensor);
+
+} // End of anonymous namespace
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/ops/slice_sendrecv_ops.cc b/tensorflow/core/ops/slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..11905712410
--- /dev/null
+++ b/tensorflow/core/ops/slice_sendrecv_ops.cc
@@ -0,0 +1,78 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_SliceSend")
+    .Input("tensor: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Sends the named tensor from send_device to recv_device.
+Supports sending the tensor of any size.
+
+tensor: The tensor to send.
+tensor_name: The name of the tensor to send.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+)doc");
+
+REGISTER_OP("_SliceRecv")
+    .Output("tensor: tensor_type")
+    .Attr("tensor_type: type")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .Attr("timeout_ms: int >= 0 = 300000")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Receives the named tensor from send_device on recv_device.
+Supports recving the tensor of any size.
+
+tensor: The tensor to receive.
+tensor_name: The name of the tensor to receive.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+timeout_ms: The maximum wait time for receiving a tensor.
+)doc");
+
+} // End of namespace tensorflow

From 6bf562197efaedccc8026d1d05ac23e27d3b2521 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 20 Dec 2023 15:47:52 +0800
Subject: [PATCH 69/91] [Embedding] undefine EV GPU interface in CPU compile.
 (#956)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 .../core/framework/embedding/embedding_var.h  | 91 +++++++++----------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index df6ae6f1277..c0d26a2f4d8 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -140,13 +140,6 @@ class EmbeddingVar : public ResourceBase {
     return storage_->Get(key, value_ptr);
   }
 
-  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                      const K* keys,
-                      void** value_ptr_list,
-                      int64 num_of_keys) {
-    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
-  }
-
   Status LookupOrCreateKey(K key, void** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
@@ -167,45 +160,6 @@ class EmbeddingVar : public ResourceBase {
     return Status::OK();
   }
 
-  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
-                           const K* keys,
-                           void** value_ptrs,
-                           int64 num_of_keys,
-                           int64* indices_counts,
-                           bool indices_as_pointer = false) {
-    if (indices_as_pointer) {
-      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
-          (int64 start, int64 limit) {
-        for (int i = start; i < limit; i++) {
-          value_ptrs[i] = (void*)keys[i];
-        }
-      };
-      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
-      auto worker_threads = context.worker_threads;
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, num_of_keys, unit_cost,
-            lookup_key_and_set_version_fn);
-    } else {
-      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
-    }
-
-    if (indices_counts != nullptr) {
-      auto add_freq_fn = [this, value_ptrs, indices_counts]
-          (int64 start, int64 limit) {
-        for (int i = start; i < limit; i++) {
-          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
-        }
-      };
-      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
-      auto worker_threads = context.worker_threads;
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, num_of_keys, unit_cost,
-            add_freq_fn);
-    }
-    return Status::OK();
-  }
-
-
   Status LookupOrCreateKey(K key, void** value_ptr) {
     Status s = storage_->GetOrCreate(key, value_ptr);
     TF_CHECK_OK(s);
@@ -402,6 +356,51 @@ class EmbeddingVar : public ResourceBase {
 
     storage_->AddToCache(keys_tensor);
   }
+
+  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                      const K* keys,
+                      void** value_ptr_list,
+                      int64 num_of_keys) {
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
+  }
+
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys,
+                           void** value_ptrs,
+                           int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            add_freq_fn);
+    }
+    return Status::OK();
+  }
 #endif
 
 #if GOOGLE_CUDA

From 0f536a2849528e2c25dd7f496a00d810acd5e72c Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 26 Dec 2023 16:14:06 +0800
Subject: [PATCH 70/91] [Op] Implement FileSliceSend/FileSliceRecvOp. (#960)

FileSliceSend/FileSliceRecv Op transfer scalar string Tensor to/from SliceRecv/SliceSend Op.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/framework/rendezvous.h        |   2 +
 tensorflow/core/graph/graph.cc                |   2 +
 tensorflow/core/graph/graph.h                 |  12 +-
 tensorflow/core/grappler/op_types.cc          |  10 +-
 tensorflow/core/grappler/op_types.h           |   2 +
 tensorflow/core/kernels/BUILD                 |  46 +-
 .../core/kernels/file_slice_sendrecv_ops.cc   | 482 +++++++++++++++++
 .../core/kernels/file_slice_sendrecv_ops.h    |  98 ++++
 .../kernels/file_slice_sendrecv_ops_test.cc   | 483 ++++++++++++++++++
 tensorflow/core/kernels/slice_sendrecv_ops.cc | 175 +++----
 tensorflow/core/kernels/slice_sendrecv_ops.h  |   6 +-
 .../core/kernels/slice_sendrecv_utils.cc      |  53 ++
 .../core/kernels/slice_sendrecv_utils.h       |  41 ++
 .../core/ops/file_slice_sendrecv_ops.cc       |  77 +++
 15 files changed, 1388 insertions(+), 103 deletions(-)
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.cc
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.h
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.h
 create mode 100644 tensorflow/core/ops/file_slice_sendrecv_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ce6850eb9da..07115cfea3c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1203,6 +1203,7 @@ tf_gen_op_libs(
         "encode_proto_ops",
         "experimental_dataset_ops",
         "feature_column_ops",
+        "file_slice_sendrecv_ops",
         "function_ops",
         "functional_ops",
         "fused_embedding_ops",
@@ -1465,6 +1466,7 @@ cc_library(
         ":encode_proto_ops_op_lib",
         ":experimental_dataset_ops_op_lib",
         ":feature_column_ops_op_lib",
+        ":file_slice_sendrecv_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":fused_embedding_ops_op_lib",
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 3c2b20379c8..3aa65534272 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -82,6 +82,8 @@ class Rendezvous : public core::RefCounted {
     friend class FuseRecvOp;
     friend class SliceSendOp;
     friend class SliceRecvOp;
+    friend class FileSliceSendOp;
+    friend class FileSliceRecvOp;
     friend class RefSendOp;
     friend class RefRecvOp;
     string buf_;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index d9709d39f3f..59b25ee7c36 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -70,12 +70,14 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"_HostSend", NC_HOST_SEND},
         {"_RefSend", NC_REF_SEND},
         {"_SliceSend", NC_SLICE_SEND},
+        {"_FileSliceSend", NC_FILE_SLICE_SEND},
         {"_Recv", NC_RECV},
         {"_HostRecv", NC_HOST_RECV},
         {"_RefRecv", NC_REF_RECV},
         {"_FuseRecv", NC_FUSE_RECV},
         {"_HostFuseRecv", NC_HOST_FUSE_RECV},
         {"_SliceRecv", NC_SLICE_RECV},
+        {"_FileSliceRecv", NC_FILE_SLICE_RECV},
         {"Const", NC_CONSTANT},
         {"HostConst", NC_CONSTANT},
         {"Variable", NC_VARIABLE},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 0baf8f257a9..bd6d18cfc7c 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -220,15 +220,19 @@ class Node {
   bool IsSend() const { return class_ == NC_SEND ||
                                class_ == NC_HOST_SEND ||
                                class_ == NC_REF_SEND ||
-                               class_ == NC_SLICE_SEND; }
+                               class_ == NC_SLICE_SEND ||
+                               class_ == NC_FILE_SLICE_SEND; }
   bool IsSliceSend() const { return class_ == NC_SLICE_SEND; }
+  bool IsFileSliceSend() const { return class_ == NC_FILE_SLICE_SEND; }
   bool IsRecv() const { return class_ == NC_RECV ||
                                class_ == NC_HOST_RECV ||
                                class_ == NC_REF_RECV ||
-                               class_ == NC_SLICE_RECV; }
+                               class_ == NC_SLICE_RECV ||
+                               class_ == NC_FILE_SLICE_RECV; }
   bool IsFuseRecv() const { return class_ == NC_FUSE_RECV ||
                                    class_ == NC_HOST_FUSE_RECV; }
   bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; }
+  bool IsFileSliceRecv() const { return class_ == NC_FILE_SLICE_RECV; }
   bool IsConstant() const { return class_ == NC_CONSTANT; }
   bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; }
   bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; }
@@ -339,12 +343,14 @@ class Node {
     NC_HOST_SEND,
     NC_REF_SEND,
     NC_SLICE_SEND,
+    NC_FILE_SLICE_SEND,
     NC_RECV,
     NC_HOST_RECV,
     NC_REF_RECV,
     NC_FUSE_RECV,
     NC_HOST_FUSE_RECV,
     NC_SLICE_RECV,
+    NC_FILE_SLICE_RECV,
     NC_CONSTANT,
     NC_VARIABLE,
     NC_KV_VAR_HANDLE,
@@ -851,8 +857,10 @@ inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
 inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); }
+inline bool IsFileSliceSend(const Node* node) { return node->IsFileSliceSend(); }
 inline bool IsRecv(const Node* node) { return node->IsRecv(); }
 inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); }
+inline bool IsFileSliceRecv(const Node* node) { return node->IsFileSliceRecv(); }
 inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); }
 inline bool IsHostSend(const Node* node) { return node->IsHostSend(); }
 inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1201623ffcd..fd72927bd79 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -265,6 +265,10 @@ bool IsExp(const NodeDef& node) { return node.op() == "Exp"; }
 
 bool IsFakeParam(const NodeDef& node) { return node.op() == "FakeParam"; }
 
+bool IsFileSliceRecv(const NodeDef& node) { return node.op() == "_FileSliceRecv"; }
+
+bool IsFileSliceSend(const NodeDef& node) { return node.op() == "_FileSliceSend"; }
+
 bool IsFill(const NodeDef& node) { return node.op() == "Fill"; }
 
 bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
@@ -454,7 +458,8 @@ bool IsReciprocalGrad(const NodeDef& node) {
 }
 
 bool IsRecv(const NodeDef& node) {
-  return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node);
+  return node.op() == "_Recv" || node.op() == "_HostRecv" ||
+         IsSliceRecv(node) || IsFileSliceRecv(node);
 }
 
 bool IsFuseRecv(const NodeDef& node) {
@@ -502,7 +507,8 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
 bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
 
 bool IsSend(const NodeDef& node) {
-  return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node);
+  return node.op() == "_Send" || node.op() == "_HostSend" ||
+         IsSliceSend(node) || IsFileSliceSend(node);
 }
 
 bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 737581fd412..10968ad2547 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -80,6 +80,8 @@ bool IsExit(const NodeDef& node);
 bool IsExp(const NodeDef& node);
 bool IsFakeParam(const NodeDef& node);
 bool IsFill(const NodeDef& node);
+bool IsFileSliceRecv(const NodeDef& node);
+bool IsFileSliceSend(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
 bool IsFusedBatchNorm(const NodeDef& node);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 36721527cc2..4e6868a9897 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5423,6 +5423,7 @@ cc_library(
     name = "required",
     deps = [
         ":no_op",
+        ":file_slice_sendrecv_ops",
         ":fuserecv_ops",
         ":sendrecv_ops",
         ":slice_sendrecv_ops",
@@ -5446,10 +5447,33 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+cc_library(
+    name = "slice_sendrecv_utils",
+    hdrs = [
+        "slice_sendrecv_utils.h"
+    ],
+    srcs = [
+        "slice_sendrecv_utils.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ]
+)
+
 tf_kernel_library(
     name = "slice_sendrecv_ops",
     prefix = "slice_sendrecv_ops",
-    deps = REQUIRED_DEPS,
+    deps = REQUIRED_DEPS + [
+        ":slice_sendrecv_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "file_slice_sendrecv_ops",
+    prefix = "file_slice_sendrecv_ops",
+    deps = REQUIRED_DEPS + [
+        ":slice_sendrecv_utils",
+    ],
 )
 
 tf_kernel_library(
@@ -5534,6 +5558,26 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "file_slice_sendrecv_ops_test",
+    srcs = ["file_slice_sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking
+    deps = [
+        ":control_flow_ops",
+        ":cwise_op",
+        ":file_slice_sendrecv_ops",
+        ":logging_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":slice_sendrecv_ops",
+        ":whole_file_read_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "fuserecv_ops",
     prefix = "fuserecv_ops",
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..6bfe54363f9
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
@@ -0,0 +1,482 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/file_slice_sendrecv_ops.h"
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+//------------------------------------------------------------------------------
+// Functions of FileSliceSendOp.
+
+FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+}
+
+void FileSliceSendOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  const Tensor& file_path_t = ctx->input(0);
+  if (!ctx->is_input_dead()) {
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(file_path_t.shape()),
+                errors::InvalidArgument("file_path is not a scalar: ",
+                                        file_path_t.shape().DebugString()));
+  }
+
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  // get element_bytes.
+  uint64 element_bytes = 0;
+  OP_REQUIRES_OK(ctx, GetElementBytes(ctx, file_path_t, element_bytes));
+
+  // send total_bytes.
+  // total_bytes is the TotalBytes of the Tensor that contains the contents of
+  // the file. please refer Tensor::TotalBytes()
+  uint64 total_bytes = element_bytes + sizeof(tstring);
+  OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, total_bytes));
+  // if input is dead, only send total_bytes dead tensor.
+  if (ctx->is_input_dead()) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, send directly.
+  if (total_bytes <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->input_alloc_attr(0);
+
+    Rendezvous::ParsedKey parsed_key;
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    Tensor data_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    if (element_bytes > 0) {
+      OP_REQUIRES_OK(ctx, ReadFileToString(Env::Default(),
+        file_path_t.scalar<tstring>()(), data_t.scalar<tstring>().data()));
+    }
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key,args, data_t,
+                                                ctx->is_input_dead()));
+    return;
+  }
+
+  // send shape, in order to match the behavior of 'SliceSend'.
+  OP_REQUIRES_OK(ctx, SendScalarShape(ctx, frame_iter));
+
+  // send element bytes, in order to match the behavior of 'SliceSend'.
+  OP_REQUIRES_OK(ctx, SendElementBytes(ctx, frame_iter, element_bytes));
+
+  // send data.
+  OP_REQUIRES_OK(ctx, SendFileSlice(ctx, frame_iter, file_path_t, element_bytes));
+}
+
+Status FileSliceSendOp::GetElementBytes(OpKernelContext* ctx,
+                                        const Tensor& file_path_t,
+                                        uint64& element_bytes) {
+
+  if (ctx->is_input_dead()) {
+    element_bytes = 0;
+    return Status::OK();
+  }
+
+  const string& file_path = file_path_t.scalar<tstring>()();
+  Env* env = Env::Default();
+
+  if (env->FileExists(file_path) != Status::OK()) {
+    element_bytes = 0;
+    return Status::OK();
+  }
+
+  return env->GetFileSize(file_path, &element_bytes);
+}
+
+Status FileSliceSendOp::SendUInt64MetaMsg(OpKernelContext* ctx,
+                                          const FrameAndIter& frame_iter,
+                                          const string& name,
+                                          const uint64 val) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor val_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}), &val_t));
+  val_t.scalar<uint64>()() = val;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter,
+                                        &parsed_key.buf_);
+  VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, val_t, ctx->is_input_dead());
+}
+
+Status FileSliceSendOp::SendTotalBytes(OpKernelContext* ctx,
+                                       const FrameAndIter& frame_iter,
+                                       const uint64 total_bytes) {
+  return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes",
+                           total_bytes);
+}
+
+Status FileSliceSendOp::SendScalarShape(OpKernelContext* ctx,
+                                        const FrameAndIter& frame_iter) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor shape_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({0}), &shape_t));
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_shape", frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  return ctx->rendezvous()->Send(parsed_key, args, shape_t,
+                                 ctx->is_input_dead());
+}
+
+Status FileSliceSendOp::SendElementBytes(OpKernelContext* ctx,
+                                         const FrameAndIter& frame_iter,
+                                         const uint64 element_bytes) {
+  return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes",
+                           element_bytes);
+}
+
+Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx,
+                                      const FrameAndIter& frame_iter,
+                                      const Tensor& file_path_t,
+                                      const uint64 element_bytes) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  std::unique_ptr<RandomAccessFile> file;
+  Env* env = Env::Default();
+  const string& file_path = file_path_t.scalar<tstring>()();
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(file_path, &file));
+
+  // Slice Send.
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
+  Tensor data_t;
+  for (int64 i = 0; i < slice_num; i++) {
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    uint64 start = i * slice_size_;
+    uint64 copy_size = slice_size_;
+    if (start > element_bytes - slice_size_) {
+      copy_size = element_bytes - start;
+    }
+    TF_RETURN_IF_ERROR(ReadFileSlice(file, start, copy_size, data_t));
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(0), "_",
+                      std::to_string(i));
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+
+  return Status::OK();
+}
+
+Status FileSliceSendOp::ReadFileSlice(
+                          const std::unique_ptr<RandomAccessFile>& file,
+                          const uint64 pos, const uint64 offset,
+                          Tensor& data_t) {
+  string* data_s = data_t.scalar<tstring>().data();
+  gtl::STLStringResizeUninitialized(data_s, offset);
+  char* data_p = gtl::string_as_array(data_s);
+  StringPiece result;
+  TF_RETURN_IF_ERROR(file->Read(pos, offset, &result, data_p));
+  if (result.data() != data_p) {
+    memmove(data_p, result.data(), result.size());
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_CPU),
+                        FileSliceSendOp);
+REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_DEFAULT),
+                        FileSliceSendOp);
+
+//------------------------------------------------------------------------------
+// Functions of FileSliceRecvOp.
+
+FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_dir", &recv_dir_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_));
+}
+
+void FileSliceRecvOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  bool is_dead = false;
+  uint64 total_bytes = 0;
+  OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
+  if (is_dead) {
+    return;
+  }
+
+  // Create file path output.
+  Env* env = Env::Default();
+  if (!env->FileExists(recv_dir_).ok()) {
+    OP_REQUIRES_OK(ctx, env->RecursivelyCreateDir(recv_dir_));
+  }
+  const string &filename = GenerateRecvFileName(ctx->op_kernel().name());
+  const string &file_path = io::JoinPath(recv_dir_, "tempfilerecv-"+filename);
+  Tensor* file_path_t = nullptr;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &file_path_t));
+  file_path_t->scalar<tstring>()() = file_path;
+
+  // if total bytes is smaller than slice size, recv directly.
+  if (total_bytes <= slice_size_) {
+    OP_REQUIRES_OK(ctx, RecvFile(ctx, frame_iter, file_path));
+    return;
+  }
+
+  // recv shape, in order to match the behavior of 'SliceRecv'.
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(shape),
+    errors::InvalidArgument(
+      "FileSliceRecv only supports receiving a tensor with a scalar shape."));
+
+  // recv element_bytes, in order to match the behavior of 'SliceRecv'.
+  uint64 element_bytes = 0;
+  OP_REQUIRES_OK(ctx, RecvElementBytes(ctx, frame_iter, element_bytes));
+
+  // recv data.
+  OP_REQUIRES_OK(ctx, RecvFileSlice(ctx, frame_iter, element_bytes, file_path));
+}
+
+Status FileSliceRecvOp::RecvUInt64MetaMsg(OpKernelContext* ctx,
+                                          const FrameAndIter& frame_iter,
+                                          const string& name, bool &is_dead,
+                                          uint64& val) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor val_t;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter,
+                                        &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(
+    ctx->rendezvous()->Recv(parsed_key, args, &val_t, &is_dead, timeout_ms_));
+  if (!is_dead) {
+    val = val_t.scalar<uint64>()();
+  }
+
+  return Status::OK();
+}
+
+Status FileSliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
+                                       const FrameAndIter& frame_iter,
+                                       bool& is_dead, uint64& total_bytes) {
+  return RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes",
+                           is_dead, total_bytes);
+}
+
+string FileSliceRecvOp::GenerateRecvFileName(const string& op_name) {
+  const std::vector<string>& file_name_vec = absl::StrSplit(op_name, "/");
+  return absl::StrJoin(file_name_vec, "_");
+}
+
+Status FileSliceRecvOp::RecvShape(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  TensorShape& shape) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape",
+                                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  Tensor shape_t;
+  bool is_dead;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto shape_vec = shape_t.vec<int64>();
+  const int64 num_elements = shape_t.NumElements();
+  for (int64 i = 0; i < num_elements; i++) {
+    shape.AddDim(shape_vec(i));
+  }
+
+  return Status::OK();
+}
+
+Status FileSliceRecvOp::RecvElementBytes(OpKernelContext* ctx,
+                                        const FrameAndIter& frame_iter,
+                                        uint64& element_bytes) {
+  bool is_dead = false;
+  Status s = \
+    RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes", is_dead,
+                      element_bytes);
+  CHECK_EQ(is_dead, false);
+
+  return s;
+}
+
+Status FileSliceRecvOp::RecvFile(OpKernelContext* ctx,
+                                 const FrameAndIter& frame_iter,
+                                 const string& file_path) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  Tensor data_t;
+  bool is_dead = false;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                             &is_dead, timeout_ms_));
+
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+
+  // Write data_t to file.
+  Env* env = Env::Default();
+  return WriteStringToFile(env, file_path, data_t.scalar<tstring>()());
+}
+
+Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx,
+                                      const FrameAndIter& frame_iter,
+                                      const uint64 element_bytes,
+                                      const string& file_path) {
+  // create file
+  Env* env = Env::Default();
+  std::unique_ptr<WritableFile> file_ptr;
+  TF_RETURN_IF_ERROR(env->NewWritableFile(file_path, &file_ptr));
+
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
+  Tensor data_t;
+  bool is_dead = false;
+  for (int64 i = 0; i < slice_num; i++) {
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(0), "_",
+                      std::to_string(i));
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    file_ptr->Append(data_t.scalar<tstring>()());
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_CPU),
+                        FileSliceRecvOp);
+REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_DEFAULT),
+                        FileSliceRecvOp);
+
+}; // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
new file mode 100644
index 00000000000..6701196d481
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class FileSliceSendOp : public OpKernel {
+ public:
+  explicit FileSliceSendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+
+  // Functions.
+  Status GetElementBytes(OpKernelContext* ctx, const Tensor& file_path_t,
+                         uint64& element_bytes);
+
+  Status SendUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                           const string& name, const uint64 val);
+
+  Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        const uint64 total_bytes);
+
+  Status SendScalarShape(OpKernelContext* ctx, const FrameAndIter& frame_iter);
+
+  Status SendElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const uint64 element_bytes);
+
+  Status SendFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const Tensor& file_path_t, const uint64 element_bytes);
+
+  Status ReadFileSlice(const std::unique_ptr<RandomAccessFile>& file,
+                       const uint64 pos, const uint64 offset, Tensor& data_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FileSliceSendOp);
+};
+
+class FileSliceRecvOp: public OpKernel {
+ public:
+  explicit FileSliceRecvOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  string recv_dir_;
+  int32 slice_size_;
+  int64 timeout_ms_;
+
+  // Functions.
+  Status RecvUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                           const string& name, bool &is_dead, uint64& val);
+
+  Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        bool& is_dead, uint64& total_bytes);
+
+  string GenerateRecvFileName(const string& op_name);
+
+  Status RecvFile(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                  const string& file_path);
+
+  Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   TensorShape& shape);
+
+  Status RecvElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                          uint64& element_bytes);
+
+  Status RecvFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const uint64 element_bytes, const string& file_path);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FileSliceRecvOp);
+};
+
+}; // End of namespace tensorflow
+
+#endif // End of macro TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
new file mode 100644
index 00000000000..931cd152253
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
@@ -0,0 +1,483 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  // Functions.
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+    mutex_lock l(mu_);
+    // consumer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = send;
+      var.args = args;
+      var.data = val;
+      var.is_dead = is_dead;
+
+      kv_[key_str] = var;
+      return Status::OK();
+    }
+
+    auto var = kv_[key_str];
+    CHECK_EQ(var.type, recv);
+    var.done(Status::OK(), args, var.args, val, is_dead);
+    kv_.erase(key_str);
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+
+    mutex_lock l(mu_);
+    // producer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = recv;
+      var.args = args;
+      var.done = done;
+
+      kv_[key_str] = var;
+      return;
+    }
+
+    // auto var = kv_[key_str];
+    auto var =  kv_[key_str];
+    CHECK_EQ(var.type, send);
+    done(Status::OK(), var.args, args, var.data, var.is_dead);
+    kv_.erase(key_str);
+  }
+  void StartAbort(const Status& status) override {}
+
+ private:
+  enum RendezvousType {
+    send,
+    recv
+  };
+  // Type define.
+  struct Var {
+    RendezvousType type;
+    Args args;
+    Tensor data;
+    bool is_dead;
+    DoneCallback done;
+  };
+
+  // Variables.
+  mutex mu_;
+  std::unordered_map<std::string, struct Var> kv_ GUARDED_BY(mu_);
+};
+
+//------------------------------------------------------------------------------
+// Utils.
+Node* FileSliceSend(Graph* g, Node* filename, const string& tensor,
+                    const string& sender, const uint64 sender_incarnation,
+                    const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceSend"), "_FileSliceSend")
+              .Input(filename, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* FileSliceRecv(Graph* g, const string& tensor, const string& sender,
+                    const uint64 sender_incarnation, const string& receiver,
+                    const string& recv_dir, const int32 slice_size,
+                    const int64 timeout_ms) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceRecv"), "_FileSliceRecv")
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("recv_dir", recv_dir)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* SliceSend(Graph* g, Node* input, const string& tensor,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend")
+              .Input(input, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* SliceRecv(Graph* g, const string& tensor, const string& sender,
+                const uint64 sender_incarnation, const string& receiver,
+                const int32 slice_size, const int64 timeout_ms) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv")
+              .Attr("tensor_type", DT_STRING)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* ReadFile(Graph* g, Node* filename) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("ReadFile"), "ReadFile")
+              .Input(filename, 0)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* WriteFile(Graph* g, Node* filename, Node* contents) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("WriteFile"), "WriteFile")
+              .Input(filename, 0)
+              .Input(contents, 0)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* Equal(Graph* g, Node* x, Node* y) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("Equal"), "Equal")
+              .Input(x)
+              .Input(y)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* Assert(Graph* g, Node* condition,
+             std::vector<NodeBuilder::NodeOut>& data) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert")
+              .Input(condition)
+              .Input(data)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+//------------------------------------------------------------------------------
+// Graph Constructor.
+
+static Graph* TransferFile(const std::string& test_type,
+                           const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type;
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // send filename node.
+  Tensor filename_t(DT_STRING, TensorShape({}));
+  filename_t.scalar<tstring>().setConstant(filename);
+  Node* filename_n = test::graph::Constant(g, filename_t);
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* write_file_n = WriteFile(g, filename_n, contents_n);
+  Node* send_n = \
+    FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+  g->AddControlEdge(write_file_n, send_n);
+
+  Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir,
+                               slice_size, timeout_ms);
+  Node* read_file_n = ReadFile(g, recv_n);
+  Node* equal_n = Equal(g, contents_n, read_file_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(read_file_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* FileSliceSendTransferFileToSliceRecv(const std::string& test_type,
+                                                   const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type;
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // send filename node.
+  Tensor filename_t(DT_STRING, TensorShape({}));
+  filename_t.scalar<tstring>().setConstant(filename);
+  Node* filename_n = test::graph::Constant(g, filename_t);
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* write_file_n = WriteFile(g, filename_n, contents_n);
+  Node* send_n = \
+    FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+  g->AddControlEdge(write_file_n, send_n);
+
+  Node* recv_n = \
+    SliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+  Node* equal_n = Equal(g, contents_n, recv_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* SliceSendTransferFileToFileSliceRecv(const std::string& test_type,
+                                                   const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* send_n = \
+    SliceSend(g, contents_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+
+  Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir,
+                               slice_size, timeout_ms);
+  Node* read_file_n = ReadFile(g, recv_n);
+  Node* equal_n = Equal(g, contents_n, read_file_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(read_file_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferDeadTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor";
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = filename;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size,
+                timeout_ms);
+
+  return g;
+}
+
+static Graph* FileSliceSendTransferDeadTensorToSliceRecv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor";
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = filename;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  SliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  return g;
+}
+
+static Graph* SliceSendTransferDeadTensorToFileSliceRecv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = contents;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  SliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size,
+                timeout_ms);
+
+  return g;
+}
+
+static Graph* TransferSmallFile() {
+  return TransferFile("small_file", 1024);
+}
+
+static Graph* TransferBigFile() {
+  return TransferFile("big_file", 16);
+}
+
+static Graph* FileSliceSendTransferSmallFileToSliceRecv() {
+  return FileSliceSendTransferFileToSliceRecv("small_file", 1024);
+}
+
+static Graph* FileSliceSendTransferBigFileToSliceRecv() {
+  return FileSliceSendTransferFileToSliceRecv("big_file", 16);
+}
+
+static Graph* SliceSendTransferSmallFileToFileSliceRecv() {
+  return SliceSendTransferFileToFileSliceRecv("small_file", 1024);
+}
+
+static Graph* SliceSendTransferBigFileToFileSliceRecv() {
+  return SliceSendTransferFileToFileSliceRecv("big_file", 16);
+}
+
+//------------------------------------------------------------------------------
+// Test Function.
+
+static void BM_TransferSmallFile(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferSmallFile(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigFile(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigFile(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferSmallFileToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferSmallFileToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferBigFileToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferBigFileToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferSmallFileToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferSmallFileToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferBigFileToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferBigFileToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferDeadTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferDeadTensorToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferDeadTensorToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferDeadTensorToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferDeadTensorToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+BENCHMARK(BM_TransferSmallFile);
+BENCHMARK(BM_TransferBigFile);
+BENCHMARK(BM_FileSliceSendTransferSmallFileToSliceRecv);
+BENCHMARK(BM_FileSliceSendTransferBigFileToSliceRecv);
+BENCHMARK(BM_SliceSendTransferSmallFileToFileSliceRecv);
+BENCHMARK(BM_SliceSendTransferBigFileToFileSliceRecv);
+BENCHMARK(BM_TransferDeadTensor);
+BENCHMARK(BM_FileSliceSendTransferDeadTensorToSliceRecv);
+BENCHMARK(BM_SliceSendTransferDeadTensorToFileSliceRecv);
+
+} // End of anonymous namespace
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
index f09f314ae10..25f1a4e8738 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -14,41 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/slice_sendrecv_ops.h"
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
 
 namespace tensorflow {
 
-//------------------------------------------------------------------------------
-// Utils.
-static string GetSliceRendezvousKeyPrefix(const string& send_device,
-                                          const string& recv_device,
-                                          const uint64 send_device_incarnation,
-                                          const string& tensor_name) {
-  return strings::StrCat(send_device, ";",
-                         strings::FpToString(send_device_incarnation), ";",
-                         recv_device, ";", tensor_name);
-}
-
-static void GetSliceRendezvousKey(const string& key_prefix,
-                                  const string& tensor_name_suffix,
-                                  const FrameAndIter& frame_iter, string* key) {
-  key->clear();
-  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
-                     frame_iter.frame_id, ":", frame_iter.iter_id);
-}
-
-static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
-                                    bool hostmem_sendrecv) {
-  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
-    // Host memory send/recv pairs are added by
-    // common_runtime/memory_types.cc.  When the pair of nodes are
-    // added inside a function, we need to use the function call frame
-    // to formulate the unique rendezvous key.
-    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
-  } else {
-    return ctx->frame_iter();
-  }
-}
-
 //------------------------------------------------------------------------------
 // Functions of SliceSendOp.
 
@@ -64,8 +33,9 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string tensor_name;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = \
-    GetSliceRendezvousKeyPrefix(send_device, recv_device,
-                                send_device_incarnation, tensor_name);
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -79,7 +49,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) {
     errors::Internal("Op kernel context needs to provide a rendezvous."));
 
   const Tensor& input_t = ctx->input(0);
-  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
 
   // send total_bytes.
   OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t));
@@ -95,8 +66,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) {
     args.alloc_attrs = ctx->input_alloc_attr(0);
 
     Rendezvous::ParsedKey parsed_key;
-    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t,
@@ -124,11 +95,11 @@ Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx,
 
   Rendezvous::ParsedKey parsed_key;
   Tensor total_bytes_t;
-  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}),
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}),
                                         &total_bytes_t));
-  total_bytes_t.scalar<int64>()() = input_t.TotalBytes();
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
-                        &parsed_key.buf_);
+  total_bytes_t.scalar<uint64>()() = input_t.TotalBytes();
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t,
@@ -152,8 +123,8 @@ Status SliceSendOp::SendShape(OpKernelContext* ctx,
   for (int i = 0; i < rank; i++) {
     shape_vec(i) = shape.dim_size(i);
   }
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_shape", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   return ctx->rendezvous()->Send(parsed_key, args, shape_t,
@@ -168,21 +139,21 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
   args.alloc_attrs = AllocatorAttributes();
   Rendezvous::ParsedKey parsed_key;
 
-  // send elements size.
-  Tensor elements_size_t;
-  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(),
-                                        &elements_size_t));
+  // send elements bytes.
+  Tensor elements_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, input_t.shape(),
+                                        &elements_bytes_t));
   int64 num_elements = input_t.NumElements();
   auto input_flat = input_t.flat<tstring>();
-  auto elements_size_flat = elements_size_t.flat<int64>();
+  auto elements_bytes_flat = elements_bytes_t.flat<uint64>();
   for (int64 i = 0; i < num_elements; i++) {
-    elements_size_flat(i) = input_flat(i).size();
+    elements_bytes_flat(i) = input_flat(i).size();
   }
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
-                        frame_iter, &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+    "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t,
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_bytes_t,
                                              ctx->is_input_dead()));
 
   // send data.
@@ -196,8 +167,8 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
       data_t.scalar<tstring>()() = elem;
       std::string tensor_name_suffix = \
         strings::StrCat("_slice_transfer_data_", std::to_string(i));
-      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                            &parsed_key.buf_);
+      slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                            frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceSend " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
       TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -218,7 +189,10 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
   args.alloc_attrs = ctx->input_alloc_attr(0);
   Rendezvous::ParsedKey parsed_key;
 
-  int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_;
+  int64 slice_num = elem.size() / slice_size_;
+  if (elem.size() % slice_size_ != 0) {
+    slice_num += 1;
+  }
   Tensor data_t;
   for (int64 i = 0; i < slice_num; i++) {
     TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
@@ -231,8 +205,8 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
                       std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -252,12 +226,15 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
 
   // send data.
   Tensor data_t;
-  int64 bytes_num = input_t.TotalBytes();
-  int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_;
+  size_t bytes_num = input_t.TotalBytes();
+  int64 slice_num = bytes_num / slice_size_;
+  if (bytes_num % slice_size_ != 0) {
+    slice_num += 1;
+  }
   unsigned char* input_base = reinterpret_cast<unsigned char*>(input_t.data());
   for (int64 i = 0; i < slice_num; i++) {
-    int64 start = i * slice_size_;
-    int64 copy_size = slice_size_;
+    size_t start = i * slice_size_;
+    size_t copy_size = slice_size_;
     if (start > bytes_num - slice_size_) {
       copy_size = bytes_num - start;
     }
@@ -267,8 +244,8 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
     std::memcpy(data_base, input_base+start, copy_size);
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -296,8 +273,8 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string tensor_name;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = \
-    GetSliceRendezvousKeyPrefix(send_device, recv_device,
-                                send_device_incarnation, tensor_name);
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -311,11 +288,12 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
     ctx, ctx->rendezvous() != nullptr,
     errors::Internal("Op kernel context needs to provide a rendezvous."));
 
-  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
   bool is_dead;
 
   // recv total_bytes.
-  int64 total_bytes;
+  uint64 total_bytes;
   OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
   if (is_dead) {
     return;
@@ -334,8 +312,8 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
     }
 
     Rendezvous::ParsedKey parsed_key;
-    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     Tensor data_t;
@@ -364,7 +342,7 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
 
 Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
                                    const FrameAndIter& frame_iter,
-                                   bool& is_dead, int64& total_bytes) {
+                                   bool& is_dead, uint64& total_bytes) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = AllocatorAttributes();
@@ -377,14 +355,14 @@ Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
 
   Rendezvous::ParsedKey parsed_key;
   Tensor total_bytes_t;
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t,
                                              &is_dead, timeout_ms_));
   if (!is_dead) {
-    total_bytes = total_bytes_t.scalar<int64>()();
+    total_bytes = total_bytes_t.scalar<uint64>()();
   }
 
   return Status::OK();
@@ -404,8 +382,8 @@ Status SliceRecvOp::RecvShape(OpKernelContext* ctx,
   }
 
   Rendezvous::ParsedKey parsed_key;
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape",
+                                        frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
 
@@ -439,27 +417,27 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
   Rendezvous::ParsedKey parsed_key;
   bool is_dead;
 
-  // recv elements size.
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
-                        frame_iter, &parsed_key.buf_);
+  // recv elements bytes.
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+    "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-  Tensor elements_size_t;
-  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t,
+  Tensor elements_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_bytes_t,
                                              &is_dead, timeout_ms_));
   // This shouldn't be a dead tensor.
   CHECK_EQ(is_dead, false);
-  auto elements_size_flat = elements_size_t.flat<int64>();
+  auto elements_bytes_flat = elements_bytes_t.flat<uint64>();
   int64 num_elements = shape.num_elements();
   args.alloc_attrs = ctx->output_alloc_attr(0);
   Tensor data_t;
   auto output_flat = output_t->flat<tstring>();
   for (int64 i = 0; i < num_elements; i++) {
-    if (elements_size_flat(i) <= slice_size_) {
+    if (elements_bytes_flat(i) <= slice_size_) {
       std::string tensor_name_suffix = \
         strings::StrCat("_slice_transfer_data_", std::to_string(i));
-      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                            &parsed_key.buf_);
+      slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                            frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceRecv " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
       TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
@@ -469,7 +447,7 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
       output_flat(i) = data_t.scalar<tstring>()();
     } else {
       TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i,
-                                         elements_size_flat(i), output_flat));
+                                         elements_bytes_flat(i), output_flat));
     }
   }
 
@@ -478,7 +456,8 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
 
 Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
                                     const FrameAndIter& frame_iter,
-                                    const int64 index, const int64 element_size,
+                                    const int64 index,
+                                    const uint64 element_bytes,
                                     TTypes<tstring>::Flat& output_flat) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -491,15 +470,18 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
   }
   Rendezvous::ParsedKey parsed_key;
 
-  int64 slice_num = (element_size + slice_size_ - 1) / slice_size_;
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
   Tensor data_t;
   bool is_dead = false;
   for (int64 i = 0; i < slice_num; i++) {
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
                       std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
@@ -514,7 +496,7 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
 
 Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
                                   const FrameAndIter& frame_iter,
-                                  const int64 total_bytes,
+                                  const uint64 total_bytes,
                                   Tensor*& output_t) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -529,19 +511,22 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
 
   Tensor data_t;
   bool is_dead = false;
-  int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_;
+  int64 slice_num = total_bytes / slice_size_;
+  if (total_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
   unsigned char* output_base = \
     reinterpret_cast<unsigned char*>(output_t->data());
   for (int64 i = 0; i < slice_num; i++) {
-    int64 start = i * slice_size_;
-    int64 copy_size = slice_size_;
+    uint64 start = i * slice_size_;
+    uint64 copy_size = slice_size_;
     if (start > total_bytes - slice_size_) {
       copy_size = total_bytes - start;
     }
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
index df55c080aa1..43429bff32f 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -66,7 +66,7 @@ class SliceRecvOp : public OpKernel {
 
   // Fucntions.
   Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                        bool& is_dead, int64& total_bytes);
+                        bool& is_dead, uint64& total_bytes);
 
   Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
                    TensorShape& shape);
@@ -75,11 +75,11 @@ class SliceRecvOp : public OpKernel {
                     const TensorShape& shape, Tensor*& output_t);
 
   Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                         const int64 index, const int64 element_size,
+                         const int64 index, const uint64 element_bytes,
                          TTypes<tstring>::Flat& output_flat);
 
   Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                       const int64 total_bytes, Tensor*& output_t);
+                       const uint64 total_bytes, Tensor*& output_t);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp);
 };
diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.cc b/tensorflow/core/kernels/slice_sendrecv_utils.cc
new file mode 100644
index 00000000000..56c2166c650
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_utils.cc
@@ -0,0 +1,53 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
+
+namespace tensorflow {
+
+namespace slice_sendrecv {
+
+string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                   const string& recv_device,
+                                   const uint64 send_device_incarnation,
+                                   const string& tensor_name) {
+  return strings::StrCat(send_device, ";",
+                         strings::FpToString(send_device_incarnation), ";",
+                         recv_device, ";", tensor_name);
+}
+
+void GetSliceRendezvousKey(const string& key_prefix,
+                           const string& tensor_name_suffix,
+                           const FrameAndIter& frame_iter, string* key) {
+  key->clear();
+  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
+                     frame_iter.frame_id, ":", frame_iter.iter_id);
+}
+
+FrameAndIter GetFrameAndIter(OpKernelContext* ctx, bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
+}; // End of namespace slice_sendrecv
+
+}; // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.h b/tensorflow/core/kernels/slice_sendrecv_utils.h
new file mode 100644
index 00000000000..3605eece2ca
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace slice_sendrecv {
+
+extern string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                          const string& recv_device,
+                                          const uint64 send_device_incarnation,
+                                          const string& tensor_name);
+
+extern void GetSliceRendezvousKey(const string& key_prefix,
+                                  const string& tensor_name_suffix,
+                                  const FrameAndIter& frame_iter, string* key);
+
+extern FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv);
+
+}; // End of namespace slice_sendrecv
+
+}; // End of namespace tensorflow
+
+#endif // End of macro TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
diff --git a/tensorflow/core/ops/file_slice_sendrecv_ops.cc b/tensorflow/core/ops/file_slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..c7eb20d1358
--- /dev/null
+++ b/tensorflow/core/ops/file_slice_sendrecv_ops.cc
@@ -0,0 +1,77 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+REGISTER_OP("_FileSliceSend")
+    .Input("file_path: string")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Sends the file from send_device to recv_device.
+Supports sending the file of any size.
+
+file_path: The file to send.
+tensor_name: The name of the tensor to send.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+)doc");
+
+REGISTER_OP("_FileSliceRecv")
+    .Output("file_path: string")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("recv_dir: string")
+    .Attr("slice_size: int >= 1")
+    .Attr("timeout_ms: int >= 0 = 300000")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Receives the file from send_device on recv_device.
+Supports recving the file of any size.
+
+file_path: The file to receive.
+tensor_name: The name of the tensor to receive.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+recv_dir: the directory to store received file.
+slice_size: The maximum number of bytes transferred at one time.
+timeout_ms: The maximum wait time for receiving a tensor.
+)doc");
+
+}; // End of namespace tensorflow

From 2f938dc2a18e57c9a302f5a8b988f6cd39f89e2f Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 9 Jan 2024 17:46:11 -0800
Subject: [PATCH 71/91] [TensorRT] Fix Graph contains EmbeddingVariable
 compiling issue. (#964)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/compiler/tensorrt/trt_convert.py | 12 +++++-------
 tensorflow/python/framework/graph_util_impl.py     | 12 +++++-------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 2c8d603ba01..064e32c6984 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -539,13 +539,10 @@ def _gather_names(tensor_info):
       # EmbeddingVariable can not be convert to constant, so we need to
       # load ev varibles at runtime always.
       if self._use_ev:
-        global_step_collection_ops = sess.graph.get_collection("global_step")
-        global_step_name = global_step_collection_ops[0].name.split(":")[0]
         output_node_names.add(filename_tensor_name)
         output_node_names.add(save_tensor_name)
         output_node_names.add(restore_op_name)
 
-        tf_logging.info("TensorRT - global_step_name: %s" % str(global_step_name))
         tf_logging.info("TensorRT - filename_tensor_name: %s" % str(filename_tensor_name))
         tf_logging.info("TensorRT - save_tensor_name: %s" % str(save_tensor_name))
         tf_logging.info("TensorRT - restore_op_name: %s" % str(restore_op_name))
@@ -559,18 +556,19 @@ def _gather_names(tensor_info):
 
       # Freeze the variables in the SavedModel graph and copy the frozen
       # graph over.
-      variable_names_blacklist = []
       if self._use_ev:
-        variable_names_blacklist.append(global_step_name)
+        global_step_collection_ops = sess.graph.get_collection("global_step")
+        if len(global_step_collection_ops) > 0:
+          sess.run([sess.graph.get_operation_by_name("global_step/Assign")])
 
       frozen_graph_def = graph_util.convert_variables_to_constants(
           sess, sess.graph.as_graph_def(add_shapes=True),
-          list(output_node_names), variable_names_blacklist=variable_names_blacklist)
+          list(output_node_names))
 
       if self._use_ev:
         # Keep KV Variable in saver_def, these kv-vars will be initialized at runtime.
         frozen_graph_def = graph_util.create_kv_variable_init_graph(
-            frozen_graph_def, global_step_name, restore_op_name)
+            frozen_graph_def, restore_op_name)
 
       self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
       self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 76d69e886e7..c3fa37529c3 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -169,7 +169,7 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 @tf_export(v1=["graph_util.create_kv_variable_init_graph"])
-def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
+def create_kv_variable_init_graph(graph, restore_all_op_name):
   name_to_input_name, name_to_node, name_to_seq_num = \
       _extract_graph_summary(graph)
 
@@ -184,8 +184,10 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
                        " {} in current graph.".format(restore_all_op_name))
 
   for restore_shard_input_full_name in restore_all_op.input:
-    restore_shard_input_name = re.sub(r"^\^", "", restore_shard_input_full_name)
-    restore_shard_input_op = name_to_node[restore_shard_input_name]
+    restore_shard_input_no_op_name = re.sub(r"^\^", "", restore_shard_input_full_name)
+    restore_shard_input_no_op = name_to_node[restore_shard_input_no_op_name]
+    restore_shard_input_op_name = re.sub(r"^\^", "",restore_shard_input_no_op.input[0])
+    restore_shard_input_op = name_to_node[restore_shard_input_op_name]
     # go through all restore_shard ops
     new_node = node_def_pb2.NodeDef()
     new_node.CopyFrom(restore_shard_input_op)
@@ -198,10 +200,6 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
          n_node.op == "KvResourceImportV2" or \
          n_node.op == "KvResourceImport":
         new_node.input.append(n_full_name)
-      else:
-        # Keep global_step assign op in new save/restore_all
-        if n_node.input[0] == global_step_name:
-          new_node.input.append(n_full_name)
 
     graph.node.remove(restore_shard_input_op)
     graph.node.extend([new_node])

From 5eabe5fba8b08707020868c899b7cd63784a70f6 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Fri, 12 Jan 2024 00:24:52 -0800
Subject: [PATCH 72/91] [Embedding] Make Embedding backward compatible with
 previous saved_model. (#963)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/ops/kv_variable_ops.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 1ef9550ef6d..840aadf2541 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -530,11 +530,16 @@ def _init_from_proto(self, variable_def, import_scope=None):
           cache_op = op
     elif self._initializer_op.type == "InitializeKvVariableOp":
       init_op = self._initializer_op
-
-    self._init_op_for_restore = g.as_graph_element(
+    if variable_def.initialize_op_for_restore:
+      self._init_op_for_restore = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initialize_op_for_restore,
             import_scope=import_scope))
+    else: #Backward compatibility with 2306
+      self._init_op_for_restore = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.initializer_name,
+            import_scope=import_scope))
     self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(

From d84837fc3c589ea32aad9a3e6b6a272cbd92a079 Mon Sep 17 00:00:00 2001
From: dashingwu <dashingwu@gmail.com>
Date: Thu, 1 Feb 2024 12:22:01 +0800
Subject: [PATCH 73/91] [Runtime] fix a scheduling issue (#970)

The original code assumes the last 4 bits of the CPU cycle count is
uniformly distributed, but that is not true, at lease Intel IceLake
Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz, the CPU cycle is always
ODD number. This fact will result expensive ops are frequently scheduled
to signle thread, which will greatly increase the RT time (in custom
scenario, from ~30ms to ~45ms).

Signed-off-by: Xiaoguang Wu <zhongjian.wxg@alibaba-inc.com>
Co-authored-by: Xiaoguang Wu <zhongjian.wxg@alibaba-inc.com>
---
 tensorflow/core/common_runtime/executor.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index fd38329a1fa..3df0d2a15be 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -730,15 +730,16 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
 
   } else if (kernel_stats_->HasExpensiveMarker(item)) {
     KernelTimer timer;
+    static uint64 update_counter = 0;
     device->Compute(op_kernel, &ctx);
-    // For expensive kernels, always update the cost estimate. For inexpensive
-    // kernels, update the cost estimate with ~1/16 probability. This assumes
-    // that the last 4 bits of the CPU cycle count is uniformly distributed.
+
     constexpr int kKernelExecutionTrackingInvocationSkipCount = 16;
     if (is_expensive ||
-        timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0) {
+        update_counter % kKernelExecutionTrackingInvocationSkipCount == 0) {
       kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
     }
+
+    update_counter++;
   } else {
     device->Compute(op_kernel, &ctx);
   }

From 2b15e8a13a7d17736366bb9600267f94465b72e8 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Sun, 4 Feb 2024 01:58:50 -0800
Subject: [PATCH 74/91] [Embedding] Fix shared embedding frequency counting
 problem. (#962)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../api_def_UniqueWithExtraCounts.pbtxt       |   4 +
 .../api_def_UniqueWithExtraCounts.pbtxt       |   3 +
 .../api_def_UniqueWithExtraCounts.pbtxt       |   4 +
 tensorflow/core/kernels/unique_ali_op.cc      | 121 ++++++++++++-----
 tensorflow/core/kernels/unique_ali_op_util.h  | 122 +++++++++++++++---
 tensorflow/core/ops/array_ops.cc              |  20 +++
 .../framework/python_op_gen_internal.cc       |   1 +
 .../python/kernel_tests/unique_op_test.py     |  68 ++++++++++
 tensorflow/python/ops/array_ops.py            |   1 -
 .../python/ops/embedding_variable_ops_test.py |  69 ++++++++++
 .../python/training/gradient_descent.py       |  23 +++-
 tensorflow/python/training/optimizer.py       |  22 ++--
 12 files changed, 386 insertions(+), 72 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..b8fabfe75a9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..117b73ef185
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..b8fabfe75a9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/unique_ali_op.cc b/tensorflow/core/kernels/unique_ali_op.cc
index 28b5dad1990..efae935db12 100644
--- a/tensorflow/core/kernels/unique_ali_op.cc
+++ b/tensorflow/core/kernels/unique_ali_op.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/task_runner.h"
 #include "tensorflow/core/kernels/unique_ali_op_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
@@ -41,40 +41,43 @@ const char* kStlHashMapString = "STL";
 const char* kAbslHashMapString = "ABSL";
 const char* kGoogleHashMapString = "GOOGLE";
 const int64 kDefaultUniqueRatioHint = 4;
-}
+}  // namespace
 
 template <typename T, typename TIndex>
 class UniqueAliOp : public OpKernel {
  public:
   explicit UniqueAliOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv,
-                                             kPartitionSize, &partition_size_));
-    OP_REQUIRES(context, partition_size_ > 0,
-                errors::InvalidArgument("Invaild PARTITION_SIZE=",
-                                        partition_size_));
+    OP_REQUIRES_OK(
+        context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, kPartitionSize,
+                                     &partition_size_));
+    OP_REQUIRES(
+        context, partition_size_ > 0,
+        errors::InvalidArgument("Invaild PARTITION_SIZE=", partition_size_));
 
-    OP_REQUIRES_OK(context, ReadBoolFromEnvVar(kUniqueOpSerialEnv,
-                                                false, &serial_));
+    OP_REQUIRES_OK(context,
+                   ReadBoolFromEnvVar(kUniqueOpSerialEnv, false, &serial_));
 
     // NOTE(zycao>: Hash map insertion and lookup performance is dominating in
     // Unique Op. Based on benchmark results, 'google::dense_hash_map' will be
     // used as default for most key types except string.
     //
-    // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a particular
-    // hash map could be seleteed to use. Possible choices are listed below:
+    // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a
+    // particular hash map could be seleteed to use. Possible choices are listed
+    // below:
     //     "MULTIMAP" for multimap parrallel process,
     //     "STL" for std::unordred_map,
     //     "ABSL" for absl::flat_hash_map,
     //     "GOOGLE" for google::dense_hash_map.
     std::string hash_map_str;
-    OP_REQUIRES_OK(context, ReadStringFromEnvVar(kUniqueOpHashMapEnv,
-                                                 kGoogleHashMapString,
-                                                 &hash_map_str));
+    OP_REQUIRES_OK(
+        context, ReadStringFromEnvVar(kUniqueOpHashMapEnv, kGoogleHashMapString,
+                                      &hash_map_str));
     std::transform(hash_map_str.begin(), hash_map_str.end(),
                    hash_map_str.begin(), ::toupper);
 
     OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpUniqRatioHint,
-        kDefaultUniqueRatioHint, &unique_ratio_hint_));
+                                                kDefaultUniqueRatioHint,
+                                                &unique_ratio_hint_));
     OP_REQUIRES(context, unique_ratio_hint_ > 0,
                 errors::InvalidArgument("Invaild ", kUniqueOpUniqRatioHint, "=",
                                         unique_ratio_hint_));
@@ -83,7 +86,8 @@ class UniqueAliOp : public OpKernel {
       map_flag_ = MULTIMAP;
       static char print_once = [] {
         LOG(INFO) << "MultiMapCompute preserved "
-          "dense hash map key: " << kPreseverdEmptyKey;
+                     "dense hash map key: "
+                  << kPreseverdEmptyKey;
         return '\0';
       }();
     } else if (!hash_map_str.compare(kStlHashMapString)) {
@@ -95,7 +99,6 @@ class UniqueAliOp : public OpKernel {
     } else {
       map_flag_ = GOOGLE;
     }
-
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,16 +113,14 @@ class UniqueAliOp : public OpKernel {
     Tensor output;
     Tensor output_counter;
     if (context->num_inputs() == 1) {
-      UniqueWithoutAxis<T, TIndex>(context, input,
-          &idx, &output, &output_counter, num_outputs(),
-          partition_size_, serial_, unique_ratio_hint_,
-          map_flag_);
+      UniqueWithoutAxis<T, TIndex>(
+          context, input, &idx, &output, &output_counter, num_outputs(),
+          partition_size_, serial_, unique_ratio_hint_, map_flag_);
     } else {
       const Tensor& axis_tensor = context->input(1);
-      UniqueWithAxis<T, TIndex>(context, input,
-          axis_tensor, &idx, &output, &output_counter,
-          num_outputs(), partition_size_, serial_,
-          unique_ratio_hint_, map_flag_);
+      UniqueWithAxis<T, TIndex>(context, input, axis_tensor, &idx, &output,
+                                &output_counter, num_outputs(), partition_size_,
+                                serial_, unique_ratio_hint_, map_flag_);
     }
     context->set_output(0, output);
     context->set_output(1, idx);
@@ -128,33 +129,65 @@ class UniqueAliOp : public OpKernel {
     }
   }
 
+ protected:
   bool serial_ = false;
   int64 partition_size_ = 0;
   int64 unique_ratio_hint_;
   UniqueMaps map_flag_ = GOOGLE;  // "GOOGLE" dense hash map is default
 };
 
+template <typename T, typename TIndex>
+class UniqueWithCountAliOp : public UniqueAliOp<T, TIndex> {
+  using UniqueAliOp<T, TIndex>::serial_;
+  using UniqueAliOp<T, TIndex>::partition_size_;
+  using UniqueAliOp<T, TIndex>::unique_ratio_hint_;
+  using UniqueAliOp<T, TIndex>::map_flag_;
+  using OpKernel::num_outputs;
+
+ public:
+  explicit UniqueWithCountAliOp(OpKernelConstruction* context)
+      : UniqueAliOp<T, TIndex>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("N", &num_sparse_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor idx;
+    Tensor output;
+    Tensor output_counter;
+    UniqueWithExtraCounts<T, TIndex>(
+        context, input, &idx, &output, &output_counter, num_outputs(),
+        partition_size_, serial_, unique_ratio_hint_, num_sparse_, map_flag_);
+    context->set_output(0, output);
+    context->set_output(1, idx);
+    context->set_output(2, output_counter);
+  }
+
+ private:
+  int num_sparse_;
+};
+
 #define REGISTER_UNIQUE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -164,7 +197,7 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -174,7 +207,17 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>)
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
@@ -198,12 +241,22 @@ REGISTER_UNIQUE(string)
                               .HostMemory("count")               \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
-#endif //GOOGLE_CUDA
-  
+#endif  // GOOGLE_CUDA
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
diff --git a/tensorflow/core/kernels/unique_ali_op_util.h b/tensorflow/core/kernels/unique_ali_op_util.h
index 6b59ba26e81..0a52d8864e9 100644
--- a/tensorflow/core/kernels/unique_ali_op_util.h
+++ b/tensorflow/core/kernels/unique_ali_op_util.h
@@ -191,7 +191,8 @@ void NewSizes(OpKernelContext* context, const Tensor& input,
 
 template<typename T, typename TIndex, class HashMap>
 void SerialComputeV1(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, 
+    google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   auto Tin = input.flat<T>();
   const int64 N = input.NumElements();
   auto idx_vec = idx->template vec<TIndex>();
@@ -205,7 +206,23 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input,
       ++j;
     }
   }
-
+  
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto idx_it = uniq.find(ids);
+      if (idx_it != uniq.end()) {
+        counter_map->emplace(idx_it->second, counter_vec(k));
+      }
+    }
+  }
+  
   *uniq_size = static_cast<int64>(uniq.size());
   TensorShape output_shape(input.shape());
   output_shape.set_dim(axis, *uniq_size);
@@ -223,7 +240,8 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input,
 
 template<typename T, typename TIndex, class HashMap>
 void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, 
+     google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   // Struct INode was used to store an inverse mapping for each node in the
   // hash map container.
   struct INode {
@@ -415,6 +433,25 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
   TaskRunner t3_runner(GlobalIndexTask, thread_pool, num_tasks_t1);
   t3_runner.Run();
 
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      for (int j = 0; j < num_tasks_t1; ++j) {
+        const INode* inode = uniq_maps[j].GetINodeByKey(ids);
+        if (inode != nullptr) {
+          counter_map->emplace(inode->index_, counter_vec(k));
+          continue;
+        }
+      }
+    }
+  }
+
   // Parallel Step 4: Write output indicies Tensor.
   int32 max_tasks_t4 = (N + kPartitionSize - 1) / kPartitionSize;
   int32 num_tasks_t4 = std::max(std::min(max_threads, max_tasks_t4), 1);
@@ -447,8 +484,8 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
 template<typename TIndex, class HashMap>
 void MultiMapCompute(OpKernelContext* context, const Tensor& input,
                      Tensor* idx, int64 axis, int64* uniq_size_out,
-                     int32 num_buckets, int64 unique_ratio_hint,
-                     Tensor* output) {
+                     int32 num_buckets, int64 unique_ratio_hint, int num_sparse,
+                     google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   auto Tin = input.vec<int64>();
   const int64 N = input.NumElements();
 
@@ -529,6 +566,24 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
   }
   int64 uniq_size =
       global_offsets[num_buckets - 1] + uniq_maps[num_buckets - 1].size();
+  
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * uniq_size);
+
+  google::dense_hash_map<int64, TIndex> extra_unique_id_map;
+  extra_unique_id_map.set_empty_key(std::numeric_limits<int64>::max());
+  extra_unique_id_map.resize(2 * uniq_size);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<int64>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto counts = counter_vec(k);
+      extra_unique_id_map.emplace(ids, counts);
+    }
+  }
 
   *uniq_size_out = uniq_size;
   AllocatorAttributes attr;
@@ -539,7 +594,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
   auto key_output_vec = output->template vec<int64>();
 
   auto OutputTask = [&key_output_vec, &uniq_maps, &global_offsets,
-      &Tin, &idx_vec, &map_parter]
+      &Tin, &idx_vec, &map_parter, &counter_map, extra_unique_id_map]
       (int32 task_id, int32 num_tasks) {
     TIndex offset = global_offsets[task_id];
     for (auto iter = uniq_maps[task_id].begin(); iter != uniq_maps[task_id].end(); ++iter) {
@@ -553,7 +608,10 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
         next_idx = idx_vec(cur_idx);
         idx_vec(cur_idx) = offset;
       }
-
+      auto it = extra_unique_id_map.find(iter->first);
+      if (it != extra_unique_id_map.end()) {
+        counter_map->emplace(offset, it->second);
+      }
       ++offset;
     }
   };
@@ -618,8 +676,9 @@ void MultipleElements(OpKernelContext* context, const Tensor& input,
 }
 
 template<typename TIndex>
-void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
-                      Tensor* idx, int num_outputs, int64 uniq_size) {
+void CheckCountOutput(OpKernelContext* context, Tensor* output, Tensor* output_counter,
+                      Tensor* idx, int num_outputs, int64 uniq_size, 
+                      int num_sparse, google::dense_hash_map<int, TIndex> counter_map) {
   if (num_outputs > 2) {
     auto idx_vec = idx->template vec<TIndex>();
     AllocatorAttributes attr;
@@ -633,12 +692,19 @@ void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
     for (int64 i = 0; i < N; ++i) {
       count_output_vec(idx_vec(i))++;
     }
+    if (num_sparse > 0) {
+      for (auto& it: counter_map) {
+        count_output_vec(it.first) += (it.second - 1);
+      }
+    }
   }
+  
 }
 
 template<typename T, typename TIndex, class HashMap>
 void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, int64 N, bool serial, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int64 N, int num_sparse, bool serial, 
+    google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
               errors::InvalidArgument("unique expects a 1D vector."));
   // TODO(dga):  Make unique polymorphic for returning int32 and int64
@@ -651,10 +717,10 @@ void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
 
   if (N >= kPartitionLimit && !serial) {
     ParallelComputeV1<T, TIndex, HashMap>
-        (context, input, idx, axis, uniq_size, output);
+        (context, input, idx, axis, uniq_size, num_sparse, counter_map, output);
   } else {
     SerialComputeV1<T, TIndex, HashMap>
-        (context, input, idx, axis, uniq_size, output);
+        (context, input, idx, axis, uniq_size, num_sparse, counter_map, output);
   }
 }
 
@@ -662,7 +728,7 @@ template<typename T, typename TIndex>
 void UniqueInternal(OpKernelContext* context, const Tensor& input,
     Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs,
     int64 partition_size, bool serial, int64 axis, int64 unique_ratio_hint,
-    std::vector<int64>& new_sizes, UniqueMaps map_flag) {
+    std::vector<int64>& new_sizes, UniqueMaps map_flag, int num_sparse = 0) {
   typedef google::dense_hash_map<T, TIndex> DefaultHashMap;
 
   AllocatorAttributes attr;
@@ -672,6 +738,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input,
       TensorShape({new_sizes[1]}), idx, attr));
 
   int64 uniq_size_out;
+  google::dense_hash_map<int, TIndex> counter_map;
 
   if (new_sizes[0] == 1 && new_sizes[2] == 1) {
     // Specialized and faster implementation when unique is run over single
@@ -687,33 +754,34 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input,
       case MULTIMAP:
         if (num_buckets > 1 && !serial) {
           MultiMapCompute<TIndex, google::dense_hash_map<int64, TIndex, IdHash>>
-              (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, output);
+              (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, num_sparse, &counter_map, output);
         } else {
           SerialComputeV1<T, TIndex, DefaultHashMap>
-              (context, input, idx, axis, &uniq_size_out, output);
+              (context, input, idx, axis, &uniq_size_out, num_sparse, &counter_map, output);
         }
         break;
       case STL:
         ComputeInternalWithHashMap<T, TIndex, std::unordered_map<T, TIndex>>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       case ABSL:
         ComputeInternalWithHashMap<T, TIndex, absl::flat_hash_map<T, TIndex>>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       case GOOGLE:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       default:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
     }
   } else {
     MultipleElements<T, TIndex>(context, input, idx, output, &uniq_size_out, axis, new_sizes);
   }
 
-  CheckCountOutput<TIndex>(context, output_counter, idx, num_outputs, uniq_size_out);
+  CheckCountOutput<TIndex>(context, output, output_counter, idx, num_outputs, 
+                           uniq_size_out, num_sparse, counter_map);
 }
 
 template<typename T, typename TIndex>
@@ -743,6 +811,20 @@ void UniqueWithAxis(OpKernelContext* context, const Tensor& input,
       axis, unique_ratio_hint, new_sizes, map_flag);
 }
 
+template<typename T, typename TIndex>
+void UniqueWithExtraCounts(OpKernelContext* context, const Tensor& input,
+    Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs,
+    int64 partition_size, bool serial, int64 unique_ratio_hint,
+    int num_sparse, UniqueMaps map_flag) {
+  int64 axis = 0;
+  std::vector<int64> new_sizes{1, input.NumElements(), 1};
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+              errors::InvalidArgument("unique expects a 1D vector."));
+  UniqueInternal<T, TIndex>(context, input, idx, output,
+      output_counter, num_outputs, partition_size, serial,
+      axis, unique_ratio_hint, new_sizes, map_flag, num_sparse);
+}
+
 }  // namespace tensorflow
 
 #endif // TENSORFLOW_CORE_KERNELS_UNIQUE_ALI_OP_UTIL_H_
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 27f6811fcff..306026977ef 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1741,6 +1741,26 @@ REGISTER_OP("UniqueWithCountsV2")
       return Status::OK();
     });
 
+// ---------------------------------------------------
+
+REGISTER_OP("UniqueWithExtraCounts")
+    .Input("x: T")
+    .Input("extra_indices: N * T")
+    .Input("extra_counts: N * out_idx")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return Status::OK();
+    });
+
 namespace {
 
 Status ShapeShapeFn(InferenceContext* c) {
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 42ae4eacc77..d0370a09106 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -105,6 +105,7 @@ bool IsOpWithUnderscorePrefix(const string& s) {
        // TODO(annarev): reduce usage of '*' imports and remove these from the
        // list.
        "fused_batch_norm", "histogram_fixed_width", "stack",
+       "unique_with_extra_counts",
        "batch_norm_with_global_normalization", "clip_by_value"});
   return kUnderscoreOps->count(s) > 0;
 }
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 9ec0ff74e3e..08ebcf0e8dd 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -27,6 +27,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -278,6 +279,73 @@ def testUniqueWithCountsAbslMap(self):
   def testUniqueWithCountsDenseHashMap(self):
     self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
 
+class UniqueWithExtraCountsTest(test.TestCase):
+
+  def testInt32(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int32)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int64)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def RunUniqueWithCountsWithDifferentMaps(self, map_type):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+
+    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    self.testInt32()
+    self.testInt32OutIdxInt64()
+
+    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+
+  def testUniqueWithCountsMultiMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+
+  def testUniqueWithCountsStlMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('STL')
+
+  def testUniqueWithCountsAbslMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+
+  def testUniqueWithCountsDenseHashMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index adadf3cc427..960dae9ac8c 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1627,7 +1627,6 @@ def unique_with_counts(x, out_idx=dtypes.int32, name=None):
 
 unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
-
 @tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 81b315e2e43..dbf254d5f14 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -19,6 +19,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.check_ops import assert_equal
 from tensorflow.python.platform import googletest
@@ -2871,6 +2872,39 @@ def testCountsTensor(self):
         value = checkpoint_utils.load_variable(ckpt_path, name)
         self.assertAllEqual(value, [3, 3, 1, 3, 2])
   
+  def testCountsWithSparseAndDenseTensor(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64)
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup(var, ids)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+  
   def testCountsTensorWithGradientDescent(self):
     os.environ["TF_RECORD_FREQ"] = "1"
     checkpoint_directory = self.get_temp_dir()
@@ -2908,6 +2942,41 @@ def testCountsTensorWithGradientDescent(self):
         self.assertAllEqual(value, [3, 3, 1, 3, 2])
 
     del os.environ["TF_RECORD_FREQ"]
+  
+  def testCountsDenseAndSparseTensorWithGradientDescent(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64)
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup(var, ids)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = gradient_descent.GradientDescentOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+
+    del os.environ["TF_RECORD_FREQ"]
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 799e3c5f5bd..bd16892c1c8 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -19,9 +19,12 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import gen_hash_training_ops
 from tensorflow.python.ops import kv_variable_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
@@ -72,22 +75,28 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     if isinstance(handle, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
       if handle.need_counts() and len(handle._counts_tensor.keys()) != 0:
+        extra_counts, extra_indices = [], []
         if indices.op.type == "ConcatV2":
-          total_counts = []
           for tensor in indices.op.inputs:
             if tensor.op.type == "Reshape":
               indices_tensor = tensor.op.inputs[0]
-              total_counts.append(handle._counts_tensor[indices_tensor])
-          from tensorflow.python.ops import array_ops
-          counts_tensor = array_ops.concat(total_counts, 0)
+              if indices_tensor in handle._counts_tensor:
+                extra_counts.append(handle._counts_tensor[indices_tensor])
+                extra_indices.append(indices_tensor)
         elif indices.op.type == "Reshape":
           indices_tensor = indices.op.inputs[0]
-          counts_tensor = handle._counts_tensor[indices_tensor]
+          if indices_tensor in handle._counts_tensor:
+            extra_counts.append(handle._counts_tensor[indices_tensor])
+            extra_indices.append(indices_tensor)
+        unique_indices, new_index_positions, indices_counts = \
+            gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts)
+        summed_grads = math_ops.unsorted_segment_sum(
+            grad, new_index_positions, array_ops.shape(unique_indices)[0])
         return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
                                          grad.dtype.base_dtype),
-            grad, indices, global_step,
-            counts_tensor, use_locking=self._use_locking)
+            summed_grads, unique_indices, global_step,
+            indices_counts, use_locking=self._use_locking)
       else:
         return training_ops.kv_resource_sparse_apply_gradient_descent(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 7523604ccf9..95383a9d962 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -34,6 +34,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import gen_io_ops
@@ -93,16 +94,14 @@ def _deduplicate_indexed_slices_with_counts(values, indices):
       array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, indices_counts)
 
-def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts):
+def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices):
   """Sums `values` associated with any non-unique `indices`
   and return counts of each count in `values`."""
-  unique_indices, new_index_positions = array_ops.unique(indices)
+  unique_indices, new_index_positions, summed_counts = \
+      gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts)
   summed_values = math_ops.unsorted_segment_sum(
       values, new_index_positions,
       array_ops.shape(unique_indices)[0])
-  summed_counts = math_ops.unsorted_segment_sum(
-      counts, new_index_positions,
-      array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, summed_counts)
 
 def _var_key(var):
@@ -1105,19 +1104,22 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
             _deduplicate_indexed_slices_with_counts(
                 values=grad, indices=indices)
       else:
+        extra_counts, extra_indices = [], []
         if indices.op.type == "ConcatV2":
-          total_counts = []
           for tensor in indices.op.inputs:
             if tensor.op.type == "Reshape":
               indices_tensor = tensor.op.inputs[0]
-              total_counts.append(handle._counts_tensor[indices_tensor])
-          counts_tensor = array_ops.concat(total_counts, 0)
+              if indices_tensor in handle._counts_tensor:
+                extra_counts.append(handle._counts_tensor[indices_tensor])
+                extra_indices.append(indices_tensor)
         elif indices.op.type == "Reshape":
           indices_tensor = indices.op.inputs[0]
-          counts_tensor = handle._counts_tensor[indices_tensor]
+          if indices_tensor in handle._counts_tensor:
+            extra_counts.append(handle._counts_tensor[indices_tensor])
+            extra_indices.append(indices_tensor)
         summed_grad, unique_indices, indices_counts = \
             _deduplicate_indexed_slices_with_counts_reduction(
-                grad, indices, counts_tensor)
+                grad, indices, extra_counts, extra_indices)
       return self._resource_apply_sparse(
           summed_grad, handle, unique_indices, indices_counts)
     else:

From 70b32df83f0e7928d8894773fe2d5cf247ccf3d4 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 20 Feb 2024 19:20:27 +0800
Subject: [PATCH 75/91] [BUILD] Add build SDK package. (#972)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/sdk_package/BUILD            |  43 ++++++
 tensorflow/tools/sdk_package/README.md        |  41 ++++++
 .../tools/sdk_package/build_sdk_package.sh    | 136 ++++++++++++++++++
 3 files changed, 220 insertions(+)
 create mode 100644 tensorflow/tools/sdk_package/BUILD
 create mode 100644 tensorflow/tools/sdk_package/README.md
 create mode 100755 tensorflow/tools/sdk_package/build_sdk_package.sh

diff --git a/tensorflow/tools/sdk_package/BUILD b/tensorflow/tools/sdk_package/BUILD
new file mode 100644
index 00000000000..b3dca82b9e3
--- /dev/null
+++ b/tensorflow/tools/sdk_package/BUILD
@@ -0,0 +1,43 @@
+# Description:
+# TensorFlow is a computational framework, primarily for use in machine
+# learning applications.
+#
+# Public targets:
+#
+# ":sdk_package" - Package the tensorflow dynamic library and necessry
+#     headers for developing. The script should be executed manually
+#     after 'bazel build'.
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow:tensorflow.bzl", "transitive_hdrs", "tf_binary_additional_srcs")
+load("//tensorflow/core/platform:default/build_config_root.bzl",
+     "tf_additional_plugin_deps")
+
+transitive_hdrs(
+    name = "sdk_headers",
+    deps = [
+        # Need to check definition of //tensorflow:libtensorflow_cc.so
+        # for updates.
+        "//tensorflow/c:c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/contrib/session_bundle:bundle_shim",
+    ] + tf_additional_plugin_deps(),
+    tags = ["manual"],
+)
+
+sh_binary(
+    name = "build_sdk_package",
+    srcs = ["build_sdk_package.sh"],
+    data = [
+        ":sdk_headers",
+        "@com_google_protobuf//:protoc",
+        "//tensorflow:libtensorflow_cc.so",
+    ] + tf_binary_additional_srcs(),
+    tags = ["manual"],
+)
diff --git a/tensorflow/tools/sdk_package/README.md b/tensorflow/tools/sdk_package/README.md
new file mode 100644
index 00000000000..8dbac7bed92
--- /dev/null
+++ b/tensorflow/tools/sdk_package/README.md
@@ -0,0 +1,41 @@
+Bazel rules and bash scripts to package the DeepRec C/C++ APIs and
+runtime library into '\<DeepRec Root Path\>/tensorflow_sdk.tar.gz' archive.
+
+## SDK Build
+
+First of all, edit and run the configurating script **'./configure'** under
+DeeRec root directory (supposed '\<DeepRec Root Path\>').
+
+Then simply run the following commands under '\<DeepRec Root Path\>' to build
+the DeepRec SDK package:
+
+```sh
+./build sdk
+```
+_This command will put the SDK package named 'tensorflow\_sdk.tar.gz' into
+the directory below:_
+>     <DeepRec Root Path>/built/sdk/[gpu|cpu]
+
+## SDK usage:
+
+To make use of DeepRec runtime SDK for C++ codes writting with original APIs
+defined in TensorFlow, just decompress the SDK package into another work
+directory (supposed '\<workdir path\>') with the command at first:
+
+```sh
+tar xzvf -C <workdir path> tensorflow_sdk.tar.gz
+```
+
+Then a directory named 'sdk' will be placed into the \<workdir path\>, which
+contains necessary header files in the 'include' sub-directory, keeping the
+original hierarchy in TensorFlow, and the 'libtensorflow_cc.so' dynamic
+runtime library in the 'lib' sub-directoy to support TensorFlow running.
+
+Just append **'-I\<workdir path\>/sdk/include'** to compiling arguments and
+**'-L\<workdir path\>/sdk/lib'** -ltensorflow_cc to linking arguments, in the
+cases of building a project, that contains codes using original TensorFlow
+C++ APIs, together with DeepRec SDK.
+
+Finally, to successfully run the binary building with DeepRec SDK, do not
+forget to append '\<workdir path\>/sdk/lib' to **'LD_LIBRARY_PATH'** environment
+variable.
diff --git a/tensorflow/tools/sdk_package/build_sdk_package.sh b/tensorflow/tools/sdk_package/build_sdk_package.sh
new file mode 100755
index 00000000000..89b7d8e9195
--- /dev/null
+++ b/tensorflow/tools/sdk_package/build_sdk_package.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+# Copyright 2024 The DeepRec Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script is used for packaging TensorFlow SDK files into a tarball.
+# The processing flow took 'tensorflow/tools/pip_package/build_pip_package.sh'
+# as the reference.
+
+set -e
+
+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
+function main() {
+  if [ $# -lt 1 ] ; then
+    echo "No destination dir provided"
+    exit 1
+  fi
+
+  DEST=$1
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+  mkdir -p "${TMPDIR}/sdk/bin"
+  mkdir -p "${TMPDIR}/sdk/include"
+  mkdir -p "${TMPDIR}/sdk/lib"
+
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+
+  if [ ! -d bazel-bin/tensorflow ]; then
+    echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
+    exit 1
+  fi
+
+  if is_windows; then
+    echo "Windows version TensorFlow SDK not supported..."
+  elif [ ! -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow ]; then
+    # Really old (0.2.1-) runfiles, without workspace name.
+    echo "TensorFlow SDK does not support such old verions..."
+  else
+    RUNFILES=bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow
+    if [ -d ${RUNFILES}/external ]; then
+      # Old-style runfiles structure (--legacy_external_runfiles).
+      cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include"
+      # Check LLVM headers for XLA support.
+      if [ -d ${RUNFILES}/external/llvm_archive ]; then
+        # Old-style runfiles structure (--legacy_external_runfiles).
+        mkdir -p ${TMPDIR}/sdk/include/external/llvm/include
+        cp -RL ${RUNFILES}/external/llvm_archive/include/llvm \
+          "${TMPDIR}/sdk/include/external/llvm/include"
+        pushd ${TMPDIR}/sdk/include
+        ln -s external/llvm/include/llvm llvm
+        popd
+      fi
+      # Copy MKL libs over so they can be loaded at runtime
+      so_lib_dir=$(ls $RUNFILES | grep solib) || true
+      if [ -n "${so_lib_dir}" ]; then
+        mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+        if [ -n "${mkl_so_dir}" ]; then
+          cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib"
+        fi
+      fi
+    else
+      # New-style runfiles structure (--nolegacy_external_runfiles).
+      cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include"
+      # Check LLVM headers for XLA support.
+      if [ -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive ]; then
+        cp -RL \
+          bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive/include/llvm \
+          "${TMPDIR}/sdk/include"
+      fi
+      # Copy MKL libs over so they can be loaded at runtime
+      so_lib_dir=$(ls $RUNFILES | grep solib) || true
+      if [ -n "${so_lib_dir}" ]; then
+        mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+        if [ -n "${mkl_so_dir}" ]; then
+          cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib"
+        fi
+      fi
+    fi
+  fi
+
+  # move and strip the dynamic library file for packaging.
+  # at default the .so file was not writable for the owner,
+  # so using a 'chmod +w' to perform the strip command.
+  chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so
+  chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1
+  strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so
+  strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1
+  mv ${TMPDIR}/sdk/include/tensorflow/libtensorflow_*.so* ${TMPDIR}/sdk/lib
+
+  # third party packages doesn't ship with header files. Copy the headers
+  # over so user defined ops can be compiled.
+  mkdir -p ${TMPDIR}/sdk/include/google
+  mkdir -p ${TMPDIR}/sdk/include/third_party
+  pushd ${RUNFILES%org_tensorflow}/com_google_protobuf/src/google
+  for header in $(find protobuf -name \*.h); do
+    mkdir -p "${TMPDIR}/sdk/include/google/$(dirname ${header})"
+    cp -L "$header" "${TMPDIR}/sdk/include/google/$(dirname ${header})/"
+  done
+  popd
+  cp -RL $RUNFILES/third_party/eigen3 ${TMPDIR}/sdk/include/third_party
+  cp -RL ${RUNFILES%org_tensorflow}/eigen_archive/* ${TMPDIR}/sdk/include/
+  cp -RL ${RUNFILES%org_tensorflow}/nsync/public/* ${TMPDIR}/sdk/include
+  cp -L ${RUNFILES%org_tensorflow}/com_google_protobuf/protoc ${TMPDIR}/sdk/bin
+
+  # package all files into the target file.
+  pushd ${TMPDIR}
+  rm -f MANIFEST
+  echo $(date) : "=== Building sdk package"
+  tar czvf tensorflow_sdk.tar.gz sdk/ 1> /dev/null
+  popd
+  mkdir -p ${DEST}
+  mv ${TMPDIR}/tensorflow_sdk.tar.gz ${DEST}
+  rm -rf ${TMPDIR}
+  echo $(date) : "=== Output sdk package file is: ${DEST}/tensorflow_sdk.tar.gz"
+}
+
+main "$@"

From eb5f30db53ee41179a61a83c6ec9b54111c0257a Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 22 Feb 2024 15:08:16 +0800
Subject: [PATCH 76/91] [Embedding] Log error when EV has been initialized in
 EV Import OP. (#971)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/kernels/kv_variable_restore_ops.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 3b10c2521b9..2eccf485ef8 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -373,6 +373,12 @@ class KvResourceImportV3Op: public AsyncOpKernel {
 
     core::ScopedUnref unref_me(ev);
 
+    // EV should not be initialized at this time.
+    if (ev->IsInitialized()) {
+      LOG(ERROR) << "Import parameter for EV (" << name_string
+                 << ") failed, this EV has already been initialized.";
+    }
+
     auto do_compute = [this, context, file_name_string, ev,
          name_string, done] () {
       BundleReader reader(Env::Default(), file_name_string);

From 9a54aae7d5062330f4055c73401183b57650c7d2 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 10:54:34 +0800
Subject: [PATCH 77/91] [Release] Update DeepRec release version to
 1.15.5+deeprec2402. (#974)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e8635e1a298..10132cab678 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '1.15.5+deeprec2310'
+_VERSION = '1.15.5+deeprec2402'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.9.0',

From 8d4024406210dbcb0a99cc036606efcfa3671c3a Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:57:59 +0800
Subject: [PATCH 78/91] [Docs] Update deeprec2402 release images and notes in
 README.md & RELEASE.md. (#975)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 44 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 8f491e14665..b7d7b578c24 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index 6b7e4a7fd79..b095351d2a0 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,48 @@
+# Release r1.15.5-deeprec2402
+
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Refine KVInterface::GetShardedSnapshot API.
+- Undefine EV GPU interface in CPU compile.
+- Make Embedding backward compatible with previous saved_model.
+- Log error when EV has been initialized in EV Import OP.
+
+### **Op Implement**
+
+- Implement of SliceSend/SliceRecv Op.
+- Implement FileSliceSend/FileSliceRecvOp.
+
+### **SDK**
+
+- Add build SDK package.
+
+### **BugFix**
+
+- Fix shared embedding frequency counting problem.
+- Fix Graph contains EmbeddingVariable compiling issue.
+- Fix a scheduling issue.
+- Fix tensor shape meta-data bug for DataFrame Value.
+
+### **ModelZoo**
+
+- Set Saver parameter sharded=True in distributed training.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2310
+
 ## **Major Features and Improvements**
 
 ### **Embedding**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index fdf3e295fdd..379526e5b24 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 55f759a3c2a..6305d739571 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2310
+Develop Branch：master, Latest Release Branch: deeprec2402
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 79a0944aa3e..ea70f397c98 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2310
+Develop Branch: master, Latest Release Branch: deeprec2402
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index ad8fd36dbf7..0c11dca394f 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index e54c8ddbd2f..eeb4f66dc99 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2310
+开发分支：master，最新Release分支：deeprec2402
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index a43d2d517a6..b0460934165 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2310
+开发分支：master，最新Release分支：deeprec2402
 
 ## TFServing编译&打包
 

From 8b58f9b93e144fa2d6517d5d370dc0df4fd3644b Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:18:29 +0800
Subject: [PATCH 79/91] [Dockerfile] Add DeepRec release image dockerfile.
 (#976)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 cibuild/dockerfiles/Dockerfile.release | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 cibuild/dockerfiles/Dockerfile.release

diff --git a/cibuild/dockerfiles/Dockerfile.release b/cibuild/dockerfiles/Dockerfile.release
new file mode 100644
index 00000000000..77b013f840d
--- /dev/null
+++ b/cibuild/dockerfiles/Dockerfile.release
@@ -0,0 +1,32 @@
+# build DeepRec & estimator wheel
+FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04 AS deeprec_build
+
+ARG TF_COMMIT=deeprec2402
+
+RUN mkdir -p /src
+RUN wget -nv -O /src/install_bazel.sh \
+    http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/bazel-0.26.1-installer-linux-x86_64.sh && \
+    bash /src/install_bazel.sh
+
+RUN git clone https://github.com/DeepRec-AI/DeepRec.git /src/DeepRec && \
+    cd /src/DeepRec && \
+    git checkout ${TF_COMMIT}
+RUN cd /src/DeepRec && \
+    yes "" | bash ./configure || true && \
+    bazel build -c opt --config=opt //tensorflow/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package /src/
+
+RUN pip install /src/tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl 
+
+RUN git clone https://github.com/DeepRec-AI/estimator.git /src/estimator && \
+    cd /src/estimator && \
+    git checkout ${TF_COMMIT}
+RUN cd /src/estimator && \
+    bazel build //tensorflow_estimator/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow_estimator/tools/pip_package/build_pip_package /src/
+
+# build DeeepRec release image
+FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04
+COPY --from=deeprec_build /src/*.whl /
+RUN pip install /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl
+RUN rm -f /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl /tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl

From 186afd0479bb43c629cafa808be70b7f5ac33d83 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Thu, 29 Feb 2024 10:10:38 +0800
Subject: [PATCH 80/91] [Serving] Fix syntax error in generate timeline tool.
 (#977)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 serving/tools/timeline/gen_timeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/serving/tools/timeline/gen_timeline.py b/serving/tools/timeline/gen_timeline.py
index f055e473fa0..d56c1b39897 100644
--- a/serving/tools/timeline/gen_timeline.py
+++ b/serving/tools/timeline/gen_timeline.py
@@ -1,6 +1,6 @@
 import sys
-import config_pb2
-import timeline
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import timeline
 
 def gen_timeline(src_name, dest_name):
   run_metadata = config_pb2.RunMetadata()

From 6dae552cb40e954cce59e125977f141c6a926ada Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 7 Mar 2024 14:35:36 +0800
Subject: [PATCH 81/91] [Embedding] Refine header file of embedding variable.
 (#978)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/framework/embedding/embedding_var.h | 1 -
 tensorflow/core/kernels/kv_variable_ops.cc          | 1 +
 tensorflow/core/kernels/kv_variable_restore_ops.cc  | 1 +
 tensorflow/core/kernels/training_ali_ops.cc         | 8 ++++----
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index c0d26a2f4d8..81941bc9ff9 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/storage.h"
-#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 5cd0ef140bd..b7567ffe924 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 2eccf485ef8..e16db9b4cd6 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 546b30e29dd..fc21ab610cf 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -236,7 +236,7 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
     T** dev_a = dev_v + task_size;
     CHECK(dev_a);
     CHECK(dev_v);
-    DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2);
+    se::DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2);
     stream->ThenMemcpy(&dev_v_ptr, v, sizeof(T*) * task_size * 2);
 
     int block_size = 128;
@@ -1606,7 +1606,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;
@@ -2579,7 +2579,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;
@@ -3236,7 +3236,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;

From cf16856d01551c9d1cb005722d7f62a448df7095 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 26 Mar 2024 17:15:18 +0800
Subject: [PATCH 82/91] [Incremental Checkpoint] Fix import incremental
 embedding variable. (#983)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../embedding/embedding_var_restore.cc        | 50 +++++++++--------
 tensorflow/python/training/incr_ckpt_test.py  | 54 +++++++++++++++++++
 2 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc
index 11c13008995..6ff07bf7e43 100644
--- a/tensorflow/core/framework/embedding/embedding_var_restore.cc
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc
@@ -102,45 +102,48 @@ void CheckpointLoader<K, V>::RestoreInternal(
   Tensor part_filter_offset_tensor;
   if (!restore_args_.m_is_oldform) {
     /****** InitPartOffsetTensor ******/
-    TensorShape part_offset_shape, part_filter_offset_shape;
-    DataType part_offset_type, part_filter_offset_type;
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
     string offset_tensor_name;
     if (!restore_args_.m_is_incr) {
       offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
     } else {
       offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
     }
-  
-    string offset_filter_tensor_name =
-        name_string + kPartFilterOffsetTensorSuffsix;
+
     Status s = reader_->LookupDtypeAndShape(
         offset_tensor_name, &part_offset_type, &part_offset_shape);
     if (!s.ok()) {
       LOG(ERROR) << "EV restoring fail:" << s.error_message();
     }
-    s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
-                                     &part_filter_offset_type,
-                                     &part_filter_offset_shape);
-    if (!s.ok()) {
-      LOG(ERROR) << "EV restoring fail: " << s.error_message();
-    }
     part_offset_tensor =
         Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
-    part_filter_offset_tensor = Tensor(
-        cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
     s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
     if (!s.ok()) {
       LOG(ERROR) << "EV restoring fail:" << s.error_message();
     }
 
-    s = reader_->Lookup(offset_filter_tensor_name,
-                        &part_filter_offset_tensor);
-    if (!s.ok()) {
-      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    if (restore_args_.m_has_filter) {
+      TensorShape part_filter_offset_shape;
+      DataType part_filter_offset_type;
+      string offset_filter_tensor_name =
+        name_string + kPartFilterOffsetTensorSuffsix;
+      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                       &part_filter_offset_type,
+                                       &part_filter_offset_shape);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
+      part_filter_offset_tensor = \
+        Tensor(cpu_allocator(), part_filter_offset_type,
+               part_filter_offset_shape);
+      s = reader_->Lookup(offset_filter_tensor_name,
+                          &part_filter_offset_tensor);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
     }
   }
-  auto part_offset_flat = part_offset_tensor.flat<int32>();
-  auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
   
   if (restore_args_.m_is_oldform) {
     VLOG(1) << "old form, EV name:" << name_string
@@ -164,6 +167,7 @@ void CheckpointLoader<K, V>::RestoreInternal(
     VLOG(1) << "new form checkpoint... :" << name_string
             << " , partition_id:" << restore_args_.m_partition_id
             << " , partition_num:" << restore_args_.m_partition_num;
+    auto part_offset_flat = part_offset_tensor.flat<int32>();
     for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
       int subpart_id = restore_args_.m_loaded_parts[i];
       size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
@@ -183,6 +187,7 @@ void CheckpointLoader<K, V>::RestoreInternal(
                         new_dim, emb_config, device);
 
       if (restore_args_.m_has_filter) {
+        auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
         Status s = EVRestoreFilteredFeatures(
             subpart_id, new_dim, restore_buff, part_filter_offset_flat,
             emb_config, device);
@@ -444,7 +449,7 @@ Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
     }
     st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
                                sizeof(K) * version_filter_shape.dim_size(0));
-    if (!st.ok()) {
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
       return st;
     }
     st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
@@ -463,7 +468,8 @@ Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
       return st;
     }
   }
-  return st;
+
+  return Status::OK();
 }
 #define REGISTER_KERNELS(ktype, vtype)                               \
   template Status CheckpointLoader<ktype, vtype>::EVInitTensorNameAndShape(\
@@ -644,4 +650,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
 #undef REGISTER_KERNELS_ALL_INDEX
 #undef REGISTER_KERNELS
 
-}// namespace tensorflow
\ No newline at end of file
+}// namespace tensorflow
diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py
index b4f7ded3cea..55cf748a9d6 100644
--- a/tensorflow/python/training/incr_ckpt_test.py
+++ b/tensorflow/python/training/incr_ckpt_test.py
@@ -451,5 +451,59 @@ def testIncrementalSaverForResourceVariable(self):
     saver.build()
     incr_saver = incr_saver_module._get_incremental_saver(True, saver)
 
+  def testIncrementalSaverSaveAndRestore(self):
+    tmp_path = self.get_temp_dir()
+    full_ckpt_dir = os.path.join(tmp_path, "model.ckpt")
+    incr_ckpt_dir = os.path.join(tmp_path, "incr.ckpt")
+    full_ckpt_path = None
+    incr_ckpt_path = None
+
+    # construct graph
+    emb_var = variable_scope.get_embedding_variable("emb", embedding_dim=3,
+                initializer = init_ops.ones_initializer(dtypes.float32))
+    emb = embedding_ops.embedding_lookup(emb_var,
+            math_ops.cast([0, 1, 2, 3, 4], dtypes.int64))
+    loss = math_ops.reduce_sum(emb, name = 'reduce_sum')
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    saver = saver_module.Saver(sharded=True, incremental_save_restore=True)
+    incr_saver = \
+      incr_saver_module.IncrementalSaver(sharded=True,
+                          saver_def=saver.saver_def, defer_build=True)
+    incr_saver.build(saver._builder.filename_tensor)
+
+    # generate full ckpt and incr ckpt.
+    full_ckpt_value=None
+    incr_ckpt_value=None
+    with self.test_session() as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run([init])
+      sess.run([train_op])
+      full_ckpt_path = saver.save(sess, full_ckpt_dir, global_step = 10)
+      full_ckpt_value = sess.run([emb])
+      print("full_ckpt: {}".format(full_ckpt_value))
+      sess.run([train_op])
+      incr_ckpt_path = \
+        incr_saver.incremental_save(sess, incr_ckpt_dir, global_step=20)
+      incr_ckpt_value = sess.run([emb])
+      print("incr_ckpt: {}".format(incr_ckpt_value))
+
+    # check the value after restoring parameter.
+    with self.test_session() as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run([init])
+      saver.restore(sess, full_ckpt_path)
+      restore_full_ckpt_value = sess.run([emb])
+      print("restore_full_ckpt: {}".format(restore_full_ckpt_value))
+      incr_saver.incremental_restore(sess, full_ckpt_path, incr_ckpt_path)
+      restore_incr_ckpt_value = sess.run([emb])
+      print("restore_incr_ckpt: {}".format(restore_incr_ckpt_value))
+      self.assertAllClose(full_ckpt_value, restore_full_ckpt_value)
+      self.assertAllClose(incr_ckpt_value, restore_incr_ckpt_value)
+
 if __name__ == "__main__":
   googletest.main()

From d5f7f6ad77a59b70679835009dbe31add175dba3 Mon Sep 17 00:00:00 2001
From: "Secret.Sun" <sunputonsteam@gmail.com>
Date: Wed, 10 Apr 2024 14:41:50 +0800
Subject: [PATCH 83/91] [Runtime] Remove read limit of ReadBinaryProto. (#981)

Signed-off-by: Secret.Sun <sunputonsteam@gmail.com>
---
 tensorflow/core/platform/env.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index ac91b79a07f..b835677627a 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -508,14 +508,7 @@ Status ReadBinaryProto(Env* env, const string& fname,
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
 
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
   ::tensorflow::protobuf::io::CodedInputStream coded_stream(stream.get());
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
 
   if (!proto->ParseFromCodedStream(&coded_stream) ||
       !coded_stream.ConsumedEntireMessage()) {

From a4489e31a4b9bc8371198537a0a15af6011ef8ae Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Fri, 12 Apr 2024 14:22:32 +0800
Subject: [PATCH 84/91] [EVAllocator] Fix the bug in configuring
 ARENA_ARRAY_SIZE. (#986)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/framework/ev_allocator.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/ev_allocator.h b/tensorflow/core/framework/ev_allocator.h
index d3251b14782..5082ee04b72 100644
--- a/tensorflow/core/framework/ev_allocator.h
+++ b/tensorflow/core/framework/ev_allocator.h
@@ -546,15 +546,15 @@ class EVAllocatorImpl {
     page_map_ = new PageMap<ChunkType>();
     page_map_->Init();
 
-    int64 arena_array_size = ARENA_ARRAY_SIZE;
+    arena_array_size_ = ARENA_ARRAY_SIZE;
     Status s = ReadInt64FromEnvVar("ARENA_ARRAY_SIZE",
-        ARENA_ARRAY_SIZE, &arena_array_size);
+        ARENA_ARRAY_SIZE, &arena_array_size_);
     if (!s.ok()) {
       LOG(ERROR) << "Read ARENA_ARRAY_SIZE env error: " << s.error_message();
     }
-    LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size;
+    LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size_;
 
-    arenas_ = new std::vector<Arena<ChunkType>>(arena_array_size, page_map_);
+    arenas_ = new std::vector<Arena<ChunkType>>(arena_array_size_, page_map_);
     arena_cur_index = 0;
   }
 
@@ -602,7 +602,7 @@ class EVAllocatorImpl {
     {
       mutex_lock l(mu_arena_index_);
       ret = &((*arenas_)[arena_cur_index]);
-      arena_cur_index = (arena_cur_index + 1) % ARENA_ARRAY_SIZE;
+      arena_cur_index = (arena_cur_index + 1) % arena_array_size_;
     }
 
     return ret;
@@ -619,6 +619,7 @@ class EVAllocatorImpl {
   PageMap<ChunkType>* page_map_ = nullptr;
   std::vector<Arena<ChunkType>> *arenas_ = nullptr;
   int arena_cur_index GUARDED_BY(mu_arena_index_);
+  int64 arena_array_size_;
 };
 
 template<typename ChunkType>

From 04413cf0ee6ca57f35446095c4e27bc1cfdf2b0d Mon Sep 17 00:00:00 2001
From: Chaofeng Guo <guocfly@gmail.com>
Date: Thu, 18 Apr 2024 19:56:17 +0800
Subject: [PATCH 85/91] [Embedding] Fix the issue of default_value type
 mismatch in the EV Gather op. (#989)

Signed-off-by: Lyaction <guocfly@gmail.com>
---
 tensorflow/python/ops/kv_variable_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 840aadf2541..55e01537c0d 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -858,10 +858,10 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
       if self._trainable:
         tape.variable_accessed(self)
       if ev_init_value is not None:
-        default_value = ev_init_value
+        default_value = math_ops.cast(ev_init_value, self.dtype)
         is_use_default_value_tensor = True
       else:
-        default_value = ops.convert_to_tensor(1.0)
+        default_value = ops.convert_to_tensor(1.0, dtype=self.dtype)
         is_use_default_value_tensor = False
       if counts != None:
         value = gen_kv_variable_ops.kv_resource_gather_v1(self._handle,

From fc08e1b605490e818cdf80bc2389b68028c19049 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Fri, 26 Apr 2024 11:33:59 +0800
Subject: [PATCH 86/91] [Hook] Add 'before_create_session' interface to
 SessionRunHook. (#991)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/python/training/monitored_session.py |  3 +++
 tensorflow/python/training/session_run_hook.py  | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6eb204785dd..9492028a200 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -957,6 +957,8 @@ def __init__(self, session_creator, hooks, stop_grace_period_secs):
     def create_session(self):
       """Creates a coordinated session."""
       # Keep the tf_sess for unit testing.
+      for hook in self._hooks:
+        hook.before_create_session()
       self.tf_sess = self._session_creator.create_session()
       # We don't want coordinator to suppress any exception.
       self.coord = coordinator.Coordinator(clean_stop_exception_types=[])
@@ -1027,6 +1029,7 @@ class MonitoredSession(_MonitoredSession):
   in given order:
 
   * calls `hook.begin()` for each given hook
+  * calls `hook.before_create_session()`
   * finalizes the graph via `scaffold.finalize()`
   * create session
   * initializes the model via initialization ops provided by `Scaffold`
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index e598bc2d98c..9d05d04c139 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -109,6 +109,20 @@ def begin(self):
     """
     pass
 
+  def before_create_session(self):
+    """Called before new TensorFlow session is created.
+
+    This has two essential differences with the situation in which `begin` is
+    called:
+
+    * Do not modify the graph in this method, ops should not be added to graph.
+        The modification of the graph should take place within the begin
+        interface.
+    * This method will also be called prior to the recovery of a wrapped
+        session, not just at the beginning of the overall session.
+    """
+    pass
+
   def after_create_session(self, session, coord):  # pylint: disable=unused-argument
     """Called when new TensorFlow session is created.
 

From e10d4411dfb93ca47f6e1908ac878d1417c7db58 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Mon, 29 Apr 2024 17:18:35 +0800
Subject: [PATCH 87/91] [Docs] Fix readthedoc build fail. (#993)

- Add configure file: docs/docs_zh/.readthedocs.yaml docs/docs_en/.readthedocs.yaml

Signed-off-by: Chen Ding <candy.dc@alibaba-inc.com>
---
 docs/docs_en/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++
 docs/docs_zh/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 docs/docs_en/.readthedocs.yaml
 create mode 100644 docs/docs_zh/.readthedocs.yaml

diff --git a/docs/docs_en/.readthedocs.yaml b/docs/docs_en/.readthedocs.yaml
new file mode 100644
index 00000000000..c69bbd13812
--- /dev/null
+++ b/docs/docs_en/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/docs_en/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/docs_en/requirements.txt
diff --git a/docs/docs_zh/.readthedocs.yaml b/docs/docs_zh/.readthedocs.yaml
new file mode 100644
index 00000000000..859db8adfa5
--- /dev/null
+++ b/docs/docs_zh/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/docs_zh/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/docs_zh/requirements.txt

From b2aed9686182124fca72f8093e74136cc13dcd39 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 14 May 2024 10:43:13 +0800
Subject: [PATCH 88/91] [Embedding] Change the log level for EV restore. (#995)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/kernels/kv_variable_restore_ops.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index e16db9b4cd6..0a0165595f0 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -376,8 +376,8 @@ class KvResourceImportV3Op: public AsyncOpKernel {
 
     // EV should not be initialized at this time.
     if (ev->IsInitialized()) {
-      LOG(ERROR) << "Import parameter for EV (" << name_string
-                 << ") failed, this EV has already been initialized.";
+      LOG(WARNING) << "EV (" << name_string
+                   << ") has already been initialized.";
     }
 
     auto do_compute = [this, context, file_name_string, ev,

From 93c69ad9576d6ee0f7b9479bef9b091451e5b91a Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 21 May 2024 19:26:07 +0800
Subject: [PATCH 89/91] [Rendezvous] RemoteRendezvous supports FlowControl.
 (#994)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../base_rendezvous_mgr.cc                    | 213 ++++++++++++++-
 .../distributed_runtime/base_rendezvous_mgr.h |  45 ++++
 .../rendezvous_mgr_interface.h                |  11 +-
 .../rpc/grpc_remote_worker.cc                 |  10 +
 .../rpc/grpc_worker_interface.h               |   6 +
 .../rpc/grpc_worker_service.cc                | 162 ++++++++++++
 .../rpc/grpc_worker_service.h                 |   4 +
 .../rpc/grpc_worker_service_impl.cc           |   2 +
 .../rpc/grpc_worker_service_impl.h            |   1 +
 .../rpc/rpc_rendezvous_mgr.cc                 | 245 ++++++++++++++++++
 .../rpc/rpc_rendezvous_mgr_test.cc            |  26 ++
 tensorflow/core/framework/rendezvous.cc       |  41 +++
 tensorflow/core/framework/rendezvous.h        |  26 ++
 .../core/kernels/file_slice_sendrecv_ops.cc   |  20 +-
 .../core/kernels/file_slice_sendrecv_ops.h    |   2 +
 .../kernels/file_slice_sendrecv_ops_test.cc   |  13 +
 tensorflow/core/kernels/slice_sendrecv_ops.cc |  40 +--
 tensorflow/core/kernels/slice_sendrecv_ops.h  |   2 +
 .../core/kernels/slice_sendrecv_ops_test.cc   |  13 +
 tensorflow/core/protobuf/worker.proto         |  46 ++++
 tensorflow/core/protobuf/worker_service.proto |   5 +
 21 files changed, 903 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 17935eb8982..ead121b30c8 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -34,11 +34,13 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
 namespace {
   uint64 kGlobalStepId = 0x100000000000000uLL;
+  int64 kFlowControlMaxSize = 16;
 } // namespace anonymous
 
 static void StartAbortRendevous(Rendezvous* rendez, const Status& s) {
@@ -127,6 +129,23 @@ void BaseRendezvousMgr::FuseRecvLocalAsync(
   rendez->FuseRecvLocalAsync(parsed_keys, std::move(done_cb));
 }
 
+void BaseRendezvousMgr::FlowControlRecvLocalAsync(int64 step_id,
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       Rendezvous::DoneCallback done) {
+  auto rendez = FindOrCreate(step_id);
+  using namespace std::placeholders;
+  Rendezvous::DoneCallback done_cb = std::bind(
+      [rendez](Rendezvous::DoneCallback done,
+               // Begin unbound arguments.
+               const Status& s, const Rendezvous::Args& send_args,
+               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
+        rendez->Unref();
+        done(s, send_args, recv_args, v, dead);
+      },
+      std::move(done), _1, _2, _3, _4, _5);
+  rendez->FlowControlRecvLocalAsync(tag, parsed, std::move(done_cb));
+}
+
 void BaseRendezvousMgr::Cleanup(int64 step_id) {
   Rendezvous* rendez = nullptr;
   {
@@ -174,7 +193,17 @@ BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id)
     : env_(env),
       step_id_(step_id),
       local_(NewLocalRendezvous()),
-      session_(nullptr) {}
+      session_(nullptr),
+      flow_control_num_(0) {
+  Status s = ReadInt64FromEnvVar("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE",
+        kFlowControlMaxSize, &flow_control_max_size_);
+  if (!s.ok()) {
+    LOG(ERROR) << "Read REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE env error: "
+               << s.error_message();
+  }
+  VLOG(2) << "BaseRemoteRendezvous set flow control max size: "
+          << flow_control_max_size_;
+}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
   CHECK(active_.empty());
@@ -221,6 +250,16 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
                                std::move(fuse_call.done));
   }
 
+  std::vector<DeferredFlowControlCall> deferred_flow_control_calls;
+  {
+    mutex_lock l(mu_);
+    std::swap(deferred_flow_control_calls, deferred_flow_control_calls_);
+  }
+  for (auto& fc_call : deferred_flow_control_calls) {
+    FlowControlRecvLocalAsyncInternal(fc_call.tag, fc_call.parsed,
+                                      std::move(fc_call.done));
+  }
+
   return Status::OK();
 }
 
@@ -271,6 +310,43 @@ Status BaseRemoteRendezvous::Send(const ParsedKey& parsed,
   return local_->Send(parsed, args, val, mu, is_dead);
 }
 
+Status BaseRemoteRendezvous::FlowControlSend(const StringPiece& tag,
+                                             const ParsedKey& parsed,
+                                             const Args& args,
+                                             const Tensor& val,
+                                             const bool is_dead,
+                                             const int64 timeout_millis) {
+  VLOG(1) << "BaseRemoteRendezvous FlowControlSend " << this << " "
+          << parsed.FullKey();
+  const std::string tag_string(tag.data(), tag.size());
+  {
+    mutex_lock l(mu_);
+    while(status_.ok() && flow_control_num_ >= flow_control_max_size_) {
+      if (flow_control_cv_.wait_for(
+            l, std::chrono::milliseconds(timeout_millis)) == \
+          std::cv_status::timeout) {
+        return errors::DeadlineExceeded("FlowControlSend has timed out.");
+      }
+    }
+
+    if (!status_.ok()) return status_;
+    DCHECK(is_initialized_locked());
+    if (!IsLocalDevice(session_->worker_name, parsed.src_device)) {
+      return errors::InvalidArgument(
+          "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
+          session_->worker_name);
+    }
+
+    flow_control_num_++;
+    if (flow_control_counters_.count(tag_string) == 0) {
+      flow_control_counters_[tag_string] = 0;
+    }
+    flow_control_counters_[tag_string]++;
+  }
+  // Buffers "val" and "device_context" in local_.
+  return local_->Send(parsed, args, val, is_dead);
+}
+
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
                                              bool is_src) {
   // Cache session pointer to avoid repeatedly taking & releasing the lock
@@ -413,6 +489,63 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   }
 }
 
+void BaseRemoteRendezvous::FlowControlRecvAsync(const StringPiece& tag,
+                                                const ParsedKey& parsed,
+                                                const Args& recv_args,
+                                                DoneCallback done) {
+  VLOG(1) << "RemoteRendezvous FlowControlRecvAsync " << this
+          << " " << tag << " " << parsed.FullKey();
+
+  Status s = ValidateDevices(parsed, false /*!is_src*/);
+  if (s.ok() && !is_initialized()) {
+    s.Update(errors::Internal(
+        "FlowControlRecvAsync called when uninitialized (key:",
+        parsed.FullKey(), ")."));
+  }
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor(), false);
+    return;
+  }
+
+  // Are src and dst in the same worker?
+  if (IsSameWorker(parsed.src, parsed.dst)) {
+    // Recv the tensor from local_.
+    local_->RecvAsync(
+        parsed, recv_args,
+        [this, tag, parsed, done](
+            const Status& status, const Rendezvous::Args& send_args,
+            const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
+          VLOG(2) << "RemoteRendezvous Finished Recv " << this << " "
+                  << parsed.FullKey();
+          Tensor* out = new Tensor;
+          StatusCallback final_callback = [done, send_args, recv_args, out,
+                                           is_dead](const Status& s) {
+            done(s, send_args, recv_args, *out, is_dead);
+            delete out;
+          };
+
+          if (status.ok()) {
+            SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                               std::move(final_callback));
+            const std::string tag_string(tag.data(), tag.size());
+            {
+              mutex_lock l(mu_);
+              flow_control_num_--;
+              DCHECK(flow_control_counters_.count(tag_string) != 0);
+              flow_control_counters_[tag_string]--;
+            }
+            flow_control_cv_.notify_one();
+          } else {
+            final_callback(status);
+          }
+        });
+    return;
+  } else {
+    FlowControlRecvFromRemoteAsync(tag, parsed, recv_args, std::move(done));
+  }
+
+}
+
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
   {
@@ -600,6 +733,58 @@ void BaseRemoteRendezvous::FuseRecvLocalAsyncInternal(
   }
 }
 
+void BaseRemoteRendezvous::FlowControlRecvLocalAsync(const StringPiece& tag,
+                                                     const ParsedKey& parsed,
+                                                     DoneCallback done) {
+  {
+    mutex_lock l(mu_);
+    if (!is_initialized_locked()) {
+      // FlowControlRecvLocalAsync can be called (due to an incoming RecvTensor
+      // RPC from a remote worker) before the RunStep (or PartialRunStep) RPC
+      // from the master arrives. RecvLocalAsync thus buffers the arguments
+      // until after the RemoteRendezvous is Initialize()'d, when it completes
+      // the rendezvous logic. At some point after Initialize() is called, a
+      // Tensor is produced locally that will then be sent in response to the
+      // incoming RPC.
+      DeferredFlowControlCall call(tag, parsed, std::move(done));
+      deferred_flow_control_calls_.push_back(call);
+      return;
+    }
+  }
+  FlowControlRecvLocalAsyncInternal(tag, parsed, std::move(done));
+}
+
+void BaseRemoteRendezvous::FlowControlRecvLocalAsyncInternal(
+       const StringPiece& tag, const ParsedKey& parsed, DoneCallback done) {
+  Status s = ValidateDevices(parsed, true /* is_src */);
+  if (!s.ok()) {
+    done(s, Args(), Args(), Tensor(), false);
+    return;
+  }
+
+  using namespace std::placeholders;
+  Rendezvous::DoneCallback done_cb = std::bind(
+      [this, tag](Rendezvous::DoneCallback done,
+             // Begin unbound arguments.
+             const Status& s, const Rendezvous::Args& send_args,
+             const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
+        done(s, send_args, recv_args, v, dead);
+        if (s.ok()) {
+          const std::string tag_string(tag.data(), tag.size());
+          {
+            mutex_lock l(mu_);
+            flow_control_num_--;
+            DCHECK(flow_control_counters_.count(tag_string) != 0);
+            flow_control_counters_[tag_string]--;
+          }
+          flow_control_cv_.notify_one();
+        }
+      },
+      std::move(done), _1, _2, _3, _4, _5);
+
+  local_->RecvAsync(parsed, Args(), std::move(done_cb));
+}
+
 void BaseRemoteRendezvous::FuseRecvFromRemoteAsync(
         const std::vector<Rendezvous::ParsedKey>& parsed_keys,
         const Rendezvous::Args& args,
@@ -607,6 +792,12 @@ void BaseRemoteRendezvous::FuseRecvFromRemoteAsync(
     CHECK(false) << "FuseRecvFromRemoteAsync Unimplemented";
 }
 
+void BaseRemoteRendezvous::FlowControlRecvFromRemoteAsync(
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       const Rendezvous::Args& args, DoneCallback done) {
+  CHECK(false) << "FlowControlRecvFromRemoteAsync Unimplemented.";
+}
+
 void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      RefDoneCallback done) {
@@ -636,6 +827,19 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   }
 }
 
+int64 BaseRemoteRendezvous::GetAllFlowControlItemNum() {
+  mutex_lock l(mu_);
+  return flow_control_num_;
+}
+
+int64 BaseRemoteRendezvous::GetFlowControlItemNum(StringPiece tag) {
+  const std::string tag_string(tag.data(), tag.size());
+  mutex_lock l(mu_);
+  if (flow_control_counters_.count(tag_string) == 0)
+    return 0;
+  return flow_control_counters_[tag_string];
+}
+
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
   CHECK(!s.ok());
   // Use a "derived" status as the status for the rendezvous. Derived
@@ -656,7 +860,10 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
       }
       active_.clear();
     }
+    flow_control_num_ = 0;
+    flow_control_counters_.clear();
   }
+  flow_control_cv_.notify_all();
 }
 
 void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
@@ -707,4 +914,8 @@ BaseRemoteRendezvous::DeferredFuseCall::DeferredFuseCall(
     const std::vector<ParsedKey>& parsed_keys, FuseDoneCallback done)
     : parsed_keys(parsed_keys), done(std::move(done)) {}
 
+BaseRemoteRendezvous::DeferredFlowControlCall::DeferredFlowControlCall(
+    const StringPiece& tag, const ParsedKey& parsed, DoneCallback done)
+    : tag(tag), parsed(parsed), done(std::move(done)) {}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index b65e59436c0..fc72d9bedfc 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
 
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
@@ -86,6 +87,10 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
                           const std::vector<Rendezvous::ParsedKey>& parsed_keys,
                           Rendezvous::FuseDoneCallback done) override;
 
+  void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag,
+                                 const Rendezvous::ParsedKey& parsed,
+                                 Rendezvous::DoneCallback done) override;
+
   // Removes rendezvous for "step_id".
   //
   // TODO(zhifengc): Have a background thread in worker that
@@ -140,6 +145,11 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   Status Send(const ParsedKey& key, const Rendezvous::Args& args,
               Tensor* val, mutex* mu, const bool is_dead) override;
 
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead,
+                         const int64 timeout_millis) override;
+
   // This method is called only by the RecvOp.  It tests to see
   // whether the value will be produced by a local or remote device
   // and handles accordingly.  In the local case it forwards to
@@ -147,6 +157,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
                  DoneCallback done) override;
 
+  void FlowControlRecvAsync(const StringPiece& tag,
+                            const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) override;
+
   void StartAbort(const Status& status) override;
 
   // This method is called only by the local Worker, forwarded through
@@ -171,10 +185,18 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   void FuseRecvLocalSync(const std::vector<ParsedKey>& parsed_keys,
                          FuseDoneCallback done);
 
+  void FlowControlRecvLocalAsync(const StringPiece& tag,
+                                 const ParsedKey& parsed, DoneCallback done);
+
   // For ref send/recv
   void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
                  RefDoneCallback done) override;
 
+  // Obtain statistical information
+  int64 GetAllFlowControlItemNum() override;
+
+  int64 GetFlowControlItemNum(StringPiece tag) override;
+
  protected:
   virtual void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                                    const Rendezvous::Args& args,
@@ -185,6 +207,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
       const Rendezvous::Args& args,
       FuseDoneCallback done);
 
+  virtual void FlowControlRecvFromRemoteAsync(const StringPiece& tag,
+                 const Rendezvous::ParsedKey& parsed,
+                 const Rendezvous::Args& args, DoneCallback done);
+
   // Returns true if "src" and "dst" are located in the same worker,
   // and hence may use a local rendezvous.
   virtual bool IsSameWorker(DeviceNameUtils::ParsedName src,
@@ -210,6 +236,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
   mutable mutex mu_;
 
+  // For Flow Control.
+  int64 flow_control_max_size_;
+  int64 flow_control_num_ GUARDED_BY(mu_);
+  std::unordered_map<string, int64> flow_control_counters_ GUARDED_BY(mu_);
+  tensorflow::condition_variable flow_control_cv_;
+
   // Status given by StartAbort() if any.
   Status status_ GUARDED_BY(mu_);
   WorkerSession* session_ GUARDED_BY(mu_);  // Not owned.
@@ -233,6 +265,16 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   };
   std::vector<DeferredFuseCall> deferred_fuse_calls_ GUARDED_BY(mu_);
 
+  struct DeferredFlowControlCall {
+    const StringPiece tag;
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    DeferredFlowControlCall(const StringPiece& tag, const ParsedKey& parsed,
+                            DoneCallback done);
+  };
+  std::vector<DeferredFlowControlCall> deferred_flow_control_calls_ GUARDED_BY(mu_);
+
   typedef std::function<void()> InactiveCallback;
 
   // Active outstanding RecvTensor calls.
@@ -262,6 +304,9 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
   void FuseRecvLocalAsyncInternal(const std::vector<ParsedKey>& parsed_keys,
                                   FuseDoneCallback done);
+  void FlowControlRecvLocalAsyncInternal(const StringPiece& tag,
+                                         const ParsedKey& parsed,
+                                         DoneCallback done);
 
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
 };
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index caf4af97ac2..abc971c4552 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -40,6 +40,11 @@ class RemoteRendezvous : public Rendezvous {
  public:
   // Fully construct the RemoteRendezvous.
   virtual Status Initialize(WorkerSession* session) = 0;
+
+  // Obtain statistical information
+  virtual int64 GetAllFlowControlItemNum() = 0;
+
+  virtual int64 GetFlowControlItemNum(StringPiece tag) = 0;
 };
 
 // RendezvousMgr keeps track of a set of local rendezvous instances.
@@ -87,7 +92,11 @@ class RendezvousMgrInterface {
 
   virtual void FuseRecvLocalAsync(
       int64 step_id, const std::vector<Rendezvous::ParsedKey>& parsed_keys,
-      Rendezvous::FuseDoneCallback done) = 0;
+                                  Rendezvous::FuseDoneCallback done) = 0;
+
+  virtual void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag,
+                                         const Rendezvous::ParsedKey& parsed,
+                                         Rendezvous::DoneCallback done) = 0;
 
   // Removes rendezvous for "step_id".
   //
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index ba95e80b496..c3fb6a8ee6c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -63,6 +63,7 @@ class GrpcRemoteWorker :
         cleanupall_(Method(GrpcWorkerMethod::kCleanupAll)),
         recvtensor_(Method(GrpcWorkerMethod::kRecvTensor)),
         fuserecvtensor_(Method(GrpcWorkerMethod::kFuseRecvTensor)),
+        flowcontrolrecvtensor_(Method(GrpcWorkerMethod::kFlowControlRecvTensor)),
         recvbuf_(Method(GrpcWorkerMethod::kRecvBuf)),
         logging_(Method(GrpcWorkerMethod::kLogging)),
         tracing_(Method(GrpcWorkerMethod::kTracing)),
@@ -210,6 +211,14 @@ class GrpcRemoteWorker :
     IssueRequest(request, response, fuserecvtensor_, done, call_opts);
   }
 
+  void FlowControlRecvTensorAsync(CallOptions* call_opts,
+                                  const FlowControlRecvTensorRequest* request,
+                                  TensorResponse* response,
+                                  StatusCallback done) {
+    VLOG(1) << "FlowControlRecvTensorAsync req: " << request->DebugString();
+    IssueRequest(request, response, flowcontrolrecvtensor_, done, call_opts);
+  }
+
   void RecvTensorAsync(CallOptions* call_opts, const RecvTensorRequest* request,
                        TensorResponse* response, StatusCallback done) override {
     VLOG(1) << "RecvTensorAsync req: " << request->DebugString();
@@ -341,6 +350,7 @@ class GrpcRemoteWorker :
   const ::grpc::string cleanupall_;
   const ::grpc::string recvtensor_;
   const ::grpc::string fuserecvtensor_;
+  const ::grpc::string flowcontrolrecvtensor_;
   const ::grpc::string recvbuf_;
   const ::grpc::string logging_;
   const ::grpc::string tracing_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
index 20f1d2b5a62..2c885fec75d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
@@ -6,6 +6,8 @@ namespace tensorflow {
 class CallOptions;
 class FuseTensorResponse;
 class FuseRecvTensorRequest;
+class FlowControlRecvTensorRequest;
+class TensorResponse;
 
 class GrpcWorkerInterface {
  public:
@@ -13,6 +15,10 @@ class GrpcWorkerInterface {
                                    const FuseRecvTensorRequest* request,
                                    FuseTensorResponse* response,
                                    StatusCallback done) = 0;
+
+  virtual void FlowControlRecvTensorAsync(CallOptions* call_opts,
+                 const FlowControlRecvTensorRequest* request,
+                 TensorResponse* response, StatusCallback done) = 0;
 };
 
 } // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index ef4fbeab438..3bdacc29a12 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -170,6 +170,15 @@ class GrpcWorkerServiceThread {
       EnqueueFuseRecvTensorRequestRaw();
     }
 
+    // Support FlowControlRecv
+    for (int i = 0;
+         i < gtl::FindWithDefault(
+                 queue_depth_, static_cast<int>(GrpcWorkerMethod::kFlowControlRecvTensor),
+                 1000);
+         ++i) {
+      EnqueueFlowControlRecvTensorRequestRaw();
+    }
+
     void* tag;
     bool ok;
 
@@ -312,6 +321,24 @@ class GrpcWorkerServiceThread {
       EnqueueFuseRecvTensorRequestRaw();
     }
 
+  void FlowControlRecvTensorHandlerRaw(
+         WorkerCall<FlowControlRecvTensorRequest, ::grpc::ByteBuffer>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+
+      worker_->GrpcFlowControlRecvTensorAsync(call_opts, &call->request,
+                                       &call->response,
+                                       [call, call_opts
+                                       ](const Status& s) {
+                                         call->ClearCancelCallback();
+                                         delete call_opts;
+                                         call->SendResponse(ToGrpcStatus(s));
+                                       });
+    });
+    EnqueueFlowControlRecvTensorRequestRaw();
+  }
+
   void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
     Schedule([this, call]() {
       CallOptions* call_opts = new CallOptions;
@@ -394,6 +421,19 @@ class GrpcWorkerServiceThread {
     }
   }
 
+  void EnqueueFlowControlRecvTensorRequestRaw() {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           FlowControlRecvTensorRequest, ::grpc::ByteBuffer>::
+          EnqueueRequestForMethod(
+              worker_service_, cq_.get(),
+              static_cast<int>(GrpcWorkerMethod::kFlowControlRecvTensor),
+              &GrpcWorkerServiceThread::FlowControlRecvTensorHandlerRaw,
+              true /* supports cancel*/);
+    }
+  }
+
   GrpcWorker* const worker_ = nullptr;  // Not owned.
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   std::unique_ptr<Thread> thread_;
@@ -746,6 +786,128 @@ void GrpcWorker::GrpcFuseRecvTensorAsync(CallOptions* opts,
       });
 }
 
+// GrpcFlowControlRecvTensorAsync: unlike the other Worker methods, which use
+// protocol buffers for a response object, to avoid extra protocol buffer
+// serialization overhead we generate our response directly into a
+// ::grpc::ByteBuffer object
+void GrpcWorker::GrpcFlowControlRecvTensorAsync(CallOptions* opts,
+                   const FlowControlRecvTensorRequest* request,
+                   ::grpc::ByteBuffer* response, StatusCallback done) {
+  VLOG(1) << "GrpcFlowControlRecvTensorAsync req: " << request->DebugString();
+  const int64 request_id = request->request_id();
+  const int64 step_id = request->step_id();
+
+  bool cache_enabled = (response_cache_ != nullptr && request_id != 0);
+
+  auto do_response = [response, done, cache_enabled](const Tensor& tensor,
+                                                     bool is_dead,
+                                                     const Status& status) {
+    if (status.ok()) {
+      grpc::EncodeTensorToByteBuffer(is_dead, tensor, cache_enabled, response);
+    }
+    done(status);
+  };
+
+  // If response cache is enabled and the response cache already contains the
+  // request, we delegate this retry request to the response cache. Otherwise,
+  // we add the request to the response cache and start the computation to
+  // retrieve the requested data.
+  if (cache_enabled &&
+      response_cache_->QueueRequest(request_id, step_id, do_response)) {
+    return;
+  }
+
+  auto rendezvous_done = [this, request_id, do_response, cache_enabled](
+                             const Tensor& tensor, bool is_dead,
+                             const Status& status) {
+    if (cache_enabled) {
+      // Data is ready. Process all pending requests in the response cache.
+      response_cache_->OnRequestFinished(request_id, tensor, is_dead, status);
+    } else {
+      do_response(tensor, is_dead, status);
+    }
+  };
+
+  auto fail = [&rendezvous_done](const Status& status) {
+    rendezvous_done(Tensor(), false, status);
+  };
+
+  Status s = recent_request_ids_.TrackUnique(
+      request_id, "RecvTensor (GrpcWorker)", *request);
+  if (!s.ok()) {
+    fail(s);
+    return;
+  }
+
+  const string& key = request->rendezvous_key();
+  TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
+  Rendezvous::ParsedKey parsed;
+  s = Rendezvous::ParseKey(key, &parsed);
+  Device* src_dev = nullptr;
+  if (s.ok()) {
+    s = PrepareRecvTensor(parsed, &src_dev);
+  }
+  if (!s.ok()) {
+    fail(s);
+    return;
+  }
+
+  // Request the tensor associated with the rendezvous key.
+  // Note that we log the cancellation here but do not abort the current step.
+  // gRPC can generate cancellations in response to transient network failures,
+  // and aborting the step eliminates the opportunity for client side retries.
+  // Repeated client failures will eventually cause the step to be aborted by
+  // the client.
+  opts->SetCancelCallback(
+      [step_id]() { LOG(WARNING) << "RecvTensor cancelled for " << step_id; });
+  StringPiece tag = request->tag();
+  env_->rendezvous_mgr->FlowControlRecvLocalAsync(
+      step_id, tag, parsed,
+      [opts, rendezvous_done, src_dev, request](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val,
+          const bool is_dead) {
+        opts->ClearCancelCallback();
+        if (status.ok()) {
+          // DMA can only be used for Tensors that do not fall into
+          // the following three odd edge cases: 1) a zero-size
+          // buffer, 2) a dead tensor which has an uninit value, and
+          // 3) the tensor has the on_host allocation attribute,
+          // i.e. it's in CPU RAM *independent of its assigned
+          // device type*.
+          const bool on_host = send_args.alloc_attrs.on_host();
+          {
+            // Non-DMA cases.
+            if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
+              DeviceContext* send_dev_context = send_args.device_context;
+              AllocatorAttributes alloc_attrs;
+              alloc_attrs.set_gpu_compatible(true);
+              alloc_attrs.set_on_host(true);
+              Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
+              Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
+              CHECK(send_dev_context)
+                  << "send dev name: " << src_dev->name()
+                  << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
+              StatusCallback copy_ready = [rendezvous_done, copy,
+                                           is_dead](const Status& s) {
+                // The value is now ready to be returned on the wire.
+                rendezvous_done(*copy, is_dead, s);
+                delete copy;
+              };
+
+              CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(),
+                               src_dev, copy, send_dev_context, copy_ready);
+              return;
+            }
+          }
+        }
+
+        rendezvous_done(val, is_dead, status);
+      });
+}
+
 namespace {
 // If RecvBufRespExtra.tensor_content is a single large string, then gRPC
 // can stall on the recv side when the string buffer needs to be enlarged,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 69759c420cc..48941d438c9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -51,6 +51,10 @@ class GrpcWorker : public Worker {
                                        ::grpc::ByteBuffer* response,
                                        StatusCallback done);
 
+  virtual void GrpcFlowControlRecvTensorAsync(CallOptions* opts,
+                   const FlowControlRecvTensorRequest* request,
+                   ::grpc::ByteBuffer* response, StatusCallback done);
+
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
                     StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 515d6e90beb..2095540e36a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -48,6 +48,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
       return "/tensorflow.WorkerService/RecvTensor";
     case GrpcWorkerMethod::kFuseRecvTensor:
       return "/tensorflow.WorkerService/FuseRecvTensor";
+    case GrpcWorkerMethod::kFlowControlRecvTensor:
+      return "/tensorflow.WorkerService/FlowControlRecvTensor";
     case GrpcWorkerMethod::kRecvBuf:
       return "/tensorflow.WorkerService/RecvBuf";
     case GrpcWorkerMethod::kLogging:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index ff8e1c07cb4..ad77ee0fd80 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -80,6 +80,7 @@ enum class GrpcWorkerMethod {
   kCleanupAll,
   kRecvTensor,
   kFuseRecvTensor,
+  kFlowControlRecvTensor,
   kRecvBuf,
   kLogging,
   kTracing,
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 69f1481f59e..267bf09e66f 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -53,6 +53,10 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
       const Rendezvous::Args& args,
       FuseDoneCallback done) override;
 
+  void FlowControlRecvFromRemoteAsync(const StringPiece& tag,
+      const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
+      DoneCallback done) override;
+
  private:
   ~RpcRemoteRendezvous() override {}
 
@@ -529,6 +533,247 @@ void RpcRemoteRendezvous::FuseRecvFromRemoteAsync(
   });
 }
 
+
+
+class FlowControlRpcRecvTensorCall : public BaseRecvTensorCall {
+ public:
+  FlowControlRpcRecvTensorCall()
+    : wi_(nullptr), dst_device_(nullptr) {}
+
+  void Init(WorkerInterface* wi, int64 step_id, const StringPiece& tag,
+            const StringPiece& key, AllocatorAttributes alloc_attrs,
+            Device* dst_device, const Rendezvous::Args& recv_args,
+            Rendezvous::DoneCallback done) {
+    wi_ = wi;
+    grpc_wi_ = dynamic_cast<GrpcWorkerInterface*>(wi_);
+    alloc_attrs_ = alloc_attrs;
+    dst_device_ = dst_device;
+    recv_args_ = recv_args;
+    done_ = std::move(done);
+    req_.set_step_id(step_id);
+    req_.set_tag(tag.data(), tag.size());
+    req_.set_request_id(GetUniqueRequestId());
+    req_.set_rendezvous_key(key.data(), key.size());
+  }
+
+  void Reset() {
+    // The FlowControlRpcRemoteRendezvous using this object is responsible for
+    // calling ReleaseWorker() before Reset().
+    DCHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "Leaking WorkerInterface in RpcRecvTensorCall::Reset().";
+
+    alloc_attrs_ = AllocatorAttributes();
+    dst_device_ = nullptr;
+    // We don't clear opts_ and assume that Init will set up the state for
+    // opts_ appropriately.
+    req_.Clear();
+    resp_.Clear();
+    {
+      mutex_lock l(mu_);
+      status_ = Status::OK();
+    }
+    done_ = nullptr;
+  }
+
+  ~FlowControlRpcRecvTensorCall() override {
+    // Since only the FlowControlRpcRecvTensorFreeList will delete an
+    // FlowControlRpcRecvTensorCall, and it always sets this->wi_ to null when
+    // a call object is released to it, we can assert that this->wi_ is
+    // always null at the point of deletion.
+    CHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+      << "Leaking WorkerInterface in FlowControlRpcRecvTensorCall destructor.";
+  }
+
+  void Start(std::function<void()> recv_done) override {
+    StartRTCall(std::move(recv_done));
+  }
+
+  void StartAbort(const Status& s) override {
+    {
+      mutex_lock l(mu_);
+      status_.Update(s);
+    }
+    opts_.StartCancel();
+  }
+
+  Status status() const override {
+    mutex_lock l(mu_);
+    return status_;
+  }
+
+  void ReleaseWorker(WorkerCacheInterface* worker_cache) {
+    DCHECK_NE(static_cast<WorkerInterface*>(nullptr), wi_)
+      << "FlowControlRpcRecvTensorCall::ReleaseWorker() called twice.";
+    worker_cache->ReleaseWorker(src_worker_, wi_);
+    wi_ = nullptr;
+    grpc_wi_ = nullptr;
+  }
+
+  const Tensor& tensor() const { return resp_.tensor(); }
+
+  bool is_dead() const { return resp_.metadata().is_dead(); }
+
+  Device* dst_device() const { return dst_device_; }
+  const Rendezvous::Args recv_args() const { return recv_args_; }
+  const Rendezvous::DoneCallback& done() const { return done_; }
+
+ private:
+  friend class RpcRemoteRendezvous;
+
+  // Start the main RecvTensor call, checking for an async abort.
+  void StartRTCall(std::function<void()> recv_done) {
+    resp_.InitAlloc(dst_device_, alloc_attrs_);
+    using namespace std::placeholders;
+    StatusCallback cb = std::bind(
+        [this](std::function<void()> recv_done,
+               // Begin unbound arguments.
+               const Status& s) {
+          if (!s.ok()) {
+            mutex_lock l(mu_);
+            status_.Update(s);
+          }
+          recv_done();
+        },
+        std::move(recv_done), _1);
+    grpc_wi_->FlowControlRecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+  }
+
+  string src_worker_;
+  string src_rel_device_;
+  WorkerInterface* wi_;  // Not owned.
+  GrpcWorkerInterface* grpc_wi_;
+  AllocatorAttributes alloc_attrs_;
+  Device* dst_device_;
+  CallOptions opts_;
+  FlowControlRecvTensorRequest req_;
+  TensorResponse resp_;
+  Rendezvous::Args recv_args_;
+  Rendezvous::DoneCallback done_;
+
+  mutable mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FlowControlRpcRecvTensorCall);
+};
+
+class FlowControlRpcRecvTensorFreeList {
+ public:
+  FlowControlRpcRecvTensorFreeList() {}
+  ~FlowControlRpcRecvTensorFreeList() {
+    for (size_t i = 0; i < objects_.size(); i++) {
+      delete objects_[i];
+    }
+  }
+
+  FlowControlRpcRecvTensorCall* New() {
+    {
+      mutex_lock l(mu_);
+      if (!objects_.empty()) {
+        FlowControlRpcRecvTensorCall* result = objects_.back();
+        objects_.pop_back();
+        return result;
+      }
+    }
+    return new FlowControlRpcRecvTensorCall;
+  }
+
+  void Release(FlowControlRpcRecvTensorCall* obj) {
+    obj->Reset();
+    {
+      mutex_lock l(mu_);
+      if (objects_.size() < kMaxObjects) {
+        objects_.push_back(obj);
+        return;
+      }
+    }
+    delete obj;
+  }
+
+ private:
+  static const int kMaxObjects = 1000;
+
+  mutex mu_;
+  std::vector<FlowControlRpcRecvTensorCall*> objects_ GUARDED_BY(mu_);
+};
+
+static FlowControlRpcRecvTensorFreeList* get_flow_control_call_freelist() {
+  static FlowControlRpcRecvTensorFreeList* call_freelist = \
+    new FlowControlRpcRecvTensorFreeList();
+  return call_freelist;
+}
+
+void RpcRemoteRendezvous::FlowControlRecvFromRemoteAsync(
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       const Rendezvous::Args& recv_args, DoneCallback done) {
+  CHECK(is_initialized());
+  Status s;
+
+  // Prepare a FlowControlRecvTensor call that can handle being aborted.
+  FlowControlRpcRecvTensorCall* call = get_flow_control_call_freelist()->New();
+
+  // key.src_device identifies a remote device.
+  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &call->src_worker_,
+                                        &call->src_rel_device_)) {
+    s = errors::Internal(parsed.src_device,
+                         " is invalid remote source device.");
+  }
+
+  WorkerSession* sess = session();
+  WorkerInterface* rwi =
+      sess->worker_cache->GetOrCreateWorker(call->src_worker_);
+  if (s.ok() && rwi == nullptr) {
+    s = errors::Internal("No worker known as ", call->src_worker_);
+  }
+
+  Device* dst_device;
+  if (s.ok()) {
+    s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
+  }
+  if (!s.ok()) {
+    if (rwi != nullptr) {
+      sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
+    }
+    get_flow_control_call_freelist()->Release(call);
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+
+  call->Init(rwi, step_id_, tag, parsed.FullKey(), recv_args.alloc_attrs,
+             dst_device, recv_args, std::move(done));
+
+  // Record "call" in active_ so that it can be aborted cleanly.
+  RegisterCall(call, recv_args);
+
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    // NOTE: `*sess` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(sess->worker_cache.get());
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    get_flow_control_call_freelist()->Release(call);
+    return;
+  }
+
+  // Start "call".
+  Ref();
+  call->Start([this, call]() {
+    // Removes "call" from active_. Prevent StartAbort().
+    DeregisterCall(call);
+    // If StartAbort was called prior to DeregisterCall, then the
+    // current status should be bad.
+    Status s = call->status();
+    // NOTE: `*session()` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(session()->worker_cache.get());
+    call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+    get_flow_control_call_freelist()->Release(call);
+    Unref();
+  });
+
+}
+
 }  // namespace
 
 RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 5021853ce23..75f41ab3057 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -211,6 +211,32 @@ TEST_F(RpcRendezvousMgrTest, CleanupAll) {
   }
 }
 
+TEST_F(RpcRendezvousMgrTest, FlowControlSend) {
+  setenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE", "2", 1);
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+    TF_ASSERT_OK(
+      rendez->FlowControlSend("TEST", key, args, V("peach_0"), false));
+    TF_ASSERT_OK(
+      rendez->FlowControlSend("TEST", key, args, V("peach_1"), false));
+
+    EXPECT_NE(
+      rendez->FlowControlSend("TEST", key, args, V("peach_2"), false, 100),
+      Status::OK());
+    EXPECT_EQ(rendez->GetAllFlowControlItemNum(), 2);
+    EXPECT_EQ(rendez->GetFlowControlItemNum("TEST"), 2);
+  }
+
+  unsetenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE");
+}
+
 class DummyDeviceContext : public DeviceContext {
  public:
   explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {}
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index e4db066a562..4d1adf1a070 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -146,6 +146,47 @@ Status Rendezvous::Recv(const ParsedKey& key, const Args& args, Tensor* val,
   return Recv(key, args, val, is_dead, no_timeout);
 }
 
+Status Rendezvous::FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, const Tensor& val,
+                                   const bool is_dead) {
+  int64 no_timeout = 300000;
+  return FlowControlSend(tag, key, args, val, is_dead, no_timeout);
+}
+
+Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, Tensor* val, bool* is_dead,
+                                   int64 timeout_ms) {
+  Status ret;
+  Notification n;
+  FlowControlRecvAsync(tag, key, args, [&ret, &n, val, is_dead](
+                       const Status& s, const Args& send_args,
+                       const Args& recv_args, const Tensor& v,
+                       const bool dead) {
+    ret = s;
+    *val = v;
+    *is_dead = dead;
+    n.Notify();
+  });
+  if (timeout_ms > 0) {
+    int64 timeout_us = timeout_ms * 1000;
+    bool notified = WaitForNotificationWithTimeout(&n, timeout_us);
+    if (!notified) {
+      return Status(error::DEADLINE_EXCEEDED,
+                    "Timed out waiting for notification");
+    }
+  } else {
+    n.WaitForNotification();
+  }
+  return ret;
+}
+
+Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, Tensor* val,
+                                   bool* is_dead) {
+  const int64 no_timeout = 0;
+  return FlowControlRecv(tag, key, args, val, is_dead, no_timeout);
+}
+
 class LocalRendezvousImpl : public Rendezvous {
  public:
   explicit LocalRendezvousImpl() {}
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 3aa65534272..106c0f26b32 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -108,6 +108,17 @@ class Rendezvous : public core::RefCounted {
   virtual Status Send(const ParsedKey& key, const Args& args, Tensor* ref_val,
                       mutex* ref_mu, const bool is_dead) { return Status::OK(); }
 
+  virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                 const Args& args, const Tensor& val,
+                                 const bool is_dead,
+                                 const int64 timeout_millis) {
+    return errors::Unimplemented("[Rendezvous] unimplement FlowControlSend.");
+  }
+
+  virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                 const Args& args, const Tensor& val,
+                                 const bool is_dead);
+
   // Callback provided by a tensor consumer waiting on the rendezvous.
   // It will be invoked when the tensor is available, or when a non-OK
   // status arises in the production of that tensor.  It also gets
@@ -139,12 +150,27 @@ class Rendezvous : public core::RefCounted {
   virtual void FuseRecvAsync(const std::vector<ParsedKey>& parsed_keys,
                              const Args& args, FuseDoneCallback done) {}
 
+  // Local rendezvous does not need this.
+  virtual void FlowControlRecvAsync(const StringPiece& tag,
+                 const ParsedKey& parsed_key, const Args& args,
+                 DoneCallback done) {
+    CHECK(false) << "[Rendezvous] unimplement FlowControlRecvAsync.";
+  }
+
   // Synchronous wrapper for RecvAsync.
   Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
               bool* is_dead, int64 timeout_ms);
   Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
               bool* is_dead);
 
+  // Synchronous wrapper for FlowControlRecvAsync.
+  Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, Tensor* val, bool* is_dead,
+                         int64 timeout_ms);
+
+  Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, Tensor* val, bool* is_dead);
+
   // Aborts all pending and future Send/Recv with the given "status".
   //
   // StartAbort() does not wait for ongoing calls to finish.
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
index 6bfe54363f9..a919238a5ee 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
@@ -33,11 +33,10 @@ FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
 
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
@@ -212,8 +211,9 @@ Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "FileSliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
 
@@ -253,11 +253,10 @@ FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -464,8 +463,9 @@ Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     file_ptr->Append(data_t.scalar<tstring>()());
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
index 6701196d481..df7e6c646f8 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
@@ -28,6 +28,7 @@ class FileSliceSendOp : public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
@@ -63,6 +64,7 @@ class FileSliceRecvOp: public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   string recv_dir_;
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
index 931cd152253..62f5596bb62 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
@@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous {
     kv_.erase(key_str);
     return Status::OK();
   }
+
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead) {
+    return Send(key, args, val, is_dead);
+  }
+
   void RecvAsync(const ParsedKey& key, const Args& args,
                  DoneCallback done) override {
     std::string key_str = { key.FullKey().data(), key.FullKey().size() };
@@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous {
     done(Status::OK(), var.args, args, var.data, var.is_dead);
     kv_.erase(key_str);
   }
+
+  void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) {
+    RecvAsync(parsed_key, args, done);
+  }
+
   void StartAbort(const Status& status) override {}
 
  private:
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
index 25f1a4e8738..ee0e5426cbc 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -30,11 +30,10 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
 
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
@@ -171,8 +170,9 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
                                             frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceSend " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-      TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                                 ctx->is_input_dead()));
+      TF_RETURN_IF_ERROR(
+        ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args,
+                                           data_t, ctx->is_input_dead()));
     } else {
       TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i));
     }
@@ -209,8 +209,9 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
   return Status::OK();
@@ -248,8 +249,9 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
   return Status::OK();
@@ -270,11 +272,10 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -440,8 +441,9 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
                                             frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceRecv " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-      TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                                 &is_dead, timeout_ms_));
+      TF_RETURN_IF_ERROR(
+        ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                           &data_t, &is_dead, timeout_ms_));
       // This shouldn't be a dead tensor.
       CHECK_EQ(is_dead, false);
       output_flat(i) = data_t.scalar<tstring>()();
@@ -484,8 +486,9 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     output_flat(index) += data_t.scalar<tstring>()();
@@ -529,8 +532,9 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     auto data_base = data_t.data();
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
index 43429bff32f..12e583e5551 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -28,6 +28,7 @@ class SliceSendOp : public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
@@ -58,6 +59,7 @@ class SliceRecvOp : public OpKernel {
 
  private:
   // Variable.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
index 5693ed57918..0eeb6d98c36 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
@@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous {
     kv_.erase(key_str);
     return Status::OK();
   }
+
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead) {
+    return Send(key, args, val, is_dead);
+  }
+
   void RecvAsync(const ParsedKey& key, const Args& args,
                  DoneCallback done) override {
     std::string key_str = { key.FullKey().data(), key.FullKey().size() };
@@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous {
     done(Status::OK(), var.args, args, var.data, var.is_dead);
     kv_.erase(key_str);
   }
+
+  void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) {
+    RecvAsync(parsed_key, args, done);
+  }
+  
   void StartAbort(const Status& status) override {}
 
  private:
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 65ec7ffe4bc..fa18fec180c 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -441,6 +441,52 @@ message MarkRecvFinishedRequest {
 
 message MarkRecvFinishedResponse {}
 
+////////////////////////////////////////////////////////////////////////////////
+//
+// FlowControlRecvTensor method request messages
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message FlowControlRecvTensorRequest {
+  // The step in which the tensor will be produced.
+  //
+  // REQUIRED: This must eventually correspond to the `step_id` passed
+  // into a RunGraph call on the same WorkerService.
+  int64 step_id = 1;
+
+  string tag = 2;
+
+  // A key identifying the channel to receive tensors from. A RecvTensor request
+  // retrieves one tensor from the channel, but multiple tensors can be sent and
+  // received over the same channel with multiple RecvTensor requests. See
+  // rendezvous.h for details.
+  string rendezvous_key = 3;
+
+  // If true, use an out-of-band DMA mechanism to transfer the
+  // received tensor.
+  bool dma_ok = 4;
+
+  // Optional information on client-side device locality.
+  DeviceLocality client_locality = 5;
+
+  // Optional information on server-side device locality.
+  DeviceLocality server_locality = 6;
+
+  // Optional information needed by the RPC subsystem.
+  google.protobuf.Any transport_options = 7;
+
+  // Unique identifier for this request. Every RecvTensorRequest must have a
+  // unique request_id, and retried RecvTensorRequests must have the same
+  // request_id. If request_id is zero, retry detection and response cache
+  // are disabled.
+  //
+  // Retried RecvTensorRequests are problematic because a RecvTensor with no
+  // corresponding sender will wait forever, and the tensor may have been
+  // delivered to a previous retry. Workers use request_ids to reject retried
+  // RecvTensor requests instead of waiting forever.
+  int64 request_id = 8;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Logging method request/response messages
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 07a64c55ad8..8591f2fe6ab 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -72,6 +72,11 @@ service WorkerService {
     // FuseRecvTensor Method
   }
 
+  // See worker.proto for details.
+  rpc FlowControlRecvTensor(FlowControlRecvTensorRequest) returns (RecvTensorResponse) {
+    // FlowControlRecvTensor Method
+  }
+
   // See worker.proto for details.
   rpc Logging(LoggingRequest) returns (LoggingResponse);
 

From 9e30ab604aa316359f249bc061b5fe87a5773604 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 23 May 2024 12:00:02 +0800
Subject: [PATCH 90/91] [Embedding] Check the sharded property of
 tf.train.Saver. (#996)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 modelzoo/bst/train.py                         |  3 +-
 modelzoo/dbmtl/train.py                       |  3 +-
 modelzoo/dcn/train.py                         |  3 +-
 modelzoo/dcnv2/train.py                       |  3 +-
 modelzoo/deepfm/train.py                      |  3 +-
 modelzoo/dien/train.py                        |  3 +-
 modelzoo/din/train.py                         |  3 +-
 modelzoo/dlrm/train.py                        |  3 +-
 modelzoo/dssm/train.py                        |  3 +-
 modelzoo/esmm/train.py                        |  3 +-
 modelzoo/masknet/train.py                     |  3 +-
 modelzoo/mlperf/train.py                      |  3 +-
 modelzoo/mmoe/train.py                        |  3 +-
 modelzoo/ple/train.py                         |  3 +-
 modelzoo/simple_multitask/train.py            |  3 +-
 modelzoo/wide_and_deep/train.py               |  3 +-
 .../feature_column/feature_column_v2_test.py  |  6 +-
 .../ops/embedding_variable_ops_gpu_test.py    |  7 +-
 .../python/ops/embedding_variable_ops_test.py | 64 ++++++++++---------
 tensorflow/python/training/incr_ckpt_test.py  |  5 +-
 tensorflow/python/training/saver.py           | 11 ++++
 tensorflow/python/training/saver_test.py      |  6 ++
 22 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py
index eeeb136678b..536ddbc6905 100644
--- a/modelzoo/bst/train.py
+++ b/modelzoo/bst/train.py
@@ -612,10 +612,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py
index c848cbc76b2..36f2685a175 100644
--- a/modelzoo/dbmtl/train.py
+++ b/modelzoo/dbmtl/train.py
@@ -527,10 +527,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py
index 44701e22d9f..5094a18bd85 100644
--- a/modelzoo/dcn/train.py
+++ b/modelzoo/dcn/train.py
@@ -594,10 +594,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py
index 5b572af0425..c1346ad6d7d 100644
--- a/modelzoo/dcnv2/train.py
+++ b/modelzoo/dcnv2/train.py
@@ -610,10 +610,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py
index 166bedec0d0..89b2b823a46 100644
--- a/modelzoo/deepfm/train.py
+++ b/modelzoo/deepfm/train.py
@@ -472,10 +472,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py
index 190695f6ce0..f43fd2f1e73 100644
--- a/modelzoo/dien/train.py
+++ b/modelzoo/dien/train.py
@@ -776,10 +776,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py
index 058583ce6fd..34621dee45e 100644
--- a/modelzoo/din/train.py
+++ b/modelzoo/din/train.py
@@ -594,10 +594,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py
index cc4c045c349..9dff32aca52 100644
--- a/modelzoo/dlrm/train.py
+++ b/modelzoo/dlrm/train.py
@@ -507,10 +507,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py
index db949aac5e8..9d2264d9ce9 100644
--- a/modelzoo/dssm/train.py
+++ b/modelzoo/dssm/train.py
@@ -478,10 +478,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py
index 073b08814d4..1916ed76c27 100755
--- a/modelzoo/esmm/train.py
+++ b/modelzoo/esmm/train.py
@@ -534,10 +534,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py
index bb96a467701..bb9eee0ec3f 100644
--- a/modelzoo/masknet/train.py
+++ b/modelzoo/masknet/train.py
@@ -529,10 +529,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py
index ce34fe5e55c..559e4fb6efc 100644
--- a/modelzoo/mlperf/train.py
+++ b/modelzoo/mlperf/train.py
@@ -522,10 +522,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py
index 694eb45da80..a3a6c9146d8 100644
--- a/modelzoo/mmoe/train.py
+++ b/modelzoo/mmoe/train.py
@@ -523,10 +523,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py
index b2d2f2057ec..33aa9a15e8e 100644
--- a/modelzoo/ple/train.py
+++ b/modelzoo/ple/train.py
@@ -592,10 +592,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py
index 4ef1874a521..6eb51f7d4e9 100644
--- a/modelzoo/simple_multitask/train.py
+++ b/modelzoo/simple_multitask/train.py
@@ -427,10 +427,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py
index 3024f58024e..2d1c964e593 100644
--- a/modelzoo/wide_and_deep/train.py
+++ b/modelzoo/wide_and_deep/train.py
@@ -543,10 +543,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 7946aee1e1a..24f8a36daa4 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -7527,7 +7527,7 @@ def testEmbeddingVariableForL2FeatureEviction(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables_lib.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -7758,7 +7758,7 @@ def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables_lib.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
 
   @test_util.run_deprecated_v1
   def testEmbeddingVariableForInt32ID(self):
@@ -7783,7 +7783,7 @@ def testEmbeddingVariableForInt32ID(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables_lib.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index d47d94d0d99..3c69153ab1b 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -63,7 +63,8 @@ def testEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -748,7 +749,7 @@ def testSaveV3(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver = saver_module.Saver()
+    saver = saver = saver_module.Saver(sharded=True)
     checkpoint_directory = self.get_temp_dir()
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
@@ -816,7 +817,7 @@ def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self):
         opt = adagrad.AdagradOptimizer(0.1)
         g_v = opt.compute_gradients(loss)
         train_op = opt.apply_gradients(g_v, gs)
-        saver = saver_module.Saver()
+        saver = saver_module.Saver(sharded=True)
         graph = ops.get_default_graph()
         with self.test_session(graph = graph) as sess:
           saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index dbf254d5f14..1119fd1c194 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -162,7 +162,7 @@ def _RecordFreqTestTemplate(self, optimizer):
     opt = self._CreateOptimizer(optimizer)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -194,7 +194,7 @@ def _RecordVersionTemplate(self, optimizer):
     opt = self._CreateOptimizer(optimizer)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -232,7 +232,7 @@ def testSaveVersionWithGlobalStepEviction(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
       sess.run([init])
@@ -269,7 +269,7 @@ def testFeatureColumnRecordFreqWithPartition(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -313,7 +313,7 @@ def testFeatureColumnRecordFreqSGDWithPartition(self):
     opt = gradient_descent.GradientDescentOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -387,7 +387,8 @@ def testDynamicEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -406,7 +407,8 @@ def testEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -450,7 +452,7 @@ def testEmbeddingVariableForLookupInt32(self):
     opt = adam.AdamOptimizer(0.01)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -643,7 +645,7 @@ def testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -682,7 +684,7 @@ def testEmbeddingVariableForGlobalStepEviction(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run([init])
@@ -720,7 +722,7 @@ def testEmbeddingVariableForL2FeatureEviction(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -1534,7 +1536,7 @@ def testEmbeddingVariableForSaveFreq(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
       sess.run([init])
@@ -1567,7 +1569,7 @@ def testEmbeddingVariableForL2FeatureEvictionDRAM(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -1724,7 +1726,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1778,7 +1780,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1849,7 +1851,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1923,7 +1925,7 @@ def testEmbeddingVariableForRecordFreq(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1963,7 +1965,7 @@ def testEmbeddingVariableForRecordFreqWithCounterFilter(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2278,7 +2280,7 @@ def testEmbeddingVariableForContirbFeatureColumnWithPartitionNum(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
 
   def testSaveV3(self):
     print("testSaveV3")
@@ -2295,7 +2297,7 @@ def testSaveV3(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver = saver_module.Saver()
+    saver = saver = saver_module.Saver(sharded=True)
     checkpoint_directory = self.get_temp_dir()
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
@@ -2326,7 +2328,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2359,7 +2361,7 @@ def testEmbeddingVariableForSaveUnfilterFeature(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2390,7 +2392,7 @@ def testEmbeddingVariableForMultiTierInference(self):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session() as sess:
         sess.run([init])
@@ -2412,7 +2414,7 @@ def testEmbeddingVariableForMultiTierInference(self):
       emb = embedding_ops.embedding_lookup(emb_var, ids)
       tires = kv_variable_ops.lookup_tier(emb_var,
                   math_ops.cast([1,2,3,4], dtypes.int64))
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       graph = ops.get_default_graph()
       with self.test_session(graph = graph) as sess:
         saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt"))
@@ -2784,7 +2786,7 @@ def testSetInitializedWithoutRestore(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     with self.test_session() as sess:
       result = sess.run(var._is_initialized_op)
       self.assertEqual(False, result)
@@ -2806,7 +2808,7 @@ def testSetInitializedWithRestore(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         sess.run([init])
@@ -2823,7 +2825,7 @@ def testSetInitializedWithRestore(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         result = sess.run(var._is_initialized_op)
@@ -2860,7 +2862,7 @@ def testCountsTensor(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2893,7 +2895,7 @@ def testCountsWithSparseAndDenseTensor(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2929,7 +2931,7 @@ def testCountsTensorWithGradientDescent(self):
       opt = gradient_descent.GradientDescentOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2964,7 +2966,7 @@ def testCountsDenseAndSparseTensorWithGradientDescent(self):
       opt = gradient_descent.GradientDescentOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py
index 55cf748a9d6..849c73a44dc 100644
--- a/tensorflow/python/training/incr_ckpt_test.py
+++ b/tensorflow/python/training/incr_ckpt_test.py
@@ -75,7 +75,7 @@ def testSparseEvIncrSaveRestore(self):
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
     with ops.device("/device:CPU:0"):
       apply_incr = gen_io_ops.record_sparse_indices(math_ops.cast([0,1,2,5,6,7], dtypes.int64), "var_ev1")
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     ev_var_name = "var_ev1"
     incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, [ev_var_name], [], [True],[var.handle])
@@ -178,7 +178,7 @@ def testMixIncrSaveRestore(self):
     activate_op = gen_io_ops. activate_sparse_recorder(["var_ev1","var_norm1"])
  
   
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, ["var_norm1", "var_ev1"], [], [True, True], [var_norm, var_ev.handle])
     
@@ -445,6 +445,7 @@ def testIncrementalSaverForResourceVariable(self):
     variable_scope.get_variable('var', shape=[100], use_resource=False)
     variable_scope.get_embedding_variable('ev', embedding_dim=100)
     saver = saver_module.Saver(
+        sharded=True,
         save_relative_paths=True,
         incremental_save_restore=True,
     )
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index acc9723c183..e70226f2968 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1071,10 +1071,14 @@ def _build(self, checkpoint_path, build_save, build_restore):
         # pylint: disable=protected-access
         self._var_list = variables._all_saveable_objects()
       from tensorflow.python.ops import hash_table
+      from tensorflow.python.ops import kv_variable_ops
       if isinstance(self._var_list, dict):
+        ev = {}
         ht = {}
         lst = {}
         for name, x in self._var_list.items():
+          if isinstance(x, kv_variable_ops.EmbeddingVariable):
+            ev[name] = x
           if isinstance(x, hash_table.HashTable):
             if x.hash_table not in ht:
               ht[x.hash_table] = [x]
@@ -1084,15 +1088,20 @@ def _build(self, checkpoint_path, build_save, build_restore):
             lst[name] = BloomFilterSaveable(x)
           else:
             lst[name] = x
+        if len(ev) != 0 and not self._sharded:
+          raise ValueError("EmbeddingVariable can only use sharded saver")
         if len(ht) != 0 and not self._sharded:
           raise ValueError("HashTable can only use sharded saver")
         for x, y in ht.items():
           lst[x.name] = HashTableSaveable(y)
         self._var_list = lst
       else:
+        ev = []
         ht = {}
         lst = []
         for x in self._var_list:
+          if isinstance(x, kv_variable_ops.EmbeddingVariable):
+            ev.append(x)
           if isinstance(x, hash_table.HashTable):
             if x.hash_table not in ht:
               ht[x.hash_table] = [x]
@@ -1102,6 +1111,8 @@ def _build(self, checkpoint_path, build_save, build_restore):
             lst.append(BloomFilterSaveable(x))
           else:
             lst.append(x)
+        if len(ev) != 0 and not self._sharded:
+          raise ValueError("EmbeddingVariable can only use sharded saver")
         if len(ht) != 0 and not self._sharded:
           raise ValueError("HashTable can only use sharded saver")
         for x, y in ht.items():
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index b48f00d0c14..365ef85af1d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -852,6 +852,12 @@ def _model():
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
 
+  def testEnableSaverShardedWhenUseEmbeddingVariable(self):
+    with ops_lib.Graph().as_default():
+      emb_var = \
+        variable_scope.get_embedding_variable(name="emb_var", embedding_dim=64)
+      with self.assertRaisesRegexp(ValueError, "EmbeddingVariable"):
+        saver_module.Saver([emb_var], sharded=False)
 
 class SaveRestoreShardedTest(test.TestCase):
 

From d1c5a6e9aa2ec62da93f6719c6755293cf6406a5 Mon Sep 17 00:00:00 2001
From: LightWang4 <303176469@qq.com>
Date: Tue, 21 Jan 2025 17:54:28 +0800
Subject: [PATCH 91/91] [Embedding] Fix op dependency in init_from_checkpoint
 API. (#1012)

Signed-off-by: lightwang <lightwang983@gmail.com>
---
 tensorflow/python/training/checkpoint_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index db887fa12f1..d87a9f1b39b 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -443,7 +443,8 @@ def _set_checkpoint_initializer(variable,
       is_partitioned_ev = variable._save_slice_info is not None
       partition_id = variable._save_slice_info.var_offset[0] if is_partitioned_ev else 0
       partition_num = variable._save_slice_info.full_shape[0] if is_partitioned_ev else 1
-      with ops.control_dependencies([variable._initializer_op]):
+      restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+      with ops.control_dependencies(restore_dependency[variable._primary_handle]):
         rank = variable.initial_value.get_shape().rank - 1
         restore_op = gen_kv_variable_ops.kv_resource_import_v3(
             ckpt_file,