From 419162720dcfc543a84873b24772a262bc1de6b3 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 8 Aug 2023 20:15:06 +0800
Subject: [PATCH 01/45] [Docs] Update deeprec2306 release images and notes in
 README.md & RELEASE.md. (#922)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 84 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 927afe31480..53cca5c5c83 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index d41d9e569ad..43e03bc2b49 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,87 @@
+# Release r1.15.5-deeprec2306
+
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Support StaticGPUHashMap to optimize EmbeddingVariable in inference.
+- Update logic of GroupEmbedding in feature_column API.
+- Refine APIs for foward-backward optimization.
+- Move insertions of new features into the backward process when lti-tier storage. 
+- Move insertion of new features into the backward ops.
+- Modify calculation logic of embedding lookup sparse combiner.
+- Add memory and performance tests of EmbeddingVariable.
+
+### **Graph & Grappler Optimization**
+
+- Support IteratorGetNext for SmartStage as a starting node for searching.
+- Reimplement PrefetchRunner in C++.
+
+### **Runtime Optimization**
+
+- Dispatch expensive ops via multiple threads in theadpool.
+- Enable multi-stream in session_group by default.
+- Support for loading saved_model with device information when use p and multi_stream.
+- Make ARENA_ARRAY_SIZE to be configurable.
+- Optimize EV allocator performance.
+- Integrate HybridBackend in collective training mode.
+
+### **Ops & Hardware Acceleration**
+
+- Disable MatMul fused with LeakyRule when MKL is disabled.
+
+### **Serving**
+
+- Clear virtual_device configurations before load new checkpoint.
+
+### **Environment & Build**
+
+- Update docker images in user documents.
+- Update DEFAULT_CUDA_VERSION and DEFAULT_CUDNN_VERSION in configure.py.
+- Move thirdparties from WORKSPACE to workspace.bzl.
+- Update urls corresponding to colm, ragel, aliyun-oss-sdk and uuid.
+- Update default TF_CUDA_COMPUTE_CAPABILITIES to 7.0,7.5,8.0,8.6.
+- Update SparseOperationKit to v23.5.01 and docker file.
+
+### **BugFix**
+
+- Fix issue of missing params while constructing the ngScope.
+- Fix memory leak to avoid OOM.
+- Fix shape validation in API shared_embedding_columns.
+- Fix the device placement bug of stage_subgraph_on_cpu in distributed.
+- Fix hung issue when using both SOK and SmartStaged simultaneously.
+- Fix bug: init global_step before saving variables
+- Fix bug: reserve input nodes, clear saver devices on demand.
+- Fix memory leak when a graph node is invalid.
+
+### **ModelZoo**
+
+- Add examples and docs to demonstrate Collective Training.
+- Update documents and config files for modelzoo benchmark.
+- Update modelzoo README.
+
+### **Tool & Documents**
+
+- Update cases of configure TF_CUDA_COMPUTE_CAPABILITIES for H100.
+- Update COMMITTERS.md.
+- Update device placement documents.
+- Update document for SmartStage.
+- Update session_group documents.
+- Update the download link of the library that Processor depends on.
+- Update sok to 1.20.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2304
 
 ## **Major Features and Improvements**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index 0a170177353..83ba4854b9f 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index cdc04044875..73b6a36f318 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2304
+Develop Branch：master, Latest Release Branch: deeprec2306
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 8ced3628673..346a848ca74 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2304
+Develop Branch: master, Latest Release Branch: deeprec2306
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index 20df07aa252..08d249f8eeb 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index 332b96e6086..e5455aae91a 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2304
+开发分支：master，最新Release分支：deeprec2306
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index 27bfc864e4e..0c76400e6c6 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2304
+开发分支：master，最新Release分支：deeprec2306
 
 ## TFServing编译&打包
 

From 4983e027e2eae258a82b34ee19b8ae2cb59e6c56 Mon Sep 17 00:00:00 2001
From: shijieliu <aleliu@nvidia.com>
Date: Wed, 9 Aug 2023 11:26:59 +0800
Subject: [PATCH 02/45] [Distributed] Fix wgrad bug in Sparse Operation Kit.
 (#918)

Use new_git_repository to manage sok dependency,update sok for fixing localized mode wgrad bug.

Signed-off-by: aleliu <aleliu@nvidia.com>
---
 tensorflow/tools/pip_package/build_sok.sh |  3 +--
 tensorflow/workspace.bzl                  | 11 +++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_sok.sh b/tensorflow/tools/pip_package/build_sok.sh
index 2c99ceb5ac1..3860f5fdcff 100755
--- a/tensorflow/tools/pip_package/build_sok.sh
+++ b/tensorflow/tools/pip_package/build_sok.sh
@@ -16,5 +16,4 @@ export MAKEFLAGS=-j$(nproc)
 export SOK_COMPILE_GPU_SM="70;75;80"
 cd ./bazel-DeepRec/external/hugectr/sparse_operation_kit
 
-"${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel
-pip install ./dist/merlin_sok-1.2.0-cp38-cp38-linux_x86_64.whl
+"${PYTHON_BIN_PATH:-python}" setup.py install
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 3495efd182d..540f733b2ea 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1369,13 +1369,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    http_archive(
+    new_git_repository(
         name = "hugectr",                                     # Apache License 2.0
-        build_file = "//third_party:hugectr.BUILD",
-        strip_prefix = "HugeCTR-23.06.00",
-        urls = [
-            "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.06.00.tar.gz",
-        ],
+	build_file = "//third_party:hugectr.BUILD",
+	commit = "869028c1c32bdcda2f18efc88d54f0527ed28d6d",
+	init_submodules = True,
+	remote = "https://github.com/NVIDIA-Merlin/HugeCTR.git",
     )
 
 def tf_bind():

From f09e5ec0c1a2424727f8ffc5eaf98b771c4b374e Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Fri, 11 Aug 2023 14:02:44 +0800
Subject: [PATCH 03/45] [Embedding] Add GetSnapshot and Create API for
 EmbeddingVariable. (#923)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/embedding_var.h  | 35 ++++++++++++
 .../framework/embedding/eviction_manager.h    |  5 +-
 .../kernels/embedding_variable_ops_test.cc    | 54 +++++++++++++++++--
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 9a5b5cf9a19..b29493f2169 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -186,6 +186,13 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  Status Insert(K key, V* value) {
+    ValuePtr<V>* value_ptr = nullptr;
+    CreateKey(key, &value_ptr, true);
+    LookupOrCreateEmb(value_ptr, value);
+    return Status::OK();
+  }
+
   Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr) {
     Status s = storage_->GetOrCreate(key, value_ptr,
         emb_config_.total_num(storage_->GetAllocLen()));
@@ -592,6 +599,34 @@ class EmbeddingVar : public ResourceBase {
                           default_value_);
   }
 
+  void GetSnapshot(std::vector<K>* key_list,
+                   std::vector<V*>* value_list,
+                   std::vector<int64>* version_list,
+                   std::vector<int64>* freq_list) {
+    std::vector<ValuePtr<V>*> value_ptr_list;
+    storage_->GetSnapshot(key_list, &value_ptr_list);
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+    for (int64 i = 0; i < key_list->size(); i++) {
+      V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0);
+      if (val != nullptr) {
+        value_list->emplace_back(val);
+      } else {
+        value_list->emplace_back(default_value_);
+      }
+
+      if(is_save_version) {
+        int64 dump_version = value_ptr_list[i]->GetStep();
+        version_list->emplace_back(dump_version);
+      }
+
+      if(is_save_freq) {
+        int64 dump_freq = value_ptr_list[i]->GetFreq();
+        freq_list->emplace_back(dump_freq);
+      }
+    }
+  }
+
   mutex* mu() {
     return &mu_;
   }
diff --git a/tensorflow/core/framework/embedding/eviction_manager.h b/tensorflow/core/framework/embedding/eviction_manager.h
index b5a78765170..ca646c9b420 100644
--- a/tensorflow/core/framework/embedding/eviction_manager.h
+++ b/tensorflow/core/framework/embedding/eviction_manager.h
@@ -47,8 +47,7 @@ class EvictionManager {
           "EVICTION_MANAGER", 3, /*low_latency_hint=*/false));
   }
   
-  ~EvictionManager() {
-  }
+  ~EvictionManager() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(EvictionManager);
 
@@ -124,8 +123,8 @@ class EvictionManager {
   int64 num_of_threads_;
   int64 num_of_active_threads_;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::map<MultiTierStorage<K,V>*, StorageItem<K, V>*> storage_table_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
   mutex mu_;
 };
 
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index eff4b77c2dc..4839c171708 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -1191,6 +1191,7 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
 }
 
 TEST(EmbeddingVariableTest, TestCacheRestore) {
+  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
   int64 value_size = 4;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
@@ -1237,8 +1238,11 @@ TEST(EmbeddingVariableTest, TestCacheRestore) {
   LOG(INFO) << "size:" << variable->Size();
 
   BundleWriter writer(Env::Default(), Prefix("foo"));
-  DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor);
-  TF_ASSERT_OK(writer.Finish());  
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
+  TF_ASSERT_OK(writer.Finish());
+  variable->Unref();
 
   auto imported_storage= embedding::StorageFactory::Create<int64, float>(
       embedding::StorageConfig(embedding::DRAM_SSDHASH,
@@ -1258,6 +1262,7 @@ TEST(EmbeddingVariableTest, TestCacheRestore) {
 
   ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
   ASSERT_EQ(imported_storage->Size(1), 2);
+  delete imported_storage;
 }
 
 void t1_gpu(KVInterface<int64, float>* hashmap) {
@@ -1703,7 +1708,50 @@ TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
    for (auto &t : insert_threads) {
      t.join();
    }
- }
+}
+
+TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  auto emb_config = EmbeddingConfig(
+      /*emb_index = */0, /*primary_emb_index = */0,
+      /*block_num = */1, /*slot_num = */0,
+      /*name = */"", /*steps_to_live = */0,
+      /*filter_freq = */0, /*max_freq = */999999,
+      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
+      /*max_element_size = */0, /*false_positive_probability = */-1.0,
+      /*counter_type = */DT_UINT64);
+  auto storage = embedding::StorageFactory::Create<int64, float>(
+      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
+  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
+      storage,
+      emb_config,
+      cpu_allocator());
+  var->Init(value, 1);
+  float* set_value = (float*)malloc(value_size * sizeof(float));
+  //Insertion
+  for (int i = 0; i < 100; i++) {
+    for (int j = 0; j < value_size; j++) {
+      set_value[j] = i + j;
+    }
+    var->Insert(i, set_value);
+  }
+  free(set_value);
+  //GetSnapshot
+  std::vector<int64> key_list;
+  std::vector<float*> value_ptr_list;
+  std::vector<int64> version_list;
+  std::vector<int64> freq_list;
+  var->GetSnapshot(&key_list, &value_ptr_list,
+                  &version_list, &freq_list);
+  for (int i = 0; i < key_list.size(); i++) {
+    ASSERT_EQ(key_list[i], i);
+    for (int j = 0; j < value_size; j++) {
+      ASSERT_EQ(value_ptr_list[i][j], i + j);
+    }
+  }
+}
 
 } // namespace
 } // namespace embedding

From 8d8e16aae66add22cf8a4812d549c83f3569ef13 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Fri, 11 Aug 2023 18:00:40 +0800
Subject: [PATCH 04/45] [Embedding] Fix set initialized flag too early in
 restore subgraph. (#920)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../core/framework/embedding/config.proto     |  4 ++
 .../framework/embedding/multi_tier_storage.h  | 10 +--
 tensorflow/core/framework/variable.proto      |  2 +
 tensorflow/core/kernels/kv_variable_ops.cc    | 28 ++++----
 .../python/ops/embedding_variable_ops_test.py | 65 +++++++++++++++++++
 tensorflow/python/ops/kv_variable_ops.py      | 52 +++++++++++++++
 tensorflow/python/training/optimizer.py       |  3 +-
 .../training/saving/saveable_object_util.py   |  2 +-
 tensorflow/python/training/slot_creator.py    | 18 +++--
 9 files changed, 158 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index 3d5fae9f6ad..a8535347020 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -56,3 +56,7 @@ enum ValuePosition {
   IN_DRAM = 0;
   NOT_IN_DRAM = 1;
 }
+
+enum IsSetInitialized {
+  NOT_SET_INITAILIZED = 0;
+}
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index ff18425ad9a..8239d109e64 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -81,10 +81,12 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   void InitCache(embedding::CacheStrategy cache_strategy) override {
-    cache_ = CacheFactory::Create<K>(cache_strategy, name_);
-    eviction_manager_ = EvictionManagerCreator::Create<K, V>();
-    eviction_manager_->AddStorage(this);
-    cache_thread_pool_ = CacheThreadPoolCreator::Create();
+    if (cache_ == nullptr) {
+      cache_ = CacheFactory::Create<K>(cache_strategy, name_);
+      eviction_manager_ = EvictionManagerCreator::Create<K, V>();
+      eviction_manager_->AddStorage(this);
+      cache_thread_pool_ = CacheThreadPoolCreator::Create();
+    }
   }
 
   Status BatchCommit(const std::vector<K>& keys,
diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index 79ccd107628..5f9e0f16b5d 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -74,6 +74,8 @@ message VariableDef {
 
   // EmebddingVariable
   bool is_embedding_var = 91;
+
+  string initialize_op_for_restore = 92;
 }
 
 message SaveSliceInfoDef {
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 20ea6d3cb61..8a01a7bf2cd 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -43,11 +43,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-const int64 kEmbeddingVarUseDB = -214;
-const int64 kInitializableEmbeddingVarUseDB = -215;
-}
-
 Status MoveMatchingFiles(
     Env* env,
     const tstring& pattern,
@@ -207,6 +202,15 @@ class InitializeKvVariableOp : public OpKernel {
         (embedding_var_type ==
          embedding::EmbeddingVariableType::IMMUTABLE);
 
+    //initial_num_buckets is useless, so is used to set is_set_initialized_.
+    int64 initial_num_buckets = 0;
+    OP_REQUIRES_OK(c, c->GetAttr("initial_num_buckets", &initial_num_buckets));
+    is_set_initialized_ = true;
+    if (initial_num_buckets ==
+        embedding::IsSetInitialized::NOT_SET_INITAILIZED) {
+      is_set_initialized_ = false;
+    }
+
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
     storage_type_ = static_cast<embedding::StorageType>(storage_type);
@@ -263,15 +267,10 @@ class InitializeKvVariableOp : public OpKernel {
                                   " should be DRAM when layout is 'compact'."));
     }
 
-    if (steps_to_live_ == kEmbeddingVarUseDB ||
-        steps_to_live_ == kInitializableEmbeddingVarUseDB) {
-      LOG(INFO) << "hashmap use db";
-      //use_db_ = true;
-    } else {
-      OP_REQUIRES(c, steps_to_live_ >= 0,
-          errors::InvalidArgument(
+    OP_REQUIRES(c, steps_to_live_ >= 0,
+        errors::InvalidArgument(
             "steps_to_live must >= 0, ", std::to_string(steps_to_live_)));
-    }
+
     OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
     if (embedding::StorageType::LEVELDB == storage_type_) {
       ht_type_ = "leveldb_kv";
@@ -406,7 +405,7 @@ class InitializeKvVariableOp : public OpKernel {
       core::ScopedUnref unref_me(primary_variable);
     }
     core::ScopedUnref unref_me(ev);
-    if (steps_to_live_ != kEmbeddingVarUseDB) {
+    if (is_set_initialized_) {
       ev->SetInitialized();
     }
   }
@@ -436,6 +435,7 @@ class InitializeKvVariableOp : public OpKernel {
   bool record_freq_;
   bool record_version_;
   bool is_inference_;
+  bool is_set_initialized_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index d3e453df9d1..25a0cb6ff11 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2751,5 +2751,70 @@ def testCPUFbjOptWithBloomFilter(self):
         self.assertNotEqual(val, 1.0)
     del os.environ["TF_EMBEDDING_FBJ_OPT"]
 
+  def testSetInitializedWithoutRestore(self):
+    print("testSetInitializedWithoutRestore")
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+    emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    saver = saver_module.Saver()
+    with self.test_session() as sess:
+      result = sess.run(var._is_initialized_op)
+      self.assertEqual(False, result)
+      sess.run([init])
+      result = sess.run(var._is_initialized_op)
+      self.assertEqual(True, result)
+
+  def testSetInitializedWithRestore(self):
+    print("testSetInitializedWitRestore")
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,2 ,3], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        sess.run([init])
+        sess.run(train_op)
+        saver.save(sess, ckpt_path)
+
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1, 2, 3], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+      with self.test_session(graph=g) as sess:
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(False, result)
+        sess.run([var._initializer_for_restore])
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(False, result)
+
+        saver.restore(sess, ckpt_path)
+        result = sess.run(var._is_initialized_op)
+        self.assertEqual(True, result)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index e6140c9c149..701c03f6975 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -434,6 +434,8 @@ def is_multi_tier(storage_type):
             with ops.control_dependencies(set_attr_ops + [self._init_op]):
               self._initializer_op = control_flow_ops.no_op()
         
+            self.create_init_op_for_restore(name, initial_value, invalid_key, rank)
+
         self._graph_element = self._handle
         self._cached_value = None
         if not context.executing_eagerly():
@@ -444,6 +446,49 @@ def is_multi_tier(storage_type):
   def export(self):
     return gen_kv_variable_ops.kv_resource_export(self._handle, Tkeys=self._invalid_key_type)
 
+
+  def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
+        with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]):
+          self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op(
+              self._handle,
+              self._primary._handle,
+              variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+              ops.convert_to_tensor(invalid_key),
+              initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED,
+              slot_num = self._slot_num,
+              shape=initial_value.get_shape()[rank:],
+              steps_to_live=self._steps_to_live,
+              emb_index=self._emb_index, block_num=self.block_num,
+              slot_index=self._slot_index,
+              ht_type=self._ht_type,
+              ht_partition_num=self._ht_partition_num,
+              filter_freq = self._filter_freq,
+              l2_weight_threshold = self._l2_weight_threshold,
+              max_element_size = self._max_element_size,
+              false_positive_probability = self._false_positive_probability,
+              counter_type = self._counter_type,
+              max_freq = 99999,
+              layout = self._layout,
+              storage_type = self._storage_type,
+              storage_path = self._storage_path,
+              storage_size = self._storage_size,
+              default_value_dim = self._default_value_dim,
+              default_value_no_permission = self._default_value_no_permission,
+              record_freq = self._record_freq,
+              record_version = self._record_version,
+              embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE)
+        set_attr_ops = []
+        if self._is_primary and self._is_multi_tier:
+          with ops.control_dependencies([self._initializer_for_restore]):
+            set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
+                self._handle,
+                cache_strategy=self._storage_cache_strategy,
+                Tkeys=self._invalid_key_type,
+                dtype=self._dtype)
+          set_attr_ops.append(set_cache_op)
+        with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
+          self._init_op_for_restore = control_flow_ops.no_op()
+
   def need_counts(self):
     return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
   @property
@@ -482,6 +527,11 @@ def _init_from_proto(self, variable_def, import_scope=None):
           cache_op = op
     elif self._initializer_op.type == "InitializeKvVariableOp":
       init_op = self._initializer_op
+
+    self._init_op_for_restore = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.initialize_op_for_restore,
+            import_scope=import_scope))
     self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(
@@ -842,6 +892,8 @@ def to_proto(self, export_scope=None):
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
             self._save_slice_info.to_proto(export_scope=export_scope))
+      var_def.initialize_op_for_restore = ops.strip_name_scope(
+          self._init_op_for_restore.name, export_scope)
       return var_def
     else:
       return None
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 2b765814c0d..578d682cc11 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -243,8 +243,7 @@ def _get_processor(v):
   if v.op.type == "KvVarHandleOp":
     from tensorflow.core.framework import attr_value_pb2
     from tensorflow.core.framework.embedding import config_pb2
-    v._init_op._set_attr("embedding_variable_type",
-        attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+    slot_creator._set_init_op_embedding_type_attr(v, config_pb2.EmbeddingVariableType.MUTABLE)
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index cd3cba52676..0d8bfe87022 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -195,7 +195,7 @@ def restore(self, restored_tensors, unused_restored_shapes):
       if self.var._init_data_source is not None:
         return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
       else:
-        with ops.control_dependencies([self.var._initializer_op]):
+        with ops.control_dependencies([self.var._init_op_for_restore]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
               restored_tensors[0],
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 90a820d82f6..6a359321c20 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -94,8 +94,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
         validate_shape=validate_shape,
         steps_to_live=primary._steps_to_live,
         ht_partition_num=primary._ht_partition_num)
-      slot._init_op._set_attr("embedding_variable_type",
-            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+      _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE)
     else:
       filter_strategy = None
       if primary._filter_freq != 0:
@@ -107,7 +106,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
         else:
           filter_strategy = variables.CounterFilter(filter_freq=primary._filter_freq)
       if slot_config.slot_type is config_pb2.SlotType.EMBEDDING_VARIABLE:
-        primary._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_config.slot_num))
+        _set_init_op_slot_num_attr(primary, slot_config.slot_num)
         primary._slot_num = slot_config.slot_num
         emb_index = primary._emb_index
         if primary.block_num > 1:
@@ -132,8 +131,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con
             l2_weight_threshold=primary._l2_weight_threshold,
             filter_strategy=filter_strategy)
         )
-        slot._init_op._set_attr("embedding_variable_type",
-            attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE))
+        _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE)
       else:
         slot = variable_scope.get_variable(
           scope,
@@ -300,3 +298,13 @@ def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True, slo
     return create_slot(primary, val, name,
                        colocate_with_primary=colocate_with_primary,
                        slot_config=slot_config)
+
+def _set_init_op_embedding_type_attr(var, embedding_type):
+  var._init_op._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=embedding_type))
+  var._initializer_for_restore._set_attr("embedding_variable_type",
+            attr_value_pb2.AttrValue(i=embedding_type))
+
+def _set_init_op_slot_num_attr(var, slot_num):
+  var._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))
+  var._initializer_for_restore._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))

From 821d5e8d39156d477bacd2ede9f68f76ede0f77d Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 19 Sep 2023 09:56:20 +0800
Subject: [PATCH 05/45] [Embedding] Remove the dependency on private header
 file in EmbeddingVariable. (#927)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   5 +-
 .../framework/embedding/embedding_config.h    |   3 +
 .../core/framework/embedding/embedding_var.h  |   1 -
 .../embedding/embedding_var_ckpt_data.cc      | 262 +++++++
 .../embedding/embedding_var_ckpt_data.h       | 190 +----
 .../embedding/embedding_var_dump_iterator.h   |   7 +-
 .../embedding/embedding_var_restore.cc        | 647 ++++++++++++++++++
 .../embedding/embedding_var_restore.h         | 534 +--------------
 .../core/framework/embedding/kv_interface.h   |   8 +-
 .../embedding/ssd_record_descriptor.cc        |  88 +++
 .../embedding/ssd_record_descriptor.h         |  49 +-
 tensorflow/core/framework/embedding/storage.h |   4 +-
 tensorflow/core/kernels/BUILD                 |   5 +-
 13 files changed, 1041 insertions(+), 762 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
 create mode 100644 tensorflow/core/framework/embedding/embedding_var_restore.cc
 create mode 100644 tensorflow/core/framework/embedding/ssd_record_descriptor.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8ae5b4f156c..95bbbab5624 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3026,7 +3026,10 @@ tf_cuda_library(
             "framework/embedding/gpu_hash_table.cu.cc",
             "framework/embedding/gpu_hash_table.h",
             "framework/embedding/embedding_var.cu.cc",
-            "framework/embedding/multi_tier_storage.cu.cc"
+            "framework/embedding/multi_tier_storage.cu.cc",
+            "framework/embedding/embedding_var_ckpt_data.cc",
+            "framework/embedding/embedding_var_restore.cc",
+            "framework/embedding/ssd_record_descriptor.cc"
         ],
     ) + select({
         "//tensorflow:windows": [],
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index 0a50b492159..d47d07d4205 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -3,6 +3,9 @@
 
 #include <cmath>
 #include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/default/logging.h"
 
 namespace tensorflow {
 struct EmbeddingConfig {
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index b29493f2169..28ce5094d87 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/storage.h"
 #include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
new file mode 100644
index 00000000000..c1b43a608b5
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
@@ -0,0 +1,262 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "tensorflow/core/framework/embedding/embedding_var_ckpt_data.h"
+#include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace embedding {
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(
+    K key, ValuePtr<V>* value_ptr,
+    const EmbeddingConfig& emb_config,
+    V* default_value, int64 value_offset,
+    bool is_save_freq,
+    bool is_save_version,
+    bool save_unfiltered_features) {
+  if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
+    return;
+
+  V* primary_val = value_ptr->GetValue(0, 0);
+  bool is_not_admit =
+      primary_val == nullptr
+      && emb_config.filter_freq != 0;
+
+  if (!is_not_admit) {
+    key_vec_.emplace_back(key);
+
+    if (primary_val == nullptr) {
+      value_ptr_vec_.emplace_back(default_value);
+    } else if (
+        (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
+      value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
+    } else {
+      V* val = value_ptr->GetValue(emb_config.emb_index,
+          value_offset);
+      value_ptr_vec_.emplace_back(val);
+    }
+
+
+    if(is_save_version) {
+      int64 dump_version = value_ptr->GetStep();
+      version_vec_.emplace_back(dump_version);
+    }
+
+    if(is_save_freq) {
+      int64 dump_freq = value_ptr->GetFreq();
+      freq_vec_.emplace_back(dump_freq);
+    }
+  } else {
+    if (!save_unfiltered_features)
+      return;
+
+    key_filter_vec_.emplace_back(key);
+
+    if(is_save_version) {
+      int64 dump_version = value_ptr->GetStep();
+      version_filter_vec_.emplace_back(dump_version);
+    }
+
+    int64 dump_freq = value_ptr->GetFreq();
+    freq_filter_vec_.emplace_back(dump_freq);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
+      ktype, ValuePtr<vtype>*, const EmbeddingConfig&, \
+      vtype*, int64, bool, bool, bool); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(K key, V* value_ptr) {
+  key_vec_.emplace_back(key);
+  value_ptr_vec_.emplace_back(value_ptr);
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
+      ktype, vtype*); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+void EmbeddingVarCkptData<K, V>::SetWithPartition(
+    std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
+  part_offset_.resize(kSavedPartitionNum + 1);
+  part_filter_offset_.resize(kSavedPartitionNum + 1);
+  part_offset_[0] = 0;
+  part_filter_offset_[0] = 0;
+  for (int i = 0; i < kSavedPartitionNum; i++) {
+    part_offset_[i + 1] =
+        part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
+
+    part_filter_offset_[i + 1] =
+        part_filter_offset_[i] +
+        ev_ckpt_data_parts[i].key_filter_vec_.size();
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
+      key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
+      value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
+      version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
+      freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
+      key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) {
+      version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
+      freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
+    }
+  }
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void EmbeddingVarCkptData<ktype, vtype>::SetWithPartition(  \
+      std::vector<EmbeddingVarCkptData<ktype, vtype>>&); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+Status EmbeddingVarCkptData<K, V>::ExportToCkpt(
+    const string& tensor_name,
+    BundleWriter* writer,
+    int64 value_len,
+    ValueIterator<V>* value_iter) {
+  size_t bytes_limit = 8 << 20;
+  std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
+
+  EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
+  Status s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys", writer, dump_buffer.get(),
+      bytes_limit, &key_dump_iter,
+      TensorShape({key_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EV2dVectorDataDumpIterator<V> value_dump_iter(
+      value_ptr_vec_, value_len, value_iter);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-values", writer, dump_buffer.get(),
+      bytes_limit, &value_dump_iter,
+      TensorShape({value_ptr_vec_.size(), value_len}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-versions", writer, dump_buffer.get(),
+      bytes_limit, &version_dump_iter,
+      TensorShape({version_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-freqs", writer, dump_buffer.get(),
+      bytes_limit, &freq_dump_iter,
+      TensorShape({freq_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys_filtered", writer, dump_buffer.get(),
+      bytes_limit, &filtered_key_dump_iter,
+      TensorShape({key_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64>
+      filtered_version_dump_iter(version_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-versions_filtered",
+      writer, dump_buffer.get(),
+      bytes_limit, &filtered_version_dump_iter,
+      TensorShape({version_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int64>
+      filtered_freq_dump_iter(freq_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-freqs_filtered",
+      writer, dump_buffer.get(),
+      bytes_limit, &filtered_freq_dump_iter,
+      TensorShape({freq_filter_vec_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int32>
+      part_offset_dump_iter(part_offset_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-partition_offset",
+      writer, dump_buffer.get(),
+      bytes_limit, &part_offset_dump_iter,
+      TensorShape({part_offset_.size()}));
+  if (!s.ok())
+    return s;
+
+  EVVectorDataDumpIterator<int32>
+      part_filter_offset_dump_iter(part_filter_offset_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-partition_filter_offset",
+      writer, dump_buffer.get(),
+      bytes_limit, &part_filter_offset_dump_iter,
+      TensorShape({part_filter_offset_.size()}));
+  if (!s.ok())
+    return s;
+
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status EmbeddingVarCkptData<ktype, vtype>::ExportToCkpt(  \
+      const string&, BundleWriter*, int64, ValueIterator<vtype>*); 
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+}// namespace embedding
+}// namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index aa1a08cbcfd..6d7b09e70b0 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -15,11 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 namespace tensorflow {
+class BundleWriter;
+
 namespace embedding {
 
 template<class K, class V>
@@ -30,195 +30,17 @@ class  EmbeddingVarCkptData {
                V* default_value, int64 value_offset,
                bool is_save_freq,
                bool is_save_version,
-               bool save_unfiltered_features) {
-    if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
-      return;
-
-    V* primary_val = value_ptr->GetValue(0, 0);
-    bool is_not_admit =
-        primary_val == nullptr
-        && emb_config.filter_freq != 0;
-
-    if (!is_not_admit) {
-       key_vec_.emplace_back(key);
-
-      if (primary_val == nullptr) {
-        value_ptr_vec_.emplace_back(default_value);
-      } else if (
-          (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
-        value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
-      } else {
-        V* val = value_ptr->GetValue(emb_config.emb_index,
-            value_offset);
-        value_ptr_vec_.emplace_back(val);
-      }
-
-
-      if(is_save_version) {
-        int64 dump_version = value_ptr->GetStep();
-        version_vec_.emplace_back(dump_version);
-      }
-
-      if(is_save_freq) {
-        int64 dump_freq = value_ptr->GetFreq();
-        freq_vec_.emplace_back(dump_freq);
-      }
-    } else {
-      if (!save_unfiltered_features)
-        return;
-
-      key_filter_vec_.emplace_back(key);
+               bool save_unfiltered_features);
 
-      if(is_save_version) {
-        int64 dump_version = value_ptr->GetStep();
-        version_filter_vec_.emplace_back(dump_version);
-      }
-
-      int64 dump_freq = value_ptr->GetFreq();
-      freq_filter_vec_.emplace_back(dump_freq);
-    }
-  }
-
-  void Emplace(K key, V* value_ptr) {
-    key_vec_.emplace_back(key);
-    value_ptr_vec_.emplace_back(value_ptr);
-  }
+  void Emplace(K key, V* value_ptr);
 
   void SetWithPartition(
-      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
-    part_offset_.resize(kSavedPartitionNum + 1);
-    part_filter_offset_.resize(kSavedPartitionNum + 1);
-    part_offset_[0] = 0;
-    part_filter_offset_[0] = 0;
-    for (int i = 0; i < kSavedPartitionNum; i++) {
-      part_offset_[i + 1] =
-          part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
-
-      part_filter_offset_[i + 1] =
-          part_filter_offset_[i] +
-          ev_ckpt_data_parts[i].key_filter_vec_.size();
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
-        key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
-        value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
-        version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
-        freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
-        key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) {
-        version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]);
-      }
-
-      for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
-        freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
-      }
-    }
-  }
+      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts);
 
   Status ExportToCkpt(const string& tensor_name,
                       BundleWriter* writer,
                       int64 value_len,
-                      ValueIterator<V>* value_iter = nullptr) {
-    size_t bytes_limit = 8 << 20;
-    std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
-
-    EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
-    Status s = SaveTensorWithFixedBuffer(
-        tensor_name + "-keys", writer, dump_buffer.get(),
-        bytes_limit, &key_dump_iter,
-        TensorShape({key_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EV2dVectorDataDumpIterator<V> value_dump_iter(
-        value_ptr_vec_, value_len, value_iter);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-values", writer, dump_buffer.get(),
-        bytes_limit, &value_dump_iter,
-        TensorShape({value_ptr_vec_.size(), value_len}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-versions", writer, dump_buffer.get(),
-        bytes_limit, &version_dump_iter,
-        TensorShape({version_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-freqs", writer, dump_buffer.get(),
-        bytes_limit, &freq_dump_iter,
-        TensorShape({freq_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-keys_filtered", writer, dump_buffer.get(),
-        bytes_limit, &filtered_key_dump_iter,
-        TensorShape({key_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64>
-        filtered_version_dump_iter(version_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-versions_filtered",
-        writer, dump_buffer.get(),
-        bytes_limit, &filtered_version_dump_iter,
-        TensorShape({version_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int64>
-        filtered_freq_dump_iter(freq_filter_vec_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-freqs_filtered",
-        writer, dump_buffer.get(),
-        bytes_limit, &filtered_freq_dump_iter,
-        TensorShape({freq_filter_vec_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int32>
-        part_offset_dump_iter(part_offset_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-partition_offset",
-        writer, dump_buffer.get(),
-        bytes_limit, &part_offset_dump_iter,
-        TensorShape({part_offset_.size()}));
-    if (!s.ok())
-      return s;
-
-    EVVectorDataDumpIterator<int32>
-        part_filter_offset_dump_iter(part_filter_offset_);
-    s = SaveTensorWithFixedBuffer(
-        tensor_name + "-partition_filter_offset",
-        writer, dump_buffer.get(),
-        bytes_limit, &part_filter_offset_dump_iter,
-        TensorShape({part_filter_offset_.size()}));
-    if (!s.ok())
-      return s;
-
-    return Status::OK();
-  }
-
+                      ValueIterator<V>* value_iter = nullptr);
  private:
   std::vector<K> key_vec_;
   std::vector<V*> value_ptr_vec_;
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
index 71ba054b873..84c823a90dc 100644
--- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -15,9 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/framework/embedding/embedding_config.h"
+#include "tensorflow/core/framework/embedding/kv_interface.h"
 namespace tensorflow {
+template <class T>
+class DumpIterator;
+
 namespace embedding {
 template<class T>
 class EVVectorDataDumpIterator: public DumpIterator<T> {
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc
new file mode 100644
index 00000000000..11c13008995
--- /dev/null
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc
@@ -0,0 +1,647 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/embedding_var_restore.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+namespace tensorflow {
+template <typename K>
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
+  TensorShape shape;
+  Status st;
+  st = reader->LookupTensorShape(record_key, &shape);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  size_t bytes_read = 0;
+  *buffer = new K[shape.dim_size(0)];
+  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
+                             (char*)*buffer, bytes_read);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  return shape.dim_size(0);
+}
+#define REGISTER_KERNELS(ktype)                               \
+  template int64 ReadRecord(BundleReader*, const string&, ktype**);
+REGISTER_KERNELS(int32);                                    
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreSSD() {
+  std::string name_string_temp(restore_args_.m_name_string);
+  std::string new_str = "_";
+  int64 pos = name_string_temp.find("/");
+  while (pos != std::string::npos) {
+    name_string_temp.replace(pos, 1, new_str.data(), 1);
+    pos = name_string_temp.find("/");
+  }
+  std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" +
+                                     name_string_temp + "-ssd_record";
+  if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
+    std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
+                                    name_string_temp + "-emb_files";
+    BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
+    RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
+    VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
+    storage_->RestoreSSD(ev_->GetEmbeddingIndex(),
+                         ev_->GetEmbeddingSlotNum(), ev_->ValueLen(),
+                         ssd_emb_file_name, ev_, ssd_buffer);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::RestoreSSD();
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreInternal(
+    const std::string& name_string,
+    const EmbeddingConfig& emb_config,
+    const Eigen::GpuDevice* device,
+    RestoreBuffer& restore_buff) {
+  Status s = EVInitTensorNameAndShape(name_string);
+  if (!s.ok()) {
+    LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
+    return;
+  }
+
+  Tensor part_offset_tensor;
+  Tensor part_filter_offset_tensor;
+  if (!restore_args_.m_is_oldform) {
+    /****** InitPartOffsetTensor ******/
+    TensorShape part_offset_shape, part_filter_offset_shape;
+    DataType part_offset_type, part_filter_offset_type;
+    string offset_tensor_name;
+    if (!restore_args_.m_is_incr) {
+      offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
+    } else {
+      offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
+    }
+  
+    string offset_filter_tensor_name =
+        name_string + kPartFilterOffsetTensorSuffsix;
+    Status s = reader_->LookupDtypeAndShape(
+        offset_tensor_name, &part_offset_type, &part_offset_shape);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.error_message();
+    }
+    s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                     &part_filter_offset_type,
+                                     &part_filter_offset_shape);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    }
+    part_offset_tensor =
+        Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
+    part_filter_offset_tensor = Tensor(
+        cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
+    s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.error_message();
+    }
+
+    s = reader_->Lookup(offset_filter_tensor_name,
+                        &part_filter_offset_tensor);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    }
+  }
+  auto part_offset_flat = part_offset_tensor.flat<int32>();
+  auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
+  
+  if (restore_args_.m_is_oldform) {
+    VLOG(1) << "old form, EV name:" << name_string
+            << ", partition_id:" << restore_args_.m_partition_id
+            << ", new partition num:" << restore_args_.m_partition_num;
+    int64 new_dim = ev_->ValueLen();
+    TensorShape key_shape;
+    Status st =
+        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+    if (!st.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << st.error_message();
+    }
+    int tot_key_num = key_shape.dim_size(0);
+    Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff,
+                                 new_dim, emb_config, device);
+    if (!s.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message();
+    }
+  } else {
+    int64 new_dim = ev_->ValueLen();
+    VLOG(1) << "new form checkpoint... :" << name_string
+            << " , partition_id:" << restore_args_.m_partition_id
+            << " , partition_num:" << restore_args_.m_partition_num;
+    for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
+      int subpart_id = restore_args_.m_loaded_parts[i];
+      size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+      size_t value_unit_bytes_new = sizeof(V) * new_dim;
+      int subpart_offset = part_offset_flat(subpart_id);
+      int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
+      int64 key_part_offset = subpart_offset * sizeof(K);
+      int64 value_part_offset =
+          subpart_offset * sizeof(V) * restore_args_.m_old_dim;
+      int64 version_part_offset = subpart_offset * sizeof(int64);
+      int64 freq_part_offset = subpart_offset * sizeof(int64);
+      VLOG(1) << "dynamically load ev : " << name_string
+              << ", subpartid:" << subpart_id;
+
+      EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
+                        version_part_offset, freq_part_offset, restore_buff,
+                        new_dim, emb_config, device);
+
+      if (restore_args_.m_has_filter) {
+        Status s = EVRestoreFilteredFeatures(
+            subpart_id, new_dim, restore_buff, part_filter_offset_flat,
+            emb_config, device);
+        if (!s.ok()) {
+          LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message();
+        }
+      }
+    }
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::RestoreInternal(   \
+    const std::string&, const EmbeddingConfig&,                      \
+    const Eigen::GpuDevice*, RestoreBuffer&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+bool CheckpointLoader<K, V>::IsOldCheckpoint(
+    const std::string& curr_partid_str,
+    const std::string& kPartOffsetTensorSuffsix) {
+  if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
+    string tensor_name = restore_args_.m_name_string;
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+
+    string part_id = std::to_string(0);
+    tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
+
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+  } else {
+    string part_id = std::to_string(0);
+    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+    size_t part_size = strlen(kPartStr);
+    size_t cur_part_size = curr_partid_str.size();
+
+    string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+    string post_subname = restore_args_.m_name_string.substr(
+        part_pos + part_size + cur_part_size);
+    string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+    pre_subname =
+        restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
+    post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
+                                                      cur_part_size);
+    tensor_name = pre_subname + post_subname;
+
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+  }
+
+  return true;
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template bool CheckpointLoader<ktype, vtype>::IsOldCheckpoint( \
+    const std::string&, const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+
+template <class K, class V>
+void CheckpointLoader<K, V>::InitPartNumAndLoadedParts(
+    std::vector<std::string>& tensor_name_vec) {
+  std::string tmp_key_suffix;
+  std::string tmp_kPartOffsetTensorSuffsix;
+  if (!restore_args_.m_is_incr) {
+    tmp_key_suffix = kKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
+  } else {
+    tmp_key_suffix = kIncrKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
+  }
+
+  restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
+  int orig_partnum = 0;
+  const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
+  size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+
+  if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
+    restore_args_.m_is_oldform = true;
+  }
+
+  if (part_pos == std::string::npos) {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string tensor_name =
+          restore_args_.m_name_string + "/" + kPartStr + part_id;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      tensor_name_vec.emplace_back(restore_args_.m_name_string);
+    }
+    for (int i = 0; i < kSavedPartitionNum; ++i) {
+      restore_args_.m_loaded_parts.push_back(i);
+    }
+  } else {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tmp_name = pre_subname + post_subname;
+      tensor_name_vec.emplace_back(tmp_name);
+    }
+    for (int i = 0; i < kSavedPartitionNum; i++) {
+      if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
+        restore_args_.m_loaded_parts.push_back(i);
+      }
+    }
+  }
+  for (auto& tensor_name : tensor_name_vec) {
+    VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
+            << " ****";
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template void CheckpointLoader<ktype, vtype>::InitPartNumAndLoadedParts(\
+    std::vector<std::string>&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
+    const std::string& tensor_name) {
+  if (!restore_args_.m_is_incr) {
+    restore_args_.m_tensor_key = tensor_name + kKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
+  } else {
+    restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
+  }
+
+  TensorShape key_shape, value_shape, version_shape, freq_shape;
+
+  Status st =
+      reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
+                                  &version_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_key,
+                             sizeof(K) * key_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_value,
+                             sizeof(V) * value_shape.dim_size(0) *
+                             value_shape.dim_size(1));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_version,
+                             sizeof(int64) * version_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      freq_shape = version_shape;
+    } else {
+      return st;
+    }
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_freq,
+                             sizeof(int64) * freq_shape.dim_size(0));
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      restore_args_.m_has_freq = false;
+    } else {
+      return st;
+    }
+  }
+  restore_args_.m_old_dim = value_shape.dim_size(1);
+
+  if (!restore_args_.m_is_oldform) {
+    TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
+                                    &key_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        key_filter_shape = key_shape;
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupTensorShape(
+        restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
+    if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
+                               sizeof(K) * key_filter_shape.dim_size(0));
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
+                               sizeof(K) * version_filter_shape.dim_size(0));
+    if (!st.ok()) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
+                                    &freq_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        freq_filter_shape = freq_shape;
+      } else {
+        return st;
+      }
+    }
+
+    st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
+                               sizeof(K) * freq_filter_shape.dim_size(0));
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
+      return st;
+    }
+  }
+  return st;
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVInitTensorNameAndShape(\
+    const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFeatures(
+    int tot_key_num, int64 key_part_offset,
+    int64 value_part_offset, int64 version_part_offset,
+    int64 freq_part_offset, RestoreBuffer& restore_buff,
+    int64 new_dim, const EmbeddingConfig& emb_config,
+    const Eigen::GpuDevice* device) {
+  size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+  size_t value_unit_bytes_new = sizeof(V) * new_dim;
+  int64 tot_key_bytes_read(0);
+  int64 tot_value_bytes_read(0);
+  int64 tot_version_bytes_read(0);
+  int64 tot_freq_bytes_read(0);
+  size_t key_bytes_read = 0;
+  size_t value_bytes_read = 0;
+  size_t version_bytes_read = 0;
+  size_t freq_bytes_read = 0;
+
+  while (tot_key_num > 0) {
+    size_t read_key_num = std::min(
+        std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
+                 kBufferSize / sizeof(int64));
+    read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
+    read_key_num = std::min((int)read_key_num, tot_key_num);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
+        read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read,
+        read_key_num * value_unit_bytes, restore_buff.value_buffer,
+        value_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_version,
+          version_part_offset + tot_version_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.version_buffer,
+          version_bytes_read);
+      if (version_bytes_read == 0) {
+        memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
+      }
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+
+    if (restore_args_.m_has_freq) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.freq_buffer,
+          freq_bytes_read);
+      if (freq_bytes_read == 0) {
+        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+        for (int64 i = 0; i < read_key_num; i++) {
+          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+        }
+      }
+    } else {
+      int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+      for (int64 i = 0; i < read_key_num; i++) {
+        freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+      }
+    }
+    if (key_bytes_read > 0) {
+      read_key_num = key_bytes_read / sizeof(K);
+      Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
+                                   value_bytes_read, value_unit_bytes_new,
+                                   restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+
+      st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr,
+          emb_config, device,
+          filter_, restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+    }
+
+    tot_key_num -= read_key_num;
+    tot_key_bytes_read += key_bytes_read;
+    tot_value_bytes_read += value_bytes_read;
+    tot_version_bytes_read += version_bytes_read;
+    tot_freq_bytes_read += freq_bytes_read;
+  }
+
+  return Status::OK();
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFeatures( \
+    int, int64, int64, int64, int64, RestoreBuffer&, \
+    int64, const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template<class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFilteredFeatures(
+    int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
+    typename TTypes<int32>::Flat part_filter_offset_flat,
+    const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
+  int subpart_filter_offset = part_filter_offset_flat(subpart_id);
+  int tot_key_filter_num =
+      part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
+  int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
+  int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
+  int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
+
+  VLOG(1) << "key_filter_num: " << tot_key_filter_num
+         << ", subpart_filter_offset: " << subpart_filter_offset;
+
+  size_t key_filter_bytes_read = 0;
+  size_t version_filter_bytes_read = 0;
+  size_t freq_filter_bytes_read = 0;
+
+  while (tot_key_filter_num > 0) {
+    size_t read_key_num =
+        std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
+    read_key_num = std::min((int)read_key_num, tot_key_filter_num);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_key + "_filtered",
+        key_filter_part_offset + key_filter_bytes_read,
+        read_key_num * sizeof(K), restore_buff.key_buffer,
+        key_filter_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_version + "_filtered",
+          version_filter_part_offset + version_filter_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.version_buffer,
+          version_filter_bytes_read);
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_freq + "_filtered",
+        freq_filter_part_offset + freq_filter_bytes_read,
+        read_key_num * sizeof(int64), restore_buff.freq_buffer,
+        freq_filter_bytes_read);
+    if (key_filter_bytes_read > 0) {
+      read_key_num = key_filter_bytes_read / sizeof(K);
+      VLOG(2) << "restore, read_key_num:" << read_key_num;
+      Status st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr,
+          emb_config, device,
+          filter_, restore_buff);
+      if (!st.ok()) return st;
+      tot_key_filter_num -= read_key_num;
+    }
+  }
+  return Status::OK();
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFilteredFeatures( \
+    int64, int64, RestoreBuffer&, typename TTypes<int32>::Flat, \
+    const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type)                             \
+  REGISTER_KERNELS(int32, type)                                      \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+}// namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.h b/tensorflow/core/framework/embedding/embedding_var_restore.h
index ec97566fbec..3016ba9eeb8 100644
--- a/tensorflow/core/framework/embedding/embedding_var_restore.h
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.h
@@ -16,23 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
 
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/storage.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/save_restore_tensor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 using GPUDevice = Eigen::GpuDevice;
@@ -60,26 +48,7 @@ namespace {
 }  // namespace
 
 template <typename K>
-int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
-  TensorShape shape;
-  Status st;
-  st = reader->LookupTensorShape(record_key, &shape);
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  size_t bytes_read = 0;
-  *buffer = new K[shape.dim_size(0)];
-  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
-                             (char*)*buffer, bytes_read);
-  if (!st.ok()) {
-    LOG(FATAL) << "Restore record " << record_key << " failed";
-  }
-  return shape.dim_size(0);
-}
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer);
 
 template <typename K>
 struct RestoreSSDBuffer {
@@ -178,513 +147,28 @@ class CheckpointLoader {
   void RestoreInternal(const std::string& name_string,
                        const EmbeddingConfig& emb_config,
                        const Eigen::GpuDevice* device,
-                       RestoreBuffer& restore_buff) {
-    Status s = EVInitTensorNameAndShape(name_string);
-    if (!s.ok()) {
-      LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
-      return;
-    }
-
-    Tensor part_offset_tensor;
-    Tensor part_filter_offset_tensor;
-    if (!restore_args_.m_is_oldform) {
-      /****** InitPartOffsetTensor ******/
-      TensorShape part_offset_shape, part_filter_offset_shape;
-      DataType part_offset_type, part_filter_offset_type;
-      string offset_tensor_name;
-      if (!restore_args_.m_is_incr) {
-        offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
-      } else {
-        offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
-      }
-      
-      string offset_filter_tensor_name =
-          name_string + kPartFilterOffsetTensorSuffsix;
-      Status s = reader_->LookupDtypeAndShape(
-          offset_tensor_name, &part_offset_type, &part_offset_shape);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail:" << s.error_message();
-      }
-      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
-                                       &part_filter_offset_type,
-                                       &part_filter_offset_shape);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail: " << s.error_message();
-      }
-      part_offset_tensor =
-          Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
-      part_filter_offset_tensor = Tensor(
-          cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
-      s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail:" << s.error_message();
-      }
-
-      s = reader_->Lookup(offset_filter_tensor_name,
-                          &part_filter_offset_tensor);
-      if (!s.ok()) {
-        LOG(ERROR) << "EV restoring fail: " << s.error_message();
-      }
-    }
-    auto part_offset_flat = part_offset_tensor.flat<int32>();
-    auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
-    
-    if (restore_args_.m_is_oldform) {
-      VLOG(1) << "old form, EV name:" << name_string
-              << ", partition_id:" << restore_args_.m_partition_id
-              << ", new partition num:" << restore_args_.m_partition_num;
-      int64 new_dim = ev_->ValueLen();
-      TensorShape key_shape;
-      Status st =
-          reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
-      if (!st.ok()) {
-      }
-      int tot_key_num = key_shape.dim_size(0);
-      Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff,
-                                   new_dim, emb_config, device);
-      if (!s.ok()) {
-        LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message();
-      }
-    } else {
-      int64 new_dim = ev_->ValueLen();
-      VLOG(1) << "new form checkpoint... :" << name_string
-              << " , partition_id:" << restore_args_.m_partition_id
-              << " , partition_num:" << restore_args_.m_partition_num;
-      for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
-        int subpart_id = restore_args_.m_loaded_parts[i];
-        size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
-        size_t value_unit_bytes_new = sizeof(V) * new_dim;
-        int subpart_offset = part_offset_flat(subpart_id);
-        int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
-        int64 key_part_offset = subpart_offset * sizeof(K);
-        int64 value_part_offset =
-            subpart_offset * sizeof(V) * restore_args_.m_old_dim;
-        int64 version_part_offset = subpart_offset * sizeof(int64);
-        int64 freq_part_offset = subpart_offset * sizeof(int64);
-        VLOG(1) << "dynamically load ev : " << name_string
-                << ", subpartid:" << subpart_id;
-
-        EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
-                          version_part_offset, freq_part_offset, restore_buff,
-                          new_dim, emb_config, device);
-
-        if (restore_args_.m_has_filter) {
-          Status s = EVRestoreFilteredFeatures(
-              subpart_id, new_dim, restore_buff, part_filter_offset_flat,
-              emb_config, device);
-          if (!s.ok()) {
-            LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message();
-          }
-        }
-      }
-    }
-  }
+                       RestoreBuffer& restore_buff);
 
  private:
-  void RestoreSSD() {
-    std::string name_string_temp(restore_args_.m_name_string);
-    std::string new_str = "_";
-    int64 pos = name_string_temp.find("/");
-    while (pos != std::string::npos) {
-      name_string_temp.replace(pos, 1, new_str.data(), 1);
-      pos = name_string_temp.find("/");
-    }
-    std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" +
-                                       name_string_temp + "-ssd_record";
-    if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
-      std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
-                                      name_string_temp + "-emb_files";
-      BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
-      RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
-      VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
-      storage_->RestoreSSD(ev_->GetEmbeddingIndex(),
-                           ev_->GetEmbeddingSlotNum(), ev_->ValueLen(),
-                           ssd_emb_file_name, ev_, ssd_buffer);
-    }
-  }
+  void RestoreSSD();
 
   bool IsOldCheckpoint(const std::string& curr_partid_str,
-                       const std::string& kPartOffsetTensorSuffsix) {
-    if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
-      string tensor_name = restore_args_.m_name_string;
-      TensorShape part_offset_shape;
-      DataType part_offset_type;
-      Status st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (st.ok()) return false;
-
-      string part_id = std::to_string(0);
-      tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
-
-      Status form_st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (form_st.ok()) return false;
-    } else {
-      string part_id = std::to_string(0);
-      size_t part_pos = restore_args_.m_name_string.find(kPartStr);
-      size_t part_size = strlen(kPartStr);
-      size_t cur_part_size = curr_partid_str.size();
-
-      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
-      string post_subname = restore_args_.m_name_string.substr(
-          part_pos + part_size + cur_part_size);
-      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
-
-      TensorShape part_offset_shape;
-      DataType part_offset_type;
-      Status form_st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (form_st.ok()) return false;
-      pre_subname =
-          restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
-      post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
-                                                        cur_part_size);
-      tensor_name = pre_subname + post_subname;
-
-      Status st =
-          reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
-                                       &part_offset_type, &part_offset_shape);
-      if (st.ok()) return false;
-    }
-
-    return true;
-  }
-
-  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec) {
-    std::string tmp_key_suffix;
-    std::string tmp_kPartOffsetTensorSuffsix;
-    if (!restore_args_.m_is_incr) {
-      tmp_key_suffix = kKeySuffix;
-      tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
-    } else {
-      tmp_key_suffix = kIncrKeySuffix;
-      tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
-    }
-
-    restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
-    int orig_partnum = 0;
-    const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
-    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
-
-    if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
-      restore_args_.m_is_oldform = true;
-    }
-
-    if (part_pos == std::string::npos) {
-      for (;; orig_partnum++) {
-        string part_id = std::to_string(orig_partnum);
-        string tensor_name =
-            restore_args_.m_name_string + "/" + kPartStr + part_id;
-        string tensor_key = tensor_name + tmp_key_suffix;
-        TensorShape key_shape;
-        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
-        if (!st.ok()) {
-          break;
-        }
-        tensor_name_vec.emplace_back(tensor_name);
-      }
-      if (orig_partnum == 0) {
-        tensor_name_vec.emplace_back(restore_args_.m_name_string);
-      }
-      for (int i = 0; i < kSavedPartitionNum; ++i) {
-        restore_args_.m_loaded_parts.push_back(i);
-      }
-    } else {
-      for (;; orig_partnum++) {
-        string part_id = std::to_string(orig_partnum);
-        string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
-        string post_subname = restore_args_.m_name_string.substr(
-            part_pos + strlen(kPartStr) + curr_partid_str.size());
-        string tensor_name = pre_subname + kPartStr + part_id + post_subname;
-        string tensor_key = tensor_name + tmp_key_suffix;
-        TensorShape key_shape;
-        Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
-        if (!st.ok()) {
-          break;
-        }
-        tensor_name_vec.emplace_back(tensor_name);
-      }
-      if (orig_partnum == 0) {
-        string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
-        string post_subname = restore_args_.m_name_string.substr(
-            part_pos + strlen(kPartStr) + curr_partid_str.size());
-        string tmp_name = pre_subname + post_subname;
-        tensor_name_vec.emplace_back(tmp_name);
-      }
-      for (int i = 0; i < kSavedPartitionNum; i++) {
-        if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
-          restore_args_.m_loaded_parts.push_back(i);
-        }
-      }
-    }
-    for (auto& tensor_name : tensor_name_vec) {
-      VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
-              << " ****";
-    }
-  }
+                       const std::string& kPartOffsetTensorSuffsix);
 
-  Status EVInitTensorNameAndShape(const std::string& tensor_name) {
-    if (!restore_args_.m_is_incr) {
-      restore_args_.m_tensor_key = tensor_name + kKeySuffix;
-      restore_args_.m_tensor_value = tensor_name + kValueSuffix;
-      restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
-      restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
-    } else {
-      restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
-      restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
-      restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
-      restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
-    }
+  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec);
 
-    TensorShape key_shape, value_shape, version_shape, freq_shape;
-
-    Status st =
-        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
-                                    &version_shape);
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_key,
-                               sizeof(K) * key_shape.dim_size(0));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_value,
-                               sizeof(V) * value_shape.dim_size(0) *
-                               value_shape.dim_size(1));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_version,
-                               sizeof(int64) * version_shape.dim_size(0));
-    if (!st.ok()) {
-      return st;
-    }
-    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        freq_shape = version_shape;
-      } else {
-        return st;
-      }
-    }
-    st = reader_->LookupHeader(restore_args_.m_tensor_freq,
-                               sizeof(int64) * freq_shape.dim_size(0));
-    if (!st.ok()) {
-      if (st.code() == error::NOT_FOUND) {
-        restore_args_.m_has_freq = false;
-      } else {
-        return st;
-      }
-    }
-    restore_args_.m_old_dim = value_shape.dim_size(1);
-
-    if (!restore_args_.m_is_oldform) {
-      TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
-      st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
-                                      &key_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          key_filter_shape = key_shape;
-          restore_args_.m_has_filter = false;
-        } else {
-          return st;
-        }
-      }
-      st = reader_->LookupTensorShape(
-          restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
-      if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
-        return st;
-      }
-      st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
-                                 sizeof(K) * key_filter_shape.dim_size(0));
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          restore_args_.m_has_filter = false;
-        } else {
-          return st;
-        }
-      }
-      st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
-                                 sizeof(K) * version_filter_shape.dim_size(0));
-      if (!st.ok()) {
-        return st;
-      }
-      st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
-                                      &freq_filter_shape);
-      if (!st.ok()) {
-        if (st.code() == error::NOT_FOUND) {
-          freq_filter_shape = freq_shape;
-        } else {
-          return st;
-        }
-      }
-
-      st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
-                                 sizeof(K) * freq_filter_shape.dim_size(0));
-      if (!st.ok() && st.code() != error::NOT_FOUND) {
-        return st;
-      }
-    }
-    return st;
-  }
+  Status EVInitTensorNameAndShape(const std::string& tensor_name);
 
   Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset,
                            int64 value_part_offset, int64 version_part_offset,
                            int64 freq_part_offset, RestoreBuffer& restore_buff,
                            int64 new_dim, const EmbeddingConfig& emb_config,
-                           const Eigen::GpuDevice* device) {
-    size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
-    size_t value_unit_bytes_new = sizeof(V) * new_dim;
-    int64 tot_key_bytes_read(0);
-    int64 tot_value_bytes_read(0);
-    int64 tot_version_bytes_read(0);
-    int64 tot_freq_bytes_read(0);
-    size_t key_bytes_read = 0;
-    size_t value_bytes_read = 0;
-    size_t version_bytes_read = 0;
-    size_t freq_bytes_read = 0;
-
-    while (tot_key_num > 0) {
-      size_t read_key_num = std::min(
-          std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
-                   kBufferSize / sizeof(int64));
-      read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
-      read_key_num = std::min((int)read_key_num, tot_key_num);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
-          read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read,
-          read_key_num * value_unit_bytes, restore_buff.value_buffer,
-          value_bytes_read);
-      if (!restore_args_.m_reset_version) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_version,
-            version_part_offset + tot_version_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.version_buffer,
-            version_bytes_read);
-        if (version_bytes_read == 0) {
-          memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
-        }
-      } else {
-        int64* version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-
-      if (restore_args_.m_has_freq) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.freq_buffer,
-            freq_bytes_read);
-        if (freq_bytes_read == 0) {
-          int64* freq_tmp = (int64*)restore_buff.freq_buffer;
-          for (int64 i = 0; i < read_key_num; i++) {
-            freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
-          }
-        }
-      } else {
-        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
-        for (int64 i = 0; i < read_key_num; i++) {
-          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
-        }
-      }
-      if (key_bytes_read > 0) {
-        read_key_num = key_bytes_read / sizeof(K);
-        Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
-                                     value_bytes_read, value_unit_bytes_new,
-                                     restore_buff);
-        if (!st.ok()) {
-          LOG(FATAL) << "EV Restore fail:" << st.ToString();
-        }
-
-        st = storage_->RestoreFeatures(
-            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
-            restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr,
-            emb_config, device,
-            filter_, restore_buff);
-        if (!st.ok()) {
-          LOG(FATAL) << "EV Restore fail:" << st.ToString();
-        }
-      }
-
-      tot_key_num -= read_key_num;
-      tot_key_bytes_read += key_bytes_read;
-      tot_value_bytes_read += value_bytes_read;
-      tot_version_bytes_read += version_bytes_read;
-      tot_freq_bytes_read += freq_bytes_read;
-    }
-
-    return Status::OK();
-  }
+                           const Eigen::GpuDevice* device);
 
   Status EVRestoreFilteredFeatures(
       int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
       typename TTypes<int32>::Flat part_filter_offset_flat,
-      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
-    int subpart_filter_offset = part_filter_offset_flat(subpart_id);
-    int tot_key_filter_num =
-        part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
-    int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
-    int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
-    int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
-
-    VLOG(1) << "key_filter_num: " << tot_key_filter_num
-            << ", subpart_filter_offset: " << subpart_filter_offset;
-
-    size_t key_filter_bytes_read = 0;
-    size_t version_filter_bytes_read = 0;
-    size_t freq_filter_bytes_read = 0;
-
-    while (tot_key_filter_num > 0) {
-      size_t read_key_num =
-          std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
-      read_key_num = std::min((int)read_key_num, tot_key_filter_num);
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_key + "_filtered",
-          key_filter_part_offset + key_filter_bytes_read,
-          read_key_num * sizeof(K), restore_buff.key_buffer,
-          key_filter_bytes_read);
-      if (!restore_args_.m_reset_version) {
-        reader_->LookupSegmentOffset(
-            restore_args_.m_tensor_version + "_filtered",
-            version_filter_part_offset + version_filter_bytes_read,
-            read_key_num * sizeof(int64), restore_buff.version_buffer,
-            version_filter_bytes_read);
-      } else {
-        int64* version_tmp = (int64*)restore_buff.version_buffer;
-        memset(version_tmp, 0, read_key_num * sizeof(int64));
-      }
-      reader_->LookupSegmentOffset(
-          restore_args_.m_tensor_freq + "_filtered",
-          freq_filter_part_offset + freq_filter_bytes_read,
-          read_key_num * sizeof(int64), restore_buff.freq_buffer,
-          freq_filter_bytes_read);
-      if (key_filter_bytes_read > 0) {
-        read_key_num = key_filter_bytes_read / sizeof(K);
-        VLOG(2) << "restore, read_key_num:" << read_key_num;
-        Status st = storage_->RestoreFeatures(
-            read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
-            restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr,
-            emb_config, device,
-            filter_, restore_buff);
-        if (!st.ok()) return st;
-        tot_key_filter_num -= read_key_num;
-      }
-    }
-    return Status::OK();
-  }
+      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device);
 
   Status RestoreCustomDim(int new_dim, int read_key_num,
                           size_t value_unit_bytes, size_t value_bytes_read,
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 71667cf0917..5d1f20b581a 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -29,6 +30,7 @@ class ValuePtr;
 template <class K, class V>
 class GPUHashTable;
 
+using GPUDevice = Eigen::GpuDevice;
 namespace embedding {
 
 template<class V>
@@ -90,15 +92,15 @@ class KVInterface {
 
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
       int32 default_v_num,
-      size_t n, const Eigen::GpuDevice& device) {
+      size_t n, const GPUDevice& device) {
     return Status::OK();
   }
   virtual Status BatchLookupOrCreateKeys(const K* keys, size_t n,
-      int32* item_idxs, const Eigen::GpuDevice& device) {
+      int32* item_idxs, const GPUDevice& device) {
     return Status::OK();
   }
 
-  virtual Status BatchLookup(const Eigen::GpuDevice& device, 
+  virtual Status BatchLookup(const GPUDevice& device,
       const K* keys, V* val, size_t n, const V* default_v) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.cc b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc
new file mode 100644
index 00000000000..b224b24e856
--- /dev/null
+++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc
@@ -0,0 +1,88 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+#include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K>
+template <class T>
+void SsdRecordDescriptor<K>::DumpSection(
+    const std::vector<T>& data_vec,
+    const std::string& section_str,
+    BundleWriter* writer,
+    std::vector<char>& dump_buffer) {
+  EVVectorDataDumpIterator<T> iter(data_vec);
+  SaveTensorWithFixedBuffer(
+      section_str,
+      writer, dump_buffer.data(),
+      dump_buffer.size(), &iter,
+      TensorShape({data_vec.size()}));
+}
+#define REGISTER_KERNELS(ktype, ttype)                   \
+  template void SsdRecordDescriptor<ktype>::DumpSection( \
+      const std::vector<ttype>&, const std::string&,       \
+      BundleWriter*, std::vector<char>&);
+REGISTER_KERNELS(int32, int32);
+REGISTER_KERNELS(int32, int64);                                    
+REGISTER_KERNELS(int64, int32);
+REGISTER_KERNELS(int64, int64);
+#undef REGISTER_KERNELS
+
+template <class K>
+void SsdRecordDescriptor<K>::DumpSsdMeta(
+    const std::string& prefix,
+    const std::string& var_name) {
+  std::fstream fs;
+  std::string var_name_temp(var_name);
+  std::string new_str = "_";
+  int64 pos = var_name_temp.find("/");
+  while (pos != std::string::npos) {
+    var_name_temp.replace(pos, 1, new_str.data(), 1);
+    pos = var_name_temp.find("/");
+  }
+
+  std::string ssd_record_path =
+      prefix + "-" + var_name_temp + "-ssd_record";
+  BundleWriter ssd_record_writer(Env::Default(),
+                                 ssd_record_path);
+  size_t bytes_limit = 8 << 20;
+  std::vector<char> dump_buffer(bytes_limit);
+
+  DumpSection(key_list, "keys",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(key_file_id_list, "keys_file_id",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(key_offset_list, "keys_offset",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(file_list, "files",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(invalid_record_count_list, "invalid_record_count",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(record_count_list, "record_count",
+              &ssd_record_writer, dump_buffer);
+
+  ssd_record_writer.Finish();
+}
+#define REGISTER_KERNELS(ktype)                               \
+  template void SsdRecordDescriptor<ktype>::DumpSsdMeta(  \
+      const std::string&, const std::string&);
+REGISTER_KERNELS(int32);                                    
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+}//namespace embedding
+}//namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.h b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
index 9d015236934..aeb8d324759 100644
--- a/tensorflow/core/framework/embedding/ssd_record_descriptor.h
+++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.h
@@ -20,14 +20,13 @@ limitations under the License.
 #include <vector>
 #include <cstdlib>
 #include <iomanip>
-
+#include <fstream>
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
+class BundleWriter;
 namespace embedding {
 
 template <class K>
@@ -59,48 +58,10 @@ class SsdRecordDescriptor {
   void DumpSection(const std::vector<T>& data_vec,
                    const std::string& section_str,
                    BundleWriter* writer,
-                   std::vector<char>& dump_buffer) {
-    EVVectorDataDumpIterator<T> iter(data_vec);
-    SaveTensorWithFixedBuffer(
-        section_str,
-        writer, dump_buffer.data(),
-        dump_buffer.size(), &iter,
-        TensorShape({data_vec.size()}));
-  }
+                   std::vector<char>& dump_buffer);
 
   void DumpSsdMeta(const std::string& prefix,
-                   const std::string& var_name) {
-    std::fstream fs;
-    std::string var_name_temp(var_name);
-    std::string new_str = "_";
-    int64 pos = var_name_temp.find("/");
-    while (pos != std::string::npos) {
-      var_name_temp.replace(pos, 1, new_str.data(), 1);
-      pos = var_name_temp.find("/");
-    }
-
-    std::string ssd_record_path =
-        prefix + "-" + var_name_temp + "-ssd_record";
-    BundleWriter ssd_record_writer(Env::Default(),
-                                   ssd_record_path);
-    size_t bytes_limit = 8 << 20;
-    std::vector<char> dump_buffer(bytes_limit);
-
-    DumpSection(key_list, "keys",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(key_file_id_list, "keys_file_id",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(key_offset_list, "keys_offset",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(file_list, "files",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(invalid_record_count_list, "invalid_record_count",
-                &ssd_record_writer, dump_buffer);
-    DumpSection(record_count_list, "record_count",
-                &ssd_record_writer, dump_buffer);
-
-    ssd_record_writer.Finish();
-  }
+                   const std::string& var_name);
 
   void CopyEmbeddingFilesToCkptDir(
       const std::string& prefix,
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index d212e5b9c77..bb949183492 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/storage_config.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tensorflow/core/framework/device_base.h"
 #if GOOGLE_CUDA
@@ -53,6 +52,9 @@ struct SsdRecordDescriptor;
 template <class K, class V>
 class GPUHashTable;
 
+class BundleWriter;
+class BundleReader;
+
 template<typename Device>
 struct EmbeddingVarContext;
 namespace {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fc1b2cd9c67..115e3c4bae6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2907,7 +2907,10 @@ tf_kernel_library(
     hdrs = ["kv_variable_ops.h"],
     srcs = ["kv_variable_ops.cc",
             "kv_variable_lookup_ops.cc",
-            "kv_variable_restore_ops.cc"],
+            "kv_variable_restore_ops.cc",
+            "//tensorflow/core:framework/embedding/embedding_var_ckpt_data.cc",
+            "//tensorflow/core:framework/embedding/embedding_var_restore.cc",
+            "//tensorflow/core:framework/embedding/ssd_record_descriptor.cc"],
     copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",

From fe194b0718f9cc4f30a31e721780da2a956b6df8 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 19 Sep 2023 09:57:00 +0800
Subject: [PATCH 06/45] [Embedding] Fix incorrect frequency in
 shared-embedding. (#931)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../python/ops/embedding_variable_ops_test.py | 74 +++++++++++++++++++
 tensorflow/python/ops/kv_variable_ops.py      |  4 +-
 .../python/training/gradient_descent.py       | 15 +++-
 tensorflow/python/training/optimizer.py       | 30 +++++++-
 4 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 25a0cb6ff11..c6cdf951a1e 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -2816,5 +2816,79 @@ def testSetInitializedWithRestore(self):
         result = sess.run(var._is_initialized_op)
         self.assertEqual(True, result)
 
+  def testCountsTensor(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      sp2 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([3,3,3,4,4,1], dtypes.int64),
+                      dense_shape=[6, 1])
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+  
+  def testCountsTensorWithGradientDescent(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      sp2 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([3,3,3,4,4,1], dtypes.int64),
+                      dense_shape=[6, 1])
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = gradient_descent.GradientDescentOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+
+    del os.environ["TF_RECORD_FREQ"]
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 701c03f6975..96329ca345b 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -368,7 +368,7 @@ def _init_from_args(self,
         self._dtype = initial_value.dtype.base_dtype
         self._constraint = constraint
         self._gather_op = None
-        self._counts_tensor = None
+        self._counts_tensor = {}
         if self._is_primary:
           self._slot_num = 0 
         else:
@@ -850,7 +850,7 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
               default_value,
               counts, is_inference=True,
               name=name)
-        self._counts_tensor = counts
+        self._counts_tensor[indices] = counts
       else:
         value = gen_kv_variable_ops.kv_resource_gather(self._handle,
               indices,
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 32a12a0554f..799e3c5f5bd 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -71,12 +71,23 @@ def _resource_apply_dense(self, grad, handle):
   def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     if isinstance(handle, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
-      if handle.need_counts() and handle._counts_tensor is not None:
+      if handle.need_counts() and len(handle._counts_tensor.keys()) != 0:
+        if indices.op.type == "ConcatV2":
+          total_counts = []
+          for tensor in indices.op.inputs:
+            if tensor.op.type == "Reshape":
+              indices_tensor = tensor.op.inputs[0]
+              total_counts.append(handle._counts_tensor[indices_tensor])
+          from tensorflow.python.ops import array_ops
+          counts_tensor = array_ops.concat(total_counts, 0)
+        elif indices.op.type == "Reshape":
+          indices_tensor = indices.op.inputs[0]
+          counts_tensor = handle._counts_tensor[indices_tensor]
         return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
                                          grad.dtype.base_dtype),
             grad, indices, global_step,
-            handle._counts_tensor, use_locking=self._use_locking)
+            counts_tensor, use_locking=self._use_locking)
       else:
         return training_ops.kv_resource_sparse_apply_gradient_descent(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 578d682cc11..7523604ccf9 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -93,6 +93,18 @@ def _deduplicate_indexed_slices_with_counts(values, indices):
       array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, indices_counts)
 
+def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts):
+  """Sums `values` associated with any non-unique `indices`
+  and return counts of each count in `values`."""
+  unique_indices, new_index_positions = array_ops.unique(indices)
+  summed_values = math_ops.unsorted_segment_sum(
+      values, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  summed_counts = math_ops.unsorted_segment_sum(
+      counts, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  return (summed_values, unique_indices, summed_counts)
+
 def _var_key(var):
   # TODO(ashankar): Consolidate handling for eager and graph
   if hasattr(var, "op"):
@@ -1088,14 +1100,24 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     """
     from tensorflow.python.ops import kv_variable_ops
     if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts():
-      if handle._counts_tensor is None:
+      if len(handle._counts_tensor.keys()) == 0:
         summed_grad, unique_indices, indices_counts = \
             _deduplicate_indexed_slices_with_counts(
                 values=grad, indices=indices)
       else:
-        summed_grad, unique_indices = _deduplicate_indexed_slices(
-            values=grad, indices=indices)
-        indices_counts = handle._counts_tensor
+        if indices.op.type == "ConcatV2":
+          total_counts = []
+          for tensor in indices.op.inputs:
+            if tensor.op.type == "Reshape":
+              indices_tensor = tensor.op.inputs[0]
+              total_counts.append(handle._counts_tensor[indices_tensor])
+          counts_tensor = array_ops.concat(total_counts, 0)
+        elif indices.op.type == "Reshape":
+          indices_tensor = indices.op.inputs[0]
+          counts_tensor = handle._counts_tensor[indices_tensor]
+        summed_grad, unique_indices, indices_counts = \
+            _deduplicate_indexed_slices_with_counts_reduction(
+                grad, indices, counts_tensor)
       return self._resource_apply_sparse(
           summed_grad, handle, unique_indices, indices_counts)
     else:

From 29ecde4f6418cd3beca400a31e87e1e53d9567dc Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Wed, 20 Sep 2023 10:45:48 +0800
Subject: [PATCH 07/45] [Embedding] Fix missing return value of RestoreSSD of
 DramSSDHashStorage. (#926)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 tensorflow/core/framework/embedding/dram_ssd_storage.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 4243cc14eb3..356a61d865f 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -181,7 +181,9 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
                       restore_buff.key_offset_list_buf,
                       restore_buff.num_of_keys,
                       file_id_map);
+    return Status::OK();
   }
+
   Status Eviction(K* evict_ids, int64 evict_size) override {
     ValuePtr<V>* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {

From 06f81cc7c26972d8d0851a652dc212976f54f592 Mon Sep 17 00:00:00 2001
From: lixy9474 <lxy268263@alibaba-inc.com>
Date: Tue, 17 Oct 2023 15:49:38 +0800
Subject: [PATCH 08/45] [Embedding] Refactor the data structure of
 EmbeddingVariable. (#924)

Signed-off-by: lixy9474 <lxy268263@alibaba-inc.com>
---
 .../framework/embedding/bloom_filter_policy.h |  77 ++-
 .../core/framework/embedding/config.proto     |   6 +-
 .../counter_filter_descriptor_impl.h          | 272 ++++++++
 .../embedding/counter_filter_policy.h         | 104 ++-
 .../framework/embedding/cpu_hash_map_kv.h     |  91 ++-
 .../framework/embedding/dense_hash_map_kv.h   |  15 +-
 .../embedding/dram_leveldb_storage.h          |  75 +-
 .../framework/embedding/dram_pmem_storage.h   |  88 +--
 .../framework/embedding/dram_ssd_storage.h    |  62 +-
 .../dynamic_dim_feature_descriptor_impl.h     | 214 ++++++
 .../framework/embedding/embedding_config.h    |  17 +-
 .../embedding/embedding_memory_pool.h         |  12 +-
 .../framework/embedding/embedding_var.cu.cc   | 144 ----
 .../core/framework/embedding/embedding_var.h  | 345 +++-------
 .../embedding/embedding_var_ckpt_data.cc      |  38 +-
 .../embedding/embedding_var_ckpt_data.h       |  10 +-
 .../embedding/embedding_var_dump_iterator.h   |   4 +-
 .../framework/embedding/feature_descriptor.h  | 200 ++++++
 .../embedding/feature_descriptor_impl.h       | 317 +++++++++
 .../core/framework/embedding/filter_factory.h |  12 +-
 .../core/framework/embedding/filter_policy.h  |  48 +-
 .../embedding/globalstep_shrink_policy.h      |  18 +-
 .../framework/embedding/gpu_hash_map_kv.h     |  20 +-
 .../embedding/hbm_dram_ssd_storage.h          | 458 ++++---------
 .../framework/embedding/hbm_dram_storage.h    | 411 ++++-------
 .../hbm_multi_tier_feature_descriptor.h       | 122 ++++
 .../embedding/hbm_storage_iterator.h          |   7 +-
 .../core/framework/embedding/kv_interface.h   |  29 +-
 .../embedding/l2weight_shrink_policy.h        |  19 +-
 .../core/framework/embedding/layout_creator.h | 104 ---
 .../core/framework/embedding/leveldb_kv.h     |  79 ++-
 .../embedding/lockless_hash_map_cpu.h         | 243 -------
 .../embedding/multi_tier_storage.cu.cc        |  77 ++-
 .../framework/embedding/multi_tier_storage.h  | 136 ++--
 .../embedding/normal_feature_descriptor.h     | 134 ++++
 .../embedding/nullable_filter_policy.h        |  99 ++-
 .../core/framework/embedding/shrink_policy.h  |  21 +-
 .../framework/embedding/single_tier_storage.h | 237 +++----
 .../core/framework/embedding/ssd_hash_kv.h    | 112 +--
 tensorflow/core/framework/embedding/storage.h | 170 +++--
 .../core/framework/embedding/storage_config.h |  30 +-
 .../framework/embedding/storage_factory.h     |  42 +-
 .../core/framework/embedding/value_ptr.h      | 647 ------------------
 tensorflow/core/kernels/BUILD                 |   5 +-
 .../kernels/embedding_variable_memory_test.cc |  20 +-
 .../kernels/embedding_variable_ops_test.cc    | 632 ++++-------------
 .../embedding_variable_performance_test.cc    |  25 +-
 .../core/kernels/embedding_variable_test.h    |  43 +-
 .../group_embedding_lookup_ops_test.cc        |   4 +-
 .../core/kernels/incr_save_restore_ops.h      |   4 +-
 .../core/kernels/kv_variable_lookup_ops.cc    |   4 +-
 tensorflow/core/kernels/kv_variable_ops.cc    | 129 ++--
 tensorflow/core/kernels/kv_variable_ops.h     |   1 +
 .../core/kernels/kv_variable_restore_ops.cc   |  72 +-
 tensorflow/core/kernels/save_restore_tensor.h |   1 -
 .../core/kernels/training_ali_op_helpers.h    |  53 +-
 tensorflow/core/kernels/training_ali_ops.cc   |  59 +-
 tensorflow/python/framework/ops.py            |   2 +
 .../ops/embedding_variable_ops_gpu_test.py    | 164 ++---
 .../python/ops/embedding_variable_ops_test.py | 197 +++---
 tensorflow/python/ops/kv_variable_ops.py      |  14 +
 .../training/saving/saveable_object_util.py   |   3 +-
 62 files changed, 3060 insertions(+), 3738 deletions(-)
 create mode 100644 tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/feature_descriptor.h
 create mode 100644 tensorflow/core/framework/embedding/feature_descriptor_impl.h
 create mode 100644 tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
 delete mode 100644 tensorflow/core/framework/embedding/layout_creator.h
 delete mode 100644 tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
 create mode 100644 tensorflow/core/framework/embedding/normal_feature_descriptor.h
 delete mode 100644 tensorflow/core/framework/embedding/value_ptr.h

diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 29b85e5bb4e..781511578af 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -35,9 +35,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::config_;
 
  public:
-  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {
-    
+  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                    embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {
     switch (config_.counter_type){
       case DT_UINT64:
         VLOG(2) << "The type of bloom counter is uint64";
@@ -64,10 +65,10 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -81,17 +82,17 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -109,13 +110,13 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::vector<K>> lookup_or_create_ids(num_worker_threads);
     std::vector<std::vector<int>>
         lookup_or_create_cursor(num_worker_threads);
-    std::vector<std::vector<ValuePtr<V>*>>
+    std::vector<std::vector<void*>>
         lookup_or_create_ptrs(num_worker_threads);
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
     std::vector<std::list<int64>>
@@ -147,7 +148,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
           1000, do_work);
 
     std::vector<K> total_ids(num_of_keys);
-    std::vector<ValuePtr<V>*> total_ptrs(num_of_keys);
+    std::vector<void*> total_ptrs(num_of_keys);
     std::vector<int> total_cursors(num_of_keys);
     int num_of_admit_id = 0;
     for (int i = 0; i < num_worker_threads; i++) {
@@ -157,7 +158,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
                sizeof(K) * lookup_or_create_ids[i].size());
         memcpy(total_ptrs.data() + num_of_admit_id,
                lookup_or_create_ptrs[i].data(),
-               sizeof(ValuePtr<V>*) * lookup_or_create_ptrs[i].size());
+               sizeof(void*) * lookup_or_create_ptrs[i].size());
         memcpy(total_cursors.data() + num_of_admit_id,
                lookup_or_create_cursor[i].data(),
                sizeof(int) * lookup_or_create_cursor[i].size());
@@ -174,11 +175,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
     if (GetBloomFreq(key) >= config_.filter_freq) {
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+      bool is_filter = true;
+      TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       AddFreq(key, count);
@@ -186,19 +188,27 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    *val = nullptr;
-    if ((GetFreq(key, *val) + count) >= config_.filter_freq) {
+    *value_ptr = nullptr;
+    if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) {
+      Status s = ev_->LookupKey(key, value_ptr);
+      if (!s.ok()) {
+        *value_ptr = feat_desc_->Allocate();
+        feat_desc_->SetDefaultValue(*value_ptr, key);
+        ev_->storage()->Insert(key, value_ptr);
+        s = Status::OK();
+      }
       *is_filter = true;
-      return ev_->LookupOrCreateKey(key, val);
+      feat_desc_->AddFreq(*value_ptr, count);
+    } else {
+      *is_filter = false;
+      AddFreq(key, count);
     }
-    *is_filter = false;
-    AddFreq(key, count);
     return Status::OK();
   }
 
-  int64 GetFreq(K key, ValuePtr<V>*) override {
+  int64 GetFreq(K key, void* val) override {
     return GetBloomFreq(key);
   }
 
@@ -210,7 +220,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
     return bloom_counter_;
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     if (value_ptr == nullptr) {
       return false;
     } else {
@@ -326,8 +336,12 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
+      void* value_ptr = nullptr;
       int64 new_freq = freq_buff[i];
+      int64 import_version = -1;
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
           SetBloomFreq(key_buff[i], freq_buff[i]);
@@ -339,17 +353,9 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
         SetBloomFreq(key_buff[i], freq_buff[i]);
       }
       if (new_freq >= config_.filter_freq){
-        ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
-        if (config_.steps_to_live != 0 || config_.record_version) {
-          value_ptr->SetStep(version_buff[i]);
-        }
-        if (!is_filter){
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 value_buff + i * ev_->ValueLen());
-        } else {
-          ev_->LookupOrCreateEmb(value_ptr,
-                                 ev_->GetDefaultValue(key_buff[i]));
-        }
+        ev_->storage()->Import(key_buff[i],
+            value_buff + i * ev_->ValueLen(),
+            new_freq, import_version, config_.emb_index);
       }
     }
     return Status::OK();
@@ -449,6 +455,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
   }
  private:
   void* bloom_counter_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
   std::vector<int64> seeds_;
 };
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto
index a8535347020..424fc5e1a38 100644
--- a/tensorflow/core/framework/embedding/config.proto
+++ b/tensorflow/core/framework/embedding/config.proto
@@ -50,11 +50,7 @@ enum EmbeddingVariableType {
 enum ValuePtrStatus {
   OK = 0;
   IS_DELETED = 1;
-}
-
-enum ValuePosition {
-  IN_DRAM = 0;
-  NOT_IN_DRAM = 1;
+  NOT_IN_DRAM = 2;
 }
 
 enum IsSetInitialized {
diff --git a/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
new file mode 100644
index 00000000000..e51166a2895
--- /dev/null
+++ b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h
@@ -0,0 +1,272 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  CounterFilterDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version,
+      int64 filter_freq,
+      StorageType storage_type) 
+      : filter_freq_(filter_freq),
+        is_record_freq_(need_record_freq),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    if (filter_freq >= (1L << version_offset_bits_)) {
+      LOG(FATAL)<<"Filter freqeuncy threshold shouldn't bigger than 2^12.";
+    }
+
+    if (storage_type == StorageType::HBM_DRAM || 
+        storage_type == StorageType::HBM_DRAM_SSDHASH) {
+#if GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+#endif //GOOGLE_CUDA
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  CounterFilterDescriptorImpl(CounterFilterDescriptorImpl<V>* feat_desc_impl)
+      : filter_freq_(feat_desc_impl->filter_freq_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {
+#if GOOGLE_CUDA
+    if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == 
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>*)){
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+    } else {
+#endif //GOOGLE_CUDA
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc_impl->feat_desc_impl_.get())));
+#if GOOGLE_CUDA
+    }
+#endif //GOOGLE_CUDA
+  }
+
+  ~CounterFilterDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    return feat_desc_impl_->InitSlotInfo(feat_desc_impl);
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  bool IsAdmit(void* val) override {
+    return (GetFlag(val) == 0);
+  }
+
+  void* Admit(void* val) override {
+    if (!IsAdmit(val)) {
+      return feat_desc_impl_->Allocate();
+    } else {
+      LOG(FATAL)<<"Only unadmited feature could be admited.";
+      return nullptr;
+    }
+  }
+
+  void* Allocate() override {
+    uint64* val = (uint64*)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    uint64 flag = 1L << flag_offset_bits_;
+    uint64 version = (0xffffffffffffffff << version_offset_bits_);
+    uint64 freq = 0;
+    *val = version + freq;
+    val = (uint64*)((uint64)val | flag);
+    return (void*)val;
+  }
+
+  void* Allocate(int64 freq) override {
+    if (freq < filter_freq_) {
+      return Allocate();
+    } else {
+      return feat_desc_impl_->Allocate();
+    }
+  }
+
+  void Deallocate(void* val) override {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->Deallocate(val);
+    } else {
+      void* tmp = GetPtr(val);
+      alloc_->DeallocateRaw(tmp);
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      if (IsAdmit(val)) {
+        feat_desc_impl_->Deallocate(val);
+      } else {
+        void* tmp = GetPtr(val);
+        alloc_->DeallocateRaw(tmp);
+      }
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {
+    uint64* tmp = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      __sync_fetch_and_add(tmp, count);
+    } else {
+      feat_desc_impl_->AddFreq(val, count);
+    }
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->SetValue(val, emb_index, value);
+    }
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {
+    feat_desc_impl_->SetDefaultValue(val, key);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    feat_desc_impl_->SetDefaultValues(
+        keys, init_cursor,
+        value_ptrs, compute_stream,
+        event_mgr, gpu_device);
+  }
+#endif
+
+  int64 GetFreq(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      return *((uint64*)tmp) &
+             ((1L << version_offset_bits_) - 1);
+    } else {
+      if (is_record_freq_) {
+        return feat_desc_impl_->GetFreq(val);
+      } else {
+        return filter_freq_;
+      } 
+    }
+  }
+
+  int64 GetVersion(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      int64 version = *(uint64*)tmp >> version_offset_bits_;
+      if (version == 0xffffffffffff) {
+        version = -1;
+      }
+      return version;
+    } else {
+      return feat_desc_impl_->GetVersion(val);
+    }
+  }
+
+  void UpdateVersion(void* val, int64 version) override {
+    if (!IsAdmit(val)) {
+      void* tmp_ptr = GetPtr(val);
+      uint64 tmp_val = 0;
+      uint64 result  = 0;
+      do {
+        tmp_val = *(uint64*)tmp_ptr;
+        version = version << version_offset_bits_;
+        uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1);
+        result = version + freq;
+      } while(!__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result));
+    } else {
+      feat_desc_impl_->UpdateVersion(val, version);
+    }
+  }
+
+  void SetFreq(void* val, int64 freq) override {
+    uint64* tmp_ptr = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      uint64 tmp = *tmp_ptr;
+      tmp = ~((1L << version_offset_bits_) - 1) & tmp;
+      tmp += freq;
+      __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp);
+    } else {
+      feat_desc_impl_->SetFreq(val, freq);
+    }
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  uint64 GetFlag(void* val) {
+    return (uint64)val >> flag_offset_bits_;
+  }
+
+  void* GetPtr(void* val) {
+    return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1));
+  }
+
+  int64 filter_freq_;
+  int alloc_bytes_ = 8;
+  Allocator* alloc_ = ev_allocator();
+  const int freq_offset_bits_ = 0;
+  const int version_offset_bits_ = 16;
+  const int flag_offset_bits_ = 48;
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+  bool is_record_freq_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index c9f19f34cd2..19cd90ad01c 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -25,18 +25,19 @@ template<typename K, typename V, typename EV>
 class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
-  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) :
-      FilterPolicy<K, V, EV>(config, ev) {}
+  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                      embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
-    if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+    if (s.ok() && feat_desc_->IsAdmit(value_ptr)) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
@@ -50,18 +51,18 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         int64 freq = GetFreq(keys[i], value_ptr);
-        if (value_ptr != nullptr && freq >= config_.filter_freq) {
+        if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission;
         }
@@ -79,7 +80,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs_list,
+                              const K* keys, void** value_ptrs_list,
                               int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
@@ -90,36 +91,61 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    if (GetFreq(key, *value_ptr) >= config_.filter_freq) {
-      V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    if (is_filter) {
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
     }
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
-    Status s = ev_->LookupOrCreateKey(key, val);
-    *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq;
+    *is_filter = false;
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      if (count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        feat_desc_->Deallocate(*value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+      ev_->storage()->Insert(key, value_ptr);
+      s = Status::OK();
+    } else if (!feat_desc_->IsAdmit(*value_ptr)) {
+      int64 freq = feat_desc_->GetFreq(*value_ptr);
+      if (freq + count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetFreq(admit_value_ptr, freq);
+        feat_desc_->UpdateVersion(
+            admit_value_ptr, feat_desc_->GetVersion(*value_ptr));
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+    } else {
+      *is_filter = true;
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
     return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    return value_ptr->GetFreq();
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetFreq();
-  }
-
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
-    return (GetFreq(key, value_ptr) >= config_.filter_freq);
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -136,27 +162,33 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
       if (!is_filter) {
         if (freq_buff[i] >= config_.filter_freq) {
-          value_ptr->SetFreq(freq_buff[i]);
+          import_freq = freq_buff[i];
         } else {
-          value_ptr->SetFreq(config_.filter_freq);
+          import_freq = config_.filter_freq;
         }
       } else {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
-      }
-      if (value_ptr->GetFreq() >= config_.filter_freq) {
-        LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                  value_ptr, value_buff, key_buff);
+        import_version = version_buff[i];
       }
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
+
+  bool is_admit(K key, void* value_ptr) override {
+    return feat_desc_->IsAdmit(value_ptr);
+  }
+
+ private:
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 600f6c20e44..8476c399c40 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -21,25 +21,25 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
 class LocklessHashMap : public KVInterface<K, V> {
  public:
-  LocklessHashMap() {
+  LocklessHashMap(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {
     hash_map_.max_load_factor(0.8);
     hash_map_.set_empty_key_and_value(
         LocklessHashMap<K, V>::EMPTY_KEY_, nullptr);
     hash_map_.set_counternum(16);
     hash_map_.set_deleted_key(LocklessHashMap<K, V>::DELETED_KEY_);
+    pthread_key_create(&key_, NULL);
   }
 
-  ~LocklessHashMap() override {}
+  ~LocklessHashMap() override {
+    pthread_key_delete(key_);
+  }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == LocklessHashMap<K, V>::EMPTY_KEY_) {
       return errors::NotFound(
@@ -60,10 +60,10 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
     // insert fail, exist key
     if ((*(iter.first)).second != value_ptr){
       return errors::AlreadyExists(
@@ -88,14 +88,40 @@ class LocklessHashMap : public KVInterface<K, V> {
     }
   }
 
+  Status Commit(K key, const void* value_ptr) override {
+    auto iter = hash_map_.insert_lockless(std::move(
+        std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
+    if ((*(iter.first)).second != value_ptr) {
+      AppendToValuePtrQueue((*(iter.first)).second);
+      __sync_bool_compare_and_swap(
+          &((*(iter.first)).second),
+          (*(iter.first)).second,
+          value_ptr);
+    }
+    return Status::OK();
+  }
+
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
+    for(int i = 0; i < keys.size(); ++i) {
+      auto iter = hash_map_.insert_lockless(std::move(
+          std::pair<K, void*>(keys[i],
+              const_cast<void*>(value_ptrs[i]))));
+      if ((*(iter.first)).second != value_ptrs[i]) {
+        AppendToValuePtrQueue((*(iter.first)).second);
+        __sync_bool_compare_and_swap(
+            &((*(iter.first)).second),
+            (*(iter.first)).second,
+            value_ptrs[i]);
+      }
+    }
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
+      std::vector<void*>* value_ptr_list) override {
+    std::pair<const K, void*> *hash_map_dump;
     int64 bucket_count;
     auto it = hash_map_.GetSnapshot();
     hash_map_dump = it.first;
@@ -120,11 +146,50 @@ class LocklessHashMap : public KVInterface<K, V> {
     return "";
   }
 
+  void UpdateValuePtr(
+      K key, void* new_value_ptr, 
+      void* old_value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, old_value_ptr)));
+    bool flag = __sync_bool_compare_and_swap(
+        &((*(iter.first)).second), old_value_ptr, new_value_ptr);
+    if (flag) {
+      AppendToValuePtrQueue(old_value_ptr);
+    } else {
+      feat_desc_->Deallocate(new_value_ptr);
+    }
+  }
+
+ private:
+  void AppendToValuePtrQueue(void* old_value_ptr) {
+    //A parameter that can be adjusted in the future
+    std::deque<void*>* value_ptr_queue = GetOutOfDateValuePtrQueue();
+    if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) {
+      void* value_ptr = value_ptr_queue->front();
+      feat_desc_->Deallocate(value_ptr);
+      value_ptr_queue->pop_front();
+    }
+    value_ptr_queue->emplace_back(old_value_ptr);
+  }
+
+  std::deque<void*>* GetOutOfDateValuePtrQueue() {
+    std::deque<void*>* value_ptr_queue = 
+        static_cast<std::deque<void*>*>(pthread_getspecific(key_));
+    if (value_ptr_queue == nullptr) {
+      value_ptr_queue = new std::deque<void*>();
+      pthread_setspecific(key_, value_ptr_queue);
+    }
+    return value_ptr_queue;
+  }
+
  private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>*> LockLessHashMap;
+  typedef google::dense_hash_map_lockless<K, void*> LockLessHashMap;
   static const int EMPTY_KEY_;
   static const int DELETED_KEY_;
   LockLessHashMap hash_map_;
+  const int CAP_INVALID_VALUEPTR = 20000;
+  FeatureDescriptor<V>* feat_desc_;
+  pthread_key_t key_;
 };
 template <class K, class V>
 const int LocklessHashMap<K, V>::EMPTY_KEY_ = -1;
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index 92baf037721..ffaf2e335dc 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K, class V>
@@ -45,7 +42,7 @@ class DenseHashMap : public KVInterface<K, V> {
     delete []hash_map_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_rd_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -70,7 +67,7 @@ class DenseHashMap : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     int64 l_id = std::abs(key)%partition_num_;
     spin_wr_lock l(hash_map_[l_id].mu);
     auto iter = hash_map_[l_id].hash_map.find(key);
@@ -80,8 +77,8 @@ class DenseHashMap : public KVInterface<K, V> {
           "already exists Key: ", key, " in DenseHashMap.");
     } else {
       auto iter = hash_map_[l_id].hash_map.insert(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
+        std::move(std::pair<K, void*>(key,
+            const_cast<void*>(value_ptr))));
       return Status::OK();
     }
   }
@@ -109,7 +106,7 @@ class DenseHashMap : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>* >* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     dense_hash_map hash_map_dump[partition_num_];
     for (int i = 0; i< partition_num_; i++) {
       spin_rd_lock l(hash_map_[i].mu);
@@ -132,7 +129,7 @@ class DenseHashMap : public KVInterface<K, V> {
   const int partition_num_ = 1000;
   struct dense_hash_map {
     mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
-    google::dense_hash_map<K, ValuePtr<V>* > hash_map;
+    google::dense_hash_map<K, void* > hash_map;
   };
   dense_hash_map* hash_map_;
 };
diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
index fdb6697d541..2f9fbade6c5 100644
--- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h
+++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramLevelDBStore : public MultiTierStorage<K, V> {
  public:
-  DramLevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    leveldb_ = new LevelDBStore<K, V>(sc, alloc, lc);
+  DramLevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    leveldb_ = new LevelDBStore<K, V>(sc, feat_desc);
   }
 
   ~DramLevelDBStore() override {
@@ -46,7 +44,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -63,23 +61,22 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramLevelDBStore.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramLevelDBStore can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -93,7 +90,7 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       leveldb_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
  
@@ -146,15 +143,15 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_leveldb_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_leveldb_value_list;
+    std::vector<void*> value_ptr_list, tmp_leveldb_value_list;
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
 
     TF_CHECK_OK(leveldb_->GetSnapshot(
         &tmp_leveldb_key_list, &tmp_leveldb_value_list));
 
     for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) {
-      tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index);
+      tmp_leveldb_value_list[i] =
+          (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset));
     }
 
     std::vector<K> leveldb_key_list;
@@ -173,26 +170,34 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(leveldb_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM,
+          true, true,
+          {false, 0});
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = &hbm_feat_desc;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           value_iter)));
     }
 
     for (auto it: tmp_leveldb_value_list) {
-      delete it;
+      cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff));
     }
-
     delete value_iter;
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -206,8 +211,8 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(leveldb_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
@@ -218,14 +223,20 @@ class DramLevelDBStore : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    leveldb_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_;
   LevelDBStore<K, V>* leveldb_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h
index fd19f75ab4c..e58d9450d96 100644
--- a/tensorflow/core/framework/embedding/dram_pmem_storage.h
+++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h
@@ -15,14 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
 
+#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
-#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,36 +29,36 @@ namespace embedding {
 template<typename K, typename V>
 class DramPmemStorage : public MultiTierStorage<K, V> {
  public:
-  DramPmemStorage(const StorageConfig& sc, Allocator* dram_alloc,
-      Allocator* pmem_alloc, LayoutCreator<V>* lc,
+  DramPmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc,
       const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_ = new DramStorage<K, V>(sc, dram_alloc, lc, new LocklessHashMap<K, V>());
-    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_alloc, lc);
-    value_ptr_size_ =
-        const_cast<EmbeddingConfig&>(sc.embedding_config).total_num(
-            Storage<K, V>::GetAllocLen());
+      : dram_feat_desc_(feat_desc), 
+        MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    pmem_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    pmem_feat_desc_->SetAllocator(experimental_pmem_allocator(sc.path, sc.size[0]));
+
+    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_feat_desc_);
   }
 
   ~DramPmemStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete dram_;
     delete pmem_;
+    delete pmem_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_);
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_);
-      *value_ptr = new_value_ptr;
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
       s = dram_->TryInsert(key, *value_ptr);
       if (s.ok()) {
         return s;
@@ -71,19 +69,19 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramPmemStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-     LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramPmemStorage can not be called.";
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   bool IsUseHbm() override {
@@ -94,18 +92,16 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return false;
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
     }
     s = pmem_->Get(key, value_ptr);
 
-    ValuePtr<V>* new_value_ptr = dram_->CreateValuePtr(size);
+    void* new_value_ptr = dram_->CreateValuePtr();
     if (s.ok()) {
-      memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(),
-             sizeof(FixedLengthHeader) + sizeof(V) * size);
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
     }
     *value_ptr = new_value_ptr;
     
@@ -159,7 +155,7 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_pmem_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_pmem_value_list;
+    std::vector<void*> value_ptr_list, tmp_pmem_value_list;
 
     TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
     dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
@@ -182,13 +178,14 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
         emb_config,
         value_len, default_value,
         key_list,
-        value_ptr_list)));
+        value_ptr_list,
+        pmem_feat_desc_)));
 
     return Status::OK();
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr;
+    void* value_ptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -202,8 +199,8 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(pmem_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
@@ -214,13 +211,26 @@ class DramPmemStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    pmem_feat_desc_->InitSlotInfo(dram_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {}
+  int total_dim() override {
+    return pmem_feat_desc_->total_dim();
+  }
 
  private:
   DramStorage<K, V>* dram_;
   PmemLibpmemStorage<K, V>* pmem_;
-  int64 value_ptr_size_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* pmem_feat_desc_ = nullptr;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h
index 356a61d865f..ddd2d782e03 100644
--- a/tensorflow/core/framework/embedding/dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -31,11 +28,12 @@ namespace embedding {
 template<typename K, typename V>
 class DramSsdHashStorage : public MultiTierStorage<K, V> {
  public:
-  DramSsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc, const std::string& name)
-      : MultiTierStorage<K, V>(sc, name) {
-    dram_= new DramStorage<K, V>(sc, alloc, lc, new LocklessHashMap<K, V>());
-    ssd_hash_ = new SsdHashStorage<K, V>(sc, alloc, lc);
+  DramSsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : dram_feat_desc_(feat_desc),
+        MultiTierStorage<K, V>(sc, name) {
+    dram_= new DramStorage<K, V>(sc, feat_desc);
+    ssd_hash_ = new SsdHashStorage<K, V>(sc, feat_desc);
   }
 
   ~DramSsdHashStorage() override {
@@ -46,7 +44,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -64,24 +62,22 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in DramSsdHashStorage.";
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    dram_->Insert(key, value_ptr, alloc_len);
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    LOG(FATAL)<<"GetOrCreate(K key, ValuePtr<V>** value_ptr, "
-              <<"size_t size, CopyBackFlag &need_copyback) "
-              <<"in DramSsdStorage can not be called.";
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = dram_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -96,7 +92,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
       ssd_hash_->DestroyValuePtr(*value_ptr);
       return dram_->Get(key, value_ptr);
     }
-    dram_->Insert(key, value_ptr, size);
+    dram_->CreateAndInsert(key, value_ptr);
     return Status::OK();
   }
 
@@ -164,7 +160,6 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
                     const std::string& ssd_emb_file_name, EmbeddingVar<K, V>* ev,
                     RestoreSSDBuffer<K>& restore_buff) override {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
     std::map<int64, int64> file_id_map;
     for (int64 i = 0; i < restore_buff.num_of_files; i++) {
       file_id_map[restore_buff.file_list_buf[i]] = i;
@@ -185,7 +180,7 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   }
 
   Status Eviction(K* evict_ids, int64 evict_size) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -199,8 +194,8 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
   Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
     mutex_lock l(*(dram_->get_mutex()));
     mutex_lock l1(*(ssd_hash_->get_mutex()));
-    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->alloc_);
-    ValuePtr<V>* value_ptr = nullptr;
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
     for (int64 i = 0; i < evict_size; ++i) {
       if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
         TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
@@ -211,14 +206,25 @@ class DramSsdHashStorage : public MultiTierStorage<K, V> {
     return Status::OK();
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    ssd_hash_->Init();
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    ssd_hash_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return dram_feat_desc_->total_dim();
   }
 
  private:
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_hash_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
new file mode 100644
index 00000000000..c1fa878788b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#include <list>
+#include <bitset>
+#include <atomic>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+constexpr int COLUMN_BITSET_BYTES = 5;
+constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
+
+struct MetaHeader {
+  volatile unsigned char embed_num;
+  unsigned char value_type;
+  unsigned char header_size;
+  unsigned char column_bitset[COLUMN_BITSET_BYTES];
+
+  static const int kEmbeddingNumStartIndex = 0;
+  static const int kValueTypeStartIndex =
+      kEmbeddingNumStartIndex + sizeof(char);
+  static const int kHeaderSizeStartIndex =
+      kValueTypeStartIndex + sizeof(char);
+  static const int kColumnBitsetIndex =
+      kHeaderSizeStartIndex + sizeof(char);
+
+  inline unsigned int GetEmbeddingNum() {
+    return (unsigned int) embed_num;
+  }
+
+  inline void SetEmbeddingNum(size_t s) {
+    embed_num = (unsigned char)s;
+  }
+
+  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
+    unsigned long meta = ((unsigned long*)this)[0];
+    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
+    return bs;
+  }
+
+  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
+      unsigned int embnum) {
+    ((unsigned long*)(this))[0] =
+      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
+      (header_size << (8 * kHeaderSizeStartIndex)) |
+      (value_type << (8 * kValueTypeStartIndex)) |
+      (embnum << (8 * kEmbeddingNumStartIndex));
+  }
+
+  inline unsigned int GetHeaderSize() {
+    return (unsigned int) header_size;
+  }
+
+  inline void SetHeaderSize(size_t size) {
+    header_size = (unsigned char)size;
+  }
+};
+
+template <class V>
+class DynmaicDimDescriptorImpl: public FeatureDescriptorImpl<V> {
+using FeatureDescriptorImpl<V>::slot_infos_;
+ public:
+  DynmaicDimDescriptorImpl(
+      Allocator* alloc,
+      int64 slot_num) 
+      : alloc_bytes_(sizeof(std::atomic_flag) +
+                     sizeof(MetaHeader) +
+                     sizeof(V*) * slot_num),
+        header_offset_bytes_(sizeof(V*) * slot_num),
+        flag_offset_bytes_(sizeof(MetaHeader) +
+                           sizeof(V*) * slot_num),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 false,
+                                 false) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+  }
+  ~DynmaicDimDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) override {
+    return FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+  } 
+
+  V* GetEmbedding(void* val, int emb_index) override {
+		MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->embed_num;
+    auto metadata = meta->GetColumnBitset();
+    
+    if (!metadata.test(emb_index)) {
+      std::atomic_flag* flag= (std::atomic_flag*)(val + flag_offset_bytes_);
+      while(flag->test_and_set(std::memory_order_acquire));
+      metadata = meta->GetColumnBitset();
+      if (metadata.test(emb_index)) {
+        flag->clear(std::memory_order_release);
+        return ((V**)val)[emb_index];
+      }
+      embnum++ ;
+      int64 alloc_value_len = slot_infos_[emb_index].embedding_dim;
+      V* tensor_val = (V*)alloc_->AllocateRaw(
+          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
+      V* default_v = (V*)slot_infos_[emb_index].default_value;
+      memcpy(tensor_val, default_v,
+             sizeof(V) * slot_infos_[emb_index].default_value_len);
+      ((V**)val)[emb_index] = tensor_val;
+
+      metadata.set(emb_index);
+      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
+      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
+      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
+      //LOG(INFO)<<"emb_num: "<<embnum;
+      meta->SetColumnBitset(metadata, embnum);
+      flag->clear(std::memory_order_release);
+      return tensor_val;
+    } else {
+      return ((V**)val)[emb_index];
+    }
+  }
+
+  bool IsAdmit(void* val) override {
+    return true;
+  }
+
+  void* Admit(void* val) override {}
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    memset(val, 0, alloc_bytes_);
+    new ((char*)val + header_offset_bytes_) MetaHeader();
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->GetEmbeddingNum();
+    //LOG(INFO)<<"emb_num in deallocate: "<<embnum;
+    auto metadata = meta->GetColumnBitset();
+    for (int i = 0; i< embnum; i++) {
+      if (metadata.test(i)) {
+        V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i];
+        if (val_ptr != nullptr) {
+          alloc_->DeallocateRaw(val_ptr);
+        }
+      }
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val: vals) {
+      Deallocate(val);
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {}
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {}
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+
+  int64 GetFreq(void* val) override {}
+
+  int64 GetVersion(void* val) override {}
+
+  void UpdateVersion(void* val, int64 version) override {}
+
+  void SetFreq(void* val, int64 freq) override {}
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+ private:
+  int alloc_bytes_ = 0;
+  int header_offset_bytes_ = 0;
+  int flag_offset_bytes_ = 0;
+  Allocator* alloc_ = ev_allocator();
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h
index d47d07d4205..a39d2dca303 100644
--- a/tensorflow/core/framework/embedding/embedding_config.h
+++ b/tensorflow/core/framework/embedding/embedding_config.h
@@ -23,7 +23,6 @@ struct EmbeddingConfig {
   DataType counter_type;
   int64 default_value_dim;
   float default_value_no_permission;
-  int normal_fix_flag;
   bool record_freq;
   bool record_version;
   bool is_inference;
@@ -37,7 +36,6 @@ struct EmbeddingConfig {
                   int64 filter_freq = 0,
                   int64 max_freq = 999999,
                   float l2_weight_threshold = -1.0,
-                  const std::string& layout = "normal",
                   int64 max_element_size = 0,
                   float false_positive_probability = -1.0,
                   DataType counter_type = DT_UINT64,
@@ -58,7 +56,6 @@ struct EmbeddingConfig {
       counter_type(counter_type),
       default_value_dim(default_value_dim),
       default_value_no_permission(default_value_no_permission),
-      normal_fix_flag(0),
       record_freq(record_freq),
       record_version(record_version),
       is_inference(is_inference) {
@@ -70,10 +67,6 @@ struct EmbeddingConfig {
       kHashFunc = 0;
       num_counter = 0;
     }
-    if (layout == "normal_contiguous" ||
-        layout == "normal_contiguous_gpu") {
-      normal_fix_flag = 1;
-    }
   }
 
   int64 calc_num_counter(int64 max_element_size,
@@ -105,21 +98,13 @@ struct EmbeddingConfig {
   }
 
   bool is_save_freq() const {
-    return filter_freq != 0 ||
-           record_freq ||
-           normal_fix_flag == 1;
+    return filter_freq != 0 || record_freq;
   }
 
   bool is_save_version() const {
     return steps_to_live != 0 || record_version;
   }
 
-  int64 total_num(int alloc_len) {
-    return block_num *
-           (1 + (1 - normal_fix_flag) * slot_num) *
-           (1 + normal_fix_flag * (alloc_len * (slot_num + 1) - 1));
-  }
-
   int64 get_filter_freq() {
     return filter_freq;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_memory_pool.h b/tensorflow/core/framework/embedding/embedding_memory_pool.h
index 27b31ce1ed7..ef175151b00 100644
--- a/tensorflow/core/framework/embedding/embedding_memory_pool.h
+++ b/tensorflow/core/framework/embedding/embedding_memory_pool.h
@@ -18,9 +18,6 @@ limitations under the License.
 #include <deque>
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 template<typename V>
 class EmbeddingMemoryPool {
@@ -50,7 +47,7 @@ class EmbeddingMemoryPool {
     return ptr;
   }
 
-  void Deallocate(std::vector<ValuePtr<V>*> value_ptrs) {
+  void Deallocate(std::vector<void*> value_ptrs) {
     int64 prev_size = value_ptrs_queue_.size();
     for (auto it : value_ptrs) {
       value_ptrs_queue_.emplace_back(it);
@@ -59,9 +56,8 @@ class EmbeddingMemoryPool {
       int64 n = value_ptrs_queue_.size() - embs_per_block_;
       n = std::min(prev_size, n);
       for (int64 i = 0; i < n; i++) {
-        ValuePtr<V>* val = value_ptrs_queue_.front();
-        free_ptr_queue_.emplace_back(val->GetValue(0, 0));
-        delete val;
+        void* val = value_ptrs_queue_.front();
+        free_ptr_queue_.emplace_back((V*)val);
         value_ptrs_queue_.pop_front();
       }
     }
@@ -88,7 +84,7 @@ class EmbeddingMemoryPool {
   int64 embs_per_block_;
   Allocator* alloc_;
   std::deque<V*> free_ptr_queue_;
-  std::deque<ValuePtr<V>*> value_ptrs_queue_;
+  std::deque<void*> value_ptrs_queue_;
   std::vector<V*> block_list_;
 };
 } //embedding
diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc
index 0c0be83ec1d..f7162fd2c22 100644
--- a/tensorflow/core/framework/embedding/embedding_var.cu.cc
+++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc
@@ -42,71 +42,6 @@ void SyncWithEventMgr(se::Stream* stream,
   while(!is_kernel_finish) {}
 }
 
-template <class K, class V>
-void EmbeddingVar<K, V>::SetDefaultValueOfNewFeatures(
-    const K* keys, int64 size, const std::list<int64>& init_cursor,
-    V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device) {
-  if (init_cursor.size() > 0) {
-    int64 total = init_cursor.size();
-    V** value_address = nullptr;
-    value_address = TypedAllocator::Allocate<V*>(cpu_allocator(), total * 2,
-                                                 AllocationAttributes());
-    V** default_value_address = value_address + total;
-    V** dev_value_address = nullptr;
-    dev_value_address =
-        TypedAllocator::Allocate<V*>(alloc_, total * 2, AllocationAttributes());
-    V** dev_default_value_address = dev_value_address + total;
-    int64 i = 0;
-    auto it = init_cursor.cbegin();
-    for (; it != init_cursor.cend(); ++it, ++i) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_address[i] =
-          *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) +
-          storage_->GetOffset(emb_config_.emb_index);
-      default_value_address[i] =
-          default_value_ +
-          (keys[i] % emb_config_.default_value_dim) % value_len_;
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
-                               total * 2 * sizeof(V*));
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::CopyEmbedding<V>,
-        (total * value_len_ + block_dim - 1) / block_dim,
-        block_dim, 0, gpu_device.stream(), dev_default_value_address,
-        dev_value_address, value_len_, total));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    // Set init meta of ValuePtrs
-    for (auto it = init_cursor.cbegin(); it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr->SetInitialized(emb_config_.emb_index);
-      memcpy_address[*it] = value_ptr->GetValue(
-          emb_config_.emb_index,
-          storage_->GetOffset(emb_config_.emb_index));
-    }
-    TypedAllocator::Deallocate(alloc_, dev_value_address, total * 2);
-    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
-  }
-}
-
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::SetDefaultValueOfNewFeatures(     \
-      const ktype*, int64, const std::list<int64>&, vtype**,                  \
-      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
 template <class K, class V>
 void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
     V* val_base, int64 size, V** memcpy_address,
@@ -136,85 +71,6 @@ void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
 
-#undef REGISTER_KERNELS_ALL
-#undef REGISTER_KERNELS
-
-template <class K, class V>
-void EmbeddingVar<K, V>::CopyEmbeddingsFromCPUToGPU(
-    const K* keys, const std::list<int64>& copyback_cursor, V** memcpy_address,
-    se::Stream* compute_stream, EventMgr* event_mgr,
-    const Eigen::GpuDevice& gpu_device,
-    const DeviceBase::CpuWorkerThreads* worker_threads,
-    int64* output_value_ptrs) {
-  if (copyback_cursor.size() > 0) {
-    int64 total = copyback_cursor.size();
-    size_t value_len = emb_config_.total_num(storage_->GetAllocLen());
-    V* memcpy_buffer_gpu = nullptr;
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[total];
-    memcpy_buffer_gpu = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                total * value_len * sizeof(V));
-    storage_->CopyEmbeddingsFromCPUToGPU(
-        total, keys, copyback_cursor, memcpy_address, value_len, gpu_value_ptrs,
-        memcpy_buffer_gpu, compute_stream, event_mgr, worker_threads);
-
-    V** value_address = (V**)cpu_allocator()->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * total);
-    V** dev_value_address = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
-                                                 sizeof(V*) * total);
-    std::vector<K> copyback_keys(total);
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    for (; it != copyback_cursor.cend(); ++it, ++i) {
-      bool init;
-      // Get the curosr
-      int64 cursor = *it & 0x0fffffffffffffff;
-      gpu_value_ptrs[i]->SetInitialized(emb_config_.emb_index);
-      memcpy_address[cursor] = LookupOrCreateEmb(gpu_value_ptrs[i], init);
-      value_address[i] = memcpy_address[cursor];
-      copyback_keys[i] = keys[cursor];
-    }
-    DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * sizeof(V*));
-    compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, total * sizeof(V*));
-
-    int block_dim = 128;
-    TF_CHECK_OK(GpuLaunchKernel(
-        embedding::BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
-        block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu,
-        value_len, total));
-
-    auto do_insert = [this, copyback_keys, gpu_value_ptrs, value_len](
-                         int64 start, int64 limit) {
-      for (int64 i = start; i < limit; i++)
-        storage_->Insert(copyback_keys[i], gpu_value_ptrs[i]);
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers,
-          copyback_keys.size(), 100000, do_insert);
-    if (output_value_ptrs != nullptr) {
-      auto it = copyback_cursor.cbegin();
-      for (int64 i = 0; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 cursor = *it & 0x0fffffffffffffff;
-        output_value_ptrs[cursor] = (int64)gpu_value_ptrs[i];
-      }
-    }
-    SyncWithEventMgr(compute_stream, event_mgr);
-
-    alloc_->DeallocateRaw(dev_value_address);
-    alloc_->DeallocateRaw(memcpy_buffer_gpu);
-    cpu_allocator()->DeallocateRaw(value_address);
-    delete[] gpu_value_ptrs;
-  }
-}
-#define REGISTER_KERNELS(ktype, vtype)                                        \
-  template void EmbeddingVar<ktype, vtype>::CopyEmbeddingsFromCPUToGPU(       \
-      const ktype*, const std::list<int64>&, vtype**, se::Stream*, EventMgr*, \
-      const Eigen::GpuDevice&, const DeviceBase::CpuWorkerThreads*, int64*);
-#define REGISTER_KERNELS_ALL(type) \
-  REGISTER_KERNELS(int32, type);   \
-  REGISTER_KERNELS(int64, type)
-#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
-TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
-#undef REGISTER_KERNELS_CPU
-
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 28ce5094d87..487f595bf31 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_var_context.h"
 #include "tensorflow/core/framework/embedding/embedding_var_restore.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/framework/embedding/filter_factory.h"
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
@@ -57,7 +56,8 @@ class EmbeddingVar : public ResourceBase {
   EmbeddingVar(const string& name,
                embedding::Storage<K, V>* storage,
                EmbeddingConfig emb_cfg,
-               Allocator* alloc):
+               Allocator* alloc,
+               embedding::FeatureDescriptor<V>* feat_desc):
       name_(name),
       storage_(storage),
       default_value_(nullptr),
@@ -65,27 +65,8 @@ class EmbeddingVar : public ResourceBase {
       value_len_(0),
       alloc_(alloc),
       default_value_alloc_(alloc),
-      emb_config_(emb_cfg) {
-    if (IsMultiLevel() || emb_config_.record_freq) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        value_ptr->AddFreq(freq);
-      };
-    } else if (emb_config_.is_counter_filter()) {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {
-        if (value_ptr->GetFreq() < filter_freq)
-          value_ptr->AddFreq(freq);
-      };
-    } else {
-      add_freq_fn_ = [](ValuePtr<V>* value_ptr, int64 freq, int64 filter_freq) {};
-    }
-    if (emb_config_.steps_to_live != 0 || emb_config_.record_version) {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {
-        value_ptr->SetStep(gs);
-      };
-    } else {
-      update_version_fn_ = [](ValuePtr<V>* value_ptr, int64 gs) {};
-    }
-  }
+      emb_config_(emb_cfg),
+      feat_desc_(feat_desc) {}
 
   Status Init(const Tensor& default_tensor, int64 default_value_dim) {
     if (storage_ == nullptr) {
@@ -95,17 +76,11 @@ class EmbeddingVar : public ResourceBase {
 
     storage_type_ = storage_->GetStorageType();
     filter_ = FilterFactory::CreateFilter<K, V, EmbeddingVar<K, V>>(
-        emb_config_, this, storage_);
+        emb_config_, this, storage_, feat_desc_);
     emb_config_.default_value_dim = default_value_dim;
     value_len_ =
         default_tensor.NumElements() / emb_config_.default_value_dim;
 
-    if (LayoutType::NORMAL_CONTIGUOUS == storage_->GetLayoutType() ||
-        LayoutType::NORMAL_CONTIGUOUS_GPU == storage_->GetLayoutType() ||
-        LayoutType::COMPACT == storage_->GetLayoutType()) {
-      storage_->SetAllocLen(value_len_, emb_config_.slot_num + 1);
-    }
-
     if (storage_->IsUseHbm()) {
 #if GOOGLE_CUDA
       default_value_ = TypedAllocator::Allocate<V>(alloc_,
@@ -115,12 +90,6 @@ class EmbeddingVar : public ResourceBase {
       dev_addr_buffer_size_ = 0;
       cudaMemcpy(default_value_, &default_tensor_flat(0),
           default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
-      storage_->
-          CreateEmbeddingMemoryPool(
-              alloc_,
-              emb_config_.total_num(
-                  storage_->GetAllocLen()),
-              1024 * 1024 * 64);
 #endif  // GOOGLE_CUDA
     } else if (storage_->IsSingleHbm()) {
 #if GOOGLE_CUDA
@@ -147,6 +116,14 @@ class EmbeddingVar : public ResourceBase {
             emb_config_.default_value_no_permission);
       }
     }
+    bool is_all_slots_initialized = 
+        feat_desc_->InitSlotInfo(
+            emb_config_.emb_index, value_len_,
+            std::pair<V*, int64>(
+                default_value_, emb_config_.default_value_dim));
+    if (is_all_slots_initialized) {
+      storage_->Init();
+    }
 
     return Status::OK();
   }
@@ -159,57 +136,92 @@ class EmbeddingVar : public ResourceBase {
     return is_initialized_;
   }
 
-  Status LookupKey(K key, ValuePtr<V>** value_ptr) {
+  Status LookupKey(K key, void** value_ptr) {
     return storage_->Get(key, value_ptr);
   }
 
   void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
                       const K* keys,
-                      ValuePtr<V>** value_ptr_list,
+                      void** value_ptr_list,
                       int64 num_of_keys) {
-    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys,
-                       emb_config_.total_num(storage_->GetAllocLen()));
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr,
+  Status LookupOrCreateKey(K key, void** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
     if (indices_as_pointer) {
-      *value_ptr = (ValuePtr<V>*)key;
-      *is_filter = (*value_ptr != nullptr);
+      *value_ptr = (void*)key;
+      *is_filter = filter_->is_admit(key, *value_ptr);
       return Status::OK();
     } else {
       Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count);
-      add_freq_fn_(*value_ptr, count, emb_config_.filter_freq);
       return s;
     }
   }
 
   Status Insert(K key, V* value) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     CreateKey(key, &value_ptr, true);
-    LookupOrCreateEmb(value_ptr, value);
+    feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value);
     return Status::OK();
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** value_ptr) {
-    Status s = storage_->GetOrCreate(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()));
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys,
+                           void** value_ptrs,
+                           int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            add_freq_fn);
+    }
+    return Status::OK();
+  }
+
+
+  Status LookupOrCreateKey(K key, void** value_ptr) {
+    Status s = storage_->GetOrCreate(key, value_ptr);
     TF_CHECK_OK(s);
     return s;
   }
 
-  void CreateKey(K key, ValuePtr<V>** value_ptr, bool to_dram) {
-    storage_->Insert(key, value_ptr,
-        emb_config_.total_num(storage_->GetAllocLen()), to_dram);
+  void CreateKey(K key, void** value_ptr, bool to_dram) {
+    storage_->CreateAndInsert(key, value_ptr, to_dram);
   }
 
-  void UpdateVersion(ValuePtr<V>* value_ptr, int64 gs) {
-    update_version_fn_(value_ptr, gs);
+  void UpdateVersion(void* value_ptr, int64 gs) {
+    feat_desc_->UpdateVersion(value_ptr, gs);
   }
 
   void BatchCommit(const std::vector<K>& keys,
-                   const std::vector<ValuePtr<V>*>& value_ptrs) {
+                   const std::vector<void*>& value_ptrs) {
     TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs));
   }
 
@@ -218,9 +230,9 @@ class EmbeddingVar : public ResourceBase {
   }
 
   int64 GetVersion(K key) {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr));
-    return value_ptr->GetStep();
+    return feat_desc_->GetVersion(value_ptr);
   }
 
   int64 GetFreq(K key) {
@@ -261,11 +273,11 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         V* default_v = default_value + i * value_len_;
-        ValuePtr<V>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         filter_->LookupOrCreate(
             keys[i], output + i * value_len_, default_v, &value_ptr, 1,
             default_value_no_permission_);
-        add_freq_fn_(value_ptr, 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptr, 1);
       }
     };
     auto worker_threads = context.worker_threads;
@@ -276,7 +288,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<CPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
+                      void** value_ptrs,
                       int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
     auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) {
@@ -295,7 +307,7 @@ class EmbeddingVar : public ResourceBase {
 
   void GatherEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     const K* keys = (K*)keys_tensor.data();
@@ -303,13 +315,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
         V* value = nullptr;
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          value = LookupOrCreateEmb(value_ptrs[i], default_v);
+          value = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           value = default_value_no_permission_;
         }
@@ -341,8 +350,9 @@ class EmbeddingVar : public ResourceBase {
 
   void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
                       const Tensor& keys_tensor,
-                      ValuePtr<V>** value_ptrs,
-                      int64 num_of_keys) {
+                      void** value_ptrs,
+                      int64 num_of_keys,
+                      bool indices_as_pointer = false) {
     const K* keys = (K*)keys_tensor.data();
     filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
     storage_->AddToCachePrefetchList(keys_tensor);
@@ -351,17 +361,17 @@ class EmbeddingVar : public ResourceBase {
   void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& context,
       const K* keys,
-      ValuePtr<V>** value_ptrs,
+      void** value_ptrs,
       int64 num_of_keys,
       std::vector<std::list<int64>>& not_found_cursor_list) {
     storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys,
-                               emb_config_.total_num(storage_->GetAllocLen()),
+                               value_len_,
                                not_found_cursor_list);
   }
 
   void GatherEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
                         const Tensor& keys_tensor,
-                        ValuePtr<V>** value_ptrs,
+                        void** value_ptrs,
                         V* output,
                         int64 num_of_keys) {
     std::vector<V*> embedding_ptr(num_of_keys);
@@ -370,12 +380,10 @@ class EmbeddingVar : public ResourceBase {
         (int64 start, int64 limit) {
       for (int64 i = start; i < limit; ++i) {
         bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
-        add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq);
+        feat_desc_->AddFreq(value_ptrs[i], 1);
         if (is_admit) {
-          V* default_v =
-              default_value_ +
-                  (keys[i] % emb_config_.default_value_dim) * value_len_;
-          embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v);
+          embedding_ptr[i] = feat_desc_->GetEmbedding(
+              value_ptrs[i], emb_config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_no_permission_;
         }
@@ -394,72 +402,8 @@ class EmbeddingVar : public ResourceBase {
 
     storage_->AddToCache(keys_tensor);
   }
-
-  void BatchLookupOrCreateEmb(
-      const EmbeddingVarContext<GPUDevice>& ctx,
-      V** var_ptr,
-      ValuePtr<V>** value_ptrs,
-      const K* indices,
-      int64 num_of_keys,
-      IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-    int num_worker_threads = ctx.worker_threads->num_threads;
-    std::vector<std::list<int64>> init_cursor_list(
-        num_worker_threads + 1);
-    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
-
-    auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list,
-        &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) {
-      int copy_id =
-          thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id);
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptr[i] = LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-        if (is_need_set_default_value) {
-          init_cursor_list[copy_id].emplace_back(i);
-        }
-      }
-    };
-    const int64 unit_cost = 1000;
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers,
-          num_of_keys, unit_cost, do_work_get_ptrs);
-
-    // Merge copies of init_cursor_list
-    for (int i = 1; i < (worker_threads->num_threads + 1); i++) {
-      if (init_cursor_list[i].size() > 0) {
-        init_cursor_list[0].splice(init_cursor_list[0].end(),
-                                   init_cursor_list[i]);
-      }
-    }
-
-    auto stream = ctx.compute_stream;
-    auto event_mgr = ctx.event_mgr;
-
-    SetDefaultValueOfNewFeatures(
-        indices, num_of_keys,
-        init_cursor_list[0],
-        var_ptr, stream, event_mgr,
-        ctx.gpu_device);
-  }
 #endif
 
-  void LookupOrCreate(K key, V* val, V* default_v, int count = 1)  {
-    const V* default_value_ptr =
-      (default_v == nullptr) ? default_value_ : default_v;
-    ValuePtr<V>* value_ptr = nullptr;
-    filter_->LookupOrCreate(key, val, default_value_ptr, &value_ptr, count,
-                            default_value_no_permission_);
-    add_freq_fn_(value_ptr, count, emb_config_.filter_freq);
-  }
-
-  void BatchInitEmb(int64 size, V** memcpy_address, V* default_value,
-      bool* init_flags, int64 value_len) {
-    filter_->BatchInitEmb(size, memcpy_address, default_value,
-        init_flags, value_len);
-  }
-
 #if GOOGLE_CUDA
   void CopyEmbeddingsToBuffer(
       V* val_base, int64 size,
@@ -467,73 +411,18 @@ class EmbeddingVar : public ResourceBase {
       se::Stream* compute_stream,
       EventMgr* event_mgr,
       const Eigen::GpuDevice& gpu_device);
-
-  void SetDefaultValueOfNewFeatures(
-      const K* keys, int64 size,
-      const std::list<int64>& init_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device);
-
-  void CopyEmbeddingsFromCPUToGPU(
-      const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const Eigen::GpuDevice& gpu_device,
-      const DeviceBase::CpuWorkerThreads* worker_threads,
-      int64* output_value_ptrs = nullptr);
-
-  void AllocateMemoryForNewFeatures(
-      V** memcpy_address,
-      const std::list<int64>& init_cursor) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
-    for (auto it = init_cursor.cbegin();
-      it != init_cursor.cend(); ++it) {
-      ValuePtr<V>* value_ptr =
-          reinterpret_cast<ValuePtr<V>*>(memcpy_address[*it]);
-      value_ptr_list.emplace_back(value_ptr);
-    }
-    storage_->AllocateMemoryForNewFeatures(value_ptr_list);
-  }
 #endif  // GOOGLE_CUDA
 
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-          emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, const V* default_v,
-                       Allocator* alloc) {
-    return value_ptr->GetOrAllocate(alloc, value_len_, default_v,
-        emb_config_.emb_index, storage_->GetOffset(
-            emb_config_.emb_index));
-  }
-
-  V* LookupOrCreateEmb(ValuePtr<V>* value_ptr, bool &need_initialize) {
-    return value_ptr->GetOrAllocate(alloc_, value_len_, nullptr,
-        emb_config_.emb_index,
-        storage_->GetOffset(emb_config_.emb_index),
-        need_initialize);
-  }
-
-  V* LookupPrimaryEmb(ValuePtr<V>* value_ptr) {
-    V* primary_val = value_ptr->GetValue(emb_config_.primary_emb_index,
-        storage_->GetOffset(emb_config_.primary_emb_index));
-    return primary_val;
-  }
-
-  typename TTypes<V>::Flat flat(ValuePtr<V>* value_ptr, int64 index) {
-    V* default_v =
-        default_value_ + (index % emb_config_.default_value_dim) * value_len_;
-    V* val = LookupOrCreateEmb(value_ptr, default_v);
+  typename TTypes<V>::Flat flat(void* value_ptr) {
+    V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
     Eigen::array<Eigen::DenseIndex, 1> dims({value_len_});
     return typename TTypes<V>::Flat(val, dims);
   }
 
+  V* GetValuePtr(void* ptr) {
+    return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index);
+  }
+
   int64 ValueLen() const {
     return value_len_;
   }
@@ -602,25 +491,26 @@ class EmbeddingVar : public ResourceBase {
                    std::vector<V*>* value_list,
                    std::vector<int64>* version_list,
                    std::vector<int64>* freq_list) {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     storage_->GetSnapshot(key_list, &value_ptr_list);
     bool is_save_freq = emb_config_.is_save_freq();
     bool is_save_version = emb_config_.is_save_version();
     for (int64 i = 0; i < key_list->size(); i++) {
-      V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0);
-      if (val != nullptr) {
+      if (feat_desc_->IsAdmit(value_ptr_list[i])) {
+        V* val = feat_desc_->GetEmbedding(
+            value_ptr_list[i], emb_config_.emb_index);
         value_list->emplace_back(val);
       } else {
         value_list->emplace_back(default_value_);
       }
 
       if(is_save_version) {
-        int64 dump_version = value_ptr_list[i]->GetStep();
+        int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]);
         version_list->emplace_back(dump_version);
       }
 
       if(is_save_freq) {
-        int64 dump_freq = value_ptr_list[i]->GetFreq();
+        int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]);
         freq_list->emplace_back(dump_freq);
       }
     }
@@ -634,6 +524,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_;
   }
 
+  embedding::FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
+  }
+
   Status Shrink(embedding::ShrinkArgs& shrink_args) {
     if (emb_config_.is_primary()) {
       shrink_args.value_len = value_len_;
@@ -671,10 +565,6 @@ class EmbeddingVar : public ResourceBase {
     return alloc_;
   }
 
-  int64 GetAllocLen() {
-    return emb_config_.total_num(storage_->GetAllocLen());
-  }
-
   V** GetBuffer(int64 size) {
     if (dev_addr_buffer_size_ >= size) {
       return dev_addr_buffer_;
@@ -756,16 +646,17 @@ class EmbeddingVar : public ResourceBase {
     return storage_->HashTable();
   }
 
- protected:
   FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const {
     return filter_;
   }
 
+ protected:
   ~EmbeddingVar() override {
     // When dynamic dimension embedding is used,
     // there will be more than one primary slot
     if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
       delete storage_;
+      delete feat_desc_;
     }
     if (embedding::StorageType::HBM_DRAM == storage_type_) {
       alloc_->DeallocateRaw(dev_addr_buffer_);
@@ -804,35 +695,6 @@ class EmbeddingVar : public ResourceBase {
           value_len_ * sizeof(V), do_work);
   }
 
-  V* GetAddressOfGpuValuePtr(ValuePtr<V>* value_ptr,
-      int64 index,
-      bool copyback_flag,
-      std::list<int64>& init_cursor,
-      std::list<int64>& copyback_cursor) {
-    V* mem_addr = nullptr;
-    bool init_flag = false;
-    if (!copyback_flag) {
-      mem_addr = LookupOrCreateEmb(value_ptr, init_flag);
-    } else {
-      mem_addr = value_ptr->GetValue(0,0);
-      if (copyback_flag ==
-          embedding::CopyBackFlag::COPYBACK_AND_DESTROY) {
-        delete value_ptr;
-        // If the 64th bit of cursor is set to 1,
-        // the corresponding valueptr need to be deleted later.
-        int64 tmp = 1;
-        tmp = tmp << 63;
-        copyback_cursor.emplace_back(index | tmp);
-      } else {
-        copyback_cursor.emplace_back(index);
-      }
-    }
-    if (init_flag) {
-      init_cursor.emplace_back(index);
-    }
-    return mem_addr;
-  }
-
   std::string name_;
   bool is_initialized_ = false;
 
@@ -849,8 +711,7 @@ class EmbeddingVar : public ResourceBase {
   embedding::StorageType storage_type_;
   EmbeddingConfig emb_config_;
   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
-  std::function<void(ValuePtr<V>*, int64, int64)> add_freq_fn_;
-  std::function<void(ValuePtr<V>*, int64)> update_version_fn_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar);
 };
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
index c1b43a608b5..7dddf714b6b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc
@@ -21,42 +21,38 @@ namespace tensorflow {
 namespace embedding {
 template<class K, class V>
 void EmbeddingVarCkptData<K, V>::Emplace(
-    K key, ValuePtr<V>* value_ptr,
+    K key, void* value_ptr,
     const EmbeddingConfig& emb_config,
-    V* default_value, int64 value_offset,
+    V* default_value,
+    FeatureDescriptor<V>* feat_desc,
     bool is_save_freq,
     bool is_save_version,
     bool save_unfiltered_features) {
   if((int64)value_ptr == ValuePtrStatus::IS_DELETED)
     return;
 
-  V* primary_val = value_ptr->GetValue(0, 0);
-  bool is_not_admit =
-      primary_val == nullptr
-      && emb_config.filter_freq != 0;
+  bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+  bool is_admit = feat_desc->IsAdmit(value_ptr);
 
-  if (!is_not_admit) {
+  if (is_admit) {
     key_vec_.emplace_back(key);
 
-    if (primary_val == nullptr) {
+    if (!is_in_dram) {
+      value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM);
+      value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+    } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) {
       value_ptr_vec_.emplace_back(default_value);
-    } else if (
-        (int64)primary_val == ValuePosition::NOT_IN_DRAM) {
-      value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM);
     } else {
-      V* val = value_ptr->GetValue(emb_config.emb_index,
-          value_offset);
+      V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index);
       value_ptr_vec_.emplace_back(val);
     }
-
-
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_vec_.emplace_back(dump_version);
     }
 
     if(is_save_freq) {
-      int64 dump_freq = value_ptr->GetFreq();
+      int64 dump_freq = feat_desc->GetFreq(value_ptr);
       freq_vec_.emplace_back(dump_freq);
     }
   } else {
@@ -66,18 +62,18 @@ void EmbeddingVarCkptData<K, V>::Emplace(
     key_filter_vec_.emplace_back(key);
 
     if(is_save_version) {
-      int64 dump_version = value_ptr->GetStep();
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
       version_filter_vec_.emplace_back(dump_version);
     }
 
-    int64 dump_freq = value_ptr->GetFreq();
+    int64 dump_freq = feat_desc->GetFreq(value_ptr);
     freq_filter_vec_.emplace_back(dump_freq);
   }
 }
 #define REGISTER_KERNELS(ktype, vtype)                               \
   template void EmbeddingVarCkptData<ktype, vtype>::Emplace(  \
-      ktype, ValuePtr<vtype>*, const EmbeddingConfig&, \
-      vtype*, int64, bool, bool, bool); 
+      ktype, void*, const EmbeddingConfig&, \
+      vtype*, FeatureDescriptor<vtype>*, bool, bool, bool);
 #define REGISTER_KERNELS_ALL_INDEX(type)                             \
   REGISTER_KERNELS(int32, type)                                      \
   REGISTER_KERNELS(int64, type)
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index 6d7b09e70b0..10bf0d0e43b 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -19,15 +19,19 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h"
 namespace tensorflow {
 class BundleWriter;
+namespace {
+  const int kSavedPartitionNum = 1000;
+  const int kDramFlagOffset = 49;
+}
 
 namespace embedding {
-
 template<class K, class V>
 class  EmbeddingVarCkptData {
  public:
-  void Emplace(K key, ValuePtr<V>* value_ptr,
+  void Emplace(K key, void* value_ptr,
                const EmbeddingConfig& emb_config,
-               V* default_value, int64 value_offset,
+               V* default_value,
+               FeatureDescriptor<V>* feat_desc,
                bool is_save_freq,
                bool is_save_version,
                bool save_unfiltered_features);
diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
index 84c823a90dc..4c052b43c7e 100644
--- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
+++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h
@@ -57,7 +57,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
         value_len_(value_len),
         col_idx_(0) {
     if (!valueptr_list.empty()) {
-      if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+      if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
         curr_ptr_ = val_iter_->Next();
       } else {
         curr_ptr_ = *curr_iter_;
@@ -75,7 +75,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator<T> {
       curr_iter_++;
       col_idx_ = 0;
       if (curr_iter_ != end_iter_) {
-        if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) {
+        if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
           curr_ptr_ = val_iter_->Next();
         } else {
           curr_ptr_ = *curr_iter_;
diff --git a/tensorflow/core/framework/embedding/feature_descriptor.h b/tensorflow/core/framework/embedding/feature_descriptor.h
new file mode 100644
index 00000000000..8808da353f4
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor.h
@@ -0,0 +1,200 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/config.pb.h"
+#include "tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
+#include "tensorflow/core/framework/embedding/normal_feature_descriptor.h"
+#include <list>
+
+namespace tensorflow {
+namespace embedding {
+
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl;
+
+template <class V>
+class FeatureDescriptor {
+ public:
+  FeatureDescriptor(
+      int64 block_num,
+      int64 slot_num,
+      Allocator* alloc,
+      StorageType storage_type,
+      bool need_record_freq,
+      bool need_record_version,
+      const std::pair<bool, int64>& filter_info) {
+    if (block_num > 1) {
+      feat_desc_impl_.reset(
+          new DynmaicDimDescriptorImpl<V>(
+              alloc, block_num * slot_num));
+    } else if (filter_info.first) {
+      feat_desc_impl_.reset(
+          new CounterFilterDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version,
+              filter_info.second,
+              storage_type));
+    } else if (storage_type == StorageType::HBM_DRAM || 
+               storage_type == StorageType::HBM_DRAM_SSDHASH) {
+      feat_desc_impl_.reset(
+          new HbmMultiTierFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    } else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              alloc, slot_num,
+              need_record_freq,
+              need_record_version));
+    }
+  }
+
+  FeatureDescriptor(FeatureDescriptor<V>* feat_desc) {
+    if (typeid(*(feat_desc->feat_desc_impl_.get())) == 
+        typeid(CounterFilterDescriptorImpl<V>*)) {
+      feat_desc_impl_.reset(
+        new CounterFilterDescriptorImpl<V>(
+          dynamic_cast<CounterFilterDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    }
+    else if (typeid(*(feat_desc->feat_desc_impl_.get())) ==
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>)) {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+    else {
+      feat_desc_impl_.reset(
+          new NormalFeatureDescriptorImpl<V>(
+              dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+                  feat_desc->feat_desc_impl_.get())));
+    }
+  }
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    return feat_desc_impl_->InitSlotInfo(
+        emb_index, embedding_dim, default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptor<V>* feat_desc) {
+    return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get());
+  }
+
+  V* GetEmbedding(void *val, int emb_index) {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  void* Allocate() {
+    return feat_desc_impl_->Allocate();
+  }
+
+  void* Allocate(int64 freq) {
+    return feat_desc_impl_->Allocate(freq);
+  }
+
+  void Deallocate(void* val) {
+    feat_desc_impl_->Deallocate(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) {
+    feat_desc_impl_->Deallocate(value_ptrs);
+  }
+
+  void SetDefaultValue(void* val, int64 index) {
+    feat_desc_impl_->SetDefaultValue(val, index);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    feat_desc_impl_->SetValue(val, emb_index, value);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    reinterpret_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(feat_desc_impl_.get())->SetDefaultValues(
+        keys, init_cursor, value_ptrs,
+        compute_stream, event_mgr, gpu_device);
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  int data_bytes() {
+    return feat_desc_impl_->data_bytes();
+  }
+
+  int64 GetFreq(void* val) {
+    return feat_desc_impl_->GetFreq(val);
+  }
+
+  int64 GetVersion(void* val) {
+    return feat_desc_impl_->GetVersion(val);
+  }
+
+  void SetFreq(void* val, int64 freq) {
+    feat_desc_impl_->SetFreq(val, freq);
+  }
+
+  void UpdateVersion(void* val, int64 version) {
+    feat_desc_impl_->UpdateVersion(val, version);
+  }
+
+  void AddFreq(void* val, int64 freq) {
+    feat_desc_impl_->AddFreq(val, freq);
+  }
+
+  int total_dim() {
+    return feat_desc_impl_->total_dim();
+  }
+  
+  bool IsAdmit(void* val) {
+    return feat_desc_impl_->IsAdmit(val);
+  }
+
+  void* Admit(void* val) {
+    return feat_desc_impl_->Admit(val);
+  }
+
+
+ protected:
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/feature_descriptor_impl.h b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
new file mode 100644
index 00000000000..6996d22f447
--- /dev/null
+++ b/tensorflow/core/framework/embedding/feature_descriptor_impl.h
@@ -0,0 +1,317 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace embedding {
+struct SlotInfo {
+  int embedding_dim;
+  int embedding_offset;
+  void* default_value;
+  int64 default_value_dim;
+  int default_value_len;
+};
+
+class BaseFreqDescriptor {
+ public:
+  virtual int64 GetFreq(void* value_ptr) = 0;
+  virtual void AddFreq(void* value_ptr, int64 freq) {}
+  virtual void SetFreq(void* value_ptr, int64 freq) {}
+  virtual BaseFreqDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class FreqDescriptor: public BaseFreqDescriptor {
+ public:
+  explicit FreqDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+
+  int64 GetFreq(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void AddFreq(void* value_ptr, int64 freq) override {
+    __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq);
+  }
+
+  void SetFreq(void* value_ptr, int64 freq) override {
+    *(int64*)(value_ptr + offset_byte_) = freq;
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new FreqDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+  
+ private:
+  int offset_byte_;
+};
+
+class NonFreqDescriptor: public BaseFreqDescriptor {
+ public:
+  int64 GetFreq(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get freq from NonFreqCounter.";
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new NonFreqDescriptor();
+  }
+};
+
+class BaseVersionDescriptor {
+ public:
+  virtual int64 GetVersion(void* value_ptr) = 0;
+  virtual void UpdateVersion(void* value_ptr, int64 version) {}
+  virtual BaseVersionDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class VersionDescriptor: public BaseVersionDescriptor {
+ public:
+  explicit VersionDescriptor(int offset_byte)
+      : offset_byte_(offset_byte) {}
+  
+  int64 GetVersion(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void UpdateVersion(void* value_ptr, int64 version) override {
+    *(int64*)(value_ptr + offset_byte_) = version;
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new VersionDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+
+ private:
+  int offset_byte_;
+};
+
+class NonVersionDescriptor: public BaseVersionDescriptor {
+ public:
+  int64 GetVersion(void* value_ptr) override {
+    LOG(FATAL)<<"Can not get version from NonFreqCounter.";
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new NonVersionDescriptor();
+  }
+};
+
+template <class V>
+class FeatureDescriptorImpl {
+ public:
+  FeatureDescriptorImpl(int64 slot_num,
+                    bool need_record_freq,
+                    bool need_record_version) {
+    slot_infos_.resize(slot_num);
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE;
+    }
+
+    if (!need_record_freq) {
+      freq_desc_.reset(new NonFreqDescriptor());
+    }
+    if (!need_record_version) {
+      version_desc_.reset(new NonVersionDescriptor());
+    }
+  }
+
+  FeatureDescriptorImpl(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+    freq_desc_.reset(
+        feat_desc_impl->freq_desc_->Clone());
+    version_desc_.reset(
+        feat_desc_impl->version_desc_->Clone());
+  }
+
+  virtual ~FeatureDescriptorImpl() {}
+
+  virtual bool InitSlotInfo(int emb_index, int64 embedding_dim,
+      const std::pair<V*, int64>& default_value) = 0;
+  virtual bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    LOG(FATAL)<<"InitSlotInfo(feat_desc_impl) is not implemented.";
+  }
+  virtual V* GetEmbedding(void* val, int emb_index) = 0;
+  virtual void* Allocate() = 0;
+  virtual void* Allocate(int64 freq) {return Allocate();}
+  virtual void Deallocate(void* val) = 0;
+  virtual void Deallocate(const std::vector<void*>& val) = 0;
+  virtual void SetAllocator(Allocator* alloc) = 0;
+  virtual void SetDefaultValue(void* val, int64 key) = 0;
+  virtual void SetValue(void* val, int64 emb_index, V* value) {}
+  virtual bool IsAdmit(void* val) {return true;}
+  virtual void* Admit(void* val) {}
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {}
+#endif
+  virtual int data_bytes() = 0;
+
+  virtual int64 GetFreq(void* val) {
+    return freq_desc_->GetFreq(val);
+  }
+
+  virtual int64 GetVersion(void* val) {
+    return version_desc_->GetVersion(val);
+  }
+
+  virtual void SetFreq(void* val, int64 freq) {
+    freq_desc_->SetFreq(val, freq);
+  }
+
+  virtual void UpdateVersion(void* val, int64 version) {
+    version_desc_->UpdateVersion(val, version);
+  }
+
+  virtual void AddFreq(void* val, int64 freq) {
+    freq_desc_->AddFreq(val, freq);
+  }
+
+  inline int total_dim() {
+    int64 slot_num = slot_infos_.size();
+    return slot_infos_[slot_num - 1].embedding_offset
+           + slot_infos_[slot_num - 1].embedding_dim;
+  }
+
+ protected:
+  bool SetEmbeddingInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    slot_infos_[emb_index].default_value = default_value.first;
+    slot_infos_[emb_index].default_value_dim = default_value.second;
+    slot_infos_[emb_index].default_value_len = embedding_dim;
+
+    bool is_aligned = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true,
+        &is_aligned));
+    if (is_aligned) {
+      embedding_dim = ComputeAlignedDim(embedding_dim);
+    }
+
+    //Avoid parallel consitency issue
+    __sync_bool_compare_and_swap(
+        &slot_infos_[emb_index].embedding_offset,
+        EMPTY_OFFSET_VALUE, embedding_dim);
+    slot_infos_[emb_index].embedding_dim = embedding_dim;
+    //Check whether all offsets are set
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) {
+        return false;
+      }
+    }
+
+    ComputeEmbeddingOffsets();
+    return true;
+  }
+
+  void SetSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+  }
+
+  void ComputeAllocBytes(int* alloc_bytes) {
+    for(auto slot_info: slot_infos_) {
+      *alloc_bytes += slot_info.embedding_dim * sizeof(V);
+    }
+  }
+
+  void CreateFreqAndVersionDescriptor(int* alloc_bytes) {
+    if (!freq_desc_) {
+      freq_desc_.reset(new FreqDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+    if (!version_desc_) {
+      version_desc_.reset(new VersionDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+  }
+
+  void InitFreqAndVersion(void* val) {
+    freq_desc_->SetFreq(val, 0);
+    version_desc_->UpdateVersion(val, -1);
+  }
+
+  void SetFreqAndVersionOffset(int* alloc_bytes) {
+    freq_desc_->SetOffset(alloc_bytes);
+    version_desc_->SetOffset(alloc_bytes);
+  }
+
+  V* GetDefaultValuePtr(int64 emb_index, int64 key) {
+    V* default_value_base = (V*)slot_infos_[emb_index].default_value;
+    int64 default_value_offset =
+        (key % slot_infos_[emb_index].default_value_dim) *
+        slot_infos_[emb_index].default_value_len;
+    return default_value_base + default_value_offset;
+  }
+
+  void SetDefaultValue(void* val, int64 emb_index, int64 key) {
+    memcpy(val,
+           GetDefaultValuePtr(emb_index, key),
+           slot_infos_[emb_index].default_value_len * sizeof(V));
+  }
+
+ private:
+  int64 ComputeAlignedDim(int64 embedding_dim) {
+    int padding_bytes =
+        ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES;
+    if (padding_bytes == ALIGN_BYTES) {
+      return embedding_dim;
+    } else {
+      return embedding_dim + padding_bytes / sizeof(V);
+    }
+  }
+
+  void ComputeEmbeddingOffsets() {
+    for (int i = slot_infos_.size() - 1 ; i >= 0; i--) {
+      slot_infos_[i].embedding_offset = 0;
+      for (int j = 0; j < i; j++) {
+        slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset;
+      }
+    }
+  }
+
+ protected:
+  const int EMPTY_OFFSET_VALUE= -1;
+  const int ALIGN_BYTES = 16;
+  std::vector<SlotInfo> slot_infos_;
+  std::unique_ptr<BaseFreqDescriptor> freq_desc_;
+  std::unique_ptr<BaseVersionDescriptor> version_desc_;
+};
+
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
diff --git a/tensorflow/core/framework/embedding/filter_factory.h b/tensorflow/core/framework/embedding/filter_factory.h
index 5bb92467a51..0127e2c882a 100644
--- a/tensorflow/core/framework/embedding/filter_factory.h
+++ b/tensorflow/core/framework/embedding/filter_factory.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/filter_policy.h"
 #include "tensorflow/core/framework/embedding/nullable_filter_policy.h"
 
-
 namespace tensorflow {
 namespace embedding{
 template <class K, class V>
@@ -34,22 +33,23 @@ class FilterFactory {
   template<typename K, typename V, typename EV>
   static FilterPolicy<K, V, EV>* CreateFilter(
       const EmbeddingConfig& config, EV* ev,
-      embedding::Storage<K, V>* storage) {
+      embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc) {
     if (config.filter_freq > 0) {
       if (config.kHashFunc != 0) {
         return new BloomFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       } else {
         return new CounterFilterPolicy<K, V, EV>(
-            config, ev);
+            config, ev, feat_desc);
       }
     } else {
       return new NullableFilterPolicy<K, V, EV>(
-          config, ev, storage);
+          config, ev, storage, feat_desc);
     }
   }
 };
 
-} // tensorflow
+} //namespace tensorflow
 
 #endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 559a6796246..256d3b044d4 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/emb_file.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 
 namespace tensorflow {
 
@@ -45,9 +46,6 @@ struct RestoreBuffer {
 template<typename K>
 class RestoreSSDBuffer;
 
-template <typename V>
-class ValuePtr;
-
 template<typename K, typename V, typename EV>
 class FilterPolicy {
  public:
@@ -55,7 +53,7 @@ class FilterPolicy {
       config_(config), ev_(ev) {}
 
   virtual void LookupOrCreate(K key, V* val,
-      const V* default_value_ptr, ValuePtr<V>** value_ptr,
+      const V* default_value_ptr, void** value_ptr,
       int count, const V* default_value_no_permission) = 0;
 
   virtual Status Lookup(K key, V* val, const V* default_value_ptr,
@@ -70,53 +68,25 @@ class FilterPolicy {
 
   virtual void BatchLookupOrCreateKey(
       const EmbeddingVarContext<GPUDevice>& ctx,
-      const K* keys, ValuePtr<V>** value_ptrs_list,
+      const K* keys, void** value_ptrs_list,
       int64 num_of_keys) = 0;
 #endif //GOOGLE_CUDA
 
-  virtual Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  virtual Status LookupOrCreateKey(K key, void** val,
       bool* is_filter, int64 count) = 0;
+  
+  virtual Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) {}
 
-  virtual int64 GetFreq(K key, ValuePtr<V>* value_ptr) = 0;
-
+  virtual int64 GetFreq(K key, void* value_ptr) = 0;
   virtual int64 GetFreq(K key) = 0;
 
-  virtual bool is_admit(K key, ValuePtr<V>* value_ptr) = 0;
+  virtual bool is_admit(K key, void* value_ptr) = 0;
 
   virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
                          bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0;
 
- protected:
-  void LookupOrCreateEmbInternal(bool is_filter, bool to_dram,
-                                 int i, int value_len,
-                                 ValuePtr<V>* value_ptr,
-                                 V* value_src, K* key_src) {
-    
-    if (!is_filter) {
-      ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen());
-      return;
-    } else {
-      if (to_dram) {
-#if GOOGLE_CUDA
-        std::vector<V> default_value_host;
-        default_value_host.resize(config_.default_value_dim * value_len);
-        cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(),
-                    sizeof(V) * config_.default_value_dim * value_len,
-                    cudaMemcpyDeviceToHost);
-        ev_->LookupOrCreateEmb(value_ptr,
-                               default_value_host.data() +
-                                  (key_src[i] % config_.default_value_dim)
-                                  * ev_->ValueLen());
-#endif
-        return;
-      } else {
-        ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i]));
-      return;
-      }
-    }
-  }
-
  protected:
   EmbeddingConfig config_;
   EV* ev_;
diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
index a2af6a2430a..b0950eff22d 100644
--- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h
@@ -18,25 +18,21 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/shrink_policy.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   GlobalStepShrinkPolicy(int64 steps_to_live,
-                         Allocator* alloc,
+                         FeatureDescriptor<V>* feat_desc,
                          KVInterface<K, V>* kv)
       : steps_to_live_(steps_to_live),
         kv_(kv),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.global_step,
@@ -46,16 +42,16 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 global_step,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      int64 version = value_list[i]->GetStep();
+      int64 version = ShrinkPolicy<K, V>::feat_desc_->GetVersion(value_list[i]);
       if (version == -1) {
-        value_list[i]->SetStep(global_step);
+        ShrinkPolicy<K, V>::feat_desc_->UpdateVersion(value_list[i], global_step);
       } else {
         if (global_step - version > steps_to_live_) {
           kv_->Remove(key_list[i]);
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index 1dd90d63a6e..fc4a2506313 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -204,29 +204,29 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchLookupOrCreate(const K* keys, size_t n,
-                             ValuePtr<V>** value_ptrs) override {
+                             void** value_ptrs) override {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     return Status::OK();
   }
 
   Status Contains(K key) override { return Status::OK(); }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status Remove(K key) override { return Status::OK(); }
 
   Status BatchLookup(const K* keys, size_t size,
-                     ValuePtr<V>** value_ptrs) override {
+                     void** value_ptrs) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
@@ -235,22 +235,20 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return Status::OK();
   }
 
   int64 Size() const override { return 0; }
 
-  void SetTotalDims(int total_dims) override {}
+  void FreeValuePtr(void* value_ptr) override {}
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {}
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 581f1f1cfaf..1056f4bbd78 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -3,7 +3,6 @@
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -14,9 +13,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -26,15 +22,17 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramSsdStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-      Allocator* cpu_alloc, LayoutCreator<V>* lc, const std::string& name)
-      : cpu_alloc_(cpu_alloc), gpu_alloc_(gpu_alloc),
+  HbmDramSsdStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
         MultiTierStorage<K, V>(sc, name),
         dram_capacity_(-1) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc_, lc);
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc_, lc,
-        new LocklessHashMapCPU<K, V>(gpu_alloc_));
-    ssd_ = new SsdHashStorage<K, V>(sc, cpu_alloc_, lc);
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
+    ssd_ = new SsdHashStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramSsdStorage() override {
@@ -46,29 +44,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    ssd_->Init();
 
-      MultiTierStorage<K, V>::cache_capacity_ =
-          Storage<K, V>::storage_config_.size[0]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
+    MultiTierStorage<K, V>::cache_capacity_ =
+        Storage<K, V>::storage_config_.size[0]
+        / (total_dim() * sizeof(V));
           
-      dram_capacity_ = Storage<K, V>::storage_config_.size[1]
-          / (Storage<K, V>::total_dims_ * sizeof(V));
-      MultiTierStorage<K, V>::ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+    dram_capacity_ = Storage<K, V>::storage_config_.size[1]
+        / (total_dim() * sizeof(V));
+    MultiTierStorage<K, V>::ready_eviction_ = true;
   }
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -88,13 +77,12 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -102,20 +90,20 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
-    std::vector<std::list<ValuePtr<V>*>>
+    std::vector<std::list<void*>>
         ssd_value_ptr_list(num_worker_threads + 1);
 
     BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
@@ -124,70 +112,27 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        ssd_value_ptr_list[0], value_len);
+        ssd_value_ptr_list[0]);
 
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+              bool to_dram = false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->Insert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->Insert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    // Insert Failed
-    if (!s.ok()) {
-      {
-        mutex_lock l(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-      }
-      delete *value_ptr;
-      return hbm_->Get(key, value_ptr);
-    } else {
-      return s;
-    }
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-    s = ssd_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK_AND_DESTROY;
-      return s;
-    }
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   void InitCache(embedding::CacheStrategy cache_strategy) override {
@@ -195,66 +140,6 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     dram_cache_ = new LRUCache<K>();
   }
 
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it & 0x0fffffffffffffff;
-        memory_index[i] = *it;
-        ValuePtr<V>* gpu_value_ptr =
-            hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int64 j = memory_index[i] & 0x0fffffffffffffff;
-        bool destroy_flag = (memory_index[i] >> 63) & 0x1;
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-        if (destroy_flag) {
-          ssd_->DestroyValuePtr(reinterpret_cast<ValuePtr<V>*>(
-              (char *)memcpy_address[j] - sizeof(FixedLengthHeader)));
-        }
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
-  }
-
   Status Remove(K key) override {
     hbm_->Remove(key);
     dram_->Remove(key);
@@ -311,25 +196,23 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
 
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -347,17 +230,24 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
+      }
     }
 
     ssd_->Save(tensor_name, prefix, writer, emb_config,
@@ -368,7 +258,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   Status DramToSsdBatchCommit(std::shared_ptr<std::vector<K>> keys) {
     MultiTierStorage<K, V>::ReleaseValuePtrs(dram_value_ptr_out_of_date_,
-                                             dram_->alloc_);
+                                             dram_feat_desc_);
     mutex_lock l(*(ssd_->get_mutex()));
     mutex_lock l1(*(dram_->get_mutex()));
 
@@ -380,7 +270,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, DramEvictionSize);
       K dram_evic_ids[DramEvictionSize];
       size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       for (int64 i = 0; i < true_size; ++i) {
         if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) {
           TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr));
@@ -408,22 +298,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::shared_ptr<std::vector<K>> keys(new std::vector<K>());
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys->emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(*keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+      
+      dram_->BatchCommit(*keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : *keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -435,58 +334,14 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
     }
   }
 
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-        new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
   }
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
-    ssd_->SetTotalDims(total_dims);
-  }
-
-  void CopyToGpuValuePtr(
-      ValuePtr<V>* gpu_ptr,
-      ValuePtr<V>* cpu_ptr,
-      int64 size) {
-    V* cpu_data_address = cpu_ptr->GetValue(0, 0);
-    V* gpu_data_address = gpu_ptr->GetValue(0, 0);
-    cudaMemcpy(gpu_data_address, cpu_data_address,
-        size * sizeof(V), cudaMemcpyHostToDevice);
-    memcpy(gpu_ptr->GetPtr(),
-           cpu_ptr->GetPtr(),
-           sizeof(FixedLengthHeader));
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
 
   void Restore(const std::string& name_string,
@@ -539,6 +394,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
                                            (int64*)restore_buff.freq_buffer);
     return s;
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {}
  private:
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
     V* memcpy_buffer_cpu = new V[size * value_len];
@@ -551,46 +410,30 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-                                         Storage<K, V>::GetOffset(emb_index)),
-                                         value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -611,10 +454,10 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
-      std::vector<std::list<ValuePtr<V>*>>& ssd_value_ptr_list,
+      std::vector<std::list<void*>>& ssd_value_ptr_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
@@ -688,39 +531,32 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
-                                   std::list<ValuePtr<V>*>& ssd_value_ptrs,
-                                   int64 value_len) {
+                                   std::list<void*>& ssd_value_ptrs) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs,  hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -730,12 +566,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -752,34 +583,31 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
       //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      //Mutex with eviction thread
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -787,12 +615,7 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -804,29 +627,28 @@ class HbmDramSsdStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
   SsdHashStorage<K, V>* ssd_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_;
   Allocator* gpu_alloc_;
-  Allocator* cpu_alloc_;
   BatchCache<K>* dram_cache_;
   int64 dram_capacity_;
-  std::deque<ValuePtr<V>*> dram_value_ptr_out_of_date_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  std::deque<void*> dram_value_ptr_out_of_date_;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index 518c39287e0..d058d95f05b 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h"
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
@@ -29,9 +28,6 @@ namespace tensorflow {
 using se::DeviceMemoryBase;
 using se::Stream;
 
-template <class V>
-class ValuePtr;
-
 template <typename K, typename V>
 class CheckpointLoader;
 
@@ -41,27 +37,27 @@ namespace embedding {
 template<typename K, typename V>
 class HbmDramStorage : public MultiTierStorage<K, V> {
  public:
-  HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc,
-                 Allocator* cpu_alloc, LayoutCreator<V>* lc,
-                 const std::string& name)
-      : gpu_alloc_(gpu_alloc), MultiTierStorage<K, V>(sc, name) {
-    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, gpu_alloc, lc);
-    StorageConfig storage_config = StorageConfig();
-    storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    dram_ = new DramStorage<K, V>(sc, cpu_alloc,
-                                  LayoutCreatorFactory::Create<V>(storage_config),
-                                  new LocklessHashMapCPU<K, V>(gpu_alloc));
+  HbmDramStorage(const StorageConfig& sc,
+      Allocator* gpu_alloc,
+      FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
+        MultiTierStorage<K, V>(sc, name) {
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
   }
 
   ~HbmDramStorage() override {
     MultiTierStorage<K, V>::DeleteFromEvictionManager();
     delete hbm_;
     delete dram_;
+    delete dram_feat_desc_;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     Status s = hbm_->Get(key, value_ptr);
     if (s.ok()) {
       return s;
@@ -76,9 +72,8 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                 const K* keys,
-                ValuePtr<V>** value_ptr_list,
-                int64 num_of_keys,
-                int64 value_len) override {
+                void** value_ptr_list,
+                int64 num_of_keys) override {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         copyback_cursor_list(num_worker_threads + 1);
@@ -87,18 +82,17 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
   }
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
+  void Insert(K key, void** value_ptr) override {
     hbm_->Insert(key, value_ptr);
   }
 
   void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_fountd_cursor_list) override {
@@ -110,115 +104,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
                       copyback_cursor_list, &not_fountd_cursor_list);
 
     CopyEmbeddingsFromDramToHbm(
-        ctx, keys, value_ptr_list, copyback_cursor_list[0],
-        value_len);
-
+        ctx, keys, value_ptr_list, copyback_cursor_list[0]);
     CreateValuePtrs(ctx, keys, value_ptr_list,
                     not_fountd_cursor_list[0], value_len);
   }
 
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
+  void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     if (to_dram) {
-      dram_->Insert(key, value_ptr, alloc_len);
+      dram_->CreateAndInsert(key, value_ptr);
     } else {
-      hbm_->Insert(key, value_ptr, alloc_len);
+      hbm_->CreateAndInsert(key, value_ptr);
     }
   }
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(size);
-    {
-      mutex_lock l(memory_pool_mu_);
-      gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate());
-      *value_ptr = gpu_value_ptr;
-    }
 
-    s = hbm_->TryInsert(key, *value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    // Insert Failed, key already exist
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0));
-    }
-    delete *value_ptr;
-    return hbm_->Get(key, value_ptr);
-  }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    Status s = hbm_->Get(key, value_ptr);
-    if (s.ok()) {
-      return s;
-    }
-    s = dram_->Get(key, value_ptr);
-    if (s.ok()) {
-      need_copyback = COPYBACK;
-      return s;
-    }
-
-    hbm_->Insert(key, value_ptr, size);
-    return Status::OK();
-  }
-
-  void CopyEmbeddingsFromCPUToGPU(
-      int total, const K* keys,
-      const std::list<int64>& copyback_cursor,
-      V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs, V* memcpy_buffer_gpu,
-      se::Stream* compute_stream,
-      EventMgr* event_mgr,
-      const DeviceBase::CpuWorkerThreads* worker_threads) override {
-    auto memcpy_buffer_cpu = TypedAllocator::Allocate<V>(cpu_allocator(),
-        total * value_len, AllocationAttributes());
-    int64* memory_index = new int64[total];
-    int64 i = 0;
-    auto it = copyback_cursor.cbegin();
-    {
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursor.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)memcpy_address[j] - sizeof(FixedLengthHeader),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-      }
-    }
-    //Split from above for loop for minize the cost of mutex lock
-    auto do_work = [memory_index, memcpy_address,
-                    memcpy_buffer_cpu, gpu_value_ptrs,
-                    value_len, this] (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        int j = memory_index[i];
-        memcpy(memcpy_buffer_cpu + i * value_len,
-               memcpy_address[j], value_len * sizeof(V));
-      }
-    };
-    Shard(worker_threads->num_threads, worker_threads->workers, total,
-          1000, do_work);
-    DeviceMemoryBase gpu_dst_ptr(
-        memcpy_buffer_gpu, total * value_len * sizeof(V));
-    compute_stream->ThenMemcpy(
-        &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V));
-    SyncWithEventMgr(compute_stream, event_mgr);
-    TypedAllocator::Deallocate(
-        cpu_allocator(), memcpy_buffer_cpu, total * value_len);
-    delete[] memory_index;
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs.";
   }
 
   Status Remove(K key) override {
@@ -270,25 +171,23 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       int64 value_len,
       V* default_value) override {
     std::vector<K> key_list, tmp_dram_key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list, tmp_dram_value_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
     TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
     hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
 
     HbmValueIterator<K, V> hbm_value_iter(
         key_list, value_ptr_list,
-        emb_config.emb_index, Storage<K, V>::alloc_len_,
-        gpu_alloc_);
-
-    std::vector<ValuePtr<V>*> tmp_hbm_value_ptrs(value_ptr_list.size());
+        emb_config.emb_index, value_len,
+        gpu_alloc_, hbm_feat_desc_);
+    
     for (int64 i = 0; i < value_ptr_list.size(); i++) {
-      ValuePtr<V>* value_ptr = hbm_->CreateValuePtr(value_len);
-      memcpy((char *)value_ptr->GetPtr(),
-             (char *)value_ptr_list[i]->GetPtr(),
-             sizeof(FixedLengthHeader));
-      value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM);
-      value_ptr->SetInitialized(emb_config.primary_emb_index);
-      tmp_hbm_value_ptrs[i] = value_ptr;
-      value_ptr_list[i] = value_ptr;
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(
+          value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
     }
 
     TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list,
@@ -306,54 +205,26 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
     {
       mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
       TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
           tensor_name, writer,
           emb_config,
           value_len, default_value,
           key_list,
           value_ptr_list,
+          feat_desc_list,
           &hbm_value_iter)));
     }
 
-    for (auto it: tmp_hbm_value_ptrs) {
-      delete it;
-    }
-    return Status::OK();
-  }
-
-  void CreateEmbeddingMemoryPool(
-      Allocator* alloc,
-      int64 value_len,
-      int64 block_size) override {
-    embedding_mem_pool_ =
-       new EmbeddingMemoryPool<V>(alloc, value_len, block_size);
-  }
-
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    //Mutex with eviction thread
-    mutex_lock l(memory_pool_mu_);
-    for (auto it : value_ptr_list) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = it->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
-      }
-    }
-  }
-
-  void AllocateMemoryForNewFeatures(
-     ValuePtr<V>** value_ptr_list,
-     int64 num_of_value_ptrs) override {
-    //Mutex with other ImportOps
-    mutex_lock l(memory_pool_mu_);
-    for (int64 i = 0; i < num_of_value_ptrs; i++) {
-      V* val_ptr = embedding_mem_pool_->Allocate();
-      bool flag = value_ptr_list[i]->SetPtr(val_ptr);
-      if (!flag) {
-        embedding_mem_pool_->Deallocate(val_ptr);
+    for (auto value_ptr: value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
       }
     }
+    return Status::OK();
   }
 
   void BatchEviction() override {
@@ -372,22 +243,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
       k_size = std::min(k_size, EvictionSize);
       size_t true_size =
           MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
-      ValuePtr<V>* value_ptr;
+      void* value_ptr;
       std::vector<K> keys;
-      std::vector<ValuePtr<V>*> value_ptrs;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
 
       for (int64 i = 0; i < true_size; ++i) {
         if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
           keys.emplace_back(evic_ids[i]);
-          value_ptrs.emplace_back(value_ptr);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+              hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+              hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
         }
       }
-      dram_->BatchCommit(keys, value_ptrs);
-      {
-        //Mutex with main thread
-        mutex_lock l_mem(memory_pool_mu_);
-        embedding_mem_pool_->Deallocate(value_ptrs);
-      }
+      
+      CopyEmbeddingFromHbmToDram(
+          hbm_value_ptrs,
+          dram_value_ptrs, gpu_alloc_,
+          hbm_feat_desc_, dram_feat_desc_);
+
+      dram_->BatchCommit(keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
       for (auto it : keys) {
         TF_CHECK_OK(hbm_->Remove(it));
       }
@@ -430,6 +310,16 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     }
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
  protected:
   Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
@@ -447,14 +337,14 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
     return s;
   }
 
-  void SetTotalDims(int64 total_dims) override {
-    dram_->SetTotalDims(total_dims);
+  int total_dim() override {
+    return hbm_feat_desc_->total_dim();
   }
  private:
   void BatchGetValuePtrs(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* keys,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       std::vector<std::list<int64>>& copyback_cursor_list,
       std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
@@ -522,38 +412,31 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
-                                   std::list<int64>& copyback_cursors,
-                                   int64 value_len) {
+                                   void** value_ptr_list,
+                                   std::list<int64>& copyback_cursors) {
     int64 total = copyback_cursors.size();
-    std::vector<ValuePtr<V>*> gpu_value_ptrs(total);
+    std::vector<void*> gpu_value_ptrs(total);
     std::vector<K> copyback_keys(total);
     std::vector<int64> memory_index(total);
     //Create Hbm ValuePtrs.
-    {
-      int64 i = 0;
-      auto it = copyback_cursors.cbegin();
-      //Mutex with eviction thread
-      mutex_lock l(memory_pool_mu_);
-      for ( ; it != copyback_cursors.cend(); ++it, ++i) {
-        int64 j = *it;
-        memory_index[i] = j;
-        ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        bool flag = gpu_value_ptr->SetPtr(val_ptr);
-        if (!flag) {
-          embedding_mem_pool_->Deallocate(val_ptr);
-        }
-        memcpy((char *)gpu_value_ptr->GetPtr(),
-               (char *)value_ptr_list[j]->GetPtr(),
-               sizeof(FixedLengthHeader));
-        gpu_value_ptrs[i] = gpu_value_ptr;
-        copyback_keys[i] = keys[*it];
-      }
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    //Mutex with eviction thread
+    for ( ; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+          dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(gpu_value_ptr,
+          dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
     }
     MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
         ctx, keys, value_ptr_list, copyback_cursors,
-        memory_index, gpu_value_ptrs, value_len);
+        memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(),
+        hbm_feat_desc_, dram_feat_desc_);
 
     //Insert copyback ids to hbm hash table.
     auto do_insert = [this, copyback_keys, gpu_value_ptrs,
@@ -563,12 +446,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         Status s = hbm_->TryInsert(
             copyback_keys[i], gpu_value_ptrs[i]);
         if (!s.ok()) {
-          {
-            mutex_lock l(memory_pool_mu_);
-            embedding_mem_pool_->Deallocate(
-                gpu_value_ptrs[i]->GetValue(0, 0));
-          }
-          delete gpu_value_ptrs[i];
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
           hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
         }
       }
@@ -580,34 +458,29 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
 
   void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx,
                        const K* keys,
-                       ValuePtr<V>** value_ptr_list,
+                       void** value_ptr_list,
                        std::list<int64>& not_found_cursors,
                        int64 value_len) {
     int64 total = not_found_cursors.size();
     if (total > 0) {
-      std::vector<std::pair<int64, ValuePtr<V>*>> insert_pairs(total);
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
       std::vector<int64> cursor_index(total);
-      //Create Hbm ValuePtrs.
-      {
-        int64 i = 0;
-        auto it = not_found_cursors.cbegin();
-        //Mutex with eviction thread
-        mutex_lock l(memory_pool_mu_);
-        for ( ; it != not_found_cursors.cend(); ++it, ++i) {
-          int64 j = *it;
-          cursor_index[i] = j;
-          ValuePtr<V>* gpu_value_ptr = hbm_->CreateValuePtr(value_len);
-          V* val_ptr = embedding_mem_pool_->Allocate();
-          bool flag = gpu_value_ptr->SetPtr(val_ptr);
-          if (!flag) {
-            embedding_mem_pool_->Deallocate(val_ptr);
-          }
-          value_ptr_list[j] = gpu_value_ptr;
-          insert_pairs[i].first = keys[j];
-          insert_pairs[i].second = value_ptr_list[j];
-        }
+      //Create Hbm ValuePtrs.      
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      for ( ; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
       }
 
+      hbm_feat_desc_->SetDefaultValues(
+          keys, not_found_cursors, value_ptr_list,
+          ctx.compute_stream, ctx.event_mgr, ctx.gpu_device);
+
       //Insert copyback ids to hbm hash table.
       auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index]
           (int64 start, int64 limit) {
@@ -615,12 +488,7 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
           Status s = hbm_->TryInsert(
               insert_pairs[i].first, insert_pairs[i].second);
           if (!s.ok()) {
-            {
-              mutex_lock l(memory_pool_mu_);
-              embedding_mem_pool_->Deallocate(
-                  insert_pairs[i].second->GetValue(0, 0));
-            }
-            delete insert_pairs[i].second;
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
             hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
           }
         }
@@ -632,16 +500,22 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
   }
 
   void AddCopyBackFlagToValuePtr(
-      ValuePtr<V>** value_ptr, CopyBackFlag copyback_flag) {
+      void** value_ptr, CopyBackFlag copyback_flag) {
     int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
     tmp = ((int64)*value_ptr) | tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
   }
 
-  void RemoveCopyBackFlagInValuePtr(ValuePtr<V>** value_ptr) {
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
     int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
     tmp = ((int64)*value_ptr) & tmp;
-    *value_ptr = reinterpret_cast<ValuePtr<V>*>(tmp);
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
   }
 
   void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
@@ -655,45 +529,30 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
         (V*)gpu_alloc_->AllocateRaw(
             Allocator::kAllocatorAlignment,
             size * sizeof(V*));
-    ValuePtr<V>** gpu_value_ptrs = new ValuePtr<V>*[size];
-    ValuePtr<V>** cpu_value_ptrs = new ValuePtr<V>*[size];
-    {
-      //Mutex with other Import Ops
-      mutex_lock l(memory_pool_mu_);
-      for (int64 i = 0; i < size; i++) {
-        dram_->Get(ids[i], &cpu_value_ptrs[i]);
-        gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len);
-        V* val_ptr = embedding_mem_pool_->Allocate();
-        gpu_value_ptrs[i]->SetPtr(val_ptr);
-        memcpy((char *)gpu_value_ptrs[i]->GetPtr(),
-               (char *)cpu_value_ptrs[i]->GetPtr(),
-               sizeof(FixedLengthHeader));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
       }
     }
     //Split from above for loop for minize the cost of mutex lock
     //TODO: Speed up with intra parallelism
-    std::vector<ValuePtr<V>*> invalid_value_ptrs;
+    
     for (int64 i = 0; i < size; i++) {
       memcpy(memcpy_buffer_cpu + i * value_len,
-             cpu_value_ptrs[i]->GetValue(emb_index,
-             Storage<K, V>::GetOffset(emb_index)), value_len * sizeof(V));
-      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
-      if (!s.ok()) {
-        invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]);
-        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
-      }
-      gpu_value_ptrs[i]->SetInitialized(emb_index);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(
-          emb_index, Storage<K, V>::GetOffset(emb_index));
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
     }
     cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
                size * value_len * sizeof(V), cudaMemcpyHostToDevice);
     cudaMemcpy(dev_value_address, value_address,
                size * sizeof(V*), cudaMemcpyHostToDevice);
-    {
-      mutex_lock l(memory_pool_mu_);
-      embedding_mem_pool_->Deallocate(invalid_value_ptrs);
-    }
     int block_dim = 128;
     void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
                     (void*)&value_len, (void*)&size};
@@ -714,9 +573,9 @@ class HbmDramStorage : public MultiTierStorage<K, V> {
  private:
   HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
   DramStorage<K, V>* dram_ = nullptr;
-  EmbeddingMemoryPool<V>* embedding_mem_pool_ = nullptr;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
   Allocator* gpu_alloc_;
-  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
   const int copyback_flag_offset_bits_ = 60;
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
new file mode 100644
index 00000000000..a3603a61550
--- /dev/null
+++ b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h
@@ -0,0 +1,122 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+#include "tensorflow/core/framework/embedding/embedding_memory_pool.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template<class V>
+class HbmMultiTierFeatureDescriptorImpl
+    : public FeatureDescriptorImpl<V> {
+ public:
+  HbmMultiTierFeatureDescriptorImpl(
+      Allocator* alloc, int64 slot_num,
+      bool need_record_freq,
+      bool need_record_version)
+      : dram_alloc_bytes_(sizeof(V*)),
+        hbm_alloc_(alloc),
+        dram_alloc_(ev_allocator()),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&dram_alloc_bytes_);
+  }
+
+  ~HbmMultiTierFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes =
+        FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+            emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&hbm_alloc_bytes_);
+      embedding_mem_pool_.reset(
+        new EmbeddingMemoryPool<V>(hbm_alloc_,
+                                   hbm_alloc_bytes_ / sizeof(V),
+                                   1024 * 1024 * 64));
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return *((V**)val) +
+        FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = dram_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, dram_alloc_bytes_);
+    mutex_lock l(memory_pool_mu_);
+    *((V**)val) = embedding_mem_pool_->Allocate();
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    mutex_lock l(memory_pool_mu_);
+    embedding_mem_pool_->Deallocate(*((V**)val));
+    dram_alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    mutex_lock l(memory_pool_mu_);
+    for (auto ptr: value_ptrs) {
+      embedding_mem_pool_->Deallocate(*((V**)ptr));
+      dram_alloc_->DeallocateRaw(ptr);
+    }
+  }
+  void SetDefaultValue(void* val, int64 key) override {
+    LOG(FATAL)<<"Can't call SetDefaultValue(void* val, int64 key,"
+              <<"int default_value_len) in HbmMultiTierFeatureDescriptor.";
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    hbm_alloc_ = alloc;
+  }
+
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device);
+
+  int data_bytes() override {
+    return dram_alloc_bytes_;
+  }
+ public:
+  friend class NormalFeatureDescriptorImpl<V>;
+ protected:
+  int dram_alloc_bytes_;
+  int hbm_alloc_bytes_ = 0;
+  mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_
+  Allocator* hbm_alloc_;
+  Allocator* dram_alloc_;
+  std::unique_ptr<EmbeddingMemoryPool<V>> embedding_mem_pool_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
index 36d331e74aa..31dc4459a13 100644
--- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h
+++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h
@@ -28,10 +28,11 @@ class HbmValueIterator: public ValueIterator<V> {
  public:
   HbmValueIterator(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       int64 emb_index,
       int64 value_len,
-      Allocator* alloc)
+      Allocator* alloc,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         alloc_(alloc) {
     int64 emb_offset = value_len_ * emb_index;
@@ -40,7 +41,7 @@ class HbmValueIterator: public ValueIterator<V> {
       for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
         if (key_list[i] % kSavedPartitionNum == part_id) {
           value_parts_vec[part_id].emplace_back(
-              value_ptr_list[i]->GetValue(emb_index, emb_offset));
+              feat_desc->GetEmbedding(value_ptr_list[i], emb_index));
           break;
         }
       }
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 5d1f20b581a..3659187c825 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -24,9 +25,6 @@ namespace {
 const char* kInferenceMode = "INFERENCE_MODE";
 }
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class GPUHashTable;
 
@@ -43,19 +41,19 @@ template <class K, class V>
 class KVInterface {
  public:
   virtual ~KVInterface() {}
-  virtual Status Lookup(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Lookup(K key, void** value_ptr) = 0;
   virtual Status Contains(K key) = 0;
-  virtual Status Insert(K key, const ValuePtr<V>* value_ptr) = 0;
+  virtual Status Insert(K key, const void* value_ptr) = 0;
   virtual Status Remove(K key) = 0;
 
   virtual Status BatchLookup(const K* keys, size_t size,
-                             ValuePtr<V>** value_ptrs) {
+                             void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookup in KVInterface.");
   }
   // KV Batch Insert
   virtual Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchInsert in KVInterface.");
   }
@@ -66,27 +64,30 @@ class KVInterface {
   }
 
   virtual Status BatchLookupOrCreate(const K* keys, size_t size,
-      ValuePtr<V>** value_ptrs) {
+      void** value_ptrs) {
     return Status(error::Code::UNIMPLEMENTED,
                   "Unimplemented for BatchLookupOrInsert in KVInterface.");
   }
 
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) {
+    LOG(FATAL)<<"Unimplemented for UpdateValuePtr in KVInterface.";
+  }
+
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   // KV Size
   virtual int64 Size() const = 0;
 
-  virtual void SetTotalDims(int total_dims) {}
-
-  virtual void FreeValuePtr(ValuePtr<V>* value_ptr) {}
+  virtual void FreeValuePtr(void* value_ptr) {}
 
-  virtual Status Commit(K key, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K key, const void* value_ptr) {
     return Status::OK();
   }
 
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
 
   virtual std::string DebugString() const = 0;
 
diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
index 2af6b58f94b..9b0ea8aba3f 100644
--- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
+++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h
@@ -19,28 +19,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-template<typename V>
-class ValuePtr;
-
 namespace embedding {
 template<typename K, typename V>
 class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  public:
   L2WeightShrinkPolicy(float l2_weight_threshold,
                        int64 index,
-                       int64 offset,
-                       Allocator* alloc,
+                       FeatureDescriptor<V>* feat_desc,
                        KVInterface<K, V>* kv)
       : index_(index),
-        offset_(offset),
         kv_(kv),
         l2_weight_threshold_(l2_weight_threshold),
-        ShrinkPolicy<K, V>(alloc) {}
+        ShrinkPolicy<K, V>(feat_desc) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {
     ShrinkPolicy<K, V>::ReleaseValuePtrs();
     FilterToDelete(shrink_args.value_len,
@@ -50,9 +45,9 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
  private:
   void FilterToDelete(int64 value_len,
                       std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list) {
+                      std::vector<void*>& value_list) {
     for (int64 i = 0; i < key_list.size(); ++i) {
-      V* val = value_list[i]->GetValue(index_, offset_);
+      V* val = ShrinkPolicy<K, V>::feat_desc_->GetEmbedding(value_list[i], index_);
       if (val != nullptr) {
         V l2_weight = (V)0.0;
         for (int64 j = 0; j < value_len; j++) {
@@ -61,7 +56,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
         l2_weight *= (V)0.5;
         if (l2_weight < (V)l2_weight_threshold_) {
           kv_->Remove(key_list[i]);
-          value_list[i] = (ValuePtr<V>*)ValuePtrStatus::IS_DELETED;
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
           ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
         }
       }
@@ -70,7 +65,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
 
  private:
   int64 index_;
-  int64 offset_;
+  //int64 offset_;
   KVInterface<K, V>* kv_;
   float l2_weight_threshold_;
 };
diff --git a/tensorflow/core/framework/embedding/layout_creator.h b/tensorflow/core/framework/embedding/layout_creator.h
deleted file mode 100644
index 07d50451bf0..00000000000
--- a/tensorflow/core/framework/embedding/layout_creator.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-======================================================================*/
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
-
-#include "tensorflow/core/framework/embedding/cache.h"
-#include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/storage_config.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-template <class V>
-class ValuePtr;
-
-namespace embedding {
-template<typename V>
-class LayoutCreator {
- public:
-  virtual ValuePtr<V>* Create(Allocator* alloc, size_t size) = 0;
-};
-
-template<typename V>
-class NormalLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class LightLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new LightValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalContiguousValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class NormalContiguousGPULayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new NormalGPUValuePtr<V>(alloc, size);
-  }
-};
-
-template<typename V>
-class CompactLayoutCreator : public LayoutCreator<V> {
- public:
-  ValuePtr<V>* Create(Allocator* alloc, size_t size) override {
-    return new CompactValuePtr<V>(alloc, size);
-  }
-};
-
-class LayoutCreatorFactory {
- public:
-  template<typename V>
-  static LayoutCreator<V>* Create(const StorageConfig& sc) {
-    switch (sc.layout_type) {
-      case LayoutType::NORMAL:
-        static NormalLayoutCreator<V> normal_creator;
-        return &normal_creator;
-      case LayoutType::LIGHT:
-        static LightLayoutCreator<V> light_creator;
-        return &light_creator;
-      case LayoutType::NORMAL_CONTIGUOUS:
-        static NormalContiguousLayoutCreator<V> normal_contiguous_creator;
-        return &normal_contiguous_creator;
-      case LayoutType::NORMAL_CONTIGUOUS_GPU:
-        static NormalContiguousGPULayoutCreator<V>
-                   normal_contiguous_gpu_creator;
-        return &normal_contiguous_gpu_creator;
-      case LayoutType::COMPACT:
-        static CompactLayoutCreator<V> compact_creator;
-        return &compact_creator;
-      default:
-        static NormalLayoutCreator<V> default_creator;
-        return &default_creator;
-    }
-  }
-};
-} // embedding
-} // tensorflow
-
-#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index 8ea1fa63fc2..e488ab3776d 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
 
 #include "tensorflow/core/lib/io/path.h"
-
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 
 #include "leveldb/db.h"
@@ -35,9 +33,6 @@ using leveldb::WriteBatch;
 using leveldb::WriteOptions;
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 
 template <class K>
@@ -76,28 +71,21 @@ class SizeCounter {
 template <class K, class V>
 class LevelDBKV : public KVInterface<K, V> {
  public:
-  LevelDBKV(std::string path) {
+  LevelDBKV(std::string path, FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc) {
     path_ = io::JoinPath(path,
         "level_db_" + std::to_string(Env::Default()->NowMicros()));;
     options_.create_if_missing = true;
     leveldb::Status s = leveldb::DB::Open(options_, path_, &db_);
     CHECK(s.ok());
     counter_ =  new SizeCounter<K>(8);
-    new_value_ptr_fn_ = [] (size_t size) {
-      return new NormalContiguousValuePtr<V>(ev_allocator(), size);
-    };
-    total_dims_ = 0;
-  }
-
-  void SetTotalDims(int total_dims) {
-    total_dims_ = total_dims;
   }
 
   ~LevelDBKV() override {
     delete db_;
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     std::string val_str;
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::ReadOptions options;
@@ -106,8 +94,8 @@ class LevelDBKV : public KVInterface<K, V> {
       return errors::NotFound(
           "Unable to find Key: ", key, " in LevelDB.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
-      memcpy((int64 *)(val->GetPtr()), &val_str[0], val_str.length());
+      void* val = feat_desc_->Allocate();
+      memcpy((int64 *)val, &val_str[0], val_str.length());
       *value_ptr = val;
       return Status::OK();
     }
@@ -126,22 +114,22 @@ class LevelDBKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     counter_->add(key, 1);
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   } 
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     WriteBatch batch;
     for (int i = 0; i < keys.size(); i++) {
-      std::string value_res((char*)value_ptrs[i]->GetPtr(),
-          sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+      std::string value_res((char*)value_ptrs[i],
+          feat_desc_->data_bytes());
       leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*));
       batch.Put(db_key, value_res);
       delete value_ptrs[i];
@@ -150,9 +138,9 @@ class LevelDBKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    std::string value_res((char*)value_ptr->GetPtr(),
-        sizeof(FixedLengthHeader) + total_dims_ * sizeof(V));
+  Status Commit(K key, const void* value_ptr) override {
+    std::string value_res((char*)value_ptr,
+        feat_desc_->data_bytes());
     leveldb::Slice db_key((char*)(&key), sizeof(void*));
     leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res);
     if (!s.ok()){
@@ -176,22 +164,32 @@ class LevelDBKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
     leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       K key;
       memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
       key_list->emplace_back(key);
-      ValuePtr<V>* value_ptr =
-          new NormalGPUValuePtr<V>(ev_allocator(), 1);
-      memcpy((char *)value_ptr->GetPtr(),
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM, true, true,
+          {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr,
              it->value().ToString().data(),
-             sizeof(FixedLengthHeader));
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(
+          value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(
+          value_ptr, feat_desc_->GetVersion(dram_value_ptr));
       value_ptr_list->emplace_back(value_ptr);
     }
     delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
     return Status::OK();
   }
 
@@ -199,8 +197,8 @@ class LevelDBKV : public KVInterface<K, V> {
     return counter_->size();
   }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
   std::string DebugString() const override{
@@ -212,8 +210,7 @@ class LevelDBKV : public KVInterface<K, V> {
   SizeCounter<K>* counter_;
   Options options_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
-  int total_dims_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<class K, class  V>
@@ -223,10 +220,12 @@ class DBValueIterator: public ValueIterator<V> {
       const std::vector<K>& key_list,
       int64 emb_index,
       int64 value_len,
-      LevelDBKV<K, V>* leveldb_kv)
+      LevelDBKV<K, V>* leveldb_kv,
+      FeatureDescriptor<V>* feat_desc)
       : value_len_(value_len),
         emb_index_(emb_index),
-        leveldb_kv_(leveldb_kv) {
+        leveldb_kv_(leveldb_kv),
+        feat_desc_(feat_desc) {
     int64 emb_offset = value_len_ * emb_index;
     std::vector<std::list<K>> keys_parts_vec(kSavedPartitionNum);
     for (int64 i = 0; i < key_list.size(); i++) {
@@ -251,8 +250,7 @@ class DBValueIterator: public ValueIterator<V> {
 
   V* Next() {
     if (value_ptr_ != nullptr) {
-      value_ptr_->Destroy(ev_allocator());
-      delete value_ptr_;
+      feat_desc_->Deallocate(value_ptr_);
     }
     K key = *(keys_iter_++);
 
@@ -260,16 +258,17 @@ class DBValueIterator: public ValueIterator<V> {
     if (!s.ok()) {
       LOG(FATAL)<<"Not found value in LevelDB when Save.";
     }
-    return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_);
+    return feat_desc_->GetEmbedding(value_ptr_, emb_index_);
   }
 
  private:
   int64 value_len_;
   int64 emb_index_;
   LevelDBKV<K, V>* leveldb_kv_;
+  FeatureDescriptor<V>* feat_desc_;
   std::list<K> keys_;
   typename std::list<K>::const_iterator keys_iter_;
-  ValuePtr<V>* value_ptr_ = nullptr;
+  void* value_ptr_ = nullptr;
   int64 key_cursor_ = 0;
 };
 
diff --git a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h b/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
deleted file mode 100644
index 8dcea81d4a1..00000000000
--- a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-=======================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
-#if GOOGLE_CUDA
-#define EIGEN_USE_GPU
-
-#include "sparsehash/dense_hash_map_lockless"
-#include "tensorflow/core/framework/embedding/batch.h"
-#include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/stream_executor.h"
-
-namespace tensorflow {
-using se::DeviceMemoryBase;
-using se::Stream;
-
-namespace embedding {
-
-template <class K, class V>
-class LocklessHashMapCPU : public KVInterface<K, V> {
- public:
-  LocklessHashMapCPU(Allocator* gpu_alloc): gpu_alloc_(gpu_alloc) {
-    hash_map_.max_load_factor(0.8);
-    hash_map_.set_empty_key_and_value(EMPTY_KEY_, nullptr);
-    hash_map_.set_counternum(16);
-    hash_map_.set_deleted_key(DELETED_KEY_);
-    cudaEventCreate(&is_finish_);
-  }
-
-  ~LocklessHashMapCPU() override {
-    cudaEventDestroy(is_finish_);
-  }
-
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      *value_ptr = iter.second;
-      return Status::OK();
-    }
-  }
-
-  Status Contains(K key) override {
-    auto iter = hash_map_.find_wait_free(key);
-    if (iter.first == EMPTY_KEY_) {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
-    auto iter = hash_map_.insert_lockless(
-        std::move(std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(value_ptr))));
-    // insert fail, exist key
-    if ((*(iter.first)).second != value_ptr){
-      return errors::AlreadyExists(
-          "already exists Key: ", key, " in LocklessHashMap.");
-    } else {
-      return Status::OK();
-    }
-  }
-
-  // Other Method
-  int64 Size() const override {
-    return hash_map_.size_lockless();
-  }
-
-  // Remove KV
-  Status Remove(K key) override {
-    if (hash_map_.erase_lockless(key)) {
-      return Status::OK();
-    } else {
-      return errors::NotFound(
-          "Unable to find Key: ", key, " in LocklessHashMap.");
-    }
-  }
-
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-  }
-
-  void AppendToValuePtrQueue(ValuePtr<V>* old_value_ptr) {
-    //A parameter that can be adjusted in the future
-    if (value_ptr_out_of_date_.size() > CAP_INVALID_VALUEPTR) {
-      ValuePtr<V>* value_ptr = value_ptr_out_of_date_.front();
-      delete value_ptr;
-      value_ptr_out_of_date_.pop_front();
-    }
-    value_ptr_out_of_date_.emplace_back(old_value_ptr);
-  }
-
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
-    ValuePtr<V>* cpu_value_ptr =
-      new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-    cudaMemcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-               *(char **)((char*)value_ptr->GetPtr() + sizeof(FixedLengthHeader)),
-               total_dims_ * sizeof(V),
-               cudaMemcpyDeviceToHost);
-    memcpy((char *)cpu_value_ptr->GetPtr(),
-        (char*)value_ptr->GetPtr(), sizeof(FixedLengthHeader));
-    auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(key,
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-    if ((*(iter.first)).second != cpu_value_ptr) {
-      AppendToValuePtrQueue((*(iter.first)).second);
-      (*(iter.first)).second = cpu_value_ptr;
-    }
-    return Status::OK();
-  }
-
-  Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
-    int batch_size = keys.size();
-    Allocator* cpu_alloc = cpu_allocator();
-    V** value_address = (V **)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
-    V** dev_value_address;
-    V* batch_data_place;
-    V* dev_batch_data_place;
-    dev_value_address = (V**)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
-    dev_batch_data_place = (V*)gpu_alloc_->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-    batch_data_place = (V *)cpu_alloc->AllocateRaw(
-        Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_);
-
-    // Copy GPU addresses V*
-    for(int i = 0;i < batch_size;++i) {
-      value_address[i] =
-        *(V **)((char*)value_ptrs[i]->GetPtr() + sizeof(FixedLengthHeader));
-    }
-
-    cudaMemcpyAsync(dev_value_address, value_address,
-                    sizeof(V*) * batch_size,
-                    cudaMemcpyHostToDevice);
-
-    // Launch Kernel,Copy data to continuous place
-    int block_dim = 128;
-    void* args[] = { (void*)&dev_value_address,
-        (void*)&dev_batch_data_place, (void*)&total_dims_,
-        (void*)&batch_size};
-
-    cudaLaunchKernel((void *)BatchCopy<V>,
-                     (batch_size * total_dims_ + block_dim - 1) / block_dim,
-                     block_dim, args, 0, NULL);
-
-    cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
-                    sizeof(V) * batch_size * total_dims_,
-                    cudaMemcpyDeviceToHost);
-
-    cudaEventRecord(is_finish_);
-    cudaEventSynchronize(is_finish_);
-
-    // Copy data to ValuePtrs in memory;Insert it into hashmap
-    for(int i = 0; i < batch_size; ++i) {
-      ValuePtr<V>* cpu_value_ptr =
-        new NormalContiguousValuePtr<V>(ev_allocator(), total_dims_);
-      memcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader),
-          &batch_data_place[i * total_dims_], total_dims_ * sizeof(V));
-      memcpy((char *)cpu_value_ptr->GetPtr(),
-          (char *)value_ptrs[i]->GetPtr(), sizeof(FixedLengthHeader));
-      auto iter = hash_map_.insert_lockless(std::move(
-        std::pair<K, ValuePtr<V>*>(keys[i],
-            const_cast<ValuePtr<V>*>(cpu_value_ptr))));
-      if ((*(iter.first)).second != cpu_value_ptr) {
-        AppendToValuePtrQueue((*(iter.first)).second);
-        (*(iter.first)).second = cpu_value_ptr;
-      }
-    }
-
-    gpu_alloc_->DeallocateRaw(dev_value_address);
-    gpu_alloc_->DeallocateRaw(dev_batch_data_place);
-
-    cpu_alloc->DeallocateRaw(batch_data_place);
-    cpu_alloc->DeallocateRaw(value_address);
-
-    return Status::OK();
-  }
-
-  Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
-    std::pair<const K, ValuePtr<V>*> *hash_map_dump;
-    int64 bucket_count;
-    auto it = hash_map_.GetSnapshot();
-    hash_map_dump = it.first;
-    bucket_count = it.second;
-    for (int64 j = 0; j < bucket_count; j++) {
-      if (hash_map_dump[j].first != EMPTY_KEY_ &&
-          hash_map_dump[j].first != DELETED_KEY_) {
-        key_list->emplace_back(hash_map_dump[j].first);
-        value_ptr_list->emplace_back(hash_map_dump[j].second);
-      }
-    }
-    free(hash_map_dump);
-    return Status::OK();
-  }
-
-  std::string DebugString() const override {
-    LOG(INFO) << "map info size:" << Size()
-              << "map info bucket_count:" << hash_map_.bucket_count()
-              << "map info load_factor:" << hash_map_.load_factor()
-              << "map info max_load_factor:" << hash_map_.max_load_factor()
-              << "map info min_load_factor:" << hash_map_.min_load_factor();
-    return "";
-  }
-
- private:
-  typedef google::dense_hash_map_lockless<K, ValuePtr<V>* >
-    LockLessHashMap;
-  static const int EMPTY_KEY_ = -1;
-  static const int DELETED_KEY_ = -2;
-  static constexpr int CAP_INVALID_VALUEPTR = 200000;
-  LockLessHashMap hash_map_;
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
-  int total_dims_;
-  Allocator* gpu_alloc_;
-  cudaEvent_t is_finish_;
-};
-}  // namespace embedding
-}  // namespace tensorflow
-
-#endif //GOOGLE_CUDA
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
index de275183d22..9745ab5fcc3 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/embedding/multi_tier_storage.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
-#include "tensorflow/core/framework/embedding/batch.h"
+#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h"
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -44,11 +43,13 @@ template <class K, class V>
 void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     const EmbeddingVarContext<GPUDevice>& ctx,
     const K* keys,
-    ValuePtr<V>** value_ptr_list,
+    void** value_ptr_list,
     std::list<int64>& copyback_cursor,
     const std::vector<int64>& memory_index,
-    const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-    int value_len) {
+    const std::vector<void*>& gpu_value_ptrs,
+    int value_len,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
   if (copyback_cursor.size() > 0) {
     int total = copyback_cursor.size();
     //Alocate memcpy buffer on CPU and GPU.
@@ -64,11 +65,13 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     auto do_work = [memory_index,
                     memcpy_buffer_cpu, value_ptr_list,
                     gpu_value_ptrs,
+                    dram_feat_desc,
                     value_len, this] (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
         int j = memory_index[i];
         memcpy(memcpy_buffer_cpu + i * value_len,
-               value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V));
+               dram_feat_desc->GetEmbedding(value_ptr_list[j], 0),
+               value_len * sizeof(V));
         value_ptr_list[j] = gpu_value_ptrs[i];
       }
     };
@@ -96,8 +99,7 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
     for (; it != copyback_cursor.cend(); ++it, ++i) {
       // Get the cursor
       int64 cursor = *it;
-      gpu_value_ptrs[i]->SetInitialized(0);
-      value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0);
+      value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0);
     }
     DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*));
     compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*));
@@ -119,16 +121,71 @@ void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
 }
 #define REGISTER_KERNELS(ktype, vtype)                                        \
   template void MultiTierStorage<ktype, vtype>::CopyEmbeddingsFromDramToHbm(       \
-      const EmbeddingVarContext<GPUDevice>&, const ktype*, ValuePtr<vtype>**,\
+      const EmbeddingVarContext<GPUDevice>&, const ktype*, void**,\
       std::list<int64>&, const std::vector<int64>&,\
-      const std::vector<ValuePtr<vtype>*>&, int);
+      const std::vector<void*>&, int, FeatureDescriptor<vtype>*,\
+      FeatureDescriptor<vtype>*);
 #define REGISTER_KERNELS_ALL(type) \
   REGISTER_KERNELS(int32, type);   \
   REGISTER_KERNELS(int64, type)
 #define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
 #undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <class TValue>
+template <class K>
+void HbmMultiTierFeatureDescriptorImpl<TValue>::SetDefaultValues(
+    const K* keys, const std::list<int64>& init_cursor,
+    void** value_ptrs, se::Stream* compute_stream, EventMgr* event_mgr,
+    const Eigen::GpuDevice& gpu_device) {
+  if (init_cursor.size() > 0) {
+    int64 total = init_cursor.size();
+    TValue** value_address = nullptr;
+    value_address = TypedAllocator::Allocate<TValue*>(cpu_allocator(), total * 2,
+                                                 AllocationAttributes());
+    TValue** default_value_address = value_address + total;
+    TValue** dev_value_address = nullptr;
+    dev_value_address =
+        TypedAllocator::Allocate<TValue*>(hbm_alloc_, total * 2, AllocationAttributes());
+    TValue** dev_default_value_address = dev_value_address + total;
+    for (int emb_index = 0; emb_index < FeatureDescriptorImpl<TValue>::slot_infos_.size(); emb_index++) {
+      int64 i = 0;
+      auto it = init_cursor.cbegin();
+      for (; it != init_cursor.cend(); ++it, ++i) {
+        value_address[i] = GetEmbedding(value_ptrs[*it], emb_index);
+        default_value_address[i] =
+            FeatureDescriptorImpl<TValue>::GetDefaultValuePtr(emb_index, keys[i]);
+      }
+      DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(TValue*));
+      compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
+                                total * 2 * sizeof(TValue*));
+      int block_dim = 128;
+      int value_len = FeatureDescriptorImpl<TValue>::slot_infos_[emb_index].default_value_len;
+      TF_CHECK_OK(GpuLaunchKernel(
+          embedding::CopyEmbedding<TValue>,
+          (total * value_len + block_dim - 1) / block_dim,
+          block_dim, 0, gpu_device.stream(), dev_default_value_address,
+          dev_value_address, value_len, total));
+      SyncWithEventMgr(compute_stream, event_mgr);  
+    }
+    
+    TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2);
+    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
+  }
+}
 
+#define REGISTER_KERNELS(ktype, vtype)                                        \
+  template void HbmMultiTierFeatureDescriptorImpl<vtype>::SetDefaultValues(     \
+      const ktype*, const std::list<int64>&, void**,\
+      se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
 #undef REGISTER_KERNELS_ALL
 #undef REGISTER_KERNELS
 } // namespace embedding
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 8239d109e64..7955322aca6 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -31,10 +31,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/core/status.h"
 
-namespace tensorflow {
-template<typename V>
-class ValuePtr;
+#if GOOGLE_CUDA
+#include "tensorflow/core/framework/embedding/batch.h"
+#endif
 
+namespace tensorflow {
 template<typename K, typename V>
 class EmbeddingVar;
 
@@ -54,22 +55,10 @@ class MultiTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage);
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-
-      cache_capacity_ = Storage<K, V>::storage_config_.size[0]
-                        / (Storage<K, V>::total_dims_ * sizeof(V));
-      ready_eviction_ = true;
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
+  virtual void Init() override {
+    cache_capacity_ = Storage<K, V>::storage_config_.size[0]
+                      / (total_dim() * sizeof(V));
+    ready_eviction_ = true;
   }
 
   int64 CacheSize() const override {
@@ -90,13 +79,13 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL)<<"BatchCommit isn't supported by MultiTierStorage.";
     return Status::OK();
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
   }
 
@@ -104,7 +93,7 @@ class MultiTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -128,17 +117,6 @@ class MultiTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
-
   void Schedule(std::function<void()> fn) override {
     cache_thread_pool_->Schedule(std::move(fn));
   }
@@ -223,50 +201,50 @@ class MultiTierStorage : public Storage<K, V> {
     }
     return s;
   }
- 
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual int total_dim() = 0;
 
   void DeleteFromEvictionManager() {
     eviction_manager_->DeleteStorage(this);
   }
 
-  void ReleaseValuePtrs(std::deque<ValuePtr<V>*>& value_ptrs,
-                        Allocator* allocator) {
+  void ReleaseValuePtrs(std::deque<void*>& value_ptrs,
+                        FeatureDescriptor<V>* feat_desc) {
     constexpr int CAP_INVALID_VALUEPTR = 64 * 1024;
     if (value_ptrs.size() > CAP_INVALID_VALUEPTR) {
       int64 num_of_deleted_value_ptrs =
           value_ptrs.size() - CAP_INVALID_VALUEPTR;
       for (int i = 0; i < num_of_deleted_value_ptrs; i++) {
-        ValuePtr<V>* value_ptr = value_ptrs.front();
-        value_ptr->Destroy(allocator);
-        delete value_ptr;
+        void* value_ptr = value_ptrs.front();
+        feat_desc->Deallocate(value_ptr);
         value_ptrs.pop_front();
       }
     }
   }
 
-  void ReleaseInvalidValuePtr(Allocator* allocator) {
-    ReleaseValuePtrs(value_ptr_out_of_date_, allocator);
+  void ReleaseInvalidValuePtr(FeatureDescriptor<V>* feat_desc) {
+    ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc);
   }
 
-  void KeepInvalidValuePtr(ValuePtr<V>* value_ptr) {
+  void KeepInvalidValuePtr(void* value_ptr) {
     value_ptr_out_of_date_.emplace_back(value_ptr);
   }
 
 #if GOOGLE_CUDA
   void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& context,
                                    const K* keys,
-                                   ValuePtr<V>** value_ptr_list,
+                                   void** value_ptr_list,
                                    std::list<int64>& copyback_cursors,
                                    const std::vector<int64>& memory_index,
-                                   const std::vector<ValuePtr<V>*>& gpu_value_ptrs,
-                                   int value_len);
+                                   const std::vector<void*>& gpu_value_ptrs,
+                                   int value_len,
+                                   FeatureDescriptor<V>* hbm_feat_desc,
+                                   FeatureDescriptor<V>* dram_feat_desc);
 #endif //GOOGL_CUDA
  private:
   virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {}
 
  protected:
-  std::deque<ValuePtr<V>*> value_ptr_out_of_date_;
+  std::deque<void*> value_ptr_out_of_date_;
   BatchCache<K>* cache_ = nullptr;
 
   EvictionManager<K, V>* eviction_manager_;
@@ -281,6 +259,70 @@ class MultiTierStorage : public Storage<K, V> {
   std::string name_;
   std::vector<mutex> mu_list_;
 };
+
+#if GOOGLE_CUDA
+template <class V>
+void CopyEmbeddingFromHbmToDram(
+    const std::vector<void*>& hbm_value_ptrs,
+    const std::vector<void*>& dram_value_ptrs,
+    Allocator* gpu_alloc,
+    FeatureDescriptor<V>* hbm_feat_desc,
+    FeatureDescriptor<V>* dram_feat_desc) {
+  int batch_size = hbm_value_ptrs.size();
+    V** dev_value_address;
+
+  dev_value_address = (V**)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
+  Allocator* cpu_alloc = ev_allocator();
+  V** value_address = (V**)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
+
+  V* batch_data_place;
+  V* dev_batch_data_place;
+  int total_dim = dram_feat_desc->total_dim();
+  dev_batch_data_place = (V*)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  batch_data_place = (V *)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  // Copy GPU addresses V*
+  for(int i = 0; i < batch_size; ++i) {
+    value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0);
+  }
+  cudaMemcpyAsync(dev_value_address, value_address,
+                  sizeof(V*) * batch_size,
+                  cudaMemcpyHostToDevice);
+
+  // Launch Kernel,Copy data to continuous place
+  int block_dim = 128;
+  void* args[] = { (void*)&dev_value_address,
+      (void*)&dev_batch_data_place, (void*)&total_dim,
+      (void*)&batch_size};
+
+  cudaLaunchKernel((void *)BatchCopy<V>,
+                    (batch_size * total_dim + block_dim - 1) / block_dim,
+                    block_dim, args, 0, NULL);
+
+  cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
+                  sizeof(V) * batch_size * total_dim,
+                  cudaMemcpyDeviceToHost);
+
+  cudaEvent_t is_finish_;
+  cudaEventCreate(&is_finish_);
+  cudaEventRecord(is_finish_);
+  cudaEventSynchronize(is_finish_);
+  cudaEventDestroy(is_finish_);
+  
+  for(int i = 0; i < batch_size; ++i) {
+    memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0),
+        &batch_data_place[i * total_dim], total_dim * sizeof(V));
+  }
+
+  cpu_alloc->DeallocateRaw(value_address);
+  cpu_alloc->DeallocateRaw(batch_data_place);
+  gpu_alloc->DeallocateRaw(dev_value_address);
+  gpu_alloc->DeallocateRaw(dev_batch_data_place);
+}
+#endif //GOOGL_CUDA
 } // embedding
 } // tensorflow
 
diff --git a/tensorflow/core/framework/embedding/normal_feature_descriptor.h b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
new file mode 100644
index 00000000000..817b33d058b
--- /dev/null
+++ b/tensorflow/core/framework/embedding/normal_feature_descriptor.h
@@ -0,0 +1,134 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#include <list>
+#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+#if GOOGLE_CUDA
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+#endif
+
+template<class V>
+class NormalFeatureDescriptorImpl: public FeatureDescriptorImpl<V> {
+ public:
+  NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num,
+                          bool need_record_freq,
+                          bool need_record_version)
+      : alloc_bytes_(0),
+        alloc_(alloc),
+        FeatureDescriptorImpl<V>(slot_num,
+                                 need_record_freq,
+                                 need_record_version) {}
+  
+  NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_(feat_desc_impl->alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  NormalFeatureDescriptorImpl(
+      HbmMultiTierFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_bytes_(0),
+        alloc_(feat_desc_impl->dram_alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  ~NormalFeatureDescriptorImpl() {}
+  
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes = FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+      FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    FeatureDescriptorImpl<V>::SetSlotInfo(feat_desc_impl);
+    FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+    FeatureDescriptorImpl<V>::SetFreqAndVersionOffset(&alloc_bytes_);
+    return true;
+  }
+
+  V* GetEmbedding(void *val, int emb_index) override {
+    return reinterpret_cast<V*>(val)
+        + FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, alloc_bytes_);
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    for (auto val: value_ptrs) {
+      Deallocate(val);
+    }
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(val_ptr, value,
+        sizeof(V) * FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+  void SetDefaultValue(void* val, int64 index) override {
+    for (int i = 0; i < FeatureDescriptorImpl<V>::slot_infos_.size(); i++) {
+      V* val_ptr = GetEmbedding(val, i);
+      FeatureDescriptorImpl<V>::SetDefaultValue((void*)val_ptr, i, index);
+    }
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(
+      const K* keys,
+      const std::list<int64>& init_cursor,
+      void** value_ptrs,
+      se::Stream* compute_stream,
+      EventMgr* event_mgr,
+      const Eigen::GpuDevice& gpu_device) {
+    LOG(FATAL)<<"Can't call SetDefaultValue(const K*, const std::list<int64>&,"
+              <<"void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)"
+              <<" in HbmMultiTierFeatureDescriptor.";
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) override {
+    alloc_ = alloc;
+  }
+
+  int data_bytes() override {
+    return alloc_bytes_;
+  }
+
+ private:
+  int alloc_bytes_;
+  Allocator* alloc_;
+};
+} //namespace embedding
+} //namespace tensorflow
+
+#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 0c5ce80886a..7e3ace0063d 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -30,19 +30,21 @@ template<typename K, typename V, typename EV>
 class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
  using FilterPolicy<K, V, EV>::ev_;
  using FilterPolicy<K, V, EV>::config_;
- using FilterPolicy<K, V, EV>::LookupOrCreateEmbInternal;
 
  public:
   NullableFilterPolicy(const EmbeddingConfig& config,
-                       EV* ev, embedding::Storage<K, V>* storage) : 
-      FilterPolicy<K, V, EV>(config, ev), storage_(storage) {}
+      EV* ev, embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc)
+      : storage_(storage), feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
 
   Status Lookup(K key, V* val, const V* default_value_ptr,
       const V* default_value_no_permission) override {
-    ValuePtr<V>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     Status s = ev_->LookupKey(key, &value_ptr);
     if (s.ok()) {
-      V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+      V* mem_val = feat_desc_->GetEmbedding(
+          value_ptr, config_.emb_index);
       memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
     } else {
       memcpy(val, default_value_ptr,
@@ -57,17 +59,17 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
                    int64 num_of_keys,
                    V* default_value_ptr,
                    V* default_value_no_permission) override {
-    std::vector<ValuePtr<V>*> value_ptr_list(num_of_keys, nullptr);
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
     ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
     std::vector<V*> embedding_ptr(num_of_keys, nullptr);
     auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
                     default_value_ptr, default_value_no_permission]
         (int64 start, int64 limit) {
       for (int i = start; i < limit; i++) {
-        ValuePtr<V>* value_ptr = value_ptr_list[i];
+        void* value_ptr = value_ptr_list[i];
         if (value_ptr != nullptr) {
           embedding_ptr[i] =
-              ev_->LookupOrCreateEmb(value_ptr, default_value_ptr);
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
         } else {
           embedding_ptr[i] = default_value_ptr;
         }
@@ -85,65 +87,55 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
   }
 
   void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                              const K* keys, ValuePtr<V>** value_ptrs,
+                              const K* keys, void** value_ptrs,
                               int64 num_of_keys) {
     int num_worker_threads = ctx.worker_threads->num_threads;
     std::vector<std::list<int64>>
         not_found_cursor_list(num_worker_threads + 1);
     ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs,
                                 num_of_keys, not_found_cursor_list);
-    std::vector<V*> var_ptrs(num_of_keys);
-    auto do_work = [this, value_ptrs, &var_ptrs]
-        (int64 start, int64 limit) {
-      for (int i = start; i < limit; i++) {
-        bool is_need_set_default_value = false;
-        var_ptrs[i] = ev_->LookupOrCreateEmb(
-            value_ptrs[i], is_need_set_default_value);
-      }
-    };
-    auto worker_threads = ctx.worker_threads;
-    Shard(worker_threads->num_threads,
-          worker_threads->workers, num_of_keys,
-          1000, do_work);
-
-    ev_->SetDefaultValueOfNewFeatures(
-        keys, num_of_keys,
-        not_found_cursor_list[0],
-        var_ptrs.data(), ctx.compute_stream,
-        ctx.event_mgr, ctx.gpu_device);
   }
 #endif //GOOGLE_CUDA
 
   void LookupOrCreate(K key, V* val, const V* default_value_ptr,
-                      ValuePtr<V>** value_ptr, int count,
+                      void** value_ptr, int count,
                       const V* default_value_no_permission) override {
-    TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr));
-    V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr);
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
     memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
   }
 
-  Status LookupOrCreateKey(K key, ValuePtr<V>** val,
+  Status LookupOrCreateKey(K key, void** value_ptr,
       bool* is_filter, int64 count) override {
     *is_filter = true;
-    return ev_->LookupOrCreateKey(key, val);
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      feat_desc_->SetDefaultValue(*value_ptr, key);
+      storage_->Insert(key, value_ptr);
+      s = Status::OK();
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
+    return s;
   }
 
-  int64 GetFreq(K key, ValuePtr<V>* value_ptr) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      return value_ptr->GetFreq();
-    }else {
-      return 0;
-    }
+  Status LookupKey(K key, void** val,
+      bool* is_filter, int64 count) override {
+    *is_filter = true;
+    return ev_->LookupKey(key, val);
+  }
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   int64 GetFreq(K key) override {
-    if (storage_->GetLayoutType() != LayoutType::LIGHT) {
-      ValuePtr<V>* value_ptr = nullptr;
-      TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
-      return value_ptr->GetFreq();
-    }else {
+    if (!config_.is_save_freq())
       return 0;
-    }
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetFreq(value_ptr);
   }
 
   Status Restore(int64 key_num, int bucket_num, int64 partition_id,
@@ -161,27 +153,30 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
         LOG(INFO) << "skip EV key:" << *(key_buff + i);
         continue;
       }
-      ValuePtr<V>* value_ptr = nullptr;
-      ev_->CreateKey(key_buff[i], &value_ptr, to_dram);
+      int64 import_freq = 0;
+      int64 import_version = -1;
+
       if (config_.filter_freq !=0 || ev_->IsMultiLevel()
           || config_.record_freq) {
-        value_ptr->SetFreq(freq_buff[i]);
+        import_freq = freq_buff[i];
       }
       if (config_.steps_to_live != 0 || config_.record_version) {
-        value_ptr->SetStep(version_buff[i]);
+        import_version = version_buff[i];
       }
-      LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len,
-                                value_ptr, value_buff, key_buff);
+      ev_->storage()->Import(key_buff[i],
+          value_buff + i * ev_->ValueLen(),
+          import_freq, import_version, config_.emb_index);
     }
     return Status::OK();
   }
 
-  bool is_admit(K key, ValuePtr<V>* value_ptr) override {
+  bool is_admit(K key, void* value_ptr) override {
     return true;
   }
 
  private:
   embedding::Storage<K, V>* storage_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
 };
 
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h
index ea063a113a3..a8d0d9ada75 100644
--- a/tensorflow/core/framework/embedding/shrink_policy.h
+++ b/tensorflow/core/framework/embedding/shrink_policy.h
@@ -15,14 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
 
+#include "tensorflow/core/framework/embedding/feature_descriptor.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-
-template<typename V>
-class ValuePtr;
-
 class Allocator;
 
 namespace embedding {
@@ -40,31 +37,29 @@ struct ShrinkArgs {
 template<typename K, typename V>
 class ShrinkPolicy {
  public:
-  ShrinkPolicy(Allocator* alloc): alloc_(alloc) {}
+  ShrinkPolicy(FeatureDescriptor<V>* feat_desc): feat_desc_(feat_desc) {}
   virtual ~ShrinkPolicy() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy);
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_list,
+                      std::vector<void*>& value_list,
                       const ShrinkArgs& shrink_args) = 0;
 
  protected:
-  void EmplacePointer(ValuePtr<V>* value_ptr) {
+  void EmplacePointer(void* value_ptr) {
     to_delete_.emplace_back(value_ptr);
   }
 
   void ReleaseValuePtrs() {
     for (auto it : to_delete_) {
-      it->Destroy(alloc_);
-      delete it;
+      feat_desc_->Deallocate(it);
     }
     to_delete_.clear();
   }
  protected:
-  std::vector<ValuePtr<V>*> to_delete_;
- private:
-  Allocator* alloc_;
+  std::vector<void*> to_delete_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
@@ -74,7 +69,7 @@ class NonShrinkPolicy: public ShrinkPolicy<K, V> {
   TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy);
 
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_list,
+              std::vector<void*>& value_list,
               const ShrinkArgs& shrink_args) override {}
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index f9de65df588..be08afd7f50 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -24,7 +24,6 @@ limitations under the License.
 #endif // GOOGLE_CUDA
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/embedding/l2weight_shrink_policy.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/leveldb_kv.h"
 #include "tensorflow/core/framework/embedding/ssd_hash_kv.h"
 #include "tensorflow/core/framework/embedding/storage_config.h"
@@ -32,9 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -62,24 +58,22 @@ class HbmDramSsdStorage;
 template<typename K, typename V>
 class SingleTierStorage : public Storage<K, V> {
  public:
-  SingleTierStorage(const StorageConfig& sc, Allocator* alloc,
-      KVInterface<K, V>* kv, LayoutCreator<V>* lc)
-      : kv_(kv), alloc_(alloc), layout_creator_(lc),
+  SingleTierStorage(const StorageConfig& sc,
+      KVInterface<K, V>* kv, FeatureDescriptor<V>* feat_desc)
+      : kv_(kv), feat_desc_(feat_desc),
         Storage<K, V>(sc) {
     if (sc.embedding_config.steps_to_live != 0) {
       shrink_policy_ =
           new GlobalStepShrinkPolicy<K, V>(
               sc.embedding_config.steps_to_live,
-              alloc_,
+              feat_desc_,
               kv_);
     } else if (sc.embedding_config.l2_weight_threshold != -1.0) {
       shrink_policy_ =
           new L2WeightShrinkPolicy<K, V>(
               sc.embedding_config.l2_weight_threshold,
               sc.embedding_config.primary_emb_index,
-              Storage<K, V>::GetOffset(
-                  sc.embedding_config.primary_emb_index),
-              alloc_,
+              feat_desc_,
               kv_);
     } else {
       shrink_policy_ = new NonShrinkPolicy<K, V>();
@@ -89,11 +83,10 @@ class SingleTierStorage : public Storage<K, V> {
   ~SingleTierStorage() override {
     mutex_lock l(Storage<K, V>::mu_);
     std::vector<K> key_list;
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     kv_->GetSnapshot(&key_list, &value_ptr_list);
     for (auto value_ptr : value_ptr_list) {
-      value_ptr->Destroy(alloc_);
-      delete value_ptr;
+      feat_desc_->Deallocate(value_ptr);
     }
     delete kv_;
     delete shrink_policy_;
@@ -101,7 +94,7 @@ class SingleTierStorage : public Storage<K, V> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage);
 
-  Status Get(K key, ValuePtr<V>** value_ptr) override {
+  Status Get(K key, void** value_ptr) override {
     return kv_->Lookup(key, value_ptr);
   }
 
@@ -109,47 +102,45 @@ class SingleTierStorage : public Storage<K, V> {
     return kv_->Contains(key);
   }
 
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) override {
+  virtual void CreateAndInsert(K key, void** value_ptr,
+      bool to_dram=false) override {
     do {
-      *value_ptr = layout_creator_->Create(alloc_, alloc_len);
+      *value_ptr = feat_desc_->Allocate();
       Status s = kv_->Insert(key, *value_ptr);
       if (s.ok()) {
         break;
       } else {
-        (*value_ptr)->Destroy(alloc_);
-        delete *value_ptr;
+        feat_desc_->Deallocate(*value_ptr);
       }
     } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) override {
-    LOG(FATAL)<<"Unsupport Insert(K, ValuePtr<V>*) in SingleTireStorage.";
+  virtual void Insert(K key, void** value_ptr) override {
+    do {
+      Status s = kv_->Insert(key, *value_ptr);
+      if (s.ok()) {
+        break;
+      } else {
+        feat_desc_->Deallocate(*value_ptr);
+      }
+    } while (!(kv_->Lookup(key, value_ptr)).ok());
   }
 
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) override {
+  Status GetOrCreate(K key, void** value_ptr) override {
     Status s = kv_->Lookup(key, value_ptr);
     if (s.ok()) {
       return s;
     }
 
-    *value_ptr = layout_creator_->Create(alloc_, size);
+    *value_ptr = feat_desc_->Allocate();
     s = kv_->Insert(key, *value_ptr);
     if (s.ok()) {
       return s;
     }
     // Insert Failed, key already exist
-    (*value_ptr)->Destroy(alloc_);
-    delete *value_ptr;
+    feat_desc_->Deallocate(*value_ptr);
     return kv_->Lookup(key, value_ptr);
   }
-
-  Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) override {
-    need_copyback = NOT_COPYBACK;
-    return GetOrCreate(key, value_ptr, size);
-  }
  
   Status Remove(K key) override {
     return kv_->Remove(key);
@@ -180,7 +171,7 @@ class SingleTierStorage : public Storage<K, V> {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -198,13 +189,13 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) override {
+      const std::vector<void*>& value_ptrs) override {
     LOG(FATAL) << "Unsupport BatchCommit in Storage: "
                << typeid(this).name();
     return Status::OK();
   }
 
-  virtual Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  virtual Status Commit(K keys, const void* value_ptr) {
      LOG(FATAL) << "Unsupport Commit in Storage: "
                 << typeid(this).name();
     return Status::OK();
@@ -222,19 +213,12 @@ class SingleTierStorage : public Storage<K, V> {
     return;
   }
 
-  void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) override {
-    return;
-  }
-
-  void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list,
-      int64 num_of_value_ptrs) override {
-    return;
-  }
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) override {}
 
   Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) override {
+      std::vector<void*>* value_ptr_list) override {
     mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
@@ -247,7 +231,7 @@ class SingleTierStorage : public Storage<K, V> {
       ShrinkArgs& shrink_args,
       int64 value_len,
       V* default_value) override {
-    std::vector<ValuePtr<V>*> value_ptr_list;
+    std::vector<void*> value_ptr_list;
     std::vector<K> key_list_tmp;
     TF_CHECK_OK(kv_->GetSnapshot(
         &key_list_tmp, &value_ptr_list));
@@ -255,30 +239,16 @@ class SingleTierStorage : public Storage<K, V> {
     if (emb_config.is_primary()) {
       Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len);
     }
-
     TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
         tensor_name, writer,
         emb_config,
         value_len, default_value,
         key_list_tmp,
-        value_ptr_list)));
+        value_ptr_list,
+        SingleTierStorage<K, V>::feat_desc_)));
     return Status::OK();
   }
 
-  void SetAllocLen(int64 value_len, int slot_num) override {
-    while (Storage<K, V>::flag_.test_and_set(std::memory_order_acquire));
-    // The start address of every slot should be aligned to 16 bytes,
-    // otherwise a coredump will happen in the ApplyOp.
-    Storage<K, V>::alloc_len_ = Storage<K, V>::ComputeAllocLen(value_len);
-
-    int64 temp = Storage<K, V>::alloc_len_ * slot_num;
-    if (temp > Storage<K, V>::total_dims_) {
-      Storage<K, V>::total_dims_ = temp;
-      SetTotalDims(Storage<K, V>::total_dims_);
-    }
-    Storage<K, V>::flag_.clear(std::memory_order_release);
-  }
-
   bool IsMultiLevel() override {
     return false;
   }
@@ -299,16 +269,22 @@ class SingleTierStorage : public Storage<K, V> {
     LOG(FATAL) << "Unsupport Schedule in SingleTierStorage.";
   }
 
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
  protected:
-  virtual void SetTotalDims(int64 total_dims) = 0;
+  virtual void* CreateValuePtr() {
+    return feat_desc_->Allocate();
+  }
 
-  virtual ValuePtr<V>* CreateValuePtr(int64 size) {
-    return layout_creator_->Create(alloc_, size);
+  virtual void DestroyValuePtr(void* value_ptr) {
+    feat_desc_->Deallocate(value_ptr);
   }
 
-  virtual void DestroyValuePtr(ValuePtr<V>* value_ptr) {
-    value_ptr->Destroy(alloc_);
-    delete value_ptr;
+  FeatureDescriptor<V>* feature_descriptor() {
+    return feat_desc_;
   }
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -324,7 +300,7 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   virtual void Shrink(std::vector<K>& key_list,
-                      std::vector<ValuePtr<V>*>& value_ptr_list,
+                      std::vector<void*>& value_ptr_list,
                       ShrinkArgs& shrink_args,
                       int64 value_len) {
     mutex_lock l(Storage<K, V>::mu_);
@@ -339,31 +315,40 @@ class SingleTierStorage : public Storage<K, V> {
   KVInterface<K, V>* kv_;
   ShrinkPolicy<K, V>* shrink_policy_;
   Allocator* alloc_;
-  LayoutCreator<V>* layout_creator_;
+  FeatureDescriptor<V>* feat_desc_;
 };
 
 template<typename K, typename V>
 class DramStorage : public SingleTierStorage<K, V> {
  public:
-  DramStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc,
-      KVInterface<K, V>* kv)
-      : SingleTierStorage<K, V>(sc, alloc, kv, lc) {}
+  DramStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {}
 
   ~DramStorage() override {}
 
   Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) {
+      const std::vector<void*>& value_ptrs) {
     return SingleTierStorage<K, V>::kv_->BatchCommit(keys, value_ptrs);
   }
 
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) override{
+  Status Commit(K keys, const void* value_ptr) override{
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
+
+  void Import(K key, V* value,
+              int64 freq, int64 version,
+              int emb_index) override {
+    void* value_ptr = SingleTierStorage<K, V>::feat_desc_->Allocate(freq);
+    SingleTierStorage<K, V>::Insert(key, &value_ptr);
+    SingleTierStorage<K, V>::feat_desc_->SetValue(value_ptr, emb_index, value);
+    SingleTierStorage<K, V>::feat_desc_->SetFreq(value_ptr, freq);
+    SingleTierStorage<K, V>::feat_desc_->UpdateVersion(value_ptr, version);
+  }
  
   TF_DISALLOW_COPY_AND_ASSIGN(DramStorage);
  public:
@@ -375,12 +360,8 @@ class DramStorage : public SingleTierStorage<K, V> {
   friend class HbmDramSsdStorage<K, V>;
 #endif
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -395,9 +376,10 @@ class DramStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class HbmStorage : public SingleTierStorage<K, V> {
  public:
-  HbmStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new GPUHashMapKV<K, V>(sc.embedding_config, alloc), lc) {
+  HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new GPUHashMapKV<K, V>(
+            sc.embedding_config, gpu_allocator), feat_desc) {
   }
   ~HbmStorage() override {}
 
@@ -488,48 +470,27 @@ class HbmStorage : public SingleTierStorage<K, V> {
     gpu_kv->Import(key_import, value_import, device, emb_config);
     return Status::OK();
   }
-
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
  public:
-  HbmStorageWithCpuKv(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  HbmStorageWithCpuKv(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+        sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
 
   ~HbmStorageWithCpuKv() override {}
 
-  void Insert(K key, ValuePtr<V>* value_ptr) override {
-    do {
-      Status s = SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
-      if (s.ok()) {
-        break;
-      } else {
-        value_ptr->Destroy(SingleTierStorage<K, V>::alloc_);
-        delete value_ptr;
-      }
-    } while (!(SingleTierStorage<K, V>::kv_->Lookup(key, &value_ptr)).ok());
-  }
-
-  void Insert(K key, ValuePtr<V>** value_ptr,
-              size_t alloc_len, bool to_dram = false) override {
-    SingleTierStorage<K, V>::Insert(key, value_ptr, alloc_len, to_dram);
-  }
-
-  Status TryInsert(K key, ValuePtr<V>* value_ptr) {
+  Status TryInsert(K key, void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
   }
  public:
   friend class HbmDramStorage<K, V>;
   friend class HbmDramSsdStorage<K, V>;
  protected:
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -544,28 +505,25 @@ class HbmStorageWithCpuKv: public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class PmemMemkindStorage : public SingleTierStorage<K, V> {
  public:
-  PmemMemkindStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemMemkindStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemMemkindStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage);
- 
- protected:
-  void SetTotalDims(int64 total_dims) override {}
 };
 
 template<typename K, typename V>
 class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  public:
-  PmemLibpmemStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LocklessHashMap<K, V>(), lc) {
+  PmemLibpmemStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LocklessHashMap<K, V>(feat_desc), feat_desc) {
   }
   ~PmemLibpmemStorage() override {}
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -573,10 +531,8 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
  
  protected:
   friend class DramPmemStorage<K, V>;
-  void SetTotalDims(int64 total_dims) override {}
-
   void Shrink(std::vector<K>& key_list,
-              std::vector<ValuePtr<V>*>& value_ptr_list,
+              std::vector<void*>& value_ptr_list,
               ShrinkArgs& shrink_args,
               int64 value_len) override {
     SingleTierStorage<K, V>::Shrink(
@@ -590,15 +546,15 @@ class PmemLibpmemStorage : public SingleTierStorage<K, V> {
 template<typename K, typename V>
 class LevelDBStore : public SingleTierStorage<K, V> {
  public:
-  LevelDBStore(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new LevelDBKV<K, V>(sc.path), lc) {
+  LevelDBStore(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new LevelDBKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~LevelDBStore() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -608,29 +564,25 @@ class LevelDBStore : public SingleTierStorage<K, V> {
     LevelDBKV<K, V>* leveldb_kv =
         reinterpret_cast<LevelDBKV<K, V>*>(SingleTierStorage<K, V>::kv_);
     return new DBValueIterator<K, V>(
-        key_list, emb_index, value_len, leveldb_kv);
+        key_list, emb_index, value_len,
+        leveldb_kv, SingleTierStorage<K, V>::feat_desc_);
   }
  public:
   friend class DramLevelDBStore<K, V>;
-
- protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
-  }
 };
 
 template<typename K, typename V>
 class SsdHashStorage : public SingleTierStorage<K, V> {
  public:
-  SsdHashStorage(const StorageConfig& sc, Allocator* alloc,
-      LayoutCreator<V>* lc) : SingleTierStorage<K, V>(
-          sc, alloc, new SSDHashKV<K, V>(sc.path, alloc), lc) {
+  SsdHashStorage(const StorageConfig& sc,
+      FeatureDescriptor<V>* feat_desc) : SingleTierStorage<K, V>(
+          sc, new SSDHashKV<K, V>(sc.path, feat_desc), feat_desc) {
   }
   ~SsdHashStorage() override {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage);
 
-  Status Commit(K keys, const ValuePtr<V>* value_ptr) {
+  Status Commit(K keys, const void* value_ptr) {
     return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
   }
 
@@ -691,8 +643,9 @@ class SsdHashStorage : public SingleTierStorage<K, V> {
 #endif
 
  protected:
-  void SetTotalDims(int64 total_dims) override {
-    SingleTierStorage<K, V>::kv_->SetTotalDims(total_dims);
+  void Init() override {
+    dynamic_cast<SSDHashKV<K, V>*>(
+        SingleTierStorage<K, V>::kv_)->Init();
   }
 };
 } // embedding
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index 8040421233e..f51c6904a50 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -25,17 +25,12 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/ssd_record_descriptor.h"
 #include "tensorflow/core/framework/embedding/emb_file_creator.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
-
-template <class V>
-class ValuePtr;
-
 namespace embedding {
 class EmbPosition {
  public:
@@ -115,55 +110,6 @@ class SSDIterator {
     }
   }
 
-  virtual void Key(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    memcpy((char*)val, &((file_map_[f_id])[curr_vec_].first), dim);
-  }
-
-  virtual void Value(char* val, int64 dim, int64 value_offset) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, dim,
-              posi->offset_ + value_offset + sizeof(FixedLengthHeader));
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_ +
-          value_offset + sizeof(FixedLengthHeader), dim);
-    }
-  }
-
-  virtual void Freq(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) =
-        reinterpret_cast<FixedLengthHeader*>(val)->GetFreqCounter();
-  }
-
-  virtual void Version(char* val, int64 dim) {
-    int64 f_id = file_id_vec_[curr_file_];
-    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
-
-    if (posi->flushed_) {
-      emb_files_[posi->version_]->
-          ReadWithMemcpy(val, sizeof(FixedLengthHeader),
-              posi->offset_);
-    } else {
-      memcpy(val, write_buffer_ + posi->buffer_offset_,
-             sizeof(FixedLengthHeader));
-    }
-    *((int64*)val) = 
-        reinterpret_cast<FixedLengthHeader*>(val)->GetGlobalStep();
-  }
-
   virtual K Key() {
     int64 f_id = file_id_vec_[curr_file_];
     return (file_map_[f_id])[curr_vec_].first;
@@ -192,8 +138,9 @@ class SSDIterator {
 template <class K, class V>
 class SSDHashKV : public KVInterface<K, V> {
  public:
-  explicit SSDHashKV(const std::string& path, Allocator* alloc)
-  : alloc_(alloc) {
+  explicit SSDHashKV(const std::string& path,
+                     FeatureDescriptor<V>* feat_desc)
+  : feat_desc_(feat_desc) {
     path_ = io::JoinPath(
         path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_");
     hash_map_.max_load_factor(0.8);
@@ -205,9 +152,6 @@ class SSDHashKV : public KVInterface<K, V> {
     evict_file_set_.set_counternum(16);
     evict_file_set_.set_deleted_key(DELETED_KEY);
 
-    new_value_ptr_fn_ = [this](size_t size) {
-      return new NormalContiguousValuePtr<V>(alloc_, size);
-    };
     is_async_compaction_ = true;
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true,
           &is_async_compaction_));
@@ -224,7 +168,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [this](){Compaction();}; 
       check_buffer_fn_ = [this](){CheckBuffer();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKV(key, value_ptr, is_compaction);
       };
@@ -233,7 +177,7 @@ class SSDHashKV : public KVInterface<K, V> {
         "Use Async Compactor in SSDHashKV of Multi-tier Embedding Storage!";
       compaction_fn_ = [](){};
       check_buffer_fn_ = [this](){CheckBufferAsync();};
-      save_kv_fn_ = [this](K key, const ValuePtr<V>* value_ptr,
+      save_kv_fn_ = [this](K key, const void* value_ptr,
           bool is_compaction=false) {
         SaveKVAsync(key, value_ptr, is_compaction);
       };
@@ -244,9 +188,8 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SetTotalDims(int total_dims) override {
-    total_dims_ = total_dims;
-    val_len_ = sizeof(FixedLengthHeader) + total_dims_ * sizeof(V);
+  void Init() {
+    val_len_ = feat_desc_->data_bytes();
     max_app_count_ = BUFFER_SIZE / val_len_;
     write_buffer_ = new char[BUFFER_SIZE];
     unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_);
@@ -334,18 +277,18 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Lookup(K key, ValuePtr<V>** value_ptr) override {
+  Status Lookup(K key, void** value_ptr) override {
     auto iter = hash_map_.find_wait_free(key);
     if (iter.first == EMPTY_KEY) {
       return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
     } else {
-      ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+      void* val = feat_desc_->Allocate();
       EmbPosition* posi = iter.second;
       if (posi->flushed_) {
-        emb_files_[posi->version_]->Read((char*)(val->GetPtr()),
+        emb_files_[posi->version_]->Read((char*)val,
             val_len_, posi->offset_);
       } else {
-        memcpy((char*)val->GetPtr(),
+        memcpy((char*)val,
             write_buffer_ + posi->buffer_offset_, val_len_);
       }
       *value_ptr = val;
@@ -363,17 +306,17 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  Status Insert(K key, const ValuePtr<V>* value_ptr) override {
+  Status Insert(K key, const void* value_ptr) override {
     return Status::OK();
   }
 
   Status BatchInsert(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     return BatchCommit(keys, value_ptrs);
   }
 
   Status BatchCommit(const std::vector<K>& keys,
-                     const std::vector<ValuePtr<V>*>& value_ptrs) override {
+                     const std::vector<void*>& value_ptrs) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, keys.size());
     for (int i = 0; i < keys.size(); i++) {
@@ -384,7 +327,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
-  Status Commit(K key, const ValuePtr<V>* value_ptr) override {
+  Status Commit(K key, const void* value_ptr) override {
     compaction_fn_();
     __sync_fetch_and_add(&total_app_count_, 1);
     check_buffer_fn_();
@@ -402,7 +345,7 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   Status GetSnapshot(std::vector<K>* key_list,
-                     std::vector<ValuePtr<V>*>* value_ptr_list) override {
+                     std::vector<void*>* value_ptr_list) override {
     return Status::OK();
   }
 
@@ -467,8 +410,8 @@ class SSDHashKV : public KVInterface<K, V> {
 
   int64 Size() const override { return hash_map_.size_lockless(); }
 
-  void FreeValuePtr(ValuePtr<V>* value_ptr) override {
-    delete value_ptr;
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
   }
 
  private:
@@ -555,10 +498,10 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void AppendToWriteBuffer(size_t curr_buffer_offset, K key,
-                            const ValuePtr<V>* value_ptr) {
+                            const void* value_ptr) {
     current_offset_ += val_len_;
     memcpy(write_buffer_ + curr_buffer_offset,
-        (char*)value_ptr->GetPtr(), val_len_);
+        (char*)value_ptr, val_len_);
     key_buffer_[buffer_cur_] = key;
     ++buffer_cur_;
   }
@@ -582,7 +525,7 @@ class SSDHashKV : public KVInterface<K, V> {
     return flag;
   }
 
-  void SaveKV(K key, const ValuePtr<V>* value_ptr,
+  void SaveKV(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, current_version_,
@@ -608,7 +551,7 @@ class SSDHashKV : public KVInterface<K, V> {
     }
   }
 
-  void SaveKVAsync(K key, const ValuePtr<V>* value_ptr,
+  void SaveKVAsync(K key, const void* value_ptr,
       bool is_compaction = false) {
     size_t curr_buffer_offset = buffer_cur_ * val_len_;
     EmbPosition* ep = new EmbPosition(current_offset_, evict_version_,
@@ -681,21 +624,21 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   void MoveToNewFile() {
-    ValuePtr<V>* val = new_value_ptr_fn_(total_dims_);
+    void* val = feat_desc_->Allocate();
     for (auto it : evict_file_map_) {
       EmbFile* file = emb_files_[it.first];
       total_app_count_ -= file->InvalidCount();
       file->MapForRead();
       for (auto it_vec : it.second) {
         EmbPosition* posi = it_vec.second;
-        file->ReadWithMemcpy((char*)(val->GetPtr()), val_len_,
+        file->ReadWithMemcpy((char*)val, val_len_,
             posi->offset_);
         CheckBuffer();
         SaveKV(it_vec.first, val, true);
       }
       file->UnmapForRead();
     }
-    delete val;
+    feat_desc_->Deallocate(val);
   }
 
   void MoveToNewFileAsync() {
@@ -825,11 +768,10 @@ class SSDHashKV : public KVInterface<K, V> {
   char* write_buffer_ = nullptr;
   K* key_buffer_ = nullptr;
   bool is_async_compaction_;
-  Allocator* alloc_ = nullptr;
+  FeatureDescriptor<V>* feat_desc_;
 
   int total_dims_;
   std::string path_;
-  std::function<ValuePtr<V>*(size_t)> new_value_ptr_fn_;
 
   typedef google::dense_hash_map_lockless<K, EmbPosition*> LockLessHashMap;
   LockLessHashMap hash_map_;
@@ -857,7 +799,7 @@ class SSDHashKV : public KVInterface<K, V> {
 
   std::function<void()> compaction_fn_;
   std::function<void()> check_buffer_fn_;
-  std::function<void(K, const ValuePtr<V>*, bool)> save_kv_fn_;
+  std::function<void(K, const void*, bool)> save_kv_fn_;
   EmbFileCreator* emb_file_creator_ = nullptr;
 };
 template <class K, class V>
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index bb949183492..1ffb435054b 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -40,9 +40,6 @@ using GPUDevice = Eigen::GpuDevice;
 template <class K, class V>
 class CheckpointLoader;
 
-template <class V>
-class ValuePtr;
-
 template <class K, class V>
 class EmbeddingVar;
 
@@ -57,9 +54,6 @@ class BundleReader;
 
 template<typename Device>
 struct EmbeddingVarContext;
-namespace {
-  const int kSavedPartitionNum = 1000;
-}
 namespace embedding {
 
 template<typename K, typename V>
@@ -67,42 +61,40 @@ class Storage {
  friend class CheckpointLoader<K, V>;
  public:
   explicit Storage(const StorageConfig& storage_config)
-      : storage_config_(storage_config) {}
+      : storage_config_(storage_config) {
+    initialize_value_.resize(storage_config.embedding_config.slot_num + 1);    
+  }
   virtual ~Storage() {}
   TF_DISALLOW_COPY_AND_ASSIGN(Storage);
 
-  virtual Status Get(K key, ValuePtr<V>** value_ptr) = 0;
+  virtual Status Get(K key, void** value_ptr) = 0;
 #if GOOGLE_CUDA
   virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx,
                         const K* key,
-                        ValuePtr<V>** value_ptr_list,
-                        int64 num_of_keys,
-                        int64 value_len) {}
+                        void** value_ptr_list,
+                        int64 num_of_keys) {}
 
   virtual void BatchGetOrCreate(
       const EmbeddingVarContext<GPUDevice>& ctx,
       const K* key,
-      ValuePtr<V>** value_ptr_list,
+      void** value_ptr_list,
       int64 num_of_keys,
       int64 value_len,
       std::vector<std::list<int64>>& not_found_cursor_list) {}
 #endif //GOOGLE_CUDA
   virtual Status Contains(K key) = 0;
-  virtual void Insert(K key, ValuePtr<V>** value_ptr,
-                      size_t alloc_len, bool to_dram = false) = 0;
-  virtual void Insert(K key, ValuePtr<V>* value_ptr) = 0;
-  virtual void SetAllocLen(int64 value_len, int slot_num) = 0;
+  virtual void CreateAndInsert(K key, void** value_ptr,
+                               bool to_dram=false) = 0;
+  virtual void Insert(K key, void** value_ptr) = 0;
+  virtual void Init() {}
   virtual void SetValueLen(int64 value_len) {}
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size) = 0;
-  virtual Status GetOrCreate(K key, ValuePtr<V>** value_ptr,
-      size_t size, CopyBackFlag &need_copyback) = 0;
+  virtual Status GetOrCreate(K key, void** value_ptr) = 0;
   virtual int LookupTier(K key) const = 0;
   virtual Status Remove(K key) = 0;
   virtual int64 Size() const = 0;
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
-      std::vector<ValuePtr<V>*>* value_ptr_list) = 0;
+      std::vector<void*>* value_ptr_list) = 0;
   virtual Status Save(
       const string& tensor_name,
       const string& prefix,
@@ -113,7 +105,7 @@ class Storage {
       V* default_value) = 0;
 
   virtual Status BatchCommit(const std::vector<K>& keys,
-      const std::vector<ValuePtr<V>*>& value_ptrs) = 0;
+      const std::vector<void*>& value_ptrs) = 0;
 
   virtual Status Eviction(K* evict_ids, int64 evict_size) = 0;
 
@@ -121,7 +113,7 @@ class Storage {
       int total, const K* keys,
       const std::list<int64>& copyback_cursor,
       V** memcpy_address, size_t value_len,
-      ValuePtr<V> **gpu_value_ptrs,
+      void **gpu_value_ptrs,
       V* memcpy_buffer_gpu,
       se::Stream* compute_stream,
       EventMgr* event_mgr,
@@ -149,25 +141,11 @@ class Storage {
       Allocator* alloc,
       int64 value_len,
       int64 block_size) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      const std::vector<ValuePtr<V>*>& value_ptr_list) = 0;
-  virtual void AllocateMemoryForNewFeatures(
-      ValuePtr<V>** value_ptr_list, int64 num_of_value_ptrs) = 0;
  
   inline mutex* get_mutex() { return &mu_; }
   inline int64 GetAllocLen() { return alloc_len_; }
   inline int64 GetOffset(int64 index) { return alloc_len_ * index; }
   inline int64 GetTotalDims() { return total_dims_; }
-  inline int64 ComputeAllocLen(int64 value_len) {
-    if (LayoutType::COMPACT == storage_config_.layout_type) {
-      return value_len;
-    } else {
-      return (value_len * sizeof(V) % 16 == 0)
-          ? value_len
-          : value_len + (16 - (sizeof(V) * value_len) % 16) / sizeof(V);
-    }
-  }
-  inline LayoutType GetLayoutType() { return storage_config_.layout_type; }
   inline embedding::StorageType GetStorageType() { return storage_config_.type; }
   inline std::string GetStoragePath() { return storage_config_.path; }
   inline embedding::CacheStrategy
@@ -183,7 +161,7 @@ class Storage {
   }
 
   inline void Insert(const std::vector<K>& keys,
-                     ValuePtr<V>** value_ptrs) {
+                     void** value_ptrs) {
     for (size_t i = 0; i < keys.size(); i++) {
       Insert(keys[i], value_ptrs[i]);
     }
@@ -211,6 +189,13 @@ class Storage {
                                     reset_version, reader);
     restorer.RestoreCkpt(emb_config, device);
   };
+  
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) = 0;
+  
+  virtual void Import(K key, V* value,
+                      int64 freq, int64 version,
+                      int emb_index) = 0;
 
  protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
@@ -227,12 +212,7 @@ class Storage {
                             const std::string& ssd_emb_file_name,
                             EmbeddingVar<K, V>* ev,
                             RestoreSSDBuffer<K>& restore_buff) {
-    int64 alloc_len = Storage<K, V>::ComputeAllocLen(value_len);
-    auto* alloc = ev->GetAllocator();
     for (int64 i = 0; i < restore_buff.num_of_keys; i++) {
-      ValuePtr<V>* value_ptr = nullptr;
-      ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr);
-      value_ptr->SetInitialized(emb_index);
       int64 file_id = restore_buff.key_file_id_list_buf[i];
       int64 key_offset = restore_buff.key_offset_list_buf[i];
       // Read data from embedding files on SSD. Data are stored in
@@ -240,32 +220,29 @@ class Storage {
       std::stringstream ss;
       ss << ssd_emb_file_name << "/" << file_id << ".emb";
       int fd = open(ss.str().data(), O_RDONLY);
+      EmbeddingConfig& emb_config = storage_config_.embedding_config;
+      FeatureDescriptor<V> normal_feat_desc(
+          emb_config.block_num, emb_config.slot_num + 1,
+          ev_allocator(), StorageType::DRAM, true,
+          true, {false, 0});
+      void* value_ptr = normal_feat_desc.Allocate();
       char* file_addr = (char*)mmap(nullptr,
-                                    sizeof(FixedLengthHeader) +
-                                    alloc_len * sizeof(V) * (emb_slot_num + 1) +
+                                    normal_feat_desc.data_bytes() +
                                     key_offset,
                                     PROT_READ, MAP_PRIVATE, fd, 0);
-
-      NormalContiguousValuePtr<V> tmp_value_ptr(alloc,
-                                                alloc_len * (emb_slot_num + 1));
-      void* ptr = tmp_value_ptr.GetPtr();
-      memcpy(ptr, file_addr + key_offset,
-             sizeof(FixedLengthHeader) +
-              alloc_len * sizeof(V) * (emb_slot_num + 1));
+      memcpy(value_ptr, file_addr + key_offset,
+             normal_feat_desc.data_bytes());
       munmap(file_addr,
-             sizeof(FixedLengthHeader) +
-             alloc_len * sizeof(V) * (emb_slot_num + 1) +
+             normal_feat_desc.data_bytes() +
              key_offset);
       close(fd);
       // Copy Data to ValuePtr, data of slots are set by primary here.
-      for (int j = 0; j < emb_slot_num + 1; j++) {
-        V* value = tmp_value_ptr.GetValue(j, alloc_len * j);
-        if (value != nullptr) {
-          value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j);
-        }
-      }
-      value_ptr->SetFreq(tmp_value_ptr.GetFreq());
-      value_ptr->SetStep(tmp_value_ptr.GetStep());
+      int64 import_freq = normal_feat_desc.GetFreq(value_ptr);
+      int64 import_version = normal_feat_desc.GetVersion(value_ptr);
+      V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index);
+      Import(restore_buff.key_list_buf[i], value,
+             import_freq, import_version, emb_index);
+      normal_feat_desc.Deallocate(value_ptr);
     }
     return Status::OK();
   }
@@ -273,10 +250,11 @@ class Storage {
  private:
   void GeneratePartitionedCkptData(
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
       EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
       const EmbeddingConfig& emb_config,
-      V* default_value) {
+      V* default_value,
+      FeatureDescriptor<V>* feat_desc) {
     std::vector<EmbeddingVarCkptData<K, V>>
         ev_ckpt_data_parts(kSavedPartitionNum);
 
@@ -293,7 +271,43 @@ class Storage {
           ev_ckpt_data_parts[part_id].Emplace(
               key_list[i], value_ptr_list[i],
               emb_config, default_value,
-              GetOffset(emb_config.emb_index),
+              feat_desc,
+              is_save_freq,
+              is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config,
+      V* default_value,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc) {
+    std::vector<EmbeddingVarCkptData<K, V>>
+        ev_ckpt_data_parts(kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset;
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i],
+              emb_config, default_value,
+              feat_desc[feat_desc_type],
               is_save_freq,
               is_save_version,
               save_unfiltered_features);
@@ -333,12 +347,33 @@ class Storage {
       int64 value_len,
       V* default_value,
       const std::vector<K>& key_list,
-      const std::vector<ValuePtr<V>*>& value_ptr_list,
+      const std::vector<void*>& value_ptr_list,
+      FeatureDescriptor<V>* feat_desc,
+      ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value, feat_desc);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(
+            tensor_name, writer, value_len, value_iter);
+    return Status::OK();
+  }
+
+  Status SaveToCheckpoint(
+      const string& tensor_name,
+      BundleWriter* writer,
+      const EmbeddingConfig& emb_config,
+      int64 value_len,
+      V* default_value,
+      const std::vector<K>& key_list,
+      const std::vector<void*>& value_ptr_list,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc,
       ValueIterator<V>* value_iter = nullptr) {
     EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
     GeneratePartitionedCkptData(key_list, value_ptr_list,
                                 &partitioned_ckpt_data, emb_config,
-                                default_value);
+                                default_value, feat_desc);
     Status s =
         partitioned_ckpt_data.ExportToCkpt(
             tensor_name, writer, value_len, value_iter);
@@ -366,6 +401,7 @@ class Storage {
 
   mutex mu_;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
+  std::vector<V*> initialize_value_;
 };
 } // embedding
 } // tensorflow
diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h
index 85e44879dcb..23babc9ef08 100644
--- a/tensorflow/core/framework/embedding/storage_config.h
+++ b/tensorflow/core/framework/embedding/storage_config.h
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 namespace tensorflow {
 namespace embedding {
 struct StorageConfig {
   StorageConfig() : type(StorageType::DEFAULT),
                     path(""),
-                    layout_type(LayoutType::NORMAL),
                     cache_strategy(CacheStrategy::LFU) {
     size = {1<<30,1<<30,1<<30,1<<30};
   }
@@ -31,32 +29,14 @@ struct StorageConfig {
   StorageConfig(StorageType t,
                 const std::string& p,
                 const std::vector<int64>& s,
-                const std::string& layout,
                 const EmbeddingConfig& ec,
                 const CacheStrategy cache_strategy_ = CacheStrategy::LFU)
-                                      : type(t),
-                                        path(p),
-                                        embedding_config(ec),
-                                        cache_strategy(cache_strategy_) {
-    if ("normal" == layout) {
-      layout_type = LayoutType::NORMAL;
-    } else if ("light" == layout) {
-      layout_type = LayoutType::LIGHT;
-    } else if ("normal_contiguous" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS;
-    } else if ("normal_contiguous_gpu" == layout){
-      layout_type = LayoutType::NORMAL_CONTIGUOUS_GPU;
-    } else if ("compact" == layout){
-      layout_type = LayoutType::COMPACT;
-    } else {
-      LOG(WARNING) << "Unknown layout: "
-        << layout << ", use LayoutType::NORMAL by default.";
-      layout_type = LayoutType::NORMAL;
-    }
-    size = s;
-  }
+      : type(t),
+        path(p),
+        size(s),
+        embedding_config(ec),
+        cache_strategy(cache_strategy_) {}
   StorageType type;
-  LayoutType layout_type;
   std::string path;
   std::vector<int64> size;
   CacheStrategy cache_strategy;
diff --git a/tensorflow/core/framework/embedding/storage_factory.h b/tensorflow/core/framework/embedding/storage_factory.h
index 10d2d52b83f..c585b058470 100644
--- a/tensorflow/core/framework/embedding/storage_factory.h
+++ b/tensorflow/core/framework/embedding/storage_factory.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
 
 #include "tensorflow/core/framework/embedding/config.pb.h"
-#include "tensorflow/core/framework/embedding/layout_creator.h"
 #include "tensorflow/core/framework/embedding/dram_leveldb_storage.h"
 #include "tensorflow/core/framework/embedding/dram_pmem_storage.h"
 #include "tensorflow/core/framework/embedding/dram_ssd_storage.h"
@@ -34,50 +33,41 @@ class StorageFactory {
  public:
   template<typename K, typename V>
   static Storage<K, V>* Create(const StorageConfig& sc,
-      Allocator* gpu_allocator, const string& name) {
-    auto layout_creator = LayoutCreatorFactory::Create<V>(sc);
-
+      Allocator* gpu_allocator, FeatureDescriptor<V>* feat_desc,
+      const string& name) {
     switch (sc.type) {
       case StorageType::DRAM:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_MEMKIND:
-        return new PmemMemkindStorage<K, V>(sc, pmem_allocator(),
-            layout_creator);
+        feat_desc->SetAllocator(pmem_allocator());
+        return new PmemMemkindStorage<K, V>(sc, feat_desc);
       case StorageType::PMEM_LIBPMEM:
-        return new PmemLibpmemStorage<K, V>(sc,
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator);
+        feat_desc->SetAllocator(
+            experimental_pmem_allocator(sc.path, sc.size[0]));
+        return new PmemLibpmemStorage<K, V>(sc, feat_desc);
       case StorageType::DRAM_PMEM:
-        return new DramPmemStorage<K, V>(sc, ev_allocator(),
-            experimental_pmem_allocator(sc.path, sc.size[0]),
-            layout_creator, name);
+        return new DramPmemStorage<K, V>(sc,
+            feat_desc, name);
       case StorageType::LEVELDB:
       case StorageType::DRAM_LEVELDB:
-        return new DramLevelDBStore<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramLevelDBStore<K, V>(sc, feat_desc, name);
       case StorageType::SSDHASH:
       case StorageType::DRAM_SSDHASH:
-        return new DramSsdHashStorage<K, V>(sc, ev_allocator(),
-            layout_creator, name);
+        return new DramSsdHashStorage<K, V>(sc, feat_desc, name);
       case StorageType::HBM:
 #if GOOGLE_CUDA
-        return new HbmStorage<K, V>(sc, gpu_allocator,
-            layout_creator);
+        return new HbmStorage<K, V>(sc, gpu_allocator, feat_desc);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM:
 #if GOOGLE_CUDA
-        return new HbmDramStorage<K, V>(sc, gpu_allocator,
-        ev_allocator(), layout_creator, name);
+        return new HbmDramStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       case StorageType::HBM_DRAM_SSDHASH:
 #if GOOGLE_CUDA
-        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator,
-            ev_allocator(), layout_creator, name);
+        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator, feat_desc, name);
 #endif  // GOOGLE_CUDA
       default:
-        return new DramStorage<K, V>(sc, ev_allocator(),
-            layout_creator, new LocklessHashMap<K, V>());
+        return new DramStorage<K, V>(sc, feat_desc);
     }
   }
 };
diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h
deleted file mode 100644
index ca7d234ed61..00000000000
--- a/tensorflow/core/framework/embedding/value_ptr.h
+++ /dev/null
@@ -1,647 +0,0 @@
-#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
-
-#include <pthread.h>
-#include <bitset>
-#include <atomic>
-#include <memory>
-
-#include "tensorflow/core/framework/typed_allocator.h"
-#if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#endif  // GOOGLE_CUDA
-
-namespace tensorflow {
-
-enum class LayoutType {
-  LIGHT,
-  NORMAL,
-  LEVELDB,
-  NORMAL_CONTIGUOUS,
-  NORMAL_CONTIGUOUS_GPU,
-  COMPACT,
-};
-
-namespace {
-constexpr int COLUMN_BITSET_BYTES = 5;
-constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
-
-struct MetaHeader {
-  unsigned char embed_num;
-  unsigned char value_type;
-  unsigned char header_size;
-  unsigned char column_bitset[COLUMN_BITSET_BYTES];
-
-  static const int kEmbeddingNumStartIndex = 0;
-  static const int kValueTypeStartIndex =
-      kEmbeddingNumStartIndex + sizeof(char);
-  static const int kHeaderSizeStartIndex =
-      kValueTypeStartIndex + sizeof(char);
-  static const int kColumnBitsetIndex =
-      kHeaderSizeStartIndex + sizeof(char);
-
-  inline unsigned int GetEmbeddingNum() {
-    return (unsigned int) embed_num;
-  }
-
-  inline void SetEmbeddingNum(size_t s) {
-    embed_num = (unsigned char)s;
-  }
-
-  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
-    unsigned long meta = ((unsigned long*)this)[0];
-    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
-    return bs;
-  }
-
-  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
-      unsigned int embnum) {
-    ((unsigned long*)(this))[0] =
-      (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
-      (header_size << (8 * kHeaderSizeStartIndex)) |
-      (value_type << (8 * kValueTypeStartIndex)) |
-      (embnum << (8 * kEmbeddingNumStartIndex));
-  }
-
-  inline unsigned int GetHeaderSize() {
-    return (unsigned int) header_size;
-  }
-
-  inline void SetHeaderSize(size_t size) {
-    header_size = (unsigned char)size;
-  }
-
-  inline void SetLayoutType(LayoutType vt) {
-    value_type = (unsigned char)vt;
-  }
-
-  inline LayoutType GetLayoutType() {
-    return (LayoutType)value_type;
-  }
-};
-
-struct LightHeader {
-/*__________________________________________________________________________________________
- |           |          |          |               |    embedding     |       slot       |
- | number of | valueptr |  header  | each bit a V* |        V*        |        V*        |
- | embedding | type     |   size   |    1 valid    | actually pointer | actually pointer |...
- |  columns  |          |          |   0 no-valid  |    by alloctor   |    by alloctor   |
- |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |     (8 bytes)    |     (8 bytes)    |
- --------------------------------------------------------------------------------------------
-*/
-  MetaHeader meta;
-  LightHeader() {
-    memset(this, 0, sizeof(LightHeader));
-    meta.SetLayoutType(LayoutType::LIGHT);
-    meta.SetHeaderSize(sizeof(LightHeader) / sizeof(int64));
-  }
-};
-
-struct NormalHeader {
-/*_________________________________________________________________________________________________________________________
-  |           |          |          |               |             |               |    embedding     |       slot       |
-  | number of | valueptr |  header  | each bit a V* | global step | freq counter  |        V*        |        V*        |
-  | embedding | type     |   size   |    1 valid    |             |               | actually pointer | actually pointer |...
-  |  columns  |          |          |   0 no-valid  |    int64    |     int64     |    by alloctor   |    by alloctor   |
-  |  (8 bits) | (8 bits) | (8 bits) |   (40 bits)   |  (8 bytes)  |   (8 bytes)   |     (8 bytes)    |     (8 bytes)    |
-  --------------------------------------------------------------------------------------------------------------------------
- */
-  MetaHeader meta;
-  int64 global_step;
-  int64 freq_counter;
-
-  NormalHeader() {
-    memset(this, 0, sizeof(NormalHeader));
-    meta.SetLayoutType(LayoutType::NORMAL);
-    meta.SetHeaderSize(sizeof(NormalHeader) / sizeof(int64));
-    SetGlobalStep(-1);
-  }
-
-  inline int64 GetGlobalStep() {
-    return global_step;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    global_step = gs;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-
-struct FixedLengthHeader {
-/*_________________________________________________________________________________
-  |                        |               |                embeddings             |
-  | slotflag + global step | freq counter  |                    V                  |
-  |                        |               |             actually value            |
-  |           int64        |     int64     |               by alloctor             |
-  |         (8 bytes)      |   (8 bytes)   |     (4 * slot_num * emb_dim bytes)    |
-  ---------------------------------------------------------------------------------
-*/
-  int64 global_step;
-  int64 freq_counter;
-
-  FixedLengthHeader() {
-    memset(this, 0, sizeof(FixedLengthHeader));
-    SetGlobalStep(-1);
-  }
-
-   inline int64 GetGlobalStep() {
-    return global_step & 0x0000ffffffffffff;
-  }
-
-  inline void SetGlobalStep(int64 gs) {
-    int64 temp = global_step;
-    temp &= 0xffff000000000000;
-    gs &= 0x0000ffffffffffff;
-    temp |= gs;
-    global_step = temp;
-  }
-
-  inline void SetInitialized(int64 emb_index) {
-    int64 temp = 1;
-    temp = temp << (48 + emb_index);
-    global_step |= temp;
-  }
-
-  inline int64 GetFreqCounter() {
-    return freq_counter;
-  }
-
-  inline void SetFreqCounter(int64 fc) {
-    freq_counter = fc;
-  }
-
-  inline void AddFreq() {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + 1);
-  }
-
-  inline void AddFreq(int64 count) {
-    __sync_bool_compare_and_swap(&freq_counter,
-        freq_counter, freq_counter + count);
-  }
-};
-} // namespace
-
-template <class V>
-class ValuePtr {
- public:
-  virtual ~ValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) = 0;
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) = 0;
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) = 0;
-
-  virtual void Destroy(Allocator* allocator) = 0;
-
-  virtual void* GetPtr() const = 0;
-
-  // Global Step
-  virtual int64 GetStep() {
-    LOG(FATAL) << "Unsupport GlobalStep in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetStep(int64 gs) {}
-
-  // Frequency Counter
-  virtual int64 GetFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-    return 0;
-  }
-
-  virtual void SetFreq(int64 freq) {}
-
-  virtual void AddFreq() {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void AddFreq(int64 count) {
-    LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase";
-  }
-
-  virtual void SetValue(V val, size_t size) {
-    LOG(FATAL) << "Unsupport SetValue in subclass of ValuePtrBase";
-  }
-
-  virtual void SetInitialized(int64 emb_index) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-  }
-
-  virtual bool SetPtr(V* ptr) {
-    LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase";
-    return false;
-  }
-
-};
-
-template <class V>
-class LooseValuePtr : public ValuePtr<V> {
- public:
-  virtual ~LooseValuePtr() {}
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-
-    if (!metadata.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      metadata = meta->GetColumnBitset();
-      if (metadata.test(emb_index)) {
-        this->flag_.clear(std::memory_order_release);
-        return ((V**)((int64*)ptr_ +
-              (unsigned int)meta->header_size))[emb_index];
-      }
-      embnum++ ;
-      int64 alloc_value_len = value_len;
-      V* tensor_val = (V*)allocator->AllocateRaw(
-          Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]  = tensor_val;
-
-      metadata.set(emb_index);
-      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong();
-      // the ptr_ will be occaionally  modified from 0x7f18700912a0 to 0x700912a0
-      // must use  ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val;  to avoid
-      meta->SetColumnBitset(metadata, embnum);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    auto metadata = meta->GetColumnBitset();
-    if (metadata.test(emb_index)) {
-      return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index];
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    MetaHeader* meta = (MetaHeader*)ptr_;
-    unsigned int embnum = (unsigned int)meta->embed_num;
-    auto metadata = meta->GetColumnBitset();
-    for (int i = 0; i< embnum; i++) {
-      if (metadata.test(i)) {
-        V* val = ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[i];
-        if (val != nullptr) {
-          allocator->DeallocateRaw(val);
-        }
-      }
-    }
-  }
-
-  virtual void* GetPtr() const {
-    return ptr_;
-  }
-
- protected:
-  void* ptr_;
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-template <class V>
-class LightValuePtr : public LooseValuePtr<V> {
- public:
-  LightValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*)malloc(
-        sizeof(LightHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(LightHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) LightHeader();
-  }
-
-  ~LightValuePtr() {
-    free(this->ptr_);
-  }
-};
-
-template <class V>
-class NormalValuePtr : public LooseValuePtr<V> {
- public:
-  NormalValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(NormalHeader) + sizeof(int64) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(NormalHeader), 0, sizeof(int64) * size);
-    new ((char*)this->ptr_) NormalHeader();
-  }
-
-  ~NormalValuePtr() {
-    free(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((NormalHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((NormalHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((NormalHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((NormalHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    return ((NormalHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    return ((NormalHeader*)this->ptr_)->AddFreq(count);
-  }
-};
-
-template <class V>
-class NormalContiguousValuePtr : public LooseValuePtr<V> {
-  public:
-   NormalContiguousValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
-      sizeof(FixedLengthHeader) + sizeof(V) * size);
-    memset(static_cast<char*>(this->ptr_) + sizeof(FixedLengthHeader), 0, sizeof(V) * size);
-    new ((char*)this->ptr_) FixedLengthHeader();
-   }
-
-   ~NormalContiguousValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(FixedLengthHeader) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(FixedLengthHeader) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  void SetValue(V val, size_t size) {
-    for (int i = 0; i < size; ++i) {
-      *((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + i) = val;
-    }
-  }
-};
-
-template <class V>
-class NormalGPUValuePtr : public LooseValuePtr<V> {
- public:
-  NormalGPUValuePtr(Allocator* allocator, size_t size) {
-    this->ptr_ = (void*) malloc(sizeof(FixedLengthHeader) + sizeof(V *));
-    *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = nullptr;
-    new ((char*)this->ptr_) FixedLengthHeader();
-  }
-
-  ~NormalGPUValuePtr() {
-    free(this->ptr_);
-  }
-
-#if GOOGLE_CUDA
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      V* tensor_val =
-        *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      cudaMemcpy(tensor_val, default_v, value_len * sizeof(V),
-          cudaMemcpyDeviceToDevice);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-#endif  // GOOGLE_CUDA
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset,
-      bool &need_initialize) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-      }
-      need_initialize = 1;
-      this->flag_.clear(std::memory_order_release);
-      return reinterpret_cast<V*>(this);
-    }
-    return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-  }
-
-  // simple getter for V* and version
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset;
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    return;
-  }
-
-  int64 GetStep() {
-    return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep();
-  }
-
-  void SetStep(int64 gs) {
-    ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs);
-  }
-
-  int64 GetFreq() {
-    return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter();
-  }
-
-  void SetFreq(int64 freq) {
-    ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq);
-  }
-
-  void AddFreq() {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq();
-  }
-
-  void AddFreq(int64 count) override {
-    ((FixedLengthHeader*)this->ptr_)->AddFreq(count);
-  }
-
-  bool SetPtr(V* ptr) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    V* value_ptr = *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader));
-    if (value_ptr == nullptr) {
-      *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = ptr;
-      this->flag_.clear(std::memory_order_release);
-      return true;
-    } else {
-      this->flag_.clear(std::memory_order_release);
-      return false;
-    }
-  }
-
-  void SetInitialized(int64 emb_index) {
-    while(this->flag_.test_and_set(std::memory_order_acquire));
-    ((FixedLengthHeader*)this->ptr_)->SetInitialized(emb_index);
-    this->flag_.clear(std::memory_order_release);
-  }
-
-};
-
-template <class V>
-class CompactValuePtr : public ValuePtr<V> {
-  public:
-   CompactValuePtr(Allocator* allocator, size_t size) {
-    memset(static_cast<char*>(this->ptr_), 0, sizeof(V) * size + sizeof(int64));
-   }
-
-   ~CompactValuePtr() {
-   }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset) override {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (!bs.test(emb_index)) {
-      while(this->flag_.test_and_set(std::memory_order_acquire));
-      if (bs.test(emb_index)) {
-        return ((V*)this->ptr_ + sizeof(int64) /
-            sizeof(V) + offset);
-      }
-      V* tensor_val =
-        ((V*)this->ptr_ + sizeof(int64) / sizeof(V) + offset);
-      memcpy(tensor_val, default_v, sizeof(V) * value_len);
-      int8* m = (int8*)((char*)this->ptr_ + 6);
-      *m |= (1 <<  emb_index);
-      this->flag_.clear(std::memory_order_release);
-      return tensor_val;
-    } else {
-      return (V*)this->ptr_ + sizeof(int64) /
-        sizeof(V) + offset;
-    }
-  }
-
-  virtual V* GetOrAllocate(Allocator* allocator, int64 value_len,
-      const V* default_v, int emb_index, int offset, bool &need_initialize) {
-    return nullptr;
-  }
-
-  virtual V* GetValue(int emb_index, int offset) {
-    int8 meta = *((int8*)((char*)this->ptr_ + 6));
-    std::bitset<8> bs(meta);
-    if (bs.test(emb_index)) {
-      return ((V*)this->ptr_ + sizeof(int64) /
-          sizeof(V) + offset);
-    } else {
-      return nullptr;
-    }
-  }
-
-  virtual void Destroy(Allocator* allocator) {
-    allocator->DeallocateRaw(this->ptr_);
-  }
-
-  virtual void* GetPtr() const {
-    return (void*)ptr_;
-  }
-
- private:
-  char ptr_[23];
-  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 115e3c4bae6..0c08c30c30a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -439,7 +439,8 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "embedding_variable_ops_test",
-    srcs = ["embedding_variable_ops_test.cc"],
+    srcs = ["embedding_variable_ops_test.cc",
+            "embedding_variable_test.h"],
     extra_copts = ["-fexceptions", "-g"],
     deps = [
         ":io",
@@ -6497,7 +6498,7 @@ tf_kernel_library(
         "training_ali_ops_gpu.h",
         "training_ali_ops.h"
     ],
-    copts = tf_copts(),
+    copts = tf_copts() + ["-g"],
     deps = [
         ":bounds_check",
         ":training_op_helpers",
diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc
index 7ec6b1cf109..393e9a9754b 100644
--- a/tensorflow/core/kernels/embedding_variable_memory_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc
@@ -19,17 +19,22 @@ namespace embedding {
 float PerfMemory(Tensor& default_value,
                 const std::vector<int64>& id_list,
                 int value_size, int64 default_value_dim,
-                int64 filter_freq = 0) {
+                int64 filter_freq = 0, int64 steps_to_live = 0,
+                int64 record_freq = false) {
   auto ev = CreateEmbeddingVar(value_size, default_value,
-                               default_value_dim, filter_freq);
-  ValuePtr<float>* value_ptr = nullptr;
+                               default_value_dim, filter_freq,
+                               steps_to_live, -1.0,
+                               embedding::StorageType::DRAM,
+                               {1024, 1024, 1024, 1024},
+                               record_freq);
+  void* value_ptr = nullptr;
   bool is_filter = false;
   double start_mem, end_mem;
   start_mem = getResident() * getpagesize();
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
     if (is_filter)
-      ev->flat(value_ptr, id_list[i]);
+      ev->flat(value_ptr);
   }
   end_mem = getResident() * getpagesize();
   double used_mb = (end_mem - start_mem)/1000000;
@@ -58,7 +63,7 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   float used_mb = PerfMemory(default_value, id_list,
                              value_size, default_value_dim);
   float theoritical_mb =
-      50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000;
+      50 + num_of_ids * (value_size * sizeof(float)) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
               (used_mb < theoritical_mb * 1.01));
 
@@ -68,9 +73,10 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) {
   used_mb = PerfMemory(default_value, id_list, value_size,
                        default_value_dim, filter_freq);
   theoritical_mb =
-      50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000;
+      50 + num_of_ids * (8 + value_size * sizeof(float) / 2
+                         + 4/*memory for ids_list*/) / 1000000;
   EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
-              (used_mb < theoritical_mb * 1.01));
+              (used_mb < theoritical_mb * 1.02));
 }
 } //namespace embedding
 } //namespace tensorflow
diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc
index 4839c171708..e30381fef07 100644
--- a/tensorflow/core/kernels/embedding_variable_ops_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc
@@ -21,6 +21,7 @@
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/embedding_variable_test.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -48,18 +49,6 @@ namespace {
 const int THREADNUM = 16;
 const int64 max = 2147483647;
 
-template<class K, class V>
-class TestableEmbeddingVar : public EmbeddingVar<K, V> {
- public:
-  TestableEmbeddingVar(const string& name,
-                       embedding::Storage<K, V>* storage,
-                       EmbeddingConfig emb_cfg = EmbeddingConfig(),
-                       Allocator* alloc = nullptr) : EmbeddingVar<K, V>(
-                         name, storage, emb_cfg, alloc) {}
-
-  using EmbeddingVar<K, V>::GetFilter;
-};
-
 struct ProcMemory {
   long size;      // total program size
   long resident;  // resident set size
@@ -123,11 +112,7 @@ TEST(EmbeddingVariableTest, TestEmptyEV) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   {
-    auto storage = embedding::StorageFactory::Create<int64, float>(
-        embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-    auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-        storage, EmbeddingConfig(), cpu_allocator());
-    variable->Init(value, 1);
+    auto variable = CreateEmbeddingVar(value_size, value, 1);
 
     LOG(INFO) << "size:" << variable->Size();
     Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
@@ -191,19 +176,14 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
   int64 value_size = 8;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddigVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   for (int64 i = 0; i < 5; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
     vflat(i) = 5.0;
   }
 
@@ -269,20 +249,15 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(0, 0, 1, 1, "", 5),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
 
   Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
 
   int64 ev_size = 10048576;
   for (int64 i = 0; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -344,9 +319,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
 
 void multi_insertion(EmbeddingVar<int64, float>* variable, int64 value_size){
   for (long j = 0; j < 5; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, j);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 }
 
@@ -355,12 +330,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   std::vector<std::thread> insert_threads(THREADNUM);
   for (size_t i = 0 ; i < THREADNUM; i++) {
@@ -375,54 +345,45 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) {
 
 void InsertAndLookup(EmbeddingVar<int64, float>* variable,
                      int64 *keys, long ReadLoops, int value_size){
-  float *default_value_fake = (float *)malloc((value_size)*sizeof(float));
-  for (int j = 0; j < value_size; j++) {
-      default_value_fake[j] = -1.0;
-    }
   for (long j = 0; j < ReadLoops; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    float *default_value = (float *)malloc((value_size)*sizeof(float));
-    for (int k = 0; k < value_size; k++) {
-      default_value[k] = (float)keys[j];
-    }
-    variable->LookupOrCreate(keys[j], val, default_value);
-    variable->LookupOrCreate(keys[j], val, default_value_fake);
-    ASSERT_EQ(default_value[0] , val[0]);
-    free(val);
-    free(default_value);
+    void* val = nullptr;
+    void* val_1 = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(keys[j], &val, &is_filter, false);
+    variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false);
+    ASSERT_EQ(val, val_1);
   }
-  free(default_value_fake);
 }
 
 void MultiBloomFilter(EmbeddingVar<int64, float>* var, int value_size, int64 i) {
   for (long j = 0; j < 1; j++) {
-    float *val = (float *)malloc((value_size+1)*sizeof(float));
-    var->LookupOrCreate(i+1, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    var->LookupOrCreateKey(i+1, &val, &is_filter, false);
   }
 }
 
 TEST(EmbeddingVariableTest, TestBloomFilter) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, "normal", 10, 0.01),
-      cpu_allocator());
-
-  var->Init(value, 1);
-
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  float *default_value = (float *)malloc((value_size+1)*sizeof(float));
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(1, val, default_value);
-  var->LookupOrCreate(2, val, default_value);
+  std::vector<float> default_value =
+      {0.0 ,1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+  test::FillValues<float>(&value, default_value);
+
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01);
+
+  //float *val = (float *)malloc((value_size+1)*sizeof(float));
+  void* val = nullptr;
+  bool is_filter = true;
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(2, &val, &is_filter, false);
   
   std::vector<int64> keylist;
   std::vector<float *> valuelist;
@@ -437,14 +398,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt64) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT64), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT64);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -509,14 +467,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt32) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal", 10, 0.01, DT_UINT32), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT32);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -581,14 +536,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt16) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT16), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT16);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -654,14 +606,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) {
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new TestableEmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0,
-          "normal_contiguous", 10, 0.01, DT_UINT8), cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value,
+                                1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024},
+                                false, 10, 0.01, DT_UINT8);
 
   float *val = (float *)malloc((value_size+1)*sizeof(float));
 
@@ -725,12 +674,7 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
   int64 value_size = 128;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
-
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 InsertLoops = 1000;
   bool* flag = (bool *)malloc(sizeof(bool)*max);
@@ -765,8 +709,9 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) {
 }
 
 void MultiFilter(EmbeddingVar<int64, float>* variable, int value_size) {
-  float *val = (float *)malloc((value_size+1)*sizeof(float));
-  variable->LookupOrCreate(20, val, nullptr);
+  bool is_filter = true;
+  void* val;
+  variable->LookupOrCreateKey(20, &val, &is_filter, false);
 }
 
 TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
@@ -774,14 +719,8 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float)); 
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(0, 0, 1, 1, "", 5, 7),
-      cpu_allocator());
-
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5);
+
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   int thread_num = 5;
   std::vector<std::thread> insert_threads(thread_num);
@@ -792,20 +731,16 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
     t.join();
   }
 
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   var->LookupOrCreateKey(20, &value_ptr);
-  ASSERT_EQ(value_ptr->GetFreq(), thread_num);
+  ASSERT_EQ(var->GetFreq(20), thread_num);
 }
 
 EmbeddingVar<int64, float>* InitEV_Lockless(int64 value_size) {
   Tensor value(DT_INT64, TensorShape({value_size}));
   test::FillValues<int64>(&value, std::vector<int64>(value_size, 10));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, EmbeddingConfig(), cpu_allocator());
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
-  variable->Init(value, 1);
   return variable;
 }
 
@@ -813,7 +748,7 @@ void MultiLookup(EmbeddingVar<int64, float>* variable,
     int64 InsertLoop, int thread_num, int i) {
   for (int64 j = i * InsertLoop/thread_num;
       j < (i+1)*InsertLoop/thread_num; j++) {
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(j, &value_ptr);
   }
 }
@@ -829,9 +764,9 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
   float* fill_v = (float*)malloc(value_size * sizeof(float));
 
   for (int64 i = 0; i < InsertLoop; i++){
-    ValuePtr<float>* value_ptr = nullptr;
+    void* value_ptr = nullptr;
     variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
   }
 
   testing::StartTiming();
@@ -848,58 +783,6 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
 
 }
 
-void hybrid_process(EmbeddingVar<int64, float>* variable,
-    int64* keys, int64 InsertLoop, int thread_num,
-    int64 i, int64 value_size) {
-  float *val = (float *)malloc(sizeof(float)*(value_size + 1));
-  for (int64 j = i * InsertLoop/thread_num;
-      j < (i+1) * InsertLoop/thread_num; j++) {
-    variable->LookupOrCreate(keys[j], val, nullptr);
-  }
-}
-
-void BM_HYBRID_LOCKLESS(int iters, int thread_num) {
-  testing::StopTiming();
-  testing::UseRealTime();
-
-  int64 value_size = 128;
-  auto variable = InitEV_Lockless(value_size);
-  int64 InsertLoop =  1000000;
-
-  srand((unsigned)time(NULL));
-  int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoop);
-
-  for (int64 i = 0; i < InsertLoop; i++) {
-    keys[i] =  rand() % 1000;
-  }
-
-  testing::StartTiming();
-  while (iters--) {
-    std::vector<std::thread> insert_threads(thread_num);
-    for (size_t i = 0 ; i < thread_num; i++) {
-      insert_threads[i] = std::thread(hybrid_process,
-          variable, keys, InsertLoop, thread_num, i, value_size);
-    }
-    for (auto &t : insert_threads) {
-      t.join();
-    }
-  }
-}
-
-BENCHMARK(BM_MULTIREAD_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
-BENCHMARK(BM_HYBRID_LOCKLESS)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
 
 TEST(EmbeddingVariableTest, TestAllocate) {
   int value_len = 8;
@@ -923,23 +806,13 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
   float* fill_v = (float*)malloc(value_size * sizeof(float));
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      EmbeddingConfig(/*emb_index = */0, /*primary_emb_index = */0,
-                      /*block_num = */1, /*slot_num = */1,
-                      /*name = */"", /*steps_to_live = */0,
-                      /*filter_freq = */0, /*max_freq = */999999,
-                      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-                      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-                      /*counter_type = */DT_UINT64),
-      cpu_allocator());
-  variable->Init(value, 1);
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
 
   int64 ev_size = 100;
   for (int64 i = 0; i < ev_size; i++) {
-    variable->LookupOrCreate(i, fill_v, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(i, &val, &is_filter, false);
   }
 
   LOG(INFO) << "size:" << variable->Size();
@@ -947,59 +820,20 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
 
 void t1(KVInterface<int64, float>* hashmap) {
   for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalValuePtr<float>(ev_allocator(), 100));
+    hashmap->Insert(i, nullptr);
   }
 }
 
 TEST(EmbeddingVariableTest, TestRemoveLockless) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMap<int64, float>();
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(t1, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofDBKV) {
-  int64 value_size = 4;
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM,
+      false, false, {false, 0});
   KVInterface<int64, float>* hashmap =
-      new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(value_size);
-
-  for (int64 i = 0; i < 6; ++i) {
-    const ValuePtr<float>* tmp =
-        new NormalContiguousValuePtr<float>(ev_allocator(), value_size);
-    hashmap->Commit(i, tmp);
-  }
-
-  for(int64 i = 0; i < 6; i++) {
-    ValuePtr<float>* tmp = nullptr;
-    Status s = hashmap->Lookup(i, &tmp);
-    ASSERT_EQ(s.ok(), true);
-  }
-}
-
-void InsertAndCommit(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    const ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(ev_allocator(), 100);
-    hashmap->Insert(i, tmp);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestSizeDBKV) {
-  KVInterface<int64, float>* hashmap =
-    new LevelDBKV<int64, float>(testing::TmpDir());
-  hashmap->SetTotalDims(100);
+      new LocklessHashMap<int64, float>(feat_desc);
+  feat_desc->InitSlotInfo(0, 100, {nullptr, 1});
   ASSERT_EQ(hashmap->Size(), 0);
   LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(InsertAndCommit, hashmap);
+  auto t = std::thread(t1, hashmap);
   t.join();
   LOG(INFO) << "hashmap size: " << hashmap->Size();
   ASSERT_EQ(hashmap->Size(), 100);
@@ -1190,213 +1024,6 @@ TEST(EmbeddingVariableTest, TestLFUCache) {
   }
 }
 
-TEST(EmbeddingVariableTest, TestCacheRestore) {
-  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
-  int64 value_size = 4;
-  Tensor value(DT_FLOAT, TensorShape({value_size}));
-  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
-  float* fill_v = (float*)malloc(value_size * sizeof(float));
-  std::vector<int64> size;
-  size.emplace_back(64);
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal_contiguous",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar");
-  auto variable = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage, emb_config, cpu_allocator());
-  variable->Init(value, 1);
-  variable->InitCache(CacheStrategy::LFU);
-
-  Tensor part_offset_tensor(DT_INT32,  TensorShape({kSavedPartitionNum + 1}));
-
-  int64 ev_size = 7;
-  int64 cache_size = 3;
-  for (int64 i = 1; i < cache_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(2);
-  }
-  for (int64 i = cache_size; i < ev_size; i++) {
-    ValuePtr<float>* value_ptr = nullptr;
-    variable->LookupOrCreateKey(i, &value_ptr);
-    typename TTypes<float>::Flat vflat = variable->flat(value_ptr, i);
-    value_ptr->AddFreq(1);
-  }
-
-  LOG(INFO) << "size:" << variable->Size();
-
-  BundleWriter writer(Env::Default(), Prefix("foo"));
-  embedding::ShrinkArgs shrink_args;
-  shrink_args.global_step = 1;
-  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
-  TF_ASSERT_OK(writer.Finish());
-  variable->Unref();
-
-  auto imported_storage= embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(embedding::DRAM_SSDHASH,
-      testing::TmpDir(),
-      size, "normal_contiguous",
-      emb_config),
-      cpu_allocator(),
-      "EmbeddingVar1");
-  auto imported_variable = new EmbeddingVar<int64, float>("EmbeddingVar1",
-      imported_storage, emb_config, cpu_allocator());
-  imported_variable->Init(value, 1);
-  imported_variable->InitCache(CacheStrategy::LFU);
-
-  BundleReader reader(Env::Default(), Prefix("foo"));
-  std::string name_string("var");
-  imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false);
-
-  ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size);
-  ASSERT_EQ(imported_storage->Size(1), 2);
-  delete imported_storage;
-}
-
-void t1_gpu(KVInterface<int64, float>* hashmap) {
-  for (int i = 0; i< 100; ++i) {
-    hashmap->Insert(i, new NormalGPUValuePtr<float>(ev_allocator(), 100));
-  }
-}
-
-#if GOOGLE_CUDA
-TEST(EmbeddingVariableTest,TestRemoveLocklessCPU) {
-    SessionOptions sops;
-    std::unique_ptr<Device> device =
-      DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
-    Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-        GPUOptions(), TfGpuId(0), 1 << 26);
-    KVInterface<int64, float>* hashmap =
-      new LocklessHashMapCPU<int64, float>(gpu_allocator);
-    ASSERT_EQ(hashmap->Size(), 0);
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    auto t = std::thread(t1, hashmap);
-    t.join();
-    LOG(INFO) << "hashmap size: " << hashmap->Size();
-    ASSERT_EQ(hashmap->Size(), 100);
-    TF_CHECK_OK(hashmap->Remove(1));
-    TF_CHECK_OK(hashmap->Remove(2));
-    ASSERT_EQ(hashmap->Size(), 98);
-    LOG(INFO) << "2 size:" << hashmap->Size();
-}
-#endif  // GOOGLE_CUDA
-
-/*void CommitGPU(KVInterface<int64, float>* hashmap) {
-  for (int64 i = 0; i< 100; ++i) {
-    ValuePtr<float>* tmp= new NormalGPUValuePtr<float>(ev_allocator(), 100);
-    hashmap->Commit(i, tmp);
-  }
-}
-
-TEST(EmbeddingVariableTest, TestCommitHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(100);
-  ASSERT_EQ(hashmap->Size(), 0);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  auto t = std::thread(CommitGPU, hashmap);
-  t.join();
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  ASSERT_EQ(hashmap->Size(), 100);
-  TF_CHECK_OK(hashmap->Remove(1));
-  TF_CHECK_OK(hashmap->Remove(2));
-  ASSERT_EQ(hashmap->Size(), 98);
-  LOG(INFO) << "2 size:" << hashmap->Size();
-}
-
-TEST(EmbeddingVariableTest, TestGPUValuePtr) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(), ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float host_data[ev_list_size];
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << initial_data[i];
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(host_data, address, ev_list_size * sizeof(float), cudaMemcpyDeviceToHost);
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << host_data[i];
-  }
-}//Forbidden, due to no gpu allocator at that time
-
-TEST(EmbeddingVariableTest, TestCommitValue) {
-  int ev_list_size = 32;
-  ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(ev_allocator(),ev_list_size);
-  float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-  float initial_data[ev_list_size];
-  for(int i = 0;i < ev_list_size;++i){
-    initial_data[i] = 10;
-  }
-  cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice);
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  hashmap->SetTotalDims(ev_list_size);
-  hashmap->Commit(1, ptr_);
-  ValuePtr<float>* check;
-  hashmap->Lookup(1,&check);
-  LOG(INFO) << "hashmap size: " << hashmap->Size();
-  float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-
-  for(int i = 0;i < ev_list_size;++i){
-    LOG(INFO) << i << " " << tmp[i];
-    //ASSERT_EQ(tmp[i], 10);
-  }//
-}
-
-TEST(EmbeddingVariableTest, TestBatchCommitofLocklessHashMapCPU) {
-  KVInterface<int64, float>* hashmap = new LocklessHashMapCPU<int64, float>();
-  const int EmbeddingSize = 16;
-  const int BatchSize = 16;
-
-  hashmap->SetTotalDims(EmbeddingSize);
-  std::vector<ValuePtr<float>*> value_ptr_list;
-  std::vector<int64> key_list;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    key_list.emplace_back(i);
-    ValuePtr<float>* ptr_ = new NormalGPUValuePtr<float>(EmbeddingSize);
-    float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader));
-    float initial_data[EmbeddingSize];
-    for(int j = 0;j < EmbeddingSize;++j){
-      initial_data[j] = i;
-      //LOG(INFO) << "initial[" << i << "][" << j << "]=" << initial_data[j];
-    }
-    cudaMemcpy(address, initial_data, EmbeddingSize * sizeof(float), cudaMemcpyHostToDevice);
-    value_ptr_list.emplace_back(ptr_);
-  }//initialize V on GPU
-
-  timespec start,end;
-  clock_gettime(CLOCK_MONOTONIC, &start);
-  hashmap->BatchCommit(key_list, value_ptr_list);
-  clock_gettime(CLOCK_MONOTONIC, &end);
-  std::cout << "time: " << ((double)(end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec)/1000000 << "ms" << std::endl;
-
-  for(int64 i = 0; i < BatchSize; i++) {
-    ValuePtr<float>* check;
-    hashmap->Lookup(i,&check);
-    float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader));
-    for(int j = 0;j < EmbeddingSize;++j){
-      LOG(INFO) << "batch[" << i << "][" << j << "]=" << tmp[j];
-      //ASSERT_EQ(tmp[j], i);
-    }
-  }//compare value after BatchCommit
-}
-*/
-
 const int total_size = 1024 * 8;
 const int th_num = 1;
 const int malloc_size = total_size / th_num;
@@ -1466,17 +1093,11 @@ TEST(EmbeddingVariableTest, TestCPUGPUMalloc) {
   auto mem_pool = new EmbeddingMemoryPool<float>(gpu_allocator, 256, 1024);
   float* ptr_1 = mem_pool->Allocate();
   float* ptr_2 = mem_pool->Allocate();
-  ValuePtr<float>* value_ptr1 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  ValuePtr<float>* value_ptr2 = new NormalGPUValuePtr<float>(gpu_allocator, 256);
-  value_ptr1->SetPtr(ptr_1);
-  value_ptr2->SetPtr(ptr_2);
-  value_ptr1->SetInitialized(0);
-  value_ptr2->SetInitialized(0);
-  std::vector<ValuePtr<float>*> value_ptrs;
-  value_ptrs.emplace_back(value_ptr1);
+  std::vector<void*> value_ptrs;
+  value_ptrs.emplace_back(ptr_1);
   mem_pool->Deallocate(value_ptrs);
   value_ptrs.clear();
-  value_ptrs.emplace_back(value_ptr2);
+  value_ptrs.emplace_back(ptr_2);
   mem_pool->Deallocate(value_ptrs);
   float* ptr_3 = mem_pool->Allocate();
   ASSERT_EQ(ptr_1, ptr_3);
@@ -1539,16 +1160,16 @@ TEST(EmbeddingVariableTest, TestEVMallocFree) {
 
 void SingleCommit(KVInterface<int64, float>* hashmap,
     std::vector<int64> keys, int bias) {
-  std::vector<ValuePtr<float>*> value_ptrs;
+  std::vector<void*> value_ptrs;
   for (int64 i = 0; i < keys.size(); ++i) {
-    ValuePtr<float>* tmp =
-      new NormalContiguousValuePtr<float>(cpu_allocator(), 124);
-    tmp->SetValue(float(keys[i] + bias), 124);
+    void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16);
+    for (int j = 0; j < 124; j++) {
+      ((float*)tmp)[j] = keys[i] + bias;
+    }
     value_ptrs.push_back(tmp);
   }
   ASSERT_EQ(keys.size(), value_ptrs.size());
   uint64 start = Env::Default()->NowNanos();
-  
   for (int64 i = 0; i < keys.size(); i++) {
     hashmap->Commit(keys[i], value_ptrs[i]);
   }
@@ -1558,9 +1179,13 @@ void SingleCommit(KVInterface<int64, float>* hashmap,
 
 void TestCompaction() {
   std::string temp_dir = testing::TmpDir();
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262144; i++) {
@@ -1576,12 +1201,12 @@ void TestCompaction() {
   t1.join();
   ids.clear();
   sleep(1);
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   for (int i = 131073; i < 262144; i++) {
@@ -1596,16 +1221,16 @@ void TestCompaction() {
   sleep(1);
   for (int i = 0; i < 131073; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 1);
+      ASSERT_EQ(v[j], i + 1);
     }
   }
   for (int i = 131073; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i + 2);
+      ASSERT_EQ(v[j], i + 2);
     }
   }
   delete hashmap;
@@ -1622,10 +1247,14 @@ TEST(KVInterfaceTest, TestSSDKVSyncCompaction) {
 }
 
 void TestReadEmbFile() {
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH,
+      true, true, {false, 0});
   std::string temp_dir = testing::TmpDir();
   auto hashmap = new SSDHashKV<int64, float>(
-      temp_dir, cpu_allocator());
-  hashmap->SetTotalDims(124);
+      temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
   ASSERT_EQ(hashmap->Size(), 0);
   std::vector<int64> ids;
   for (int i = 0; i < 262145; i++) {
@@ -1634,12 +1263,12 @@ void TestReadEmbFile() {
   SingleCommit(hashmap, ids, 3);
   sleep(1);
   ids.clear();
-  ValuePtr<float>* val = nullptr;
+  void* val = nullptr;
   for (int i = 0; i < 262144; i++) {
     hashmap->Lookup(i, &val);
-    float* v = (float*)val->GetPtr();
+    float* v = (float*)val;
     for (int j = 0; j < 124; j++){
-      ASSERT_EQ(v[4+j], i+3);
+      ASSERT_EQ(v[j], i+3);
     }
   }
   delete hashmap;
@@ -1666,9 +1295,10 @@ TEST(KVInterfaceTest, TestDirectIoFile) {
 void InsertKey(EmbeddingVar<int64, float>* variable, int value_size) {
   float *val = (float *)malloc((value_size+1)*sizeof(float));
   for (int64 i = 0; i < 100000000; i++) {
-    variable->LookupOrCreate(20, val, nullptr);
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(20, &val, &is_filter, false);
   }
-  LOG(INFO)<<"Finish Insert";
 }
 
 void RemoveKey(EmbeddingVar<int64, float>* variable) {
@@ -1676,29 +1306,13 @@ void RemoveKey(EmbeddingVar<int64, float>* variable) {
     sleep(1);
     variable->storage()->Remove(20);
   }
-  LOG(INFO)<<"Remove thread finish";
 }
 
 TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */2, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-
-   var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
    int thread_num = 5;
    std::vector<std::thread> insert_threads(thread_num);
    for (size_t i = 0 ; i < thread_num - 1; i++) {
@@ -1714,21 +1328,7 @@ TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
   int value_size = 10;
   Tensor value(DT_FLOAT, TensorShape({value_size}));
   test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
-  auto emb_config = EmbeddingConfig(
-      /*emb_index = */0, /*primary_emb_index = */0,
-      /*block_num = */1, /*slot_num = */0,
-      /*name = */"", /*steps_to_live = */0,
-      /*filter_freq = */0, /*max_freq = */999999,
-      /*l2_weight_threshold = */-1.0, /*layout = */"normal",
-      /*max_element_size = */0, /*false_positive_probability = */-1.0,
-      /*counter_type = */DT_UINT64);
-  auto storage = embedding::StorageFactory::Create<int64, float>(
-      embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar");
-  auto var = new EmbeddingVar<int64, float>("EmbeddingVar",
-      storage,
-      emb_config,
-      cpu_allocator());
-  var->Init(value, 1);
+  auto var = CreateEmbeddingVar(value_size, value, 1);
   float* set_value = (float*)malloc(value_size * sizeof(float));
   //Insertion
   for (int i = 0; i < 100; i++) {
diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc
index 9b01e35840b..16f4a894858 100644
--- a/tensorflow/core/kernels/embedding_variable_performance_test.cc
+++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc
@@ -90,14 +90,21 @@ void GenerateSkewInput(int num_of_ids, float skew_factor,
 void thread_lookup_or_create(
     EmbeddingVar<int64, float>* ev,
     const int64* input_batch,
+    float* default_value,
+    int default_value_dim,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false);
-    auto val = ev->flat(value_ptr, input_batch[i]);
-    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    if (is_filter) {
+      auto val = ev->flat(value_ptr);
+      memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    } else {
+      int default_value_index = input_batch[i] % default_value_dim;
+      memcpy(outputs[i], default_value + default_value_index * value_size, sizeof(float) * value_size);
+    }
   }
 }
 
@@ -138,6 +145,8 @@ double PerfLookupOrCreate(
     for (int i = 0; i < num_thread; i++) {
       worker_threads[i] = std::thread(thread_lookup_or_create,
                                       ev, input_batches[k].data(),
+                                      default_value_matrix.data(),
+                                      default_value_dim,
                                       outputs.data(), value_size,
                                       thread_task_range[i].first,
                                       thread_task_range[i].second);
@@ -201,11 +210,11 @@ void thread_lookup(
     const int64* input_batch,
     float** outputs, int value_size,
     int start, int end) {
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
 	bool is_filter = false;
   for (int i = start; i < end; i++) {
     ev->LookupKey(input_batch[i], &value_ptr);
-    auto val = ev->flat(value_ptr, input_batch[i]);
+    auto val = ev->flat(value_ptr);
     memcpy(outputs[i], &val(0), sizeof(float) * value_size);
   }
 }
@@ -293,7 +302,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) {
 		}
 	}
   auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   for (int i = 0; i < hot_ids_list.size(); i++) {
     ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false);
@@ -339,13 +348,13 @@ void PerfSave(Tensor& default_value,
       value_size, default_value,
       default_value_dim, 0, steps_to_live,
       l2_weight_threshold);
-  ValuePtr<float>* value_ptr = nullptr;
+  void* value_ptr = nullptr;
   bool is_filter = false;
   srand((unsigned)time(NULL));
 
   for (int i = 0; i < id_list.size(); i++) {
     ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
-    ev->flat(value_ptr, id_list[i]);
+    ev->flat(value_ptr);
     int64 global_step = rand() % 100;
     ev->UpdateVersion(value_ptr, global_step);
   }
diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h
index d06304fb78a..07c34764fb0 100644
--- a/tensorflow/core/kernels/embedding_variable_test.h
+++ b/tensorflow/core/kernels/embedding_variable_test.h
@@ -107,35 +107,42 @@ EmbeddingVar<int64, float>* CreateEmbeddingVar(
     int value_size, Tensor& default_value,
     int64 default_value_dim, int64 filter_freq = 0,
     int64 steps_to_live = 0,
-    float l2_weight_threshold=-1.0) {
-  std::string layout_type = "light";
-  if (filter_freq != 0) {
-    layout_type = "normal";
-  }
-
-  if (steps_to_live != 0) {
-    if (layout_type == "light") {
-      layout_type = "normal_contiguous";
-    }
-  }
+    float l2_weight_threshold=-1.0,
+    embedding::StorageType storage_type = embedding::StorageType::DRAM,
+    std::vector<int64> storage_size = {1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024,
+                                       1024*1024*1024},
+    bool record_freq = false,
+    int64 max_element_size = 0,
+    float false_positive_probability = -1.0,
+    DataType counter_type = DT_UINT64) {
   auto embedding_config = EmbeddingConfig(
-			0, 0, 1, 0, "emb_var", steps_to_live,
-			filter_freq, 999999, l2_weight_threshold, layout_type,
-			0, -1.0, DT_UINT64, default_value_dim,
-			0.0, false, false, false);
+      0, 0, 1, 0, "emb_var", steps_to_live,
+      filter_freq, 999999, l2_weight_threshold,
+      max_element_size, false_positive_probability,
+      counter_type, default_value_dim,
+      0.0, record_freq, false, false);
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), storage_type,
+      record_freq,
+      embedding_config.is_save_version(),
+      {embedding_config.is_counter_filter(), filter_freq});
   auto storage =
       embedding::StorageFactory::Create<int64, float>(
           embedding::StorageConfig(
-              embedding::StorageType::DRAM, "",
-              {1024, 1024, 1024, 1024}, layout_type,
+              storage_type, "",
+              storage_size,
               embedding_config),
           cpu_allocator(),
+          feat_desc,
           "emb_var");
 	auto ev = new EmbeddingVar<int64, float>(
       "emb_var",
       storage,
       embedding_config,
-      cpu_allocator());
+      cpu_allocator(),
+      feat_desc);
 	ev->Init(default_value, default_value_dim);
   return ev;
 }
diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
index 55dd40176a8..2f07e2ef537 100644
--- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
+++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc
@@ -774,7 +774,7 @@ class GroupEmbeddingVariableForWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
@@ -958,7 +958,7 @@ class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase {
       embedding_var->Init(value, 1);
 
       for (int64 j = 0; j < nnz; ++j) {
-        ValuePtr<TValue>* value_ptr = nullptr;
+        void* value_ptr = nullptr;
         Status s =
             embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
         typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h
index 0582697ad16..d84838ae413 100644
--- a/tensorflow/core/kernels/incr_save_restore_ops.h
+++ b/tensorflow/core/kernels/incr_save_restore_ops.h
@@ -225,9 +225,9 @@ class IncrEVValueDumpIterator : public  DumpIterator<T> {
       keys_idx_++;
       col_idx_ = 0;
     }
-    ValuePtr<T>* value_ptr = NULL;
+    void* value_ptr = NULL;
     TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr));
-    return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++);
+    return emb_var_->flat(value_ptr)(col_idx_++);
   }
 
  private:
diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
index c69aec8ebb9..7e40dfff7ac 100644
--- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc
@@ -121,7 +121,7 @@ class KvResourceLookupIDOp : public OpKernel {
       const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GetOrCreateKey(ev_ctx, indices,
-                         reinterpret_cast<ValuePtr<TValue>**>(out_base),
+                         reinterpret_cast<void**>(out_base),
                          indices_size);
     }
   }
@@ -203,7 +203,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel {
       const size_t slice_bytes = slice_elems * sizeof(TValue);
       EmbeddingVarContext<Device> ev_ctx(c);
       ev->GatherEmbeddings(ev_ctx, indices,
-                          (ValuePtr<TValue>**)pointer.data(),
+                          (void**)pointer.data(),
                           out_base, N);
     }
   }
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 8a01a7bf2cd..5cd0ef140bd 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -214,16 +214,16 @@ class InitializeKvVariableOp : public OpKernel {
     int64 storage_type = 0;
     OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
     storage_type_ = static_cast<embedding::StorageType>(storage_type);
-    auto device_type_str = c->device_type().type_string();
+    device_type_str_ = c->device_type().type_string();
     if (storage_type_ == embedding::DEFAULT) {
-      if (device_type_str == "CPU") {
+      if (device_type_str_ == "CPU") {
         storage_type_ = embedding::DRAM;
       } else {
         storage_type_ = embedding::HBM;
       }
     }
 
-    bool if_op_on_gpu = (device_type_str == "GPU");
+    bool if_op_on_gpu = (device_type_str_ == "GPU");
     bool if_embedding_on_hbm = (storage_type_ == embedding::HBM ||
                                 storage_type_ == embedding::HBM_DRAM ||
                                 storage_type_ == embedding::HBM_DRAM_SSDHASH);
@@ -238,57 +238,14 @@ class InitializeKvVariableOp : public OpKernel {
       filter_freq_ = 0;
     }
 
-    OP_REQUIRES_OK(c, c->GetAttr("layout", &layout_));
-    if (!layout_.empty()) {
-      // use layout by user configuration
-    } else if ((filter_freq_ != 0 && max_element_size_ == 0)
-               || steps_to_live_ != 0 || record_freq_
-               || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        if (storage_type == embedding::HBM_DRAM ||
-            storage_type == embedding::HBM_DRAM_SSDHASH) {
-          layout_ = "normal_contiguous_gpu";
-        } else {
-          layout_ = "normal_contiguous";
-        }
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
-    if ("compact" == layout_) {
-      OP_REQUIRES(c, shape_.dim_size(0) == 1 &&
-            storage_type_ == embedding::StorageType::DRAM,
-          errors::InvalidArgument("embedding_dim must be 1 and storage type"
-                                  " should be DRAM when layout is 'compact'."));
-    }
+    record_freq_ |= (storage_type > 5);
+    record_version_ |= (storage_type > 5);
 
     OP_REQUIRES(c, steps_to_live_ >= 0,
         errors::InvalidArgument(
             "steps_to_live must >= 0, ", std::to_string(steps_to_live_)));
 
     OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
-    if (embedding::StorageType::LEVELDB == storage_type_) {
-      ht_type_ = "leveldb_kv";
-      if (layout_ != "normal_contiguous")
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS when storage type is LEVELDB";
-      layout_ = "normal_contiguous";
-    }
-
-    if (embedding::StorageType::PMEM_LIBPMEM == storage_type_ ||
-        embedding::StorageType::PMEM_MEMKIND == storage_type_){
-      if (layout_ != "normal_contiguous"){
-        LOG(WARNING)
-          << "layout must be NORAML_CONTIGUOUS"
-          << " when storage type is PMEM_LIBPMEM or PMEM_MEMKIND";
-      }
-      layout_ = "normal_contiguous";
-    }
     OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_));
   }
 
@@ -314,35 +271,43 @@ class InitializeKvVariableOp : public OpKernel {
               context, handle_self, &ev,
               [this, default_values, opname, context,
                handle_self](EmbeddingVar<TKey, TValue>** ptr) {
-            Allocator* gpu_allocator =
+            Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 emb_index_ + block_num_ * slot_index_,
                 emb_index_, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_,
                 filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
-            return Status::OK();
-          }));
-      ev->Init(default_values, default_value_dim_);
+                alloc_for_ev,
+                feat_desc);
+            return (*ptr)->Init(default_values, default_value_dim_);
+          }));   
     } else {
       EmbeddingVar<TKey, TValue>* primary_variable = nullptr;
       OP_REQUIRES_OK(
@@ -352,30 +317,38 @@ class InitializeKvVariableOp : public OpKernel {
               [this, default_values, opname,
                handle_primary, context](EmbeddingVar<TKey, TValue>** ptr) {
             int64 primary_slot_index(0), primary_emb_index(0);
-            Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes());
-            //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes());
+            Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
             auto embedding_config = EmbeddingConfig(
                 primary_emb_index + block_num_ * primary_slot_index,
                 primary_emb_index,
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_, max_freq_,
-                l2_weight_threshold_, layout_,
+                l2_weight_threshold_,
                 max_element_size_, false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_,
                 is_inference_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    gpu_allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_primary.name(),
                 storage,
                 embedding_config,
-                gpu_allocator);
+                alloc_for_ev,
+                feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -386,20 +359,26 @@ class InitializeKvVariableOp : public OpKernel {
             context, handle_self, &ev,
             [this, default_values, opname, primary_variable,
              handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
+           Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes());
+          auto embedding_config = EmbeddingConfig(
+              emb_index_ + block_num_ * slot_index_,
+              emb_index_,
+              block_num_, slot_num_, opname,
+              steps_to_live_, filter_freq_,
+              max_freq_, l2_weight_threshold_,
+              max_element_size_,
+              false_positive_probability_,
+              counter_type_, default_value_dim_,
+              default_value_no_permission_,
+              record_freq_, record_version_,
+              is_inference_);
+          Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
           *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
               primary_variable->storage(),
-              EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                              emb_index_,
-                              block_num_, slot_num_, opname,
-                              steps_to_live_, filter_freq_,
-                              max_freq_, l2_weight_threshold_,
-                              layout_, max_element_size_,
-                              false_positive_probability_,
-                              counter_type_, default_value_dim_,
-                              default_value_no_permission_,
-                              record_freq_, record_version_,
-                              is_inference_),
-          primary_variable->GetAllocator());
+              embedding_config,
+              alloc_for_ev,
+              primary_variable->feature_descriptor());
           return (*ptr)->Init(default_values, default_value_dim_);
         }));
       core::ScopedUnref unref_me(primary_variable);
@@ -424,7 +403,6 @@ class InitializeKvVariableOp : public OpKernel {
   int64 filter_freq_;
   int64 max_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_element_size_;
   float false_positive_probability_;
   embedding::StorageType storage_type_;
@@ -436,6 +414,7 @@ class InitializeKvVariableOp : public OpKernel {
   bool record_version_;
   bool is_inference_;
   bool is_set_initialized_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(ktype, vtype)                               \
diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h
index 8e3572443ba..3202e6d12bf 100644
--- a/tensorflow/core/kernels/kv_variable_ops.h
+++ b/tensorflow/core/kernels/kv_variable_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/embedding/cache_factory.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 23a504eea5d..3b10c2521b9 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -120,20 +120,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_));
     OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_));
 
-    if ((filter_freq_ != 0 && max_element_size_ == 0)
-         || steps_to_live_ != -1 || record_freq_
-         || record_version_ || storage_type > 5) {
-      if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) {
-        layout_ = "normal";
-      } else {
-        layout_ = "normal_contiguous";
-      }
-    } else {
-      layout_ = "light";
-    }
-
-    CHECK(block_num_ == 1 || layout_ != "normal_contiguous");
-
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true,
                                    &ev_async_restore_));
   }
@@ -170,24 +156,33 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 block_num_, slot_num_, opname + "-primary",
                 steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, default_value_dim_,
                 default_value_no_permission_,
                 record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_self.name());
             *ptr = new EmbeddingVar<TKey, TValue>(
                 handle_self.name(),
                 storage,
                 embedding_config,
-                allocator);
+                alloc_for_ev,
+                feat_desc);
             return Status::OK();
         }));
       ev->Init(default_values, default_value_dim_);
@@ -207,19 +202,27 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                 primary_emb_index, block_num_, slot_num_,
                 opname + "-primary", steps_to_live_, filter_freq_,
                 max_freq_, l2_weight_threshold_,
-                layout_,  max_element_size_,
+                max_element_size_,
                 false_positive_probability_,
                 counter_type_, 0, record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+            auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                record_freq_,
+                embedding_config.is_save_version(),
+                {embedding_config.is_counter_filter(), filter_freq_});
             auto storage =
                 embedding::StorageFactory::Create<TKey, TValue>(
                     embedding::StorageConfig(
                         storage_type_, storage_path_,
-                        storage_size_, layout_,
+                        storage_size_,
                         embedding_config),
-                    allocator,
+                    alloc_for_ev,
+                    feat_desc,
                     handle_primary.name());
             *ptr = new EmbeddingVar<TKey, TValue>(handle_primary.name(),
-                storage, embedding_config, allocator);
+                storage, embedding_config, alloc_for_ev, feat_desc);
             // default_values is slot value, should not to initialize primary value
             return Status::OK();
           }));
@@ -232,17 +235,22 @@ class KvResourceImportV2Op: public AsyncOpKernel {
                handle_self, context](EmbeddingVar<TKey, TValue>** ptr) {
             Allocator* allocator =
                 context->device()->GetAllocator(AllocatorAttributes());
+            auto embedding_config = EmbeddingConfig(
+                emb_index_ + block_num_ * slot_index_,
+                emb_index_, block_num_, slot_num_, opname,
+                steps_to_live_, filter_freq_, max_freq_,
+                l2_weight_threshold_, max_element_size_,
+                false_positive_probability_,
+                counter_type_, default_value_dim_,
+                default_value_no_permission_,
+                record_freq_, record_version_);
+            Allocator* alloc_for_ev =
+                (device_type_str_ == "CPU") ? ev_allocator() : allocator;
             *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
                 primary_variable->storage(),
-                EmbeddingConfig(emb_index_ + block_num_ * slot_index_,
-                    emb_index_, block_num_, slot_num_, opname,
-                    steps_to_live_, filter_freq_, max_freq_,
-                    l2_weight_threshold_, layout_, max_element_size_,
-                    false_positive_probability_,
-                    counter_type_, default_value_dim_,
-                    default_value_no_permission_,
-                    record_freq_, record_version_),
-                    allocator);
+                embedding_config,
+                alloc_for_ev,
+                primary_variable->feature_descriptor());
             return (*ptr)->Init(default_values, default_value_dim_);
           }));
       core::ScopedUnref unref_me(primary_variable);
@@ -290,7 +298,6 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   int64 slot_num_;
   int64 filter_freq_;
   float l2_weight_threshold_;
-  std::string layout_;
   int64 max_freq_;
   embedding::StorageType storage_type_;
   std::string storage_path_;
@@ -301,6 +308,7 @@ class KvResourceImportV2Op: public AsyncOpKernel {
   bool record_version_;
   bool reset_version_;
   bool ev_async_restore_;
+  std::string device_type_str_;
 };
 
 #define REGISTER_KERNELS(dev, ktype, vtype)                    \
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 4f69ebe3fb5..da58e17e1bb 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/hash_table/hash_table.h"
 #include "tensorflow/core/framework/hash_table/bloom_filter_strategy.h"
 #include "tensorflow/core/framework/embedding/kv_interface.h"
-#include "tensorflow/core/framework/embedding/value_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h
index e013a6a2bae..12948de24a4 100644
--- a/tensorflow/core/kernels/training_ali_op_helpers.h
+++ b/tensorflow/core/kernels/training_ali_op_helpers.h
@@ -121,55 +121,54 @@ EmbeddingVariableInputLockHolder<K, V> MaybeLockEmbeddingVariableInputMutexesInO
 template<class K, class V, class Tstep>
 void LookupKeyAndSetVersion(
     OpKernelContext* ctx, EmbeddingVar<K, V>* var,
-    ValuePtr<V>** value_ptrs, Tstep gs, const K* indices,
+    void** value_ptrs, Tstep gs, const K* indices,
     int64 task_size, bool indices_as_pointer,
     int counts_index) {
+  EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
   int64* indices_counts = nullptr;
   std::function<int64(int64*, int64)> get_count_fn = 0;
   if (counts_index != -1) {
     const Tensor& counts_tensor = ctx->input(counts_index);
     indices_counts = (int64*)counts_tensor.data();
-    get_count_fn = [](int64* counts, int64 index) {
-      return counts[index];};
-  } else {
-    get_count_fn = [](int64* counts, int64 index) {return 1;};
   }
+  var->LookupOrCreateKey(ev_ctx, indices, value_ptrs,
+                         task_size, indices_counts,
+                         indices_as_pointer);
 
-  auto lookup_key_and_set_version_fn = [var, value_ptrs, gs,
-      indices, indices_as_pointer,
-      indices_counts, get_count_fn] (int64 start, int64 limit) {
-    ValuePtr<V>* value_ptr = nullptr;
+  auto update_version_fn = [var, value_ptrs, gs]
+      (int64 start, int64 limit) {
     for (int i = start; i < limit; i++) {
-      bool is_filter = false;
-      int64 count = get_count_fn(indices_counts, i);
-      var->LookupOrCreateKey(indices[i], &value_ptr,
-          &is_filter, indices_as_pointer, count);
-      value_ptrs[i] = value_ptr;
-      var->UpdateVersion(value_ptr, gs);
+      var->UpdateVersion(value_ptrs[i], gs);
     }
   };
   const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   Shard(worker_threads->num_threads,
         worker_threads->workers, task_size, unit_cost,
-        lookup_key_and_set_version_fn);
+        update_version_fn);
 }
 
 template<class K, class V>
-void LookupOrCreateEmbedding(
+void LookupEmbedding(
     OpKernelContext* ctx,
     std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
-    ValuePtr<V>** value_ptrs,
+    void** value_ptrs,
     const K* indices,
-    int64 num_of_keys,
-    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+    int64 num_of_keys) {
   for (auto it: vars) {
     EmbeddingVar<K, V>* var = it.first;
     V** var_ptr = it.second;
-    EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
-    var->BatchLookupOrCreateEmb(
-        ev_ctx, var_ptr, value_ptrs,
-        indices, num_of_keys, thread_copy_id_alloc);
+    auto lookup_emb_fn = [var, var_ptr, value_ptrs]
+        (int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        var_ptr[i] = var->GetValuePtr(value_ptrs[i]);
+      }
+    };
+    const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads->num_threads,
+        worker_threads->workers, num_of_keys, unit_cost,
+        lookup_emb_fn);
   }
 }
 
@@ -180,12 +179,12 @@ void GetEmbeddingPointers(
     const K* indices, Tstep gs, bool indices_as_pointer,
     int counts_index, int64 num_of_keys,
     IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
-  std::vector<ValuePtr<V>*> value_ptrs(num_of_keys);
+  std::vector<void*> value_ptrs(num_of_keys);
   LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(),
                          gs, indices, num_of_keys,
                          indices_as_pointer, counts_index);
-  LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(),
-                          indices, num_of_keys, thread_copy_id_alloc);
+  LookupEmbedding(ctx, vars, value_ptrs.data(),
+                  indices, num_of_keys);
 }
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 839ce82feef..546b30e29dd 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -141,16 +141,16 @@ class KvSparseApplyAdagradOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               a += g.square();
               v -= g.constant(lr_scalar) * g * a.rsqrt();
             }
@@ -542,15 +542,15 @@ class KvSparseApplyFtrlOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const TKey index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             if (is_filter) {
-              auto var = var_->flat(value_ptr, index);
-              auto accum = accum_->flat(value_ptr, index);
-              auto linear = linear_->flat(value_ptr, index);
+              auto var = var_->flat(value_ptr);
+              auto accum = accum_->flat(value_ptr);
+              auto linear = linear_->flat(value_ptr);
               auto grad = grad_flat.template chip<0>(i);
 
 // Use a macro to implement the computation here due to the templating of the
@@ -1301,19 +1301,19 @@ class KvSparseApplyAdagradDecayOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto a = accum->flat(value_ptr, index);
+              auto a = accum->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
 
-              auto v = var->flat(value_ptr, index);
-              auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
+              auto accum_decay_power = accum_decay_power_var->flat(value_ptr);
 
               if (gs / decay_step_scalar > accum_decay_power(0)) {
                 a *= a.constant(decay_rate_scalar);
@@ -1505,19 +1505,18 @@ class KvSparseApplyAdamOp : public OpKernel {
           auto indices_vec = indices.vec<Tindex>();
 
           int64 gs = global_step.scalar<int64>()();
-
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
 
               auto g = grad_flat.template chip<0>(i);
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
@@ -2412,15 +2411,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
           Tstep gs = global_step.scalar<Tstep>()();
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto v_ = v->flat(value_ptr, index);
-              auto m_ = m->flat(value_ptr, index);
+              auto v_ = v->flat(value_ptr);
+              auto m_ = m->flat(value_ptr);
               auto grad_ = grad_flat.template chip<0>(i);
 
               v_ = v_ * v_.constant(beta2_scalar) +
@@ -2429,7 +2428,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
                      (v_ + v_.constant(epsilon_scalar)).rsqrt() *
                          v_.constant(lr_scalar) * grad_;
 
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= m_;
             }
           }
@@ -2461,17 +2460,17 @@ class KvSparseApplyAdamAsyncOp : public OpKernel {
 
             for (int64 i = start_i; i < limit_i; i++) {
               const Tindex index = indices_vec(i);
-              ValuePtr<T>* value_ptr = nullptr;
+              void* value_ptr = nullptr;
               bool is_filter = false;
               int64 count = get_count_fn(indices_counts, i);
               OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                              &is_filter, indices_as_pointer, count));
               var->UpdateVersion(value_ptr, gs);
               if (is_filter) {
-                auto m_a = m->flat(value_ptr, index);
-                auto v_a = v->flat(value_ptr, index);
+                auto m_a = m->flat(value_ptr);
+                auto v_a = v->flat(value_ptr);
                 auto g = grad_flat.template chip<0>(i);
-                auto var_i = var->flat(value_ptr, index);
+                auto var_i = var->flat(value_ptr);
 
                 m_a = m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
                 v_a = v_a * beta2_scalar + g.square() * (static_cast<T>(1) - beta2_scalar);
@@ -2939,7 +2938,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             (int64 start_i, int64 limit_i) {
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter = false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
@@ -2947,7 +2946,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel {
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
               auto g = grad_flat.template chip<0>(i);
-              auto v = var->flat(value_ptr, index);
+              auto v = var->flat(value_ptr);
               v -= g.constant(lr_scalar) * g;
             }
           }
@@ -3136,16 +3135,16 @@ class KvSparseApplyAdamWOp : public OpKernel {
 
           for (int64 i = start_i; i < limit_i; i++) {
             const Tindex index = indices_vec(i);
-            ValuePtr<T>* value_ptr = nullptr;
+            void* value_ptr = nullptr;
             bool is_filter =false;
             int64 count = get_count_fn(indices_counts, i);
             OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr,
                            &is_filter, indices_as_pointer, count));
             var->UpdateVersion(value_ptr, gs);
             if (is_filter) {
-              auto var_i = var->flat(value_ptr, index);
-              auto m_a = m->flat(value_ptr, index);
-              auto v_a = v->flat(value_ptr, index);
+              auto var_i = var->flat(value_ptr);
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
               auto g = grad_flat.template chip<0>(i);
               // m_a = beta1 * m + (1 - beta1) * g
               m_a += (g - m_a) * (static_cast<T>(1) - beta1_scalar);
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2a56634206c..e89b095aff1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -6132,6 +6132,8 @@ class GraphKeys(object):
   TRAINABLE_VARIABLES = "trainable_variables"
   # Indicate EmbeddingVariable in CollectionDef
   EMBEDDING_VARIABLES = "embedding_variables"
+  # Collection for dependencies of EmbeddingVariable's restore op
+  EMBEDDING_VARIABLE_RESTORE_DEPENDENCY = "embedding_variable_restore_dependency"
   # Key to collect summaries.
   SUMMARIES = "summaries"
   # Key to collect QueueRunners.
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index 240938e8675..d47d94d0d99 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -47,69 +47,6 @@
 
 
 class EmbeddingVariableGpuTest(test_util.TensorFlowTestCase):
-  def testDynamicDimensionEmbeddingVariable(self):
-    print("testDynamicDimensionEmbeddingVariable")
-    with ops.device('/gpu:0'):
-      def runTestAdagrad(self, var, g):
-        if isinstance(var, kv_variable_ops.EmbeddingVariable):
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-        else:
-          emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-        fun = math_ops.multiply(emb, 2.0, name='multiply')
-        loss = math_ops.reduce_sum(fun, name='reduce_sum')
-        gs = training_util.get_or_create_global_step()
-        opt = adagrad.AdagradOptimizer(0.1)
-        g_v = opt.compute_gradients(loss)
-        train_op = opt.apply_gradients(g_v)
-        init = variables.global_variables_initializer()
-        with self.test_session(graph=g) as sess:
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-          sess.run([init])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          r, _, _ = sess.run([emb, train_op,loss])
-          return r
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            embedding_dim = 8,
-            ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4))
-      emb1 = runTestAdagrad(self, emb_var, g)
-    with ops.device('/gpu:0'), ops.Graph().as_default() as g:
-      var =  variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                    embedding_block_dimension=4,
-                                                                    embedding_block_num=2,
-                                                                    storage_type=config_pb2.StorageType.HBM,
-                                                                    initializer=init_ops.ones_initializer(dtypes.float32))
-      emb2 = runTestAdagrad(self, var, g)
-    for i in range(0, 6):
-      for j in range(0, 8):
-        self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-  def testDynamicEmbeddingVariableForInitFromProto(self):
-    print("testDynamicEmbeddingVariableForInitFromProto")
-    with ops.device('/gpu:0'):
-      embedding = variable_scope.get_dynamic_dimension_embedding_variable("var_dist",
-                                                                      embedding_block_dimension=4,
-                                                                      embedding_block_num=2,
-                                                                      storage_type=config_pb2.StorageType.HBM,
-                                                                      initializer=init_ops.ones_initializer(dtypes.float32))
-    emb = embedding_ops.embedding_lookup(embedding, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2])
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
-    ops.reset_default_graph()
-    with self.test_session() as sess:
-      res = saver_module.import_meta_graph(meta_graph_def)
-
   def testEmbeddingVariableForInitFromProto(self):
     print("testEmbeddingVariableForInitFromProto")
     with ops.device('/gpu:0'):
@@ -235,43 +172,6 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
-    print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-        ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    with ops.device("/gpu:0"):
-      W = feature_column.embedding_column(sparse_id_column=columns,
-              dimension=3,
-              initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
-
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
-
-    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
-    g_v = opt.compute_gradients(loss)
-    train_op = opt.apply_gradients(g_v)
-    init = variables.global_variables_initializer()
-
-    with self.test_session() as sess:
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-      sess.run([init])
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for val1 in emb1.tolist():
-        for val in val1:
-          self.assertEqual(val, .0)
-      emb1, top, l = sess.run([emb, train_op, loss])
-      for index, val1 in enumerate(emb1.tolist()):
-        if index < 7:
-          for val in val1:
-            self.assertNotEqual(val, 1.0)
-        else:
-          for val in val1:
-            self.assertEqual(val, .0)
-
   def testEmbeddingVariableForSparseColumnEmbeddingCol(self):
     columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
         ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)))
@@ -870,6 +770,66 @@ def testSaveV3(self):
         result = sess.run([emb1])
         print(result)
 
+  def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self):
+    print("testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm")
+    checkpoint_directory = self.get_temp_dir()
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+        
+        emb = embedding_ops.embedding_lookup(var, 
+                                            math_ops.cast([0,1,2,5,6,7],
+                                            dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver(sharded=True)
+        init = variables.global_variables_initializer()
+        graph = ops.get_default_graph()
+        with self.test_session() as sess:
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+          sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+          sess.run([init])
+          sess.run(train_op)
+          emb_ori = sess.run(emb)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+
+    with ops.Graph().as_default() as g, ops.device('/gpu:0'):
+        var = variable_scope.get_embedding_variable("var_1",
+                embedding_dim = 3,
+                ev_option = variables.EmbeddingVariableOption(
+                    storage_option=variables.StorageOption(
+                        storage_type=config_pb2.StorageType.HBM_DRAM)))
+
+        emb = embedding_ops.embedding_lookup(var, 
+                                             math_ops.cast([0,1,2,5,6,7],
+                                             dtypes.int64))
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        loss = math_ops.reduce_sum(fun, name='reduce_sum')
+        gs = training_util.get_or_create_global_step()
+        opt = adagrad.AdagradOptimizer(0.1)
+        g_v = opt.compute_gradients(loss)
+        train_op = opt.apply_gradients(g_v, gs)
+        saver = saver_module.Saver()
+        graph = ops.get_default_graph()
+        with self.test_session(graph = graph) as sess:
+          saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
+          emb_val = sess.run(emb)
+          self.assertAllEqual(emb_ori, emb_val)
+          save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345)
+          for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+            if "Adagrad-values" in name:
+              value = checkpoint_utils.load_variable(checkpoint_directory, name)
+              for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+                  self.assertAlmostEqual(1.1, value[i][j])            
+
   def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
     print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm")
     checkpoint_directory = self.get_temp_dir()
@@ -894,8 +854,8 @@ def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self):
         emb2 = embedding_ops.embedding_lookup(var2,
                                               math_ops.cast([0,1,2,5,6,7],
                                               dtypes.int64))
-        fun = math_ops.multiply(emb, 0.0, name='multiply')
-        fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1')
+        fun = math_ops.multiply(emb, 1.0, name='multiply')
+        fun1 = math_ops.multiply(emb2, 1.0, name='multiply_1')
         loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum')
         gs = training_util.get_or_create_global_step()
         opt = adagrad.AdagradOptimizer(0.1)
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index c6cdf951a1e..81b315e2e43 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -120,7 +120,7 @@ def _CounterFilterTestTemplate(self, optimizer):
               initializer=init_ops.ones_initializer(dtypes.float32),
               ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
               partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
       fun = math_ops.multiply(emb, 2.0, name='multiply')
       loss = math_ops.reduce_sum(fun, name='reduce_sum')
       gs = training_util.get_or_create_global_step()
@@ -133,11 +133,18 @@ def _CounterFilterTestTemplate(self, optimizer):
         sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
         sess.run([init])
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertEqual(val, .0)
+      
+        for val1 in emb1.tolist():
+          for val in val1:
+            self.assertEqual(val, .0)
         emb1, top, l = sess.run([emb, train_op, loss])
-        for val in emb1.tolist()[0]:
-          self.assertNotEqual(val, 1.0)
+        for index, val1 in enumerate(emb1.tolist()):
+          if index < 7:
+            for val in val1:
+              self.assertNotEqual(val, 1.0)
+          else:
+            for val in val1:
+              self.assertEqual(val, .0)
 
   def _RecordFreqTestTemplate(self, optimizer):
     checkpoint_directory = self.get_temp_dir()
@@ -720,20 +727,11 @@ def testEmbeddingVariableForL2FeatureEviction(self):
       sess.run([init])
       emb_ori = sess.run([emb, train_op])
       save_path = saver.save(sess, os.path.join(checkpoint_directory, "model1.ckpt"), global_step=12345)
-      #for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
-      #  print('loading... ', name, shape)
-    with self.test_session() as sess:
-      saver.restore(sess, os.path.join(checkpoint_directory, "model1.ckpt-12345"))
-      emb_right = [[0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.8282884, 0.8282884, 0.8282884],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [0.7927219, 0.7927219, 0.7927219],
-                   [1.0, 1.0, 1.0]]
-      emb_ori = sess.run(emb)
-      for i in range(6):
-        for j in range(3):
-          self.assertAlmostEqual(emb_ori[i][j], emb_right[i][j])
+      for name, shape in checkpoint_utils.list_variables(checkpoint_directory):
+        if name == "var_1-keys":
+          self.assertEqual(shape[0], 2)
+          keys = checkpoint_utils.load_variable(checkpoint_directory, name)
+          self.assertAllEqual(keys, [0, 1])
 
   def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
     columns_list=[]
@@ -764,14 +762,15 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self):
 
   def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
     print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn")
-    columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
-                                                          ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
-    W = feature_column.embedding_column(sparse_id_column=columns,
-            dimension=3,
-            initializer=init_ops.ones_initializer(dtypes.float32))
-    ids={}
-    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
-    emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
+    with ops.device("/cpu:0"):
+      columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64,
+                                                            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+      W = feature_column.embedding_column(sparse_id_column=columns,
+              dimension=3,
+              initializer=init_ops.ones_initializer(dtypes.float32))
+      ids={}
+      ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1])
+      emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W])
 
     fun = math_ops.multiply(emb, 2.0, name='multiply')
     loss = math_ops.reduce_sum(fun, name='reduce_sum')
@@ -786,6 +785,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self):
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
       sess.run([init])
       emb1, top, l = sess.run([emb, train_op, loss])
+      
       for val1 in emb1.tolist():
         for val in val1:
           self.assertEqual(val, .0)
@@ -1328,66 +1328,6 @@ def testEmbeddingVariableForHTPartitionNum(self):
       print(sess.run([emb, train_op,loss]))
       print(sess.run([emb, train_op,loss]))
 
-  def testEmbeddingVariableForLayout(self):
-    print("testEmbeddingVariableForLayout")
-    def runTestAdagrad(self, var, g):
-      emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
-      fun = math_ops.multiply(emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
-      gs = training_util.get_or_create_global_step()
-      opt = adagrad.AdagradOptimizer(0.1)
-      g_v = opt.compute_gradients(loss)
-      train_op = opt.apply_gradients(g_v)
-      init = variables.global_variables_initializer()
-      with self.test_session(graph=g) as sess:
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
-        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
-        sess.run([init])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        r, _, _ = sess.run([emb, train_op,loss])
-        return r
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            steps_to_live=5)
-      var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32))
-      emb1 = runTestAdagrad(self, emb_var, g)
-      emb2 = runTestAdagrad(self, var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j])
-
-    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
-      emb_var = variable_scope.get_embedding_variable("var_1",
-            embedding_dim = 3,
-            initializer=init_ops.ones_initializer(dtypes.float32),
-            partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1),
-            ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=5)))
-      emb1 = runTestAdagrad(self, emb_var, g)
-
-      for i in range(0, 6):
-        for j in range(0, 3):
-          self.assertEqual(emb1.tolist()[i][j], .0)
-
   def testEVInitializerWithKeyFetch(self):
     print("testEVInitializerWithKeyFetch")
     with ops.Graph().as_default() as g, ops.device('/cpu:0'):
@@ -2391,7 +2331,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
                               "model1.ckpt")
     with self.test_session() as sess:
       sess.run([init])
-      sess.run([emb, train_op])
+      sess.run([train_op])
       save_path = saver.save(sess, model_path)
       for name, shape in checkpoint_utils.list_variables(model_path):
         if name == "var_1-keys":
@@ -2403,6 +2343,37 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
            name == "var_1-freqs_filtered":
           self.assertEqual(0, shape[0])
     del os.environ["TF_EV_SAVE_FILTERED_FEATURES"]
+
+  def testEmbeddingVariableForSaveUnfilterFeature(self):
+    checkpoint_directory = self.get_temp_dir()
+    with ops.device("/cpu:0"):
+      emb_var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)))
+    emb = embedding_ops.embedding_lookup(emb_var,  math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64))
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    gs = training_util.get_or_create_global_step()
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v, gs)
+    saver = saver_module.Saver()
+    init = variables.global_variables_initializer()
+    model_path = os.path.join(checkpoint_directory,
+                              "model1.ckpt")
+    with self.test_session() as sess:
+      sess.run([init])
+      sess.run([train_op])
+      save_path = saver.save(sess, model_path)
+      for name, shape in checkpoint_utils.list_variables(model_path):
+        if name == "var_1-keys":
+          keys = checkpoint_utils.load_variable(model_path, name)
+          self.assertEqual(1, len(keys))
+          self.assertEqual(1, keys[0])
+        if name == "var_1-keys_filtered" or \
+           name == "var_1-freqs_filtered":
+          self.assertEqual(2, shape[0])
   
   def testEmbeddingVariableForMultiTierInference(self):
     print("testEmbeddingVariableForMultiTierInference")
@@ -2716,7 +2687,55 @@ def testCPUFbjOpt(self):
   def testCPUFbjOptWithCounterFilter(self):
     print("testCPUFbjOpt")
     os.environ["TF_EMBEDDING_FBJ_OPT"] = "True"
-    self._CounterFilterTestTemplate("Adagrad")
+    with ops.device("/cpu:0"):
+      var = variable_scope.get_embedding_variable("var_1",
+              embedding_dim = 3,
+              initializer=init_ops.ones_initializer(dtypes.float32),
+              ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)),
+              partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1))
+      emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64))
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = self._CreateOptimizer("Adagrad")
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session() as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        emb1, top, l = sess.run([emb, train_op, loss])
+        emb_list = emb1.tolist()
+        emb_right = [[.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0],
+                     [.0, .0, .0],
+                     [.0, .0, .0]]
+        
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb_list[i][j], emb_right[i][j])
+
+        emb1= sess.run(emb)
+        emb_right = [[0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90031105, 0.90031105, 0.90031105],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [0.90122706, 0.90122706, 0.90122706],
+                     [1.0, 1.0, 1.0],
+                     [1.0, 1.0, 1.0],
+                     [.0, .0, .0]]
+        for i in range(6):
+          for j in range(3):
+            self.assertAlmostEqual(emb1[i][j], emb_right[i][j])
     del os.environ["TF_EMBEDDING_FBJ_OPT"]
   
   def testCPUFbjOptWithBloomFilter(self):
diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 96329ca345b..1ef9550ef6d 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -373,6 +373,8 @@ def _init_from_args(self,
           self._slot_num = 0 
         else:
           self._slot_num = evconfig.slot_num
+        if self._is_primary:
+          self._import_dependency_ops = []
         with ops.name_scope("IsInitialized"):
           self._is_initialized_op = (
               gen_kv_variable_ops.kv_var_is_initialized_op(self._handle,
@@ -488,6 +490,7 @@ def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
           set_attr_ops.append(set_cache_op)
         with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
           self._init_op_for_restore = control_flow_ops.no_op()
+        self.collect_restore_denpendencies()
 
   def need_counts(self):
     return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier)
@@ -612,8 +615,19 @@ def _init_from_proto(self, variable_def, import_scope=None):
     else:
       self._is_primary = False
 
+    self.collect_restore_denpendencies()
   # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py)
 
+  def collect_restore_denpendencies(self):
+    restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    if len(restore_dependency) == 0:
+      ops.add_to_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY, {})
+      restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)
+    dependency_dict = restore_dependency[0]
+    if not dependency_dict.__contains__(self._primary_handle):
+      dependency_dict[self._primary_handle] = []
+    dependency_dict[self._primary_handle].append(self._init_op_for_restore)
+
   def set_init_data_source_initializer(self, init_data_source):
     import pkgutil
     try:
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 0d8bfe87022..650b1a5e272 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -195,7 +195,8 @@ def restore(self, restored_tensors, unused_restored_shapes):
       if self.var._init_data_source is not None:
         return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
       else:
-        with ops.control_dependencies([self.var._init_op_for_restore]):
+        restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+        with ops.control_dependencies(restore_dependency[self.var._primary_handle]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
               restored_tensors[0],

From be62ec312595b51b74260f96a6c0872ce5f1540c Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 18 Oct 2023 10:11:16 +0800
Subject: [PATCH 09/45] [Graph] Fix hang bug for async embedding lookup. (#934)

Skip edges to 'SaveV3' Op.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/python/training/async_embedding_stage.py |  7 ++++++-
 tensorflow/python/training/monitored_session.py     | 10 ++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/training/async_embedding_stage.py b/tensorflow/python/training/async_embedding_stage.py
index 32433387c1c..858025bdab7 100644
--- a/tensorflow/python/training/async_embedding_stage.py
+++ b/tensorflow/python/training/async_embedding_stage.py
@@ -49,13 +49,14 @@ def __init__(self, options, checkpoint_dir = None):
         self._checkpoint_dir = checkpoint_dir if checkpoint_dir else ""
         self._use_stage_subgraph_thread_pool = options.use_stage_subgraph_thread_pool
         self._stage_subgraph_thread_pool_id = options.stage_subgraph_thread_pool_id
+        self._is_staged = False
         self._control_flow_ops = ['Switch', '_SwitchN', 'Merge', '_XlaMerge',
                                   'Enter', 'Exit']
         self._variable_ops = ['Variable', 'VariableV2', 'VarHandleOp',
                               'KvVarHandleOp', 'HashTableV2']
         self._variable_is_init_ops = ['IsVariableInitialized',
                                       'VarIsInitializedOp', 'KvVarIsInitializedOp']
-        self._saver_ops = ['SaveV2']
+        self._saver_ops = ['SaveV2', 'SaveV3']
         self._no_data_input_ops = self._variable_ops + ['Placeholder', 'PlaceholderV2', 'Const']
         self._boundary_ops = set()
         for tensor in ops.get_collection(ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS):
@@ -74,6 +75,10 @@ def __init__(self, options, checkpoint_dir = None):
     def stage(self, graph):
         """ add async embedding stage node to graph
         """
+        if self._is_staged:
+            return
+        self._is_staged = True
+
         logging.info('async embedding stage begin')
         logging.info('async embedding thread num: ' + str(self._threads_num))
         logging.info('async embedding capacity: ' + str(self._capacity))
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 09c05a02627..6eb204785dd 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -185,6 +185,7 @@ def __init__(self,
     self._saver = saver
     self._incremental_save_restore = incremental_save_restore
     self._incr_saver = None
+    self._async_embedding_stage = None
     self._enable_async_embedding = False
     self._async_embedding_checkpoint_dir = None
     self._async_embedding_options = None
@@ -247,10 +248,11 @@ def default_ready_for_local_init_op():
       self._incr_saver = incr_saver._get_incremental_saver(self._incremental_save_restore, self._saver)
 
     if self._enable_async_embedding:
-      async_embedding_stage = async_embedding.AsyncEmbeddingStage(
-        self._async_embedding_options,
-        self._async_embedding_checkpoint_dir)
-      async_embedding_stage.stage(ops.get_default_graph())
+      if self._async_embedding_stage is None:
+        self._async_embedding_stage = async_embedding.AsyncEmbeddingStage(
+          self._async_embedding_options,
+          self._async_embedding_checkpoint_dir)
+      self._async_embedding_stage.stage(ops.get_default_graph())
 
     ops.get_default_graph().finalize()
     logging.info('Graph was finalized.')

From 0e8127a2cc9b2529ec2ab2f6f361d6c536280d60 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 25 Oct 2023 05:10:05 -0700
Subject: [PATCH 10/45] [Distribute] Add elastic-grpc server. (#936)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 configure.py                                  |   3 +
 tensorflow/BUILD                              |   6 +
 tensorflow/contrib/elastic_grpc_server/BUILD  |  70 ++++
 .../elastic_grpc_server_lib.cc                | 317 ++++++++++++++++++
 .../elastic_grpc_server_lib.h                 |  66 ++++
 .../elastic_grpc_server_lib_test.cc           |  77 +++++
 .../elastic_grpc_server/elastic_service.cc    | 157 +++++++++
 .../elastic_grpc_server/elastic_service.h     |  31 ++
 tensorflow/core/BUILD                         |  23 ++
 .../distributed_runtime/rpc/grpc_server_lib.h |  14 +-
 .../core/platform/default/build_config.bzl    |   6 +
 .../platform/default/build_config_root.bzl    |   8 +
 .../core/protobuf/elastic_training.proto      |  76 +++++
 tensorflow/python/BUILD                       |   3 +-
 14 files changed, 849 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/elastic_grpc_server/BUILD
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.cc
 create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.h
 create mode 100644 tensorflow/core/protobuf/elastic_training.proto

diff --git a/configure.py b/configure.py
index 362479981b2..6aeaf7d12af 100644
--- a/configure.py
+++ b/configure.py
@@ -1433,6 +1433,9 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_STAR', 'STAR', 'with_star_support',
                 True, 'star')
 
+  set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support',
+                True, 'elastic')
+
   set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support',
                 False, 'pmem')
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 493247a2162..8b4190ea680 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -434,6 +434,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_elastic_support",
+    values = {"define": "with_elastic_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_pmem_support",
     values = {"define": "with_pmem_support=true"},
diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD
new file mode 100644
index 00000000000..ea4b87e3b58
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/BUILD
@@ -0,0 +1,70 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+load(
+    "//tensorflow:tensorflow.bzl", "tf_cc_test",
+)
+
+cc_library(
+    name = "elastic_grpc_server_lib",
+    srcs = select({"//tensorflow:with_elastic_support": ["elastic_service.cc",
+                                                     "elastic_grpc_server_lib.cc"],
+		   "//conditions:default": []}),
+    hdrs = ["elastic_service.h",
+            "elastic_grpc_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        "//tensorflow/core:elastic_service_proto_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_master_service",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:graph_mgr",
+        "//tensorflow/core/distributed_runtime:local_master",
+        "//tensorflow/core/distributed_runtime:master",
+        "//tensorflow/core/distributed_runtime:master_env",
+        "//tensorflow/core/distributed_runtime:master_session",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_resource",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "elastic_grpc_test",
+    size = "small",
+    srcs = ["elastic_grpc_server_lib_test.cc"],
+    deps = [
+        ":elastic_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow:grpc",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+    linkstatic = 1, 
+)
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
new file mode 100644
index 00000000000..d45d70d6c8c
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
@@ -0,0 +1,317 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "include/json/json.h"
+#include "grpc/support/alloc.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/util/env_var.h"
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/local_master.h"
+#include "tensorflow/core/distributed_runtime/master.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/distributed_runtime/worker_resource.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// static utility function
+RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
+  return new RpcRendezvousMgr(env);
+}
+
+}  // namespace
+
+ElasticGrpcServer::ElasticGrpcServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env) {}
+
+ElasticGrpcServer::~ElasticGrpcServer() {
+  delete elastic_service_;
+}
+
+Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num) {
+  std::string tf_config;
+  ReadStringFromEnvVar("TF_CONFIG", "", &tf_config);
+  if (!tf_config.empty()) {
+    Json::Reader reader;
+    Json::Value tf_config_json;
+    if(!reader.parse(tf_config, tf_config_json)) {
+      return errors::Internal("PARSE TF_CONFIG ERROR");
+    }
+    if ((tf_config_json["cluster"].isNull()) ||
+        (tf_config_json["cluster"]["ps"].isNull())) {
+      return errors::Internal("PARSE PS FROM TF_CONFIG ERROR");
+    }
+
+    Json::Value cluster_json;
+    if (!reader.parse(cluster_def_str, cluster_json)) {
+      LOG(ERROR) << "cluster_def is not correct with " << cluster_def_str;
+      return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
+    }
+
+    std::unordered_set<string> ps_addrs_vec;
+    after_part_num = cluster_json["cluster"]["ps"].size();
+    for (auto& value: cluster_json["cluster"]["ps"]) {
+      ps_addrs_vec.emplace(value.asString());
+    }
+
+    int job_size = server_def_.cluster().job_size();
+    for (int j = 0; j < job_size; ++j) {
+      auto* job = server_def_.mutable_cluster()->mutable_job(j);
+      if (job->name() == "ps") {
+        before_part_num = job->tasks_size();
+        if (before_part_num == after_part_num) {
+          return Status::OK();
+        } else if (after_part_num > before_part_num) {
+          int idx = before_part_num;
+          LOG(INFO) << "SCALING UP, partition_num is: " << after_part_num;
+          std::unordered_set<string> target_string_set;
+          for (auto& value: tf_config_json["cluster"]["ps"]) {
+            target_string_set.emplace(value.asString());
+          }
+          for (auto ps_addr: ps_addrs_vec) {
+            if (target_string_set.find(ps_addr) == target_string_set.end()) {
+              job->mutable_tasks()->insert({idx, ps_addr});
+              tf_config_json["cluster"]["ps"].append(ps_addr);
+            }
+          } 
+          break;
+        } else {
+          LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
+          for (int i = 0; i < before_part_num; ++i) {
+            string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
+            if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
+              Json::Value ps_addr;
+              tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
+              job->mutable_tasks()->erase(i);
+            }
+          }
+        }
+      }
+    }
+    Json::FastWriter writer;
+    std::string new_tf_config = writer.write(tf_config_json);
+    LOG(INFO) << "new TF_CONFIG " << new_tf_config;
+    setenv("TF_CONFIG", new_tf_config.c_str(), 1);
+  }
+  return Status::OK();
+}
+
+Status ElasticGrpcServer::Update(const string& cluster_def_str) {
+  int before_part_num, after_part_num;
+  Status s = UpdateServerDef(cluster_def_str, before_part_num, after_part_num);
+  if (!s.ok()) {
+    LOG(ERROR) << s.error_message();
+    return Status::OK();
+  }
+
+  if (after_part_num == before_part_num) {
+    return Status::OK();
+  }
+
+  WorkerCacheInterface* worker_cache;
+  WorkerCacheFactoryOptions worker_cache_factory_options(server_def_);
+  TF_RETURN_IF_ERROR(
+      WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
+  CHECK_NE(nullptr, worker_cache);
+  ConfigProto config = server_def_.default_session_config();
+  string unused;
+  string default_worker_name;
+  if (!DeviceNameUtils::SplitDeviceName(master_env()->local_devices[0]->name(),
+                                        &default_worker_name, &unused)) {
+    return errors::Internal("Could not parse worker name.");
+  }
+  std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+      new DeviceResolverDistributed(worker_env()->device_mgr, worker_cache,
+                                    default_worker_name));
+  std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+      new CollectiveParamResolverDistributed(config, worker_env()->device_mgr,
+                                              dev_resolver.get(), worker_cache,
+                                              default_worker_name));
+  worker_env()->collective_executor_mgr = new RpcCollectiveExecutorMgr(
+      config, worker_env()->device_mgr, std::move(dev_resolver),
+      std::move(param_resolver), worker_cache, default_worker_name);
+
+  if (worker_env()->session_mgr != nullptr) {
+    delete worker_env()->session_mgr;  // Deletes graph_mgr's.
+  }
+
+  // Set up worker environment.
+  worker_env()->session_mgr = new SessionMgr(
+      worker_env(), SessionMgr::WorkerNameFromServerDef(server_def_),
+      std::unique_ptr<WorkerCacheInterface>(worker_cache),
+      [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
+        WorkerCacheFactoryOptions options(server_def);
+        return WorkerCacheFactory(options, worker_cache);
+      });
+  master_env()->worker_cache = worker_cache;
+  // Finish setting up master environment.
+  
+  StatsPublisherFactory stats_factory = opts_.stats_factory;
+  master_env()->master_session_factory =
+      [config, stats_factory](
+          SessionOptions options, const MasterEnv* env,
+          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+          std::unique_ptr<WorkerCacheInterface> worker_cache,
+          std::unique_ptr<DeviceSet> device_set,
+          std::vector<string> filtered_worker_list) {
+        options.config.MergeFrom(config);
+        return new MasterSession(options, env, std::move(remote_devs),
+                                 std::move(worker_cache), std::move(device_set),
+                                 std::move(filtered_worker_list),
+                                 stats_factory);
+      };
+  master_env()->worker_cache_factory =
+      [this](const WorkerCacheFactoryOptions& options,
+             WorkerCacheInterface** worker_cache) {
+        return WorkerCacheFactory(options, worker_cache);
+      };
+  return Status::OK();
+}
+
+void ElasticGrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {
+  elastic_service_ = NewElasticGrpcService(this, builder);
+}
+
+Status ElasticGrpcServer::Start() {
+  {
+    mutex_lock l(mu_);
+    switch (state_) {
+      case NEW: {
+        update_server_thread_.reset(
+            env_->StartThread(ThreadOptions(), "TF_elastic_service",
+                              [this] { elastic_service_->HandleRPCsLoop(); }));
+        LOG(INFO) << "Started server with target: " << target();
+        break;
+      }
+      case STARTED:
+        LOG(INFO) << "Server already started (target: " << target() << ")";
+        return Status::OK();
+      case STOPPED:
+        return errors::FailedPrecondition("Server has stopped.");
+      default:
+        LOG(FATAL);
+    }
+  }
+  return GrpcServer::Start();
+}
+
+Status ElasticGrpcServer::Join() {
+  GrpcServer::Join();
+  mutex_lock l(mu_);
+  switch (state_) {
+    case NEW:
+      LOG(FATAL) << "Server shoud already closed";
+    case STARTED:
+    case STOPPED:
+      update_server_thread_.reset();  
+      return Status::OK();
+    default:
+      LOG(FATAL);
+  }
+}
+
+/* static */
+Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<ElasticGrpcServer> ret(
+      new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env));
+  ServiceInitFunction service_func = nullptr;
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+/* static */
+Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ElasticGrpcServer>* out_server) {
+  std::unique_ptr<ElasticGrpcServer> ret(
+      new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env));
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class ElasticGrpcServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "elastic-grpc";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return ElasticGrpcServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `ElasticGrpcServer` instances.
+class ElasticGrpcServerRegistrar {
+ public:
+  ElasticGrpcServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    memset(&alloc_fns, 0, sizeof(alloc_fns));
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("ELASTIC_GRPC_SERVER", new ElasticGrpcServerFactory());
+  }
+};
+static ElasticGrpcServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
new file mode 100644
index 00000000000..8853ceb2819
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
+
+#include <memory>
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/distributed_runtime/master_env.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+class ElasticGrpcServer : public GrpcServer {
+ public:
+  ElasticGrpcServer(const ServerDef& server_def, Env* env);
+  
+  virtual ~ElasticGrpcServer() override;
+
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ElasticGrpcServer>* out_server);
+
+  Status Update(const string& cluster_def_str);
+
+  void MaybeMutateBuilder(::grpc::ServerBuilder* builder) override;
+
+  Status Start() override;
+  
+  Status Join() override;
+
+ private:
+  Status UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num);
+  
+ private:
+  // TensorFlow Eager implementation, and RPC polling thread.
+  AsyncServiceInterface* elastic_service_ = nullptr;
+  std::unique_ptr<Thread> update_server_thread_ GUARDED_BY(mu_);
+
+  std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
new file mode 100644
index 00000000000..e2db870a74a
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+#include "gtest/gtest.h"
+
+namespace tensorflow {
+
+class ElasticGrpcServerTest : public ::testing::Test {
+ protected:
+  Status FillServerDef(const string& job_spec, ServerDef* options) {
+    options->set_protocol("elastic-grpc");
+    options->set_job_name("chief");
+    options->set_task_index(0);
+
+    uint32 my_tasks_per_replica = 0;
+    for (const string& job_str : str_util::Split(job_spec, ',')) {
+        JobDef* job_def = options->mutable_cluster()->add_job();
+        // Split each entry in the flag into 2 pieces, separated by "|".
+        const std::vector<string> job_pieces = str_util::Split(job_str, '|');
+        CHECK_EQ(2, job_pieces.size()) << job_str;
+        job_def->set_name(job_pieces[0]);
+        // Does a bit more validation of the tasks_per_replica.
+        const StringPiece spec = job_pieces[1];
+        // job_str is of form <job_name>|<host_ports>.
+        const std::vector<string> host_ports = str_util::Split(spec, ';');
+        uint32 tasks_per_replica = host_ports.size();
+        for (size_t i = 0; i < host_ports.size(); ++i) {
+        (*job_def->mutable_tasks())[i] = host_ports[i];
+        }
+        if (job_def->name() == options->job_name()) {
+        my_tasks_per_replica = tasks_per_replica;
+        }
+        LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {"
+                << absl::StrJoin(host_ports, ", ") << "}";
+    }
+    if (my_tasks_per_replica == 0) {
+        return errors::InvalidArgument("Invalid job specification");
+    }
+    return Status::OK();
+  }  
+};
+
+//Test Update Logic
+TEST_F(ElasticGrpcServerTest, UpdateServer) {
+  Status s;
+  std::unique_ptr<ElasticGrpcServer> grpc_server;
+  ServerDef server_def;
+  std::string job_spec = "worker|localhost:2222,ps|localhost:10086;localhost:10087;localhost:10088,chief|localhost:2220";
+  TF_ASSERT_OK(FillServerDef(job_spec, &server_def));
+  s = ElasticGrpcServer::Create(server_def, Env::Default(), &grpc_server);
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not create server: " << s.error_message();
+  }
+  TF_ASSERT_OK(grpc_server->Start());
+  // TF_QCHECK_OK(grpc_server->Join());
+  LOG(INFO) << "SCALING DOWN";
+  std::string tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\"],\"chief\": [\"localhost:2220\"]]}}";
+  grpc_server->Update(tf_config_str);
+  LOG(INFO) << "SCALING UP";
+  tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\", \"localhost:10088\"],\"chief\": [\"localhost:2220\"]]}}";
+  grpc_server->Update(tf_config_str);
+  grpc_server.release();
+}
+
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
new file mode 100644
index 00000000000..61aa6e662ec
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
@@ -0,0 +1,157 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h"
+
+#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h"
+#include "tensorflow/core/protobuf/elastic_training.grpc.pb.h"
+#include "tensorflow/core/protobuf/elastic_training.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+
+
+#include <grpc/support/log.h>
+#include <grpcpp/grpcpp.h>
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include "grpcpp/server_builder.h"
+
+using namespace des;
+
+using grpc::Server;
+using grpc::ServerAsyncResponseWriter;
+using grpc::ServerBuilder;
+using grpc::ServerCompletionQueue;
+using grpc::ServerContext;
+
+namespace tensorflow {
+
+class GrpcElasticService : public AsyncServiceInterface {
+ public:
+  GrpcElasticService(ElasticGrpcServer* elastic_grpc_server,
+                     ::grpc::ServerBuilder* builder): 
+      elastic_grpc_server_(elastic_grpc_server), builder_(builder) {
+    builder_->RegisterService(&elastic_service_);
+    cq_ = builder_->AddCompletionQueue();
+  }
+
+  ~GrpcElasticService() override { }
+  
+  void Shutdown() override {
+    cq_->Shutdown();
+  }
+
+  void HandleRPCsLoop() override {
+    new CallData(&elastic_service_, elastic_grpc_server_, cq_.get());
+    void* tag;
+    bool ok;
+    while (true) {
+      // Block waiting to read the next event from the completion queue. The
+      // event is uniquely identified by its tag, which in this case is the
+      // memory address of a CallData instance.
+      // The return value of Next should always be checked. This return value
+      // tells us whether there is any kind of event or cq_ is shutting down.
+      GPR_ASSERT(cq_->Next(&tag, &ok));
+      GPR_ASSERT(ok);
+      static_cast<CallData*>(tag)->Proceed();
+    }
+  }
+
+ private:
+  // Class encompasing the state and logic needed to serve a request.
+  class CallData {
+   public:
+    // Take in the "service" instance (in this case representing an asynchronous
+    // server) and the completion queue "cq" used for asynchronous communication
+    // with the gRPC runtime.
+    CallData(ElasticTrainingService::AsyncService* service, ElasticGrpcServer* elastic_grpc_server,
+        ServerCompletionQueue* cq)
+      : service_(service), elastic_grpc_server_(elastic_grpc_server),
+        cq_(cq), responder_(&ctx_), status_(CREATE) {
+      // Invoke the serving logic right away.
+      Proceed();
+    }
+
+    void Proceed() {
+      if (status_ == CREATE) {
+        // Make this instance progress to the PROCESS state.
+        status_ = PROCESS;
+
+        // As part of the initial CREATE state, we *request* that the system
+        // start processing SayHello requests. In this request, "this" acts are
+        // the tag uniquely identifying the request (so that different CallData
+        // instances can serve different requests concurrently), in this case
+        // the memory address of this CallData instance.
+        service_->RequestUpdateServerDef(&ctx_, &request_, &responder_,
+                                         cq_, cq_, this);
+      } else if (status_ == PROCESS) {
+        // Spawn a new CallData instance to serve new clients while we process
+        // the one for this CallData. The instance will deallocate itself as
+        // part of its FINISH state.
+        new CallData(service_, elastic_grpc_server_, cq_);
+
+        // The actual processing.
+        Status s = elastic_grpc_server_->Update(request_.cluster_def());
+        if (s.ok()) {
+          reply_.set_code(Code::OK);
+        } else {
+          reply_.set_code(Code::INTERNAL);
+          reply_.set_msg(s.ToString());
+          LOG(ERROR) << "error" << s.ToString();
+        }
+
+        // And we are done! Let the gRPC runtime know we've finished, using the
+        // memory address of this instance as the uniquely identifying tag for
+        // the event.
+        status_ = FINISH;
+        responder_.Finish(reply_, ::grpc::Status::OK, this);
+      } else {
+        GPR_ASSERT(status_ == FINISH);
+        // Once in the FINISH state, deallocate ourselves (CallData).
+        delete this;
+      }
+    }
+   private:
+    ElasticGrpcServer* elastic_grpc_server_;
+    // The means of communication with the gRPC runtime for an asynchronous
+    // server.
+    ElasticTrainingService::AsyncService* service_;
+    // The producer-consumer queue where for asynchronous server notifications.
+    ServerCompletionQueue* cq_;
+    // Context for the rpc, allowing to tweak aspects of it such as the use
+    // of compression, authentication, as well as to send metadata back to the
+    // client.
+    ServerContext ctx_;
+
+    // What we get from the client.
+    UpdateServerDefRequest request_;
+    // What we send back to the client.
+    UpdateServerDefResponse reply_;
+
+    // The means to get back to the client.
+    ServerAsyncResponseWriter<UpdateServerDefResponse> responder_;
+
+    // Let's implement a tiny state machine with the following states.
+    enum CallStatus { CREATE, PROCESS, FINISH };
+    CallStatus status_;  // The current serving state.
+  };
+
+  ElasticGrpcServer* elastic_grpc_server_;
+  ::grpc::ServerBuilder* builder_;
+  ElasticTrainingService::AsyncService elastic_service_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+};
+
+AsyncServiceInterface* NewElasticGrpcService(
+    ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder) {
+  return reinterpret_cast<AsyncServiceInterface*>(new GrpcElasticService(elastic_grpc_server, builder));
+}
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.h b/tensorflow/contrib/elastic_grpc_server/elastic_service.h
new file mode 100644
index 00000000000..9465a10c918
--- /dev/null
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
+#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
+
+
+#include <memory>
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+class ElasticGrpcServer;
+
+namespace tensorflow {
+
+class AsyncServiceInterface;
+AsyncServiceInterface* NewElasticGrpcService(
+    ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder);
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_
\ No newline at end of file
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 95bbbab5624..0531200e7ab 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -139,6 +139,7 @@ load(
     "tf_lib_proto_parsing_deps",
     "tf_proto_library",
     "tf_proto_library_cc",
+    "tf_proto_library_py",
     "tf_protos_all",
     "tf_protos_all_impl",
     "tf_protos_grappler",
@@ -2475,6 +2476,28 @@ tf_proto_library_cc(
     ],
 )
 
+tf_proto_library_cc(
+    name = "elastic_service_proto",
+    srcs = ["protobuf/elastic_training.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+tf_proto_library_py(
+    name = "elastic_service_pb",
+    srcs = ["protobuf/elastic_training.proto"],
+    use_grpc_plugin = True,
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
 LIB_INTERNAL_PRIVATE_HEADERS = [
     "framework/resource_handle.h",
     "//tensorflow/core/platform:legacy_lib_internal_headers",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 521c8f206f8..79d6b0cd65e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -127,14 +127,11 @@ class GrpcServer : public ServerInterface {
   const ServerDef& server_def() const { return server_def_; }
   GrpcWorker* worker_impl() const { return worker_impl_.get(); }
 
-
- private:
-  // The overall server configuration.
-  const ServerDef server_def_;
+ protected:
+  // The overall server configuration. It may be changed during scaling.
+  ServerDef server_def_;
   Env* env_;
-
-  // The port to which this server is bound.
-  int bound_port_ = 0;
+  GrpcServerOptions opts_;
 
   // Guards state transitions.
   mutex mu_;
@@ -151,6 +148,9 @@ class GrpcServer : public ServerInterface {
   enum State { NEW, STARTED, STOPPED };
   State state_ GUARDED_BY(mu_);
 
+ private:
+  // The port to which this server is bound.
+  int bound_port_ = 0;
   // Implementation of a TensorFlow master, and RPC polling thread.
   MasterEnv master_env_;
   std::unique_ptr<Master> master_impl_;
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 406285e7f0f..75d3c671562 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -769,6 +769,12 @@ def tf_additional_star_lib_defines():
         "//conditions:default": [],
     })
 
+def tf_additional_elastic_server_lib_defines():
+    return select({
+        "//tensorflow:with_elastic_support": ["TENSORFLOW_USE_ELASTIC_SERVER"],
+        "//conditions:default": [],
+    })
+
 def tf_additional_api_compatible_defines():
     return select({
         "//tensorflow:with_api_compatible": ["TF_API_COMPATIBLE_1150"],
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 71651faf0b1..38191dea3c4 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -77,6 +77,14 @@ def tf_additional_star_deps():
         "//conditions:default": [],
     })
 
+def tf_additional_elastic_deps():
+    return select({
+        str(Label("//tensorflow:with_elastic_support")): [
+             str(Label("//tensorflow/contrib/elastic_grpc_server:elastic_grpc_server_lib")),
+        ],
+        "//conditions:default": [],
+    })
+
 # Include specific extra dependencies when building statically, or
 # another set of dependencies otherwise. If "macos" is provided, that
 # dependency list is used when using the framework_shared_object config
diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto
new file mode 100644
index 00000000000..ee0d0bd10e0
--- /dev/null
+++ b/tensorflow/core/protobuf/elastic_training.proto
@@ -0,0 +1,76 @@
+syntax = "proto3";
+
+package des;
+
+enum Code {
+  OK                  = 0;
+  CANCELLED           = 1;
+  UNKNOWN             = 2;
+  INVALID_ARGUMENT    = 3;
+  DEADLINE_EXCEEDED   = 4;
+  NOT_FOUND           = 5;
+  ALREADY_EXISTS      = 6;
+  PERMISSION_DENIED   = 7;
+  RESOURCE_EXHAUSTED  = 8;
+  FAILED_PRECONDITION = 9;
+  ABORTED             = 10;
+  OUT_OF_RANGE        = 11;
+  UNIMPLEMENTED       = 12;
+  INTERNAL            = 13;
+  UNAVAILABLE         = 14;
+  DATA_LOSS           = 15;
+  UNAUTHENTICATED     = 16;
+  REQUEST_STOP        = 17;
+}
+
+enum ElasticTrainingState {
+  READY = 0;
+  SCALING = 1;
+  All_SESSION_CLOSED = 2;
+}
+
+enum ScalingAction {
+  NONE = 0;
+  SCALING_UP = 1;
+  SCALING_DOWN = 2;
+}
+
+message IsReadyScalingRequest {
+  int32 task_index = 1;
+}
+
+message IsReadyScalingResponse {
+  Code code = 1;
+  string msg = 2;
+  ScalingAction scaling_action = 3;
+  int32 ps_num = 4; // updated ps_num;
+}
+
+message ReadyToUpdateRequest {};
+message ReadyToUpdateResponse {};
+
+message UpdateServerDefRequest {
+  string cluster_def = 1;//serialized cluster_def
+}
+
+message UpdateServerDefResponse {
+  Code code = 1;
+  string msg = 2;
+}
+
+message FetchParamsRequest {
+  repeated string names = 1; // vec of partitioned variables or ev
+}
+
+message FetchParamsResponse {
+  Code code = 1;
+  string msg = 2;
+  map<string, int32> param_partition_map = 3; // per partition num of variable
+}
+
+service ElasticTrainingService {
+  rpc IsReadyScaling(IsReadyScalingRequest) returns (IsReadyScalingResponse);
+  rpc ReadyToUpdate(ReadyToUpdateRequest) returns (ReadyToUpdateResponse);
+  rpc UpdateServerDef(UpdateServerDefRequest) returns (UpdateServerDefResponse);
+  rpc FetchParamsMeta(FetchParamsRequest) returns (FetchParamsResponse);
+}
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 68649078f5c..a740e0916d9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -24,7 +24,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow/core/platform:default/build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_cupti_test_flags", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
-load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps")
+load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps", "tf_additional_elastic_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -5307,6 +5307,7 @@ tf_py_wrap_cc(
          tf_additional_verbs_deps() +
          tf_additional_mpi_deps() +
          tf_additional_gdr_deps() +
+	 tf_additional_elastic_deps() +
          tf_additional_star_deps()) + if_ngraph([
         "@ngraph_tf//:ngraph_tf",
     ]),

From 2d31c8e37ea28d7c169879ebd9c3a89bd8d26cb5 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Thu, 26 Oct 2023 04:02:41 -0700
Subject: [PATCH 11/45] [Embedding] Add interface of EmbeddingVar for Elastic
 Training. (#933)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 configure.py                                  |  2 +-
 tensorflow/contrib/elastic_grpc_server/BUILD  |  3 +-
 tensorflow/core/BUILD                         |  5 +-
 .../framework/embedding/bloom_filter_policy.h |  2 +-
 .../embedding/counter_filter_policy.h         |  2 +-
 .../framework/embedding/cpu_hash_map_kv.h     | 22 +++++
 .../framework/embedding/dense_hash_map_kv.h   | 19 ++++
 .../core/framework/embedding/embedding_var.h  | 86 ++++++++++++++++++-
 .../embedding/embedding_var_ckpt_data.h       |  1 -
 .../core/framework/embedding/filter_policy.h  | 20 ++++-
 .../framework/embedding/gpu_hash_map_kv.h     |  7 ++
 .../core/framework/embedding/kv_interface.h   |  5 ++
 .../core/framework/embedding/leveldb_kv.h     | 32 +++++++
 .../framework/embedding/multi_tier_storage.h  |  9 +-
 .../embedding/nullable_filter_policy.h        |  2 +-
 .../framework/embedding/single_tier_storage.h | 13 ++-
 .../core/framework/embedding/ssd_hash_kv.h    |  6 ++
 tensorflow/core/framework/embedding/storage.h |  7 +-
 tensorflow/core/kernels/data/BUILD            |  6 ++
 tensorflow/core/kernels/data/iterator_ops.cc  | 12 ++-
 tensorflow/python/ops/embedding_ops.py        |  3 +-
 21 files changed, 244 insertions(+), 20 deletions(-)

diff --git a/configure.py b/configure.py
index 6aeaf7d12af..4fb1c78c40b 100644
--- a/configure.py
+++ b/configure.py
@@ -1434,7 +1434,7 @@ def main():
                 True, 'star')
 
   set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support',
-                True, 'elastic')
+                False, 'elastic')
 
   set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support',
                 False, 'pmem')
diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD
index ea4b87e3b58..16ec91f4435 100644
--- a/tensorflow/contrib/elastic_grpc_server/BUILD
+++ b/tensorflow/contrib/elastic_grpc_server/BUILD
@@ -56,7 +56,8 @@ cc_library(
 tf_cc_test(
     name = "elastic_grpc_test",
     size = "small",
-    srcs = ["elastic_grpc_server_lib_test.cc"],
+    srcs = select({"//tensorflow:with_elastic_support": ["elastic_grpc_server_lib_test.cc"],
+		   "//conditions:default": []}),
     deps = [
         ":elastic_grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0531200e7ab..ef1ebcb6dcf 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -128,6 +128,7 @@ load(
     "tf_additional_numa_deps",
     "tf_additional_numa_lib_defines",
     "tf_additional_star_lib_defines",
+    "tf_additional_elastic_server_lib_defines",
     "tf_additional_api_compatible_defines",
     "tf_additional_pmem_lib_defines",
     "tf_additional_test_deps",
@@ -1441,6 +1442,7 @@ tf_cc_test(
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
+    defines = tf_additional_elastic_server_lib_defines(),
     deps = [
         ":array_ops_op_lib",
         ":parquet_ops_op_lib",
@@ -2562,7 +2564,8 @@ LIB_INTERNAL_DEFINES = (
     tf_additional_gdr_lib_defines() +
     tf_additional_numa_lib_defines() +
     tf_additional_star_lib_defines() +
-    tf_additional_pmem_lib_defines()
+    tf_additional_pmem_lib_defines() +
+    tf_additional_elastic_server_lib_defines()
 )
 
 cc_library(
diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h
index 781511578af..8019e70a312 100644
--- a/tensorflow/core/framework/embedding/bloom_filter_policy.h
+++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h
@@ -333,7 +333,7 @@ class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       void* value_ptr = nullptr;
diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h
index 19cd90ad01c..e53d574182c 100644
--- a/tensorflow/core/framework/embedding/counter_filter_policy.h
+++ b/tensorflow/core/framework/embedding/counter_filter_policy.h
@@ -159,7 +159,7 @@ class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       int64 import_freq = 0;
diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 8476c399c40..750ba282285 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -137,6 +137,28 @@ class LocklessHashMap : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    std::pair<const K, void*> *hash_map_dump;
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ 
+          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_
+          && hash_map_dump[j].first % kSavedPartitionNum 
+              % partition_nums != partition_id) {
+        key_list->emplace_back(hash_map_dump[j].first);
+        value_ptr_list->emplace_back(hash_map_dump[j].second);
+      }
+    }
+
+    free(hash_map_dump);
+    return Status::OK();
+  }
+
   std::string DebugString() const override {
     LOG(INFO) << "map info size:" << Size()
               << "map info bucket_count:" << hash_map_.bucket_count()
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index ffaf2e335dc..8a27404b66f 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -121,6 +121,25 @@ class DenseHashMap : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    dense_hash_map hash_map_dump[partition_num_];
+    for (int i = 0; i< partition_num_; i++) {
+      spin_rd_lock l(hash_map_[i].mu);
+      hash_map_dump[i].hash_map = hash_map_[i].hash_map;
+    }
+    for (int i = 0; i< partition_num_; i++) {
+      for (const auto it : hash_map_dump[i].hash_map) {
+        if (it.first % kSavedPartitionNum % partition_nums != partition_id) {
+          key_list->push_back(it.first);
+          value_ptr_list->push_back(it.second);
+        }
+      }
+    }
+    return Status::OK();
+  }
+
   std::string DebugString() const override {
     return "";
   }
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index 487f595bf31..a66ec19fb97 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -435,6 +435,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_->CacheSize();
   }
 
+  int64 MemoryUsage() const {
+    return storage_->Size() * (sizeof(K) + feat_desc_->data_bytes());
+  }
+
   int64 MinFreq() {
     return emb_config_.filter_freq;
   }
@@ -516,6 +520,85 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  Status GetShardedSnapshot(std::vector<K>* key_list,
+                            std::vector<void*>* value_ptr_list,
+                            int partition_id, int partition_num) {
+    return storage_->GetShardedSnapshot(key_list, value_ptr_list,
+                                        partition_id, partition_num);
+  }
+
+  void ExportAndRemove(K* key_list, V* value_list,
+                     int64* version_list, int64* freq_list,
+                     std::vector<K>& tot_keys_list,
+                     std::vector<void*>& tot_value_ptr_list) {
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar(
+        "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features));
+
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+
+    for (int64 i = 0; i < tot_keys_list.size(); ++i) {
+      auto& value_ptr = tot_value_ptr_list[i];
+      if((int64)value_ptr == embedding::ValuePtrStatus::IS_DELETED)
+        continue;
+
+      bool is_admit = feat_desc_->IsAdmit(value_ptr);
+      bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+
+      if (!is_admit) {
+        key_list[i] = tot_keys_list[i];
+        
+        if (!is_in_dram) {
+          auto tmp_value = value_list + i * value_len_;
+          tmp_value = (V*)embedding::ValuePtrStatus::NOT_IN_DRAM;
+          value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        } else if (feat_desc_->GetEmbedding(value_ptr, 0) == nullptr) {
+          memcpy(value_list + i * value_len_, default_value_, sizeof(V) * value_len_);
+        } else {
+          V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
+          memcpy(value_list + i * value_len_, val, sizeof(V) * value_len_);
+        }
+
+        if(is_save_version) {
+          int64 dump_version = feat_desc_->GetVersion(value_ptr);
+          version_list[i] = dump_version;
+        }
+
+        if(is_save_freq) {
+          int64 dump_freq = feat_desc_->GetFreq(value_ptr);
+          freq_list[i] = dump_freq;
+        }
+      } else {
+        if (!save_unfiltered_features)
+          return;
+        //TODO(JUNQI) : currently not export filtered keys
+      }
+
+      if (emb_config_.is_primary()) {
+        Status s;
+        s = storage_->Remove(tot_keys_list[i]);
+        if (!s.ok()) {
+          LOG(ERROR) << "Remove keys error: " << s.error_message();
+        }
+        feat_desc_->Deallocate(value_ptr);
+      }
+    }
+  }
+
+  Status RestoreFromKeysAndValues(int64 key_num, int partition_id,
+                                  int partition_num, const K* key_list,
+                                  const V* value_list, const int64* version_list,
+                                  const int64* freq_list,
+                                  const Eigen::GpuDevice* device = nullptr) {
+    RestoreBuffer restore_buff((char*)key_list, (char*)value_list,
+                                (char*)version_list, (char*)freq_list);
+    return storage_->RestoreFeatures(key_num, kSavedPartitionNum, 
+                                     partition_id, partition_num,
+                                     value_len_, false/* is_filter*/, false/* is_incr*/,
+                                     emb_config_, device, filter_, restore_buff);
+  }
+
   mutex* mu() {
     return &mu_;
   }
@@ -537,6 +620,8 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
+  string Name() {return name_; }
+
   V* GetDefaultValuePtr() {
     return default_value_;
   }
@@ -645,7 +730,6 @@ class EmbeddingVar : public ResourceBase {
   GPUHashTable<K, V>* HashTable() {
     return storage_->HashTable();
   }
-
   FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const {
     return filter_;
   }
diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
index 10bf0d0e43b..13072f9cdd1 100644
--- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
+++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h
@@ -20,7 +20,6 @@ limitations under the License.
 namespace tensorflow {
 class BundleWriter;
 namespace {
-  const int kSavedPartitionNum = 1000;
   const int kDramFlagOffset = 49;
 }
 
diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h
index 256d3b044d4..c994829bafc 100644
--- a/tensorflow/core/framework/embedding/filter_policy.h
+++ b/tensorflow/core/framework/embedding/filter_policy.h
@@ -27,19 +27,31 @@ struct RestoreBuffer {
   char* value_buffer = nullptr;
   char* version_buffer = nullptr;
   char* freq_buffer = nullptr;
+  bool should_release = false;
 
   explicit RestoreBuffer(size_t buffer_size) {
     key_buffer = new char[buffer_size];
     value_buffer = new char[buffer_size];
     version_buffer = new char[buffer_size];
     freq_buffer = new char[buffer_size];
+    should_release = true;
+  }
+
+  explicit RestoreBuffer(char* i_key_buffer, char* i_value_buffer,
+                         char* i_version_buffer, char* i_freq_buffer) {
+    key_buffer = i_key_buffer;
+    value_buffer = i_value_buffer;
+    version_buffer = i_version_buffer;
+    freq_buffer = i_freq_buffer;
   }
 
   ~RestoreBuffer() {
-    delete []key_buffer;
-    delete []value_buffer;
-    delete []version_buffer;
-    delete []freq_buffer;
+    if (should_release) {
+      delete []key_buffer;
+      delete []value_buffer;
+      delete []version_buffer;
+      delete []freq_buffer;
+    }
   }
 };
 
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index fc4a2506313..e73839e3f76 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -252,6 +252,13 @@ class GPUHashMapKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot";
+    return Status::OK();
+  }
+
   std::string DebugString() const override { return std::string(); }
 
   GPUHashTable<K, V>* HashTable() override { return hash_table_; }
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index 3659187c825..dc603680138 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -23,6 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 const char* kInferenceMode = "INFERENCE_MODE";
+const int kSavedPartitionNum = 1000;
 }
 
 template <class K, class V>
@@ -89,6 +90,10 @@ class KVInterface {
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
 
+  virtual Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) = 0;
+
   virtual std::string DebugString() const = 0;
 
   virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index e488ab3776d..47c8a39dfbd 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -193,6 +193,38 @@ class LevelDBKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    ReadOptions options;
+    options.snapshot = db_->GetSnapshot();
+    leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      K key;
+      memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
+      if (key % kSavedPartitionNum % partition_nums == partition_id) continue;
+      key_list->emplace_back(key);
+      FeatureDescriptor<V> hbm_feat_desc(
+          1, 1, ev_allocator()/*useless*/,
+          StorageType::HBM_DRAM, true, true,
+          {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr,
+             it->value().ToString().data(),
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(
+          value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(
+          value_ptr, feat_desc_->GetVersion(dram_value_ptr));
+      value_ptr_list->emplace_back(value_ptr);
+    }
+    delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
+    return Status::OK();
+  }
+
   int64 Size() const override {
     return counter_->size();
   }
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index 7955322aca6..f77fec8c85a 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -87,6 +87,14 @@ class MultiTierStorage : public Storage<K, V> {
   Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) override {
     LOG(FATAL)<<"Can't get snapshot of MultiTierStorage.";
+    return Status::OK();
+  }
+
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage.";
+    return Status::OK();
   }
 
   void CopyEmbeddingsFromCPUToGPU(
@@ -170,7 +178,6 @@ class MultiTierStorage : public Storage<K, V> {
     });
   }
 
- protected:
   Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                          int64 partition_num, int64 value_len, bool is_filter,
                          bool is_incr, const EmbeddingConfig& emb_config,
diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h
index 7e3ace0063d..55f718d7ca4 100644
--- a/tensorflow/core/framework/embedding/nullable_filter_policy.h
+++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h
@@ -150,7 +150,7 @@ class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
       // this can describe by graph(Mod + DynamicPartition),
       // but memory waste and slow
       if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
-        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
         continue;
       }
       int64 import_freq = 0;
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index be08afd7f50..db96c807c5e 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -223,6 +223,14 @@ class SingleTierStorage : public Storage<K, V> {
     return kv_->GetSnapshot(key_list, value_ptr_list);
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    mutex_lock l(Storage<K, V>::mu_);
+    return kv_->GetShardedSnapshot(key_list, value_ptr_list,
+                                   partition_id, partition_nums);
+  }
+
   Status Save(
       const std::string& tensor_name,
       const std::string& prefix,
@@ -286,7 +294,7 @@ class SingleTierStorage : public Storage<K, V> {
   FeatureDescriptor<V>* feature_descriptor() {
     return feat_desc_;
   }
- protected:
+
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                                  int64 partition_num, int64 value_len, bool is_filter,
                                  bool is_incr, const EmbeddingConfig& emb_config, 
@@ -298,7 +306,8 @@ class SingleTierStorage : public Storage<K, V> {
                                false/*to_dram*/, is_incr, restore_buff);
     return s;
   }
-
+ 
+ protected:
   virtual void Shrink(std::vector<K>& key_list,
                       std::vector<void*>& value_ptr_list,
                       ShrinkArgs& shrink_args,
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index f51c6904a50..a56c9f73385 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -349,6 +349,12 @@ class SSDHashKV : public KVInterface<K, V> {
     return Status::OK();
   }
 
+  Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) override {
+    return Status::OK();
+  }
+
   Status GetSnapshot(
       std::vector<K>* key_list,
       std::vector<EmbFile*>* file_list) {
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index 1ffb435054b..a652de5fa5f 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -95,6 +95,9 @@ class Storage {
   virtual int64 Size(int level) const = 0;
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
+  virtual Status GetShardedSnapshot(
+      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      int partition_id, int partition_nums) = 0;
   virtual Status Save(
       const string& tensor_name,
       const string& prefix,
@@ -197,7 +200,6 @@ class Storage {
                       int64 freq, int64 version,
                       int emb_index) = 0;
 
- protected:
   virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
                                  int64 partition_num, int64 value_len, bool is_filter,
                                  bool is_incr, const EmbeddingConfig& emb_config,
@@ -206,7 +208,8 @@ class Storage {
                                  RestoreBuffer& restore_buff) {
     return Status::OK();
   }
-  
+
+ protected:
   virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num,
                             int64 value_len,
                             const std::string& ssd_emb_file_name,
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 08445403b58..6878c5f8350 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -9,6 +9,11 @@ load(
     "transitive_hdrs",
 )
 
+load(
+    "//tensorflow/core/platform:default/build_config.bzl",
+    "tf_additional_elastic_server_lib_defines",
+)
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -1119,6 +1124,7 @@ tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
     hdrs = ["iterator_ops.h"],
+    defines = tf_additional_elastic_server_lib_defines(),
     deps = [
         ":captured_function",
         ":dataset_utils",
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 08d9d936537..ed6b40a38a0 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -308,7 +308,11 @@ void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
       }
 
       ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+#ifdef TENSORFLOW_USE_ELASTIC_SERVER
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), true));
+#else
+      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), false));
+#endif
 
       IteratorResource* resource;
       OP_REQUIRES_OK(
@@ -783,7 +787,11 @@ class OneShotIteratorOp : public AsyncOpKernel {
 
   Status TryInit(OpKernelContext* ctx, IteratorResource** iterator,
                  ContainerInfo* cinfo) {
-    TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def()));
+#ifdef TENSORFLOW_USE_ELASTIC_SERVER
+      TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), true));
+#else
+      TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), false));
+#endif
 
     FunctionLibraryRuntime* flr;
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index cb2b7bb8154..e239c9ba8d5 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -44,6 +44,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
+SAVED_PARTITIONED_NUM = 1000
 
 def _clip(params, ids, max_norm):
   """Helper function for _embedding_lookup_and_transform.
@@ -216,7 +217,7 @@ def _embedding_lookup_and_transform(params,
 
       if isinstance(params[0], kv_variable_ops.EmbeddingVariable):
          new_ids = flat_ids
-         p_assignments = flat_ids % 1000 % np 
+         p_assignments = flat_ids % SAVED_PARTITIONED_NUM % np 
       elif partition_strategy == "mod":
         p_assignments = flat_ids % np
         new_ids = flat_ids // np

From 89c7d63f50ed335ea14eb17f295b315a59e9f843 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 1 Nov 2023 19:48:48 +0800
Subject: [PATCH 12/45] [Runtime] Update log level in direct_session. (#935)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/core/common_runtime/direct_session.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 9670e838f88..a3dd3eba2ed 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -2185,8 +2185,8 @@ Status DirectSession::GetOrCreateExecutors(
   auto insert_key_status = executors_.emplace(key, insert_result.first->second);
   *executors_and_keys = insert_result.first->second.get();
   if (insert_key_status.second) {
-    LOG(INFO) << "Add new unsort key to executors_ map: " << executors_idx++
-              << ", key: " << key << ", this: " << this;
+    VLOG(2) << "Add new unsort key to executors_ map: " << executors_idx++
+            << ", key: " << key << ", this: " << this;
   }
 
   return Status::OK();

From c2e664aecaec18106350ec77dee946e45dbcf1fb Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 7 Nov 2023 19:10:14 -0800
Subject: [PATCH 13/45] [Embedding] Remove private header. (#943)

Signed-off-by: JunqiHu <silenceki@hotmail.com>
---
 tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h | 1 -
 tensorflow/core/framework/embedding/hbm_dram_storage.h     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
index 1056f4bbd78..4bc3b7d3aa2 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h
@@ -7,7 +7,6 @@
 #include "tensorflow/core/framework/embedding/single_tier_storage.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;
diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h
index d058d95f05b..15f6271fb4f 100644
--- a/tensorflow/core/framework/embedding/hbm_dram_storage.h
+++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h"
 #include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
 using se::DeviceMemoryBase;

From fc4f9f5c48b3f84d1f945c6aa738253cac7acf95 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 7 Nov 2023 23:32:37 -0800
Subject: [PATCH 14/45] [Distributed] Fix ps address list sort by index. (#945)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../elastic_grpc_server_lib.cc                  | 17 +++++++++++------
 .../elastic_grpc_server/elastic_service.cc      |  2 +-
 tensorflow/core/protobuf/elastic_training.proto |  2 +-
 tensorflow/python/BUILD                         |  1 +
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
index d45d70d6c8c..66e237956e5 100644
--- a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include <google/protobuf/map.h>
 #include "include/json/json.h"
 #include "grpc/support/alloc.h"
 #include "grpcpp/grpcpp.h"
@@ -89,7 +90,7 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
       return errors::Internal("PARSE TF_CONFIG/cluster ERROR");
     }
 
-    std::unordered_set<string> ps_addrs_vec;
+    std::set<string> ps_addrs_vec; //ordered
     after_part_num = cluster_json["cluster"]["ps"].size();
     for (auto& value: cluster_json["cluster"]["ps"]) {
       ps_addrs_vec.emplace(value.asString());
@@ -111,21 +112,25 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be
           }
           for (auto ps_addr: ps_addrs_vec) {
             if (target_string_set.find(ps_addr) == target_string_set.end()) {
-              job->mutable_tasks()->insert({idx, ps_addr});
+              job->mutable_tasks()->insert({idx++, ps_addr});
               tf_config_json["cluster"]["ps"].append(ps_addr);
             }
           } 
           break;
         } else {
           LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num;
+          google::protobuf::Map< google::protobuf::int32, std::string > tasks;
+          Json::Value arr_value(Json::arrayValue);
+          int idx = 0;
           for (int i = 0; i < before_part_num; ++i) {
             string tmp_string = tf_config_json["cluster"]["ps"][i].asString();
-            if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) {
-              Json::Value ps_addr;
-              tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr);
-              job->mutable_tasks()->erase(i);
+            if (ps_addrs_vec.find(tmp_string) != ps_addrs_vec.end()) {
+              arr_value.append(tf_config_json["cluster"]["ps"][i]);
+              tasks[idx++] = tmp_string;
             }
           }
+          tf_config_json["cluster"]["ps"].swap(arr_value);
+          job->mutable_tasks()->swap(tasks);
         }
       }
     }
diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
index 61aa6e662ec..59f7fa473bd 100644
--- a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
+++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <grpcpp/server.h>
 #include "grpcpp/server_builder.h"
 
-using namespace des;
+using namespace deeprec;
 
 using grpc::Server;
 using grpc::ServerAsyncResponseWriter;
diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto
index ee0d0bd10e0..b6af4b139cf 100644
--- a/tensorflow/core/protobuf/elastic_training.proto
+++ b/tensorflow/core/protobuf/elastic_training.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package des;
+package deeprec;
 
 enum Code {
   OK                  = 0;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a740e0916d9..f9cc74743be 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4747,6 +4747,7 @@ py_library(
         ":platform",
         ":protos_all_py",
         ":session_run_hook",
+        "//tensorflow/core:elastic_service_pb_py",
         ":training_util",
         ":util",
     ],

From 29d9b464b55b571484ceae11947a6dfa25caba19 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 8 Nov 2023 19:17:25 -0800
Subject: [PATCH 15/45] [Op] Canonicalize SaveV2 Op device spec in distributed
 training. (#925)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/training/saver.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 981d01dd7be..acc9723c183 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -550,8 +550,12 @@ def _GroupByDevices(self, saveables):
     """
     per_device = collections.defaultdict(lambda: [])
     for saveable in saveables:
-      canonical_device = set(
-          pydev.canonical_name(spec.tensor.device) for spec in saveable.specs)
+      canonical_device = set()
+      for spec in saveable.specs:
+        device_name = pydev.canonical_name(spec.tensor.device)
+        device_spec = pydev.DeviceSpec.from_string(device_name)
+        device_spec.device_type = "CPU"
+        canonical_device.add(device_spec.to_string())
       if len(canonical_device) != 1:
         raise ValueError("All tensors of a saveable object must be "
                          "on the same device: %s" % saveable.name)

From feab52dd225b9838d41790f25abb0f2f0607b199 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 15 Nov 2023 10:24:34 +0800
Subject: [PATCH 16/45] [Embedding] Fix SharedEmbeddingColumn with
 PartitionedEmbedingVariable shape validation error. (#948)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 .../python/feature_column/feature_column.py   |  3 ++
 .../feature_column/feature_column_v2_test.py  | 35 +++++++++++++++++++
 tensorflow/python/ops/variables.py            |  3 ++
 3 files changed, 41 insertions(+)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 3d5e7a71330..86a190cf86b 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2675,6 +2675,9 @@ def create_embedding(self,
         embedding_weights = shared_embedding_collection[0]
         if isinstance(embedding_weights, kv_variable_ops.EmbeddingVariable):
           embedding_shape = (self.dimension)
+        elif isinstance(embedding_weights, variables.PartitionedVariable):
+          if isinstance(embedding_weights._get_variable_list()[0], kv_variable_ops.EmbeddingVariable):
+            embedding_shape = (self.dimension)
         if embedding_weights.get_shape() != embedding_shape:
           raise ValueError(
               'Shared embedding collection {} contains variable {} of '
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index ff5935b708f..7946aee1e1a 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -7705,6 +7705,41 @@ def testEmbeddingVariableForSharedEmbeddingColumnsMultiCol(self):
         for j in range(3):
           self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
 
+  def testEmbeddingVariableForSharedPartitionedEmbeddingColumnsMultiCol(self):
+    columns_list=[]
+    columns_list.append(fc.categorical_column_with_embedding("col_emb", dtype=dtypes.string))
+    columns_list.append(fc.categorical_column_with_embedding("col_emb2", dtype=dtypes.string))
+    W = fc.shared_embedding_columns(columns_list,
+            dimension=3,
+            initializer=init_ops.ones_initializer(dtypes.float32),
+            shared_embedding_collection_name="xxxxx_shared")
+
+    ids={}
+    ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    ids["col_emb2"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5])
+    with variable_scope.variable_scope("scope",partitioner=partitioned_variables.fixed_size_partitioner(4)):
+      emb = fc_old.input_layer(ids, W)
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables_lib.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(init)
+      sess.run([emb, train_op,loss])
+      sess.run([emb, train_op,loss])
+      emb_r, _, _ = sess.run([emb, train_op,loss])
+      emb_right = [[0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214],
+       [0.7221214, 0.7221214, 0.7221214]]
+      for i in range(5):
+        for j in range(3):
+          self.assertAlmostEqual(emb_r[i][j], emb_right[i][j])
+
   @test_util.run_deprecated_v1
   def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self):
     columns_list=[]
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 6a3a1e0702b..8f92d091e68 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -3100,6 +3100,9 @@ def __init__(self, name, shape, dtype, variable_list, partitions):
 
     self._name = name
     self._shape = shape
+    from tensorflow.python.ops import kv_variable_ops
+    if isinstance(self._variable_list[0], kv_variable_ops.EmbeddingVariable):
+      self._shape = shape[1:]
     self._dtype = dtype
     self._partitions = partitions
     self._as_tensor = None

From 37221b53ca3a90ea1a3f85cc787463fc3c9884fe Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 15 Nov 2023 11:42:07 +0800
Subject: [PATCH 17/45] [Release] Update DeepRec release version to
 1.15.5+deeprec2310. (#949)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d5fa79bf2b1..e8635e1a298 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '1.15.5+deeprec2306'
+_VERSION = '1.15.5+deeprec2310'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.9.0',

From 3bc98886262c496ffcacac54f02391c9818e75ae Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Thu, 16 Nov 2023 16:53:48 +0800
Subject: [PATCH 18/45] [Docs] Update deeprec2310 release images and notes in
 README.md & RELEASE.md. (#950)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 41 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 53cca5c5c83..8f491e14665 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index 43e03bc2b49..6b7e4a7fd79 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,44 @@
+# Release r1.15.5-deeprec2310
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Refactor the data structure of EmbeddingVariable.
+- Add interface of EmbeddingVar for Elastic Training.
+- Add GetSnapshot and Create API for EmbeddingVariable.
+- Remove the dependency on private header file in EmbeddingVariable.
+
+### **Runtime Optimization**
+
+- Canonicalize SaveV2 Op device spec in distributed training.
+- Update log level in direct_session.
+
+### **Distributed**
+
+- Add elastic-grpc server.
+
+### **BugFix**
+
+- Fix missing return value of RestoreSSD of DramSSDHashStorage.
+- Fix incorrect frequency in shared-embedding.
+- Fix set initialized flag too early in restore subgraph.
+- Fix wgrad bug in Sparse Operation Kit.
+- Fix hang bug for async embedding lookup.
+- Fix ps address list sort by index.
+- Fix SharedEmbeddingColumn with PartitionedEmbedingVariable shape validation error.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2306
 
 ## **Major Features and Improvements**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index 83ba4854b9f..fdf3e295fdd 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 73b6a36f318..55f759a3c2a 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2306
+Develop Branch：master, Latest Release Branch: deeprec2310
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 346a848ca74..79a0944aa3e 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2306
+Develop Branch: master, Latest Release Branch: deeprec2310
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index 08d249f8eeb..ad8fd36dbf7 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index e5455aae91a..e54c8ddbd2f 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2306
+开发分支：master，最新Release分支：deeprec2310
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index 0c76400e6c6..a43d2d517a6 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2306
+开发分支：master，最新Release分支：deeprec2310
 
 ## TFServing编译&打包
 

From d8149699bd8366ef7bb32ea049c4202b0c8d0c68 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:41:02 -0800
Subject: [PATCH 19/45] [ModelZoo] Set Saver's parameter sharded=True in
 distributed training. (#954)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 modelzoo/bst/train.py              | 3 ++-
 modelzoo/dbmtl/train.py            | 3 ++-
 modelzoo/dcn/train.py              | 3 ++-
 modelzoo/dcnv2/train.py            | 3 ++-
 modelzoo/deepfm/train.py           | 3 ++-
 modelzoo/dien/train.py             | 6 +++---
 modelzoo/din/train.py              | 6 +++---
 modelzoo/dlrm/train.py             | 3 ++-
 modelzoo/dssm/train.py             | 3 ++-
 modelzoo/esmm/train.py             | 5 +++--
 modelzoo/masknet/train.py          | 3 ++-
 modelzoo/mlperf/train.py           | 3 ++-
 modelzoo/mmoe/train.py             | 3 ++-
 modelzoo/ple/train.py              | 3 ++-
 modelzoo/simple_multitask/train.py | 5 +++--
 modelzoo/wide_and_deep/train.py    | 3 ++-
 16 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py
index 2fb5e4e90f5..eeeb136678b 100644
--- a/modelzoo/bst/train.py
+++ b/modelzoo/bst/train.py
@@ -612,9 +612,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py
index 24595073b95..c848cbc76b2 100644
--- a/modelzoo/dbmtl/train.py
+++ b/modelzoo/dbmtl/train.py
@@ -527,9 +527,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py
index b8e1dba5d63..44701e22d9f 100644
--- a/modelzoo/dcn/train.py
+++ b/modelzoo/dcn/train.py
@@ -594,9 +594,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py
index 7ac4c1a0358..5b572af0425 100644
--- a/modelzoo/dcnv2/train.py
+++ b/modelzoo/dcnv2/train.py
@@ -610,9 +610,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py
index 896295b0ae6..166bedec0d0 100644
--- a/modelzoo/deepfm/train.py
+++ b/modelzoo/deepfm/train.py
@@ -472,9 +472,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py
index 6c583c3ac19..190695f6ce0 100644
--- a/modelzoo/dien/train.py
+++ b/modelzoo/dien/train.py
@@ -776,10 +776,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.tables_initializer(),
-                               tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py
index 6273e0d15a4..058583ce6fd 100644
--- a/modelzoo/din/train.py
+++ b/modelzoo/din/train.py
@@ -594,10 +594,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.tables_initializer(),
-                               tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py
index 0789e9418b8..cc4c045c349 100644
--- a/modelzoo/dlrm/train.py
+++ b/modelzoo/dlrm/train.py
@@ -507,9 +507,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py
index a757851711c..db949aac5e8 100644
--- a/modelzoo/dssm/train.py
+++ b/modelzoo/dssm/train.py
@@ -478,9 +478,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py
index 58219e19e3e..073b08814d4 100755
--- a/modelzoo/esmm/train.py
+++ b/modelzoo/esmm/train.py
@@ -534,9 +534,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.local_variables_initializer(), train_init_op),
-        saver=tf.train.Saver(max_to_keep=keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py
index 0790f200b21..bb96a467701 100644
--- a/modelzoo/masknet/train.py
+++ b/modelzoo/masknet/train.py
@@ -529,9 +529,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py
index db7e077250b..ce34fe5e55c 100644
--- a/modelzoo/mlperf/train.py
+++ b/modelzoo/mlperf/train.py
@@ -522,9 +522,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py
index 251e02c7a72..694eb45da80 100644
--- a/modelzoo/mmoe/train.py
+++ b/modelzoo/mmoe/train.py
@@ -523,9 +523,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py
index 2ba98363bbf..b2d2f2057ec 100644
--- a/modelzoo/ple/train.py
+++ b/modelzoo/ple/train.py
@@ -592,9 +592,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py
index ff90946c96d..4ef1874a521 100644
--- a/modelzoo/simple_multitask/train.py
+++ b/modelzoo/simple_multitask/train.py
@@ -427,9 +427,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
-        local_init_op=tf.group(tf.local_variables_initializer(), train_init_op),
-        saver=tf.train.Saver(max_to_keep=keep_checkpoint_max))
+        local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py
index b4f4dbc7a65..3024f58024e 100644
--- a/modelzoo/wide_and_deep/train.py
+++ b/modelzoo/wide_and_deep/train.py
@@ -543,9 +543,10 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
+    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(

From 7ce84779b69d746111db5934bc90b94fc3ada6fa Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 5 Dec 2023 00:51:05 -0800
Subject: [PATCH 20/45] [Embedding] Refine KVInterface::GetShardedSnapshot API.
 (#953)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../core/framework/embedding/cpu_hash_map_kv.h     | 14 ++++++++------
 .../core/framework/embedding/dense_hash_map_kv.h   | 10 ++++++----
 .../core/framework/embedding/embedding_var.h       |  9 +++++----
 .../core/framework/embedding/gpu_hash_map_kv.h     |  3 ++-
 tensorflow/core/framework/embedding/kv_interface.h |  3 ++-
 tensorflow/core/framework/embedding/leveldb_kv.h   | 10 ++++++----
 .../core/framework/embedding/multi_tier_storage.h  |  3 ++-
 .../core/framework/embedding/single_tier_storage.h |  3 ++-
 tensorflow/core/framework/embedding/ssd_hash_kv.h  |  3 ++-
 tensorflow/core/framework/embedding/storage.h      |  3 ++-
 10 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
index 750ba282285..f9a6e1fff25 100644
--- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h
@@ -138,7 +138,8 @@ class LocklessHashMap : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     std::pair<const K, void*> *hash_map_dump;
     int64 bucket_count;
@@ -147,11 +148,12 @@ class LocklessHashMap : public KVInterface<K, V> {
     bucket_count = it.second;
     for (int64 j = 0; j < bucket_count; j++) {
       if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ 
-          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_
-          && hash_map_dump[j].first % kSavedPartitionNum 
-              % partition_nums != partition_id) {
-        key_list->emplace_back(hash_map_dump[j].first);
-        value_ptr_list->emplace_back(hash_map_dump[j].second);
+          && hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_) {
+        int part_id = hash_map_dump[j].first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(hash_map_dump[j].first);
+          value_ptr_list[part_id].emplace_back(hash_map_dump[j].second);
+        }
       }
     }
 
diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
index 8a27404b66f..12749a92e6e 100644
--- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h
@@ -122,7 +122,8 @@ class DenseHashMap : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     dense_hash_map hash_map_dump[partition_num_];
     for (int i = 0; i< partition_num_; i++) {
@@ -131,9 +132,10 @@ class DenseHashMap : public KVInterface<K, V> {
     }
     for (int i = 0; i< partition_num_; i++) {
       for (const auto it : hash_map_dump[i].hash_map) {
-        if (it.first % kSavedPartitionNum % partition_nums != partition_id) {
-          key_list->push_back(it.first);
-          value_ptr_list->push_back(it.second);
+        int part_id = it.first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(it.first);
+          value_ptr_list[part_id].emplace_back(it.second);
         }
       }
     }
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index a66ec19fb97..df6ae6f1277 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -520,8 +520,8 @@ class EmbeddingVar : public ResourceBase {
     }
   }
 
-  Status GetShardedSnapshot(std::vector<K>* key_list,
-                            std::vector<void*>* value_ptr_list,
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
                             int partition_id, int partition_num) {
     return storage_->GetShardedSnapshot(key_list, value_ptr_list,
                                         partition_id, partition_num);
@@ -546,7 +546,7 @@ class EmbeddingVar : public ResourceBase {
       bool is_admit = feat_desc_->IsAdmit(value_ptr);
       bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
 
-      if (!is_admit) {
+      if (is_admit) {
         key_list[i] = tot_keys_list[i];
         
         if (!is_in_dram) {
@@ -571,7 +571,7 @@ class EmbeddingVar : public ResourceBase {
         }
       } else {
         if (!save_unfiltered_features)
-          return;
+          continue;
         //TODO(JUNQI) : currently not export filtered keys
       }
 
@@ -584,6 +584,7 @@ class EmbeddingVar : public ResourceBase {
         feat_desc_->Deallocate(value_ptr);
       }
     }
+    return;
   }
 
   Status RestoreFromKeysAndValues(int64 key_num, int partition_id,
diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
index e73839e3f76..68fecf690ba 100644
--- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
+++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h
@@ -253,7 +253,8 @@ class GPUHashMapKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot";
     return Status::OK();
diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h
index dc603680138..8480132a7d9 100644
--- a/tensorflow/core/framework/embedding/kv_interface.h
+++ b/tensorflow/core/framework/embedding/kv_interface.h
@@ -91,7 +91,8 @@ class KVInterface {
       std::vector<void*>* value_ptr_list) = 0;
 
   virtual Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) = 0;
 
   virtual std::string DebugString() const = 0;
diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h
index 47c8a39dfbd..030a0969e5d 100644
--- a/tensorflow/core/framework/embedding/leveldb_kv.h
+++ b/tensorflow/core/framework/embedding/leveldb_kv.h
@@ -194,7 +194,8 @@ class LevelDBKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     ReadOptions options;
     options.snapshot = db_->GetSnapshot();
@@ -203,8 +204,9 @@ class LevelDBKV : public KVInterface<K, V> {
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       K key;
       memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
-      if (key % kSavedPartitionNum % partition_nums == partition_id) continue;
-      key_list->emplace_back(key);
+      int part_id = key % kSavedPartitionNum % partition_nums;
+      if (part_id == partition_id) continue;
+      key_list[part_id].emplace_back(key);
       FeatureDescriptor<V> hbm_feat_desc(
           1, 1, ev_allocator()/*useless*/,
           StorageType::HBM_DRAM, true, true,
@@ -218,7 +220,7 @@ class LevelDBKV : public KVInterface<K, V> {
           value_ptr, feat_desc_->GetFreq(dram_value_ptr));
       hbm_feat_desc.UpdateVersion(
           value_ptr, feat_desc_->GetVersion(dram_value_ptr));
-      value_ptr_list->emplace_back(value_ptr);
+      value_ptr_list[part_id].emplace_back(value_ptr);
     }
     delete it;
     feat_desc_->Deallocate(dram_value_ptr);
diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h
index f77fec8c85a..e27521f1a65 100644
--- a/tensorflow/core/framework/embedding/multi_tier_storage.h
+++ b/tensorflow/core/framework/embedding/multi_tier_storage.h
@@ -91,7 +91,8 @@ class MultiTierStorage : public Storage<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage.";
     return Status::OK();
diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h
index db96c807c5e..1c6bdd90790 100644
--- a/tensorflow/core/framework/embedding/single_tier_storage.h
+++ b/tensorflow/core/framework/embedding/single_tier_storage.h
@@ -224,7 +224,8 @@ class SingleTierStorage : public Storage<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     mutex_lock l(Storage<K, V>::mu_);
     return kv_->GetShardedSnapshot(key_list, value_ptr_list,
diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h
index a56c9f73385..bdc38cc5d5e 100644
--- a/tensorflow/core/framework/embedding/ssd_hash_kv.h
+++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h
@@ -350,7 +350,8 @@ class SSDHashKV : public KVInterface<K, V> {
   }
 
   Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) override {
     return Status::OK();
   }
diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h
index a652de5fa5f..559588af7e1 100644
--- a/tensorflow/core/framework/embedding/storage.h
+++ b/tensorflow/core/framework/embedding/storage.h
@@ -96,7 +96,8 @@ class Storage {
   virtual Status GetSnapshot(std::vector<K>* key_list,
       std::vector<void*>* value_ptr_list) = 0;
   virtual Status GetShardedSnapshot(
-      std::vector<K>* key_list, std::vector<void*>* value_ptr_list,
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list,
       int partition_id, int partition_nums) = 0;
   virtual Status Save(
       const string& tensor_name,

From a5c014f144f00b5d5606ffa1e47bda0c8e0a2478 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Sun, 10 Dec 2023 22:07:29 +0800
Subject: [PATCH 21/45] [IO] Fix tensor shape meta-data bug for DataFrame
 Value. (#958)

* Revert "[IO] Add tensor shape meta-data support for ParquetDataset. (#849)"
* [IO] Fix tensor shape meta-data bug for DataFrame Value.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../python/data/experimental/ops/dataframe.py | 26 ++++++++---------
 .../experimental/ops/parquet_dataset_ops.py   | 28 +++++++++++--------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/dataframe.py b/tensorflow/python/data/experimental/ops/dataframe.py
index f3dc249653a..003f75259f1 100644
--- a/tensorflow/python/data/experimental/ops/dataframe.py
+++ b/tensorflow/python/data/experimental/ops/dataframe.py
@@ -59,17 +59,14 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None):
       self._ragged_rank = ragged_rank
       if shape:
         shape = tensor_shape.TensorShape(shape)
-        shape_rank = 0
-        for _ in shape:
-          shape_rank += 1
-        if ragged_rank is not None and ragged_rank != shape_rank:
+        for d in shape:
+          if d.value is None:
+            raise ValueError(
+              f'Field {name} has incomplete shape: {shape}')
+        if ragged_rank is not None and ragged_rank > 1:
           raise ValueError(
             f'Field {name} is a nested list ({ragged_rank}) '
             f'with shape {shape}')
-        self._ragged_rank = shape_rank
-      elif ragged_rank is not None:
-        shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)])
-
       self._shape = shape
 
     @property
@@ -134,16 +131,17 @@ def output_classes(self):
     def output_types(self):
       return self.map(lambda i: self._dtype if i == 0 else dtypes.int32)
 
-    def output_shapes(self, batch_size=None):
+    @property
+    def output_shapes(self):
       if self._shape is None:
-        return self.map(lambda i: tensor_shape.vector(batch_size) if i == 0
-                        else tensor_shape.vector(None))
+        return self.map(lambda _: tensor_shape.vector(None))
       return self.map(
-        lambda i: tensor_shape.vector(batch_size).concatenate(self._shape) if i == 0
+        lambda i: tensor_shape.vector(None).concatenate(self._shape) if i == 0
         else tensor_shape.vector(None))
 
-    def output_specs(self, batch_size=None):
-      shape = tensor_shape.vector(batch_size)
+    @property
+    def output_specs(self):
+      shape = tensor_shape.vector(None)
       if self._shape is not None:
         shape = shape.concatenate(self._shape)
       specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)]
diff --git a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
index 719940d1beb..5bb790c331d 100644
--- a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.util import nest
@@ -38,25 +39,23 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec):
   def value_type(self):
     return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor
 
-  def __init__(self, field, batch_size=None):
+  def __init__(self, field):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
       field: The field definition.
-      batch_size: The batch_size of DataFrame.
     """
     if field.incomplete:
       raise ValueError(
         f'Field {field} is incomplete, please specify dtype and ragged_rank')
     self._field = field
-    self._batch_size = batch_size
 
   def _serialize(self):
     return (self._field.dtype, self._field.ragged_rank)
 
   @property
   def _component_specs(self):
-    return self._field.output_specs(self._batch_size)
+    return self._field.output_specs
 
   def _to_components(self, value):
     if isinstance(value, DataFrame.Value):
@@ -80,7 +79,7 @@ def _to_legacy_output_types(self):
     return self._field.output_types
 
   def _to_legacy_output_shapes(self):
-    return self._field.output_shapes(self._batch_size)
+    return self._field.output_shapes
 
   def _to_legacy_output_classes(self):
     return self._field.output_classes
@@ -110,13 +109,18 @@ def __init__(
     self._batch_size = ops.convert_to_tensor(
       batch_size, dtype=dtypes.int64, name='batch_size')
     self._fields = fields
-    self._output_specs = {
-      f.name: (
-        DataFrameValueSpec(f, batch_size if drop_remainder else None)
-        if f.ragged_rank > 0
-        else tensor_spec.TensorSpec(
-            shape=[batch_size if drop_remainder else None], dtype=f.dtype))
-      for f in self._fields}
+    self._output_specs = {}
+    for f in self._fields:
+      item = None
+      if f.ragged_rank > 0:
+        item = DataFrameValueSpec(f)
+      else:
+        shape = tensor_shape.vector(batch_size if drop_remainder else None)
+        if f.shape:
+          shape = shape.concatenate(f.shape)
+        item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype)
+      self._output_specs[f.name] = item
+
     self._field_names = nest.flatten({f.name: f.name for f in self._fields})
     self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields})
     self._field_ragged_ranks = nest.flatten(

From 717f7c5e0840566c39739c321de024a88ddcc84f Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Wed, 13 Dec 2023 16:16:52 +0800
Subject: [PATCH 22/45] [Op] Implement of SliceSend/SliceRecv Op. (#947)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/framework/rendezvous.h        |   2 +
 tensorflow/core/graph/graph.cc                |   2 +
 tensorflow/core/graph/graph.h                 |  12 +-
 tensorflow/core/grappler/op_types.cc          |   8 +-
 tensorflow/core/grappler/op_types.h           |   2 +
 tensorflow/core/kernels/BUILD                 |  27 +-
 tensorflow/core/kernels/slice_sendrecv_ops.cc | 562 ++++++++++++++++++
 tensorflow/core/kernels/slice_sendrecv_ops.h  |  89 +++
 .../core/kernels/slice_sendrecv_ops_test.cc   | 339 +++++++++++
 tensorflow/core/ops/slice_sendrecv_ops.cc     |  78 +++
 11 files changed, 1118 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.h
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops_test.cc
 create mode 100644 tensorflow/core/ops/slice_sendrecv_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ef1ebcb6dcf..ce6850eb9da 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1237,6 +1237,7 @@ tf_gen_op_libs(
         "set_ops",
         "script_ops",
         "sendrecv_ops",
+        "slice_sendrecv_ops",
         "sparse_ops",
         "spectral_ops",
         "state_ops",
@@ -1497,6 +1498,7 @@ cc_library(
         ":sdca_ops_op_lib",
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
+        ":slice_sendrecv_ops_op_lib",
         ":sparse_ops_op_lib",
         ":star_run_graph_op_op_lib",
         ":summary_ops_op_lib",
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 255c0326e02..3c2b20379c8 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -80,6 +80,8 @@ class Rendezvous : public core::RefCounted {
     friend class SendOp;
     friend class RecvOp;
     friend class FuseRecvOp;
+    friend class SliceSendOp;
+    friend class SliceRecvOp;
     friend class RefSendOp;
     friend class RefRecvOp;
     string buf_;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 8ba5d345837..d9709d39f3f 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -69,11 +69,13 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"_Send", NC_SEND},
         {"_HostSend", NC_HOST_SEND},
         {"_RefSend", NC_REF_SEND},
+        {"_SliceSend", NC_SLICE_SEND},
         {"_Recv", NC_RECV},
         {"_HostRecv", NC_HOST_RECV},
         {"_RefRecv", NC_REF_RECV},
         {"_FuseRecv", NC_FUSE_RECV},
         {"_HostFuseRecv", NC_HOST_FUSE_RECV},
+        {"_SliceRecv", NC_SLICE_RECV},
         {"Const", NC_CONSTANT},
         {"HostConst", NC_CONSTANT},
         {"Variable", NC_VARIABLE},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 0e7e032c9a5..0baf8f257a9 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -219,12 +219,16 @@ class Node {
   bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; }
   bool IsSend() const { return class_ == NC_SEND ||
                                class_ == NC_HOST_SEND ||
-                               class_ == NC_REF_SEND; }
+                               class_ == NC_REF_SEND ||
+                               class_ == NC_SLICE_SEND; }
+  bool IsSliceSend() const { return class_ == NC_SLICE_SEND; }
   bool IsRecv() const { return class_ == NC_RECV ||
                                class_ == NC_HOST_RECV ||
-                               class_ == NC_REF_RECV; }
+                               class_ == NC_REF_RECV ||
+                               class_ == NC_SLICE_RECV; }
   bool IsFuseRecv() const { return class_ == NC_FUSE_RECV ||
                                    class_ == NC_HOST_FUSE_RECV; }
+  bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; }
   bool IsConstant() const { return class_ == NC_CONSTANT; }
   bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; }
   bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; }
@@ -334,11 +338,13 @@ class Node {
     NC_SEND,
     NC_HOST_SEND,
     NC_REF_SEND,
+    NC_SLICE_SEND,
     NC_RECV,
     NC_HOST_RECV,
     NC_REF_RECV,
     NC_FUSE_RECV,
     NC_HOST_FUSE_RECV,
+    NC_SLICE_RECV,
     NC_CONSTANT,
     NC_VARIABLE,
     NC_KV_VAR_HANDLE,
@@ -844,7 +850,9 @@ inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
 inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
+inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); }
 inline bool IsRecv(const Node* node) { return node->IsRecv(); }
+inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); }
 inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); }
 inline bool IsHostSend(const Node* node) { return node->IsHostSend(); }
 inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index a3a521fa123..1201623ffcd 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -454,7 +454,7 @@ bool IsReciprocalGrad(const NodeDef& node) {
 }
 
 bool IsRecv(const NodeDef& node) {
-  return node.op() == "_Recv" || node.op() == "_HostRecv";
+  return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node);
 }
 
 bool IsFuseRecv(const NodeDef& node) {
@@ -502,7 +502,7 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
 bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
 
 bool IsSend(const NodeDef& node) {
-  return node.op() == "_Send" || node.op() == "_HostSend";
+  return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node);
 }
 
 bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
@@ -517,6 +517,10 @@ bool IsSize(const NodeDef& node) { return node.op() == "Size"; }
 
 bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
 
+bool IsSliceRecv(const NodeDef& node) { return node.op() == "_SliceRecv"; }
+
+bool IsSliceSend(const NodeDef& node) { return node.op() == "_SliceSend"; }
+
 bool IsSnapshot(const NodeDef& node) { return node.op() == "Snapshot"; }
 
 bool IsSoftmax(const NodeDef& node) { return node.op() == "Softmax"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 19699ccb933..737581fd412 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -167,6 +167,8 @@ bool IsShuffle(const NodeDef& node);
 bool IsSigmoidGrad(const NodeDef& node);
 bool IsSize(const NodeDef& node);
 bool IsSlice(const NodeDef& node);
+bool IsSliceRecv(const NodeDef& node);
+bool IsSliceSend(const NodeDef& node);
 bool IsSnapshot(const NodeDef& node);
 bool IsSoftmax(const NodeDef& node);
 bool IsSoftplusGrad(const NodeDef& node);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0c08c30c30a..36721527cc2 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5423,8 +5423,9 @@ cc_library(
     name = "required",
     deps = [
         ":no_op",
-        ":sendrecv_ops",
         ":fuserecv_ops",
+        ":sendrecv_ops",
+        ":slice_sendrecv_ops",
     ],
 )
 
@@ -5445,6 +5446,12 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+tf_kernel_library(
+    name = "slice_sendrecv_ops",
+    prefix = "slice_sendrecv_ops",
+    deps = REQUIRED_DEPS,
+)
+
 tf_kernel_library(
     name = "group_embedding_ops",
     hdrs = ["group_embedding/group_embedding_lookup_sparse_forward_base_ops.h"],
@@ -5509,6 +5516,24 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "slice_sendrecv_ops_test",
+    srcs = ["slice_sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking
+    deps = [
+        ":control_flow_ops",
+        ":cwise_op",
+        ":logging_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":slice_sendrecv_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "fuserecv_ops",
     prefix = "fuserecv_ops",
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..f09f314ae10
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -0,0 +1,562 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/slice_sendrecv_ops.h"
+
+namespace tensorflow {
+
+//------------------------------------------------------------------------------
+// Utils.
+static string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                          const string& recv_device,
+                                          const uint64 send_device_incarnation,
+                                          const string& tensor_name) {
+  return strings::StrCat(send_device, ";",
+                         strings::FpToString(send_device_incarnation), ";",
+                         recv_device, ";", tensor_name);
+}
+
+static void GetSliceRendezvousKey(const string& key_prefix,
+                                  const string& tensor_name_suffix,
+                                  const FrameAndIter& frame_iter, string* key) {
+  key->clear();
+  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
+                     frame_iter.frame_id, ":", frame_iter.iter_id);
+}
+
+static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
+//------------------------------------------------------------------------------
+// Functions of SliceSendOp.
+
+SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    GetSliceRendezvousKeyPrefix(send_device, recv_device,
+                                send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+}
+
+void SliceSendOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(
+    ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  const Tensor& input_t = ctx->input(0);
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  // send total_bytes.
+  OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t));
+  // if input is dead, only send total_bytes dead tensor.
+  if (ctx->is_input_dead()) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, send directly.
+  if (input_t.TotalBytes() <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->input_alloc_attr(0);
+
+    Rendezvous::ParsedKey parsed_key;
+    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t,
+                                                ctx->is_input_dead()));
+    return;
+  }
+
+  // send shape.
+  OP_REQUIRES_OK(ctx, SendShape(ctx, frame_iter, input_t));
+
+  // send data.
+  if (dtype_ == DT_STRING) {
+    OP_REQUIRES_OK(ctx, SendString(ctx, frame_iter, input_t));
+  } else {
+    OP_REQUIRES_OK(ctx, SendBasicType(ctx, frame_iter, input_t));
+  }
+}
+
+Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx,
+                                   const FrameAndIter& frame_iter,
+                                   const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor total_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}),
+                                        &total_bytes_t));
+  total_bytes_t.scalar<int64>()() = input_t.TotalBytes();
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t,
+                                 ctx->is_input_dead());
+}
+
+Status SliceSendOp::SendShape(OpKernelContext* ctx,
+                              const FrameAndIter& frame_iter,
+                              const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor shape_t;
+  TensorShape shape = input_t.shape();
+  const int rank = shape.dims();
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({rank}),
+                                        &shape_t));
+  auto shape_vec = shape_t.vec<int64>();
+  for (int i = 0; i < rank; i++) {
+    shape_vec(i) = shape.dim_size(i);
+  }
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, shape_t,
+                                 ctx->is_input_dead());
+}
+
+Status SliceSendOp::SendString(OpKernelContext* ctx,
+                               const FrameAndIter& frame_iter,
+                               const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  // send elements size.
+  Tensor elements_size_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(),
+                                        &elements_size_t));
+  int64 num_elements = input_t.NumElements();
+  auto input_flat = input_t.flat<tstring>();
+  auto elements_size_flat = elements_size_t.flat<int64>();
+  for (int64 i = 0; i < num_elements; i++) {
+    elements_size_flat(i) = input_flat(i).size();
+  }
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
+                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "SliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t,
+                                             ctx->is_input_dead()));
+
+  // send data.
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Tensor data_t;
+  for (int64 i = 0; i < num_elements; i++) {
+    const std::string& elem = input_flat(i);
+    if (elem.size() <= slice_size_) {
+      TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}),
+                                            &data_t));
+      data_t.scalar<tstring>()() = elem;
+      std::string tensor_name_suffix = \
+        strings::StrCat("_slice_transfer_data_", std::to_string(i));
+      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                            &parsed_key.buf_);
+      VLOG(2) << "SliceSend " << parsed_key.buf_;
+      TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+      TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                                 ctx->is_input_dead()));
+    } else {
+      TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
+                                    const FrameAndIter& frame_iter,
+                                    const std::string& elem, int64 index) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_;
+  Tensor data_t;
+  for (int64 i = 0; i < slice_num; i++) {
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    size_t start = i * slice_size_;
+    size_t copy_size = slice_size_;
+    if (start > elem.size() - slice_size_) {
+      copy_size = elem.size() - start;
+    }
+    data_t.scalar<tstring>()() = elem.substr(start, copy_size);
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
+                      std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+  return Status::OK();
+}
+
+Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  const Tensor& input_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->input_alloc_attr(0);
+  Rendezvous::ParsedKey parsed_key;
+
+  // send data.
+  Tensor data_t;
+  int64 bytes_num = input_t.TotalBytes();
+  int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_;
+  unsigned char* input_base = reinterpret_cast<unsigned char*>(input_t.data());
+  for (int64 i = 0; i < slice_num; i++) {
+    int64 start = i * slice_size_;
+    int64 copy_size = slice_size_;
+    if (start > bytes_num - slice_size_) {
+      copy_size = bytes_num - start;
+    }
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT8, TensorShape({copy_size}),
+                                          &data_t));
+    auto data_base = data_t.data();
+    std::memcpy(data_base, input_base+start, copy_size);
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_CPU), SliceSendOp);
+REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_DEFAULT), SliceSendOp);
+
+//------------------------------------------------------------------------------
+// Functions of SliceRecvOp.
+
+SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    GetSliceRendezvousKeyPrefix(send_device, recv_device,
+                                send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_type", &dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_));
+}
+
+void SliceRecvOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(
+    ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  bool is_dead;
+
+  // recv total_bytes.
+  int64 total_bytes;
+  OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
+  if (is_dead) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, recv directly.
+  if (total_bytes <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->output_alloc_attr(0);
+    if (ctx->is_eager()) {
+      // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+      // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+      // rendezvous if it encounters any error.
+      args.cancellation_manager = ctx->cancellation_manager();
+    }
+
+    Rendezvous::ParsedKey parsed_key;
+    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceRecv " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    Tensor data_t;
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                                &is_dead, timeout_ms_));
+
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    ctx->set_output(0, data_t);
+    return;
+  }
+
+  // recv shape.
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape));
+
+  // recv data
+  Tensor* output_t = nullptr;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output_t));
+  if (dtype_ == DT_STRING) {
+    OP_REQUIRES_OK(ctx, RecvString(ctx, frame_iter, shape, output_t));
+  } else {
+    OP_REQUIRES_OK(ctx, RecvBasicType(ctx, frame_iter, total_bytes, output_t));
+  }
+}
+
+Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
+                                   const FrameAndIter& frame_iter,
+                                   bool& is_dead, int64& total_bytes) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor total_bytes_t;
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t,
+                                             &is_dead, timeout_ms_));
+  if (!is_dead) {
+    total_bytes = total_bytes_t.scalar<int64>()();
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvShape(OpKernelContext* ctx,
+                              const FrameAndIter& frame_iter,
+                              TensorShape& shape) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
+                        &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  Tensor shape_t;
+  bool is_dead;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto shape_vec = shape_t.vec<int64>();
+  const int64 num_elements = shape_t.NumElements();
+  for (int64 i = 0; i < num_elements; i++) {
+    shape.AddDim(shape_vec(i));
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvString(OpKernelContext* ctx,
+                               const FrameAndIter& frame_iter,
+                               const TensorShape& shape, Tensor*& output_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+  bool is_dead;
+
+  // recv elements size.
+  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
+                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "SliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  Tensor elements_size_t;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto elements_size_flat = elements_size_t.flat<int64>();
+  int64 num_elements = shape.num_elements();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  Tensor data_t;
+  auto output_flat = output_t->flat<tstring>();
+  for (int64 i = 0; i < num_elements; i++) {
+    if (elements_size_flat(i) <= slice_size_) {
+      std::string tensor_name_suffix = \
+        strings::StrCat("_slice_transfer_data_", std::to_string(i));
+      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                            &parsed_key.buf_);
+      VLOG(2) << "SliceRecv " << parsed_key.buf_;
+      TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+      TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                                 &is_dead, timeout_ms_));
+      // This shouldn't be a dead tensor.
+      CHECK_EQ(is_dead, false);
+      output_flat(i) = data_t.scalar<tstring>()();
+    } else {
+      TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i,
+                                         elements_size_flat(i), output_flat));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
+                                    const FrameAndIter& frame_iter,
+                                    const int64 index, const int64 element_size,
+                                    TTypes<tstring>::Flat& output_flat) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = (element_size + slice_size_ - 1) / slice_size_;
+  Tensor data_t;
+  bool is_dead = false;
+  for (int64 i = 0; i < slice_num; i++) {
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
+                      std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceRecv " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    output_flat(index) += data_t.scalar<tstring>()();
+  }
+
+  return Status::OK();
+}
+
+Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  const int64 total_bytes,
+                                  Tensor*& output_t) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor data_t;
+  bool is_dead = false;
+  int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_;
+  unsigned char* output_base = \
+    reinterpret_cast<unsigned char*>(output_t->data());
+  for (int64 i = 0; i < slice_num; i++) {
+    int64 start = i * slice_size_;
+    int64 copy_size = slice_size_;
+    if (start > total_bytes - slice_size_) {
+      copy_size = total_bytes - start;
+    }
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(i));
+    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
+                          &parsed_key.buf_);
+    VLOG(2) << "SliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    auto data_base = data_t.data();
+    std::memcpy(output_base+start, data_base, copy_size);
+  }
+
+  return Status::OK();
+
+}
+
+REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_CPU), SliceRecvOp);
+REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_DEFAULT), SliceRecvOp);
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
new file mode 100644
index 00000000000..df55c080aa1
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -0,0 +1,89 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class SliceSendOp : public OpKernel {
+ public:
+  explicit SliceSendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+  DataType dtype_;
+
+  // Functions.
+  Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        const Tensor& input_t);
+
+  Status SendShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   const Tensor& input_t);
+  Status SendString(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                    const Tensor& input_t);
+
+  Status SendStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const std::string& elem, int64 index);
+
+  Status SendBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const Tensor& input_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SliceSendOp);
+};
+
+class SliceRecvOp : public OpKernel {
+ public:
+  explicit SliceRecvOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variable.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+  int64 timeout_ms_;
+  DataType dtype_;
+
+  // Fucntions.
+  Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        bool& is_dead, int64& total_bytes);
+
+  Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   TensorShape& shape);
+
+  Status RecvString(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                    const TensorShape& shape, Tensor*& output_t);
+
+  Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const int64 index, const int64 element_size,
+                         TTypes<tstring>::Flat& output_flat);
+
+  Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const int64 total_bytes, Tensor*& output_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp);
+};
+
+} // End of namespace tensorflow
+
+#endif // End of TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
new file mode 100644
index 00000000000..5693ed57918
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
@@ -0,0 +1,339 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  // Functions.
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+    mutex_lock l(mu_);
+    // consumer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = send;
+      var.args = args;
+      var.data = val;
+      var.is_dead = is_dead;
+
+      kv_[key_str] = var;
+      return Status::OK();
+    }
+
+    auto var = kv_[key_str];
+    CHECK_EQ(var.type, recv);
+    var.done(Status::OK(), args, var.args, val, is_dead);
+    kv_.erase(key_str);
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+
+    mutex_lock l(mu_);
+    // producer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = recv;
+      var.args = args;
+      var.done = done;
+
+      kv_[key_str] = var;
+      return;
+    }
+
+    // auto var = kv_[key_str];
+    auto var =  kv_[key_str];
+    CHECK_EQ(var.type, send);
+    done(Status::OK(), var.args, args, var.data, var.is_dead);
+    kv_.erase(key_str);
+  }
+  void StartAbort(const Status& status) override {}
+
+ private:
+  enum RendezvousType {
+    send,
+    recv
+  };
+  // Type define.
+  struct Var {
+    RendezvousType type;
+    Args args;
+    Tensor data;
+    bool is_dead;
+    DoneCallback done;
+  };
+
+  // Variables.
+  mutex mu_;
+  std::unordered_map<std::string, struct Var> kv_ GUARDED_BY(mu_);
+};
+
+Node* SliceSend(Graph* g, Node* input, const string& tensor,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend")
+              .Input(input, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* SliceRecv(Graph* g, const string& tensor, const string& type,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size,
+                const int64 timeout_ms) {
+  Node* ret;
+  DataType dtype;
+  CHECK(DataTypeFromString(type, &dtype));
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv")
+              .Attr("tensor_type", dtype)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* Equal(Graph* g, Node* x, Node* y) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Equal")
+              .Input(x)
+              .Input(y)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* ReduceAll(Graph* g, Node* input, Node* axes) {
+  return test::graph::Reduce(g, "All", input, axes);
+}
+
+Node* Assert(Graph* g, Node* condition,
+             std::vector<NodeBuilder::NodeOut>& data) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert")
+              .Input(condition)
+              .Input(data)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+static Graph* TransferStringTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  Tensor input_t(DT_STRING, TensorShape({2, 4}));
+  input_t.flat<tstring>().setConstant(str); // total bytes: 44*8=352 bytes.
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBasicTypeTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+
+  Tensor input_t(DT_FLOAT, TensorShape({2, 8}));
+  input_t.flat<float>().setConstant(2); // total bytes = 4*2*8=64 bytes.
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBigStringTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 16;
+  const int64 timeout_ms = 5000;
+  std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  Tensor input_t(DT_STRING, TensorShape({2, 4}));
+  input_t.flat<tstring>().setConstant(str);
+  input_t.flat<tstring>()(0) = "short str";
+  Node* input_n = \
+    test::graph::Constant(g, input_t); // total bytes: 44*7+9=317 bytes.
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferBigBasicTypeTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 16;
+  const int64 timeout_ms = 5000;
+
+  Tensor input_t(DT_FLOAT, TensorShape({2, 8}));
+  input_t.flat<float>().setConstant(2); // total bytes: 4*2*8=64
+  Node* input_n = test::graph::Constant(g, input_t);
+  SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  Node* recv_n = \
+    SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  Node* equal_n = Equal(g, input_n, recv_n);
+
+  Tensor axes_t(DT_INT32, TensorShape({input_t.dims()}));
+  auto axes_flat = axes_t.flat<int32>();
+  for (int i = 0; i < input_t.dims(); i++) {
+    axes_flat(i) = i;
+  }
+  Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t));
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(input_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, reduce_all_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferDeadTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+
+  // val
+  Tensor val_t(DT_FLOAT, TensorShape({}));
+  val_t.scalar<float>()() = 2;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  SliceSend(g, switch_n, "T", "/cpu:0", 1, "/cpu:0", slice_size);
+  SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  return g;
+}
+
+static void BM_TransferStringTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferStringTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBasicTypeTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBasicTypeTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigStringTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigStringTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigBasicTypeTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigBasicTypeTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferDeadTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+BENCHMARK(BM_TransferStringTensor);
+BENCHMARK(BM_TransferBasicTypeTensor);
+BENCHMARK(BM_TransferBigStringTensor);
+BENCHMARK(BM_TransferBigBasicTypeTensor);
+BENCHMARK(BM_TransferDeadTensor);
+
+} // End of anonymous namespace
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/ops/slice_sendrecv_ops.cc b/tensorflow/core/ops/slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..11905712410
--- /dev/null
+++ b/tensorflow/core/ops/slice_sendrecv_ops.cc
@@ -0,0 +1,78 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+
+REGISTER_OP("_SliceSend")
+    .Input("tensor: T")
+    .Attr("T: type")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Sends the named tensor from send_device to recv_device.
+Supports sending the tensor of any size.
+
+tensor: The tensor to send.
+tensor_name: The name of the tensor to send.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+)doc");
+
+REGISTER_OP("_SliceRecv")
+    .Output("tensor: tensor_type")
+    .Attr("tensor_type: type")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .Attr("timeout_ms: int >= 0 = 300000")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Receives the named tensor from send_device on recv_device.
+Supports recving the tensor of any size.
+
+tensor: The tensor to receive.
+tensor_name: The name of the tensor to receive.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+timeout_ms: The maximum wait time for receiving a tensor.
+)doc");
+
+} // End of namespace tensorflow

From 6bf562197efaedccc8026d1d05ac23e27d3b2521 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 20 Dec 2023 15:47:52 +0800
Subject: [PATCH 23/45] [Embedding] undefine EV GPU interface in CPU compile.
 (#956)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 .../core/framework/embedding/embedding_var.h  | 91 +++++++++----------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index df6ae6f1277..c0d26a2f4d8 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -140,13 +140,6 @@ class EmbeddingVar : public ResourceBase {
     return storage_->Get(key, value_ptr);
   }
 
-  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
-                      const K* keys,
-                      void** value_ptr_list,
-                      int64 num_of_keys) {
-    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
-  }
-
   Status LookupOrCreateKey(K key, void** value_ptr,
                            bool* is_filter, bool indices_as_pointer,
                            int64 count = 1) {
@@ -167,45 +160,6 @@ class EmbeddingVar : public ResourceBase {
     return Status::OK();
   }
 
-  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
-                           const K* keys,
-                           void** value_ptrs,
-                           int64 num_of_keys,
-                           int64* indices_counts,
-                           bool indices_as_pointer = false) {
-    if (indices_as_pointer) {
-      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
-          (int64 start, int64 limit) {
-        for (int i = start; i < limit; i++) {
-          value_ptrs[i] = (void*)keys[i];
-        }
-      };
-      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
-      auto worker_threads = context.worker_threads;
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, num_of_keys, unit_cost,
-            lookup_key_and_set_version_fn);
-    } else {
-      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
-    }
-
-    if (indices_counts != nullptr) {
-      auto add_freq_fn = [this, value_ptrs, indices_counts]
-          (int64 start, int64 limit) {
-        for (int i = start; i < limit; i++) {
-          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
-        }
-      };
-      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
-      auto worker_threads = context.worker_threads;
-      Shard(worker_threads->num_threads,
-            worker_threads->workers, num_of_keys, unit_cost,
-            add_freq_fn);
-    }
-    return Status::OK();
-  }
-
-
   Status LookupOrCreateKey(K key, void** value_ptr) {
     Status s = storage_->GetOrCreate(key, value_ptr);
     TF_CHECK_OK(s);
@@ -402,6 +356,51 @@ class EmbeddingVar : public ResourceBase {
 
     storage_->AddToCache(keys_tensor);
   }
+
+  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                      const K* keys,
+                      void** value_ptr_list,
+                      int64 num_of_keys) {
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
+  }
+
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys,
+                           void** value_ptrs,
+                           int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts]
+          (int64 start, int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost = 1000; //very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads,
+            worker_threads->workers, num_of_keys, unit_cost,
+            add_freq_fn);
+    }
+    return Status::OK();
+  }
 #endif
 
 #if GOOGLE_CUDA

From 0f536a2849528e2c25dd7f496a00d810acd5e72c Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 26 Dec 2023 16:14:06 +0800
Subject: [PATCH 24/45] [Op] Implement FileSliceSend/FileSliceRecvOp. (#960)

FileSliceSend/FileSliceRecv Op transfer scalar string Tensor to/from SliceRecv/SliceSend Op.

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/framework/rendezvous.h        |   2 +
 tensorflow/core/graph/graph.cc                |   2 +
 tensorflow/core/graph/graph.h                 |  12 +-
 tensorflow/core/grappler/op_types.cc          |  10 +-
 tensorflow/core/grappler/op_types.h           |   2 +
 tensorflow/core/kernels/BUILD                 |  46 +-
 .../core/kernels/file_slice_sendrecv_ops.cc   | 482 +++++++++++++++++
 .../core/kernels/file_slice_sendrecv_ops.h    |  98 ++++
 .../kernels/file_slice_sendrecv_ops_test.cc   | 483 ++++++++++++++++++
 tensorflow/core/kernels/slice_sendrecv_ops.cc | 175 +++----
 tensorflow/core/kernels/slice_sendrecv_ops.h  |   6 +-
 .../core/kernels/slice_sendrecv_utils.cc      |  53 ++
 .../core/kernels/slice_sendrecv_utils.h       |  41 ++
 .../core/ops/file_slice_sendrecv_ops.cc       |  77 +++
 15 files changed, 1388 insertions(+), 103 deletions(-)
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.cc
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.h
 create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.cc
 create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.h
 create mode 100644 tensorflow/core/ops/file_slice_sendrecv_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ce6850eb9da..07115cfea3c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1203,6 +1203,7 @@ tf_gen_op_libs(
         "encode_proto_ops",
         "experimental_dataset_ops",
         "feature_column_ops",
+        "file_slice_sendrecv_ops",
         "function_ops",
         "functional_ops",
         "fused_embedding_ops",
@@ -1465,6 +1466,7 @@ cc_library(
         ":encode_proto_ops_op_lib",
         ":experimental_dataset_ops_op_lib",
         ":feature_column_ops_op_lib",
+        ":file_slice_sendrecv_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":fused_embedding_ops_op_lib",
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 3c2b20379c8..3aa65534272 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -82,6 +82,8 @@ class Rendezvous : public core::RefCounted {
     friend class FuseRecvOp;
     friend class SliceSendOp;
     friend class SliceRecvOp;
+    friend class FileSliceSendOp;
+    friend class FileSliceRecvOp;
     friend class RefSendOp;
     friend class RefRecvOp;
     string buf_;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index d9709d39f3f..59b25ee7c36 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -70,12 +70,14 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"_HostSend", NC_HOST_SEND},
         {"_RefSend", NC_REF_SEND},
         {"_SliceSend", NC_SLICE_SEND},
+        {"_FileSliceSend", NC_FILE_SLICE_SEND},
         {"_Recv", NC_RECV},
         {"_HostRecv", NC_HOST_RECV},
         {"_RefRecv", NC_REF_RECV},
         {"_FuseRecv", NC_FUSE_RECV},
         {"_HostFuseRecv", NC_HOST_FUSE_RECV},
         {"_SliceRecv", NC_SLICE_RECV},
+        {"_FileSliceRecv", NC_FILE_SLICE_RECV},
         {"Const", NC_CONSTANT},
         {"HostConst", NC_CONSTANT},
         {"Variable", NC_VARIABLE},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 0baf8f257a9..bd6d18cfc7c 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -220,15 +220,19 @@ class Node {
   bool IsSend() const { return class_ == NC_SEND ||
                                class_ == NC_HOST_SEND ||
                                class_ == NC_REF_SEND ||
-                               class_ == NC_SLICE_SEND; }
+                               class_ == NC_SLICE_SEND ||
+                               class_ == NC_FILE_SLICE_SEND; }
   bool IsSliceSend() const { return class_ == NC_SLICE_SEND; }
+  bool IsFileSliceSend() const { return class_ == NC_FILE_SLICE_SEND; }
   bool IsRecv() const { return class_ == NC_RECV ||
                                class_ == NC_HOST_RECV ||
                                class_ == NC_REF_RECV ||
-                               class_ == NC_SLICE_RECV; }
+                               class_ == NC_SLICE_RECV ||
+                               class_ == NC_FILE_SLICE_RECV; }
   bool IsFuseRecv() const { return class_ == NC_FUSE_RECV ||
                                    class_ == NC_HOST_FUSE_RECV; }
   bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; }
+  bool IsFileSliceRecv() const { return class_ == NC_FILE_SLICE_RECV; }
   bool IsConstant() const { return class_ == NC_CONSTANT; }
   bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; }
   bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; }
@@ -339,12 +343,14 @@ class Node {
     NC_HOST_SEND,
     NC_REF_SEND,
     NC_SLICE_SEND,
+    NC_FILE_SLICE_SEND,
     NC_RECV,
     NC_HOST_RECV,
     NC_REF_RECV,
     NC_FUSE_RECV,
     NC_HOST_FUSE_RECV,
     NC_SLICE_RECV,
+    NC_FILE_SLICE_RECV,
     NC_CONSTANT,
     NC_VARIABLE,
     NC_KV_VAR_HANDLE,
@@ -851,8 +857,10 @@ inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
 inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); }
+inline bool IsFileSliceSend(const Node* node) { return node->IsFileSliceSend(); }
 inline bool IsRecv(const Node* node) { return node->IsRecv(); }
 inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); }
+inline bool IsFileSliceRecv(const Node* node) { return node->IsFileSliceRecv(); }
 inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); }
 inline bool IsHostSend(const Node* node) { return node->IsHostSend(); }
 inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1201623ffcd..fd72927bd79 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -265,6 +265,10 @@ bool IsExp(const NodeDef& node) { return node.op() == "Exp"; }
 
 bool IsFakeParam(const NodeDef& node) { return node.op() == "FakeParam"; }
 
+bool IsFileSliceRecv(const NodeDef& node) { return node.op() == "_FileSliceRecv"; }
+
+bool IsFileSliceSend(const NodeDef& node) { return node.op() == "_FileSliceSend"; }
+
 bool IsFill(const NodeDef& node) { return node.op() == "Fill"; }
 
 bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
@@ -454,7 +458,8 @@ bool IsReciprocalGrad(const NodeDef& node) {
 }
 
 bool IsRecv(const NodeDef& node) {
-  return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node);
+  return node.op() == "_Recv" || node.op() == "_HostRecv" ||
+         IsSliceRecv(node) || IsFileSliceRecv(node);
 }
 
 bool IsFuseRecv(const NodeDef& node) {
@@ -502,7 +507,8 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
 bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
 
 bool IsSend(const NodeDef& node) {
-  return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node);
+  return node.op() == "_Send" || node.op() == "_HostSend" ||
+         IsSliceSend(node) || IsFileSliceSend(node);
 }
 
 bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 737581fd412..10968ad2547 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -80,6 +80,8 @@ bool IsExit(const NodeDef& node);
 bool IsExp(const NodeDef& node);
 bool IsFakeParam(const NodeDef& node);
 bool IsFill(const NodeDef& node);
+bool IsFileSliceRecv(const NodeDef& node);
+bool IsFileSliceSend(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
 bool IsFusedBatchNorm(const NodeDef& node);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 36721527cc2..4e6868a9897 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5423,6 +5423,7 @@ cc_library(
     name = "required",
     deps = [
         ":no_op",
+        ":file_slice_sendrecv_ops",
         ":fuserecv_ops",
         ":sendrecv_ops",
         ":slice_sendrecv_ops",
@@ -5446,10 +5447,33 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+cc_library(
+    name = "slice_sendrecv_utils",
+    hdrs = [
+        "slice_sendrecv_utils.h"
+    ],
+    srcs = [
+        "slice_sendrecv_utils.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+    ]
+)
+
 tf_kernel_library(
     name = "slice_sendrecv_ops",
     prefix = "slice_sendrecv_ops",
-    deps = REQUIRED_DEPS,
+    deps = REQUIRED_DEPS + [
+        ":slice_sendrecv_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "file_slice_sendrecv_ops",
+    prefix = "file_slice_sendrecv_ops",
+    deps = REQUIRED_DEPS + [
+        ":slice_sendrecv_utils",
+    ],
 )
 
 tf_kernel_library(
@@ -5534,6 +5558,26 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "file_slice_sendrecv_ops_test",
+    srcs = ["file_slice_sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking
+    deps = [
+        ":control_flow_ops",
+        ":cwise_op",
+        ":file_slice_sendrecv_ops",
+        ":logging_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":slice_sendrecv_ops",
+        ":whole_file_read_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "fuserecv_ops",
     prefix = "fuserecv_ops",
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..6bfe54363f9
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
@@ -0,0 +1,482 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/file_slice_sendrecv_ops.h"
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+//------------------------------------------------------------------------------
+// Functions of FileSliceSendOp.
+
+FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+}
+
+void FileSliceSendOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  const Tensor& file_path_t = ctx->input(0);
+  if (!ctx->is_input_dead()) {
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(file_path_t.shape()),
+                errors::InvalidArgument("file_path is not a scalar: ",
+                                        file_path_t.shape().DebugString()));
+  }
+
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  // get element_bytes.
+  uint64 element_bytes = 0;
+  OP_REQUIRES_OK(ctx, GetElementBytes(ctx, file_path_t, element_bytes));
+
+  // send total_bytes.
+  // total_bytes is the TotalBytes of the Tensor that contains the contents of
+  // the file. please refer Tensor::TotalBytes()
+  uint64 total_bytes = element_bytes + sizeof(tstring);
+  OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, total_bytes));
+  // if input is dead, only send total_bytes dead tensor.
+  if (ctx->is_input_dead()) {
+    return;
+  }
+
+  // if total bytes is smaller than slice size, send directly.
+  if (total_bytes <= slice_size_) {
+    Rendezvous::Args args;
+    args.device_context = ctx->op_device_context();
+    args.alloc_attrs = ctx->input_alloc_attr(0);
+
+    Rendezvous::ParsedKey parsed_key;
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+    OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    Tensor data_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    if (element_bytes > 0) {
+      OP_REQUIRES_OK(ctx, ReadFileToString(Env::Default(),
+        file_path_t.scalar<tstring>()(), data_t.scalar<tstring>().data()));
+    }
+    OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key,args, data_t,
+                                                ctx->is_input_dead()));
+    return;
+  }
+
+  // send shape, in order to match the behavior of 'SliceSend'.
+  OP_REQUIRES_OK(ctx, SendScalarShape(ctx, frame_iter));
+
+  // send element bytes, in order to match the behavior of 'SliceSend'.
+  OP_REQUIRES_OK(ctx, SendElementBytes(ctx, frame_iter, element_bytes));
+
+  // send data.
+  OP_REQUIRES_OK(ctx, SendFileSlice(ctx, frame_iter, file_path_t, element_bytes));
+}
+
+Status FileSliceSendOp::GetElementBytes(OpKernelContext* ctx,
+                                        const Tensor& file_path_t,
+                                        uint64& element_bytes) {
+
+  if (ctx->is_input_dead()) {
+    element_bytes = 0;
+    return Status::OK();
+  }
+
+  const string& file_path = file_path_t.scalar<tstring>()();
+  Env* env = Env::Default();
+
+  if (env->FileExists(file_path) != Status::OK()) {
+    element_bytes = 0;
+    return Status::OK();
+  }
+
+  return env->GetFileSize(file_path, &element_bytes);
+}
+
+Status FileSliceSendOp::SendUInt64MetaMsg(OpKernelContext* ctx,
+                                          const FrameAndIter& frame_iter,
+                                          const string& name,
+                                          const uint64 val) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor val_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}), &val_t));
+  val_t.scalar<uint64>()() = val;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter,
+                                        &parsed_key.buf_);
+  VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  return ctx->rendezvous()->Send(parsed_key, args, val_t, ctx->is_input_dead());
+}
+
+Status FileSliceSendOp::SendTotalBytes(OpKernelContext* ctx,
+                                       const FrameAndIter& frame_iter,
+                                       const uint64 total_bytes) {
+  return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes",
+                           total_bytes);
+}
+
+Status FileSliceSendOp::SendScalarShape(OpKernelContext* ctx,
+                                        const FrameAndIter& frame_iter) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  Tensor shape_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({0}), &shape_t));
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_shape", frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  return ctx->rendezvous()->Send(parsed_key, args, shape_t,
+                                 ctx->is_input_dead());
+}
+
+Status FileSliceSendOp::SendElementBytes(OpKernelContext* ctx,
+                                         const FrameAndIter& frame_iter,
+                                         const uint64 element_bytes) {
+  return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes",
+                           element_bytes);
+}
+
+Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx,
+                                      const FrameAndIter& frame_iter,
+                                      const Tensor& file_path_t,
+                                      const uint64 element_bytes) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  Rendezvous::ParsedKey parsed_key;
+
+  std::unique_ptr<RandomAccessFile> file;
+  Env* env = Env::Default();
+  const string& file_path = file_path_t.scalar<tstring>()();
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(file_path, &file));
+
+  // Slice Send.
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
+  Tensor data_t;
+  for (int64 i = 0; i < slice_num; i++) {
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
+    uint64 start = i * slice_size_;
+    uint64 copy_size = slice_size_;
+    if (start > element_bytes - slice_size_) {
+      copy_size = element_bytes - start;
+    }
+    TF_RETURN_IF_ERROR(ReadFileSlice(file, start, copy_size, data_t));
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(0), "_",
+                      std::to_string(i));
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceSend " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
+                                               ctx->is_input_dead()));
+  }
+
+
+  return Status::OK();
+}
+
+Status FileSliceSendOp::ReadFileSlice(
+                          const std::unique_ptr<RandomAccessFile>& file,
+                          const uint64 pos, const uint64 offset,
+                          Tensor& data_t) {
+  string* data_s = data_t.scalar<tstring>().data();
+  gtl::STLStringResizeUninitialized(data_s, offset);
+  char* data_p = gtl::string_as_array(data_s);
+  StringPiece result;
+  TF_RETURN_IF_ERROR(file->Read(pos, offset, &result, data_p));
+  if (result.data() != data_p) {
+    memmove(data_p, result.data(), result.size());
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_CPU),
+                        FileSliceSendOp);
+REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_DEFAULT),
+                        FileSliceSendOp);
+
+//------------------------------------------------------------------------------
+// Functions of FileSliceRecvOp.
+
+FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  string send_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
+  string recv_device;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device));
+  uint64 send_device_incarnation;
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("send_device_incarnation",
+                        reinterpret_cast<int64*>(&send_device_incarnation)));
+  string tensor_name;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  key_prefix_ = \
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_dir", &recv_dir_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_));
+}
+
+void FileSliceRecvOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES(ctx, ctx->rendezvous() != nullptr,
+    errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
+
+  bool is_dead = false;
+  uint64 total_bytes = 0;
+  OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
+  if (is_dead) {
+    return;
+  }
+
+  // Create file path output.
+  Env* env = Env::Default();
+  if (!env->FileExists(recv_dir_).ok()) {
+    OP_REQUIRES_OK(ctx, env->RecursivelyCreateDir(recv_dir_));
+  }
+  const string &filename = GenerateRecvFileName(ctx->op_kernel().name());
+  const string &file_path = io::JoinPath(recv_dir_, "tempfilerecv-"+filename);
+  Tensor* file_path_t = nullptr;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &file_path_t));
+  file_path_t->scalar<tstring>()() = file_path;
+
+  // if total bytes is smaller than slice size, recv directly.
+  if (total_bytes <= slice_size_) {
+    OP_REQUIRES_OK(ctx, RecvFile(ctx, frame_iter, file_path));
+    return;
+  }
+
+  // recv shape, in order to match the behavior of 'SliceRecv'.
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(shape),
+    errors::InvalidArgument(
+      "FileSliceRecv only supports receiving a tensor with a scalar shape."));
+
+  // recv element_bytes, in order to match the behavior of 'SliceRecv'.
+  uint64 element_bytes = 0;
+  OP_REQUIRES_OK(ctx, RecvElementBytes(ctx, frame_iter, element_bytes));
+
+  // recv data.
+  OP_REQUIRES_OK(ctx, RecvFileSlice(ctx, frame_iter, element_bytes, file_path));
+}
+
+Status FileSliceRecvOp::RecvUInt64MetaMsg(OpKernelContext* ctx,
+                                          const FrameAndIter& frame_iter,
+                                          const string& name, bool &is_dead,
+                                          uint64& val) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  Tensor val_t;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter,
+                                        &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  TF_RETURN_IF_ERROR(
+    ctx->rendezvous()->Recv(parsed_key, args, &val_t, &is_dead, timeout_ms_));
+  if (!is_dead) {
+    val = val_t.scalar<uint64>()();
+  }
+
+  return Status::OK();
+}
+
+Status FileSliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
+                                       const FrameAndIter& frame_iter,
+                                       bool& is_dead, uint64& total_bytes) {
+  return RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes",
+                           is_dead, total_bytes);
+}
+
+string FileSliceRecvOp::GenerateRecvFileName(const string& op_name) {
+  const std::vector<string>& file_name_vec = absl::StrSplit(op_name, "/");
+  return absl::StrJoin(file_name_vec, "_");
+}
+
+Status FileSliceRecvOp::RecvShape(OpKernelContext* ctx,
+                                  const FrameAndIter& frame_iter,
+                                  TensorShape& shape) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = AllocatorAttributes();
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape",
+                                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+
+  Tensor shape_t;
+  bool is_dead;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t,
+                                             &is_dead, timeout_ms_));
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+  auto shape_vec = shape_t.vec<int64>();
+  const int64 num_elements = shape_t.NumElements();
+  for (int64 i = 0; i < num_elements; i++) {
+    shape.AddDim(shape_vec(i));
+  }
+
+  return Status::OK();
+}
+
+Status FileSliceRecvOp::RecvElementBytes(OpKernelContext* ctx,
+                                        const FrameAndIter& frame_iter,
+                                        uint64& element_bytes) {
+  bool is_dead = false;
+  Status s = \
+    RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes", is_dead,
+                      element_bytes);
+  CHECK_EQ(is_dead, false);
+
+  return s;
+}
+
+Status FileSliceRecvOp::RecvFile(OpKernelContext* ctx,
+                                 const FrameAndIter& frame_iter,
+                                 const string& file_path) {
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+
+  Rendezvous::ParsedKey parsed_key;
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                        frame_iter, &parsed_key.buf_);
+  VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+  TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+  Tensor data_t;
+  bool is_dead = false;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                             &is_dead, timeout_ms_));
+
+  // This shouldn't be a dead tensor.
+  CHECK_EQ(is_dead, false);
+
+  // Write data_t to file.
+  Env* env = Env::Default();
+  return WriteStringToFile(env, file_path, data_t.scalar<tstring>()());
+}
+
+Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx,
+                                      const FrameAndIter& frame_iter,
+                                      const uint64 element_bytes,
+                                      const string& file_path) {
+  // create file
+  Env* env = Env::Default();
+  std::unique_ptr<WritableFile> file_ptr;
+  TF_RETURN_IF_ERROR(env->NewWritableFile(file_path, &file_ptr));
+
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
+  if (ctx->is_eager()) {
+    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
+    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
+    // rendezvous if it encounters any error.
+    args.cancellation_manager = ctx->cancellation_manager();
+  }
+  Rendezvous::ParsedKey parsed_key;
+
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
+  Tensor data_t;
+  bool is_dead = false;
+  for (int64 i = 0; i < slice_num; i++) {
+    std::string tensor_name_suffix = \
+      strings::StrCat("_slice_transfer_data_", std::to_string(0), "_",
+                      std::to_string(i));
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
+    VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
+    TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
+    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
+                                               &is_dead, timeout_ms_));
+    // This shouldn't be a dead tensor.
+    CHECK_EQ(is_dead, false);
+    file_ptr->Append(data_t.scalar<tstring>()());
+  }
+
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_CPU),
+                        FileSliceRecvOp);
+REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_DEFAULT),
+                        FileSliceRecvOp);
+
+}; // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
new file mode 100644
index 00000000000..6701196d481
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
@@ -0,0 +1,98 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class FileSliceSendOp : public OpKernel {
+ public:
+  explicit FileSliceSendOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  int32 slice_size_;
+
+  // Functions.
+  Status GetElementBytes(OpKernelContext* ctx, const Tensor& file_path_t,
+                         uint64& element_bytes);
+
+  Status SendUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                           const string& name, const uint64 val);
+
+  Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        const uint64 total_bytes);
+
+  Status SendScalarShape(OpKernelContext* ctx, const FrameAndIter& frame_iter);
+
+  Status SendElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                         const uint64 element_bytes);
+
+  Status SendFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const Tensor& file_path_t, const uint64 element_bytes);
+
+  Status ReadFileSlice(const std::unique_ptr<RandomAccessFile>& file,
+                       const uint64 pos, const uint64 offset, Tensor& data_t);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FileSliceSendOp);
+};
+
+class FileSliceRecvOp: public OpKernel {
+ public:
+  explicit FileSliceRecvOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  // Variables.
+  string key_prefix_;
+  bool hostmem_sendrecv_;
+  string recv_dir_;
+  int32 slice_size_;
+  int64 timeout_ms_;
+
+  // Functions.
+  Status RecvUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                           const string& name, bool &is_dead, uint64& val);
+
+  Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                        bool& is_dead, uint64& total_bytes);
+
+  string GenerateRecvFileName(const string& op_name);
+
+  Status RecvFile(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                  const string& file_path);
+
+  Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                   TensorShape& shape);
+
+  Status RecvElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                          uint64& element_bytes);
+
+  Status RecvFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
+                       const uint64 element_bytes, const string& file_path);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FileSliceRecvOp);
+};
+
+}; // End of namespace tensorflow
+
+#endif // End of macro TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
new file mode 100644
index 00000000000..931cd152253
--- /dev/null
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
@@ -0,0 +1,483 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  // Functions.
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+    mutex_lock l(mu_);
+    // consumer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = send;
+      var.args = args;
+      var.data = val;
+      var.is_dead = is_dead;
+
+      kv_[key_str] = var;
+      return Status::OK();
+    }
+
+    auto var = kv_[key_str];
+    CHECK_EQ(var.type, recv);
+    var.done(Status::OK(), args, var.args, val, is_dead);
+    kv_.erase(key_str);
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    std::string key_str = { key.FullKey().data(), key.FullKey().size() };
+
+    mutex_lock l(mu_);
+    // producer does not reach.
+    if (kv_.count(key_str) == 0) {
+      struct Var var;
+      var.type = recv;
+      var.args = args;
+      var.done = done;
+
+      kv_[key_str] = var;
+      return;
+    }
+
+    // auto var = kv_[key_str];
+    auto var =  kv_[key_str];
+    CHECK_EQ(var.type, send);
+    done(Status::OK(), var.args, args, var.data, var.is_dead);
+    kv_.erase(key_str);
+  }
+  void StartAbort(const Status& status) override {}
+
+ private:
+  enum RendezvousType {
+    send,
+    recv
+  };
+  // Type define.
+  struct Var {
+    RendezvousType type;
+    Args args;
+    Tensor data;
+    bool is_dead;
+    DoneCallback done;
+  };
+
+  // Variables.
+  mutex mu_;
+  std::unordered_map<std::string, struct Var> kv_ GUARDED_BY(mu_);
+};
+
+//------------------------------------------------------------------------------
+// Utils.
+Node* FileSliceSend(Graph* g, Node* filename, const string& tensor,
+                    const string& sender, const uint64 sender_incarnation,
+                    const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceSend"), "_FileSliceSend")
+              .Input(filename, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* FileSliceRecv(Graph* g, const string& tensor, const string& sender,
+                    const uint64 sender_incarnation, const string& receiver,
+                    const string& recv_dir, const int32 slice_size,
+                    const int64 timeout_ms) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceRecv"), "_FileSliceRecv")
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("recv_dir", recv_dir)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* SliceSend(Graph* g, Node* input, const string& tensor,
+                const string& sender, const uint64 sender_incarnation,
+                const string& receiver, const int32 slice_size) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend")
+              .Input(input, 0)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* SliceRecv(Graph* g, const string& tensor, const string& sender,
+                const uint64 sender_incarnation, const string& receiver,
+                const int32 slice_size, const int64 timeout_ms) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv")
+              .Attr("tensor_type", DT_STRING)
+              .Attr("tensor_name", tensor)
+              .Attr("send_device", sender)
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(sender_incarnation))
+              .Attr("recv_device", receiver)
+              .Attr("slice_size", slice_size)
+              .Attr("timeout_ms", timeout_ms)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* ReadFile(Graph* g, Node* filename) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("ReadFile"), "ReadFile")
+              .Input(filename, 0)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* WriteFile(Graph* g, Node* filename, Node* contents) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("WriteFile"), "WriteFile")
+              .Input(filename, 0)
+              .Input(contents, 0)
+              .Finalize(g, &ret));
+
+  return ret;
+}
+
+Node* Equal(Graph* g, Node* x, Node* y) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("Equal"), "Equal")
+              .Input(x)
+              .Input(y)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+Node* Assert(Graph* g, Node* condition,
+             std::vector<NodeBuilder::NodeOut>& data) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert")
+              .Input(condition)
+              .Input(data)
+              .Finalize(g, &ret));
+  return ret;
+}
+
+//------------------------------------------------------------------------------
+// Graph Constructor.
+
+static Graph* TransferFile(const std::string& test_type,
+                           const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type;
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // send filename node.
+  Tensor filename_t(DT_STRING, TensorShape({}));
+  filename_t.scalar<tstring>().setConstant(filename);
+  Node* filename_n = test::graph::Constant(g, filename_t);
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* write_file_n = WriteFile(g, filename_n, contents_n);
+  Node* send_n = \
+    FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+  g->AddControlEdge(write_file_n, send_n);
+
+  Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir,
+                               slice_size, timeout_ms);
+  Node* read_file_n = ReadFile(g, recv_n);
+  Node* equal_n = Equal(g, contents_n, read_file_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(read_file_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* FileSliceSendTransferFileToSliceRecv(const std::string& test_type,
+                                                   const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type;
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // send filename node.
+  Tensor filename_t(DT_STRING, TensorShape({}));
+  filename_t.scalar<tstring>().setConstant(filename);
+  Node* filename_n = test::graph::Constant(g, filename_t);
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* write_file_n = WriteFile(g, filename_n, contents_n);
+  Node* send_n = \
+    FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+  g->AddControlEdge(write_file_n, send_n);
+
+  Node* recv_n = \
+    SliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+  Node* equal_n = Equal(g, contents_n, recv_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(recv_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* SliceSendTransferFileToFileSliceRecv(const std::string& test_type,
+                                                   const int32 slice_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // contents node.
+  Tensor contents_t(DT_STRING, TensorShape({}));
+  contents_t.scalar<tstring>().setConstant(contents);
+  Node* contents_n = test::graph::Constant(g, contents_t);
+
+  Node* send_n = \
+    SliceSend(g, contents_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size);
+
+  Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir,
+                               slice_size, timeout_ms);
+  Node* read_file_n = ReadFile(g, recv_n);
+  Node* equal_n = Equal(g, contents_n, read_file_n);
+
+  std::vector<NodeBuilder::NodeOut> data_out;
+  data_out.emplace_back(contents_n, 0);
+  data_out.emplace_back(read_file_n, 0);
+  Assert(g, equal_n, data_out);
+
+  return g;
+}
+
+static Graph* TransferDeadTensor() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor";
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = filename;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size,
+                timeout_ms);
+
+  return g;
+}
+
+static Graph* FileSliceSendTransferDeadTensorToSliceRecv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor";
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = filename;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  SliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms);
+
+  return g;
+}
+
+static Graph* SliceSendTransferDeadTensorToFileSliceRecv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  const int32 slice_size = 1024;
+  const int64 timeout_ms = 5000;
+  std::string recv_dir = "/tmp/FileSliceTransferTestRecv";
+  std::string contents = \
+    "The quick brown fox jumps over the lazy dog."; // 44 chars.
+
+  // val
+  Tensor val_t(DT_STRING, TensorShape({}));
+  val_t.scalar<tstring>()() = contents;
+  Node* val_n = test::graph::Constant(g, val_t);
+
+  Tensor pred_t(DT_BOOL, TensorShape({}));
+  pred_t.scalar<bool>()() = true;
+  Node* pred_n = test::graph::Constant(g, pred_t);
+
+  Node* switch_n = test::graph::Switch(g, val_n, pred_n);
+  SliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size);
+  FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size,
+                timeout_ms);
+
+  return g;
+}
+
+static Graph* TransferSmallFile() {
+  return TransferFile("small_file", 1024);
+}
+
+static Graph* TransferBigFile() {
+  return TransferFile("big_file", 16);
+}
+
+static Graph* FileSliceSendTransferSmallFileToSliceRecv() {
+  return FileSliceSendTransferFileToSliceRecv("small_file", 1024);
+}
+
+static Graph* FileSliceSendTransferBigFileToSliceRecv() {
+  return FileSliceSendTransferFileToSliceRecv("big_file", 16);
+}
+
+static Graph* SliceSendTransferSmallFileToFileSliceRecv() {
+  return SliceSendTransferFileToFileSliceRecv("small_file", 1024);
+}
+
+static Graph* SliceSendTransferBigFileToFileSliceRecv() {
+  return SliceSendTransferFileToFileSliceRecv("big_file", 16);
+}
+
+//------------------------------------------------------------------------------
+// Test Function.
+
+static void BM_TransferSmallFile(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferSmallFile(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferBigFile(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferBigFile(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferSmallFileToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferSmallFileToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferBigFileToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferBigFileToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferSmallFileToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferSmallFileToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferBigFileToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferBigFileToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_TransferDeadTensor(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr,
+                  new DummyRendezvous).Run(iters);
+}
+
+static void BM_FileSliceSendTransferDeadTensorToSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", FileSliceSendTransferDeadTensorToSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+static void BM_SliceSendTransferDeadTensorToFileSliceRecv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", SliceSendTransferDeadTensorToFileSliceRecv(), nullptr,
+                  nullptr, new DummyRendezvous).Run(iters);
+}
+
+BENCHMARK(BM_TransferSmallFile);
+BENCHMARK(BM_TransferBigFile);
+BENCHMARK(BM_FileSliceSendTransferSmallFileToSliceRecv);
+BENCHMARK(BM_FileSliceSendTransferBigFileToSliceRecv);
+BENCHMARK(BM_SliceSendTransferSmallFileToFileSliceRecv);
+BENCHMARK(BM_SliceSendTransferBigFileToFileSliceRecv);
+BENCHMARK(BM_TransferDeadTensor);
+BENCHMARK(BM_FileSliceSendTransferDeadTensorToSliceRecv);
+BENCHMARK(BM_SliceSendTransferDeadTensorToFileSliceRecv);
+
+} // End of anonymous namespace
+
+} // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
index f09f314ae10..25f1a4e8738 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -14,41 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/slice_sendrecv_ops.h"
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
 
 namespace tensorflow {
 
-//------------------------------------------------------------------------------
-// Utils.
-static string GetSliceRendezvousKeyPrefix(const string& send_device,
-                                          const string& recv_device,
-                                          const uint64 send_device_incarnation,
-                                          const string& tensor_name) {
-  return strings::StrCat(send_device, ";",
-                         strings::FpToString(send_device_incarnation), ";",
-                         recv_device, ";", tensor_name);
-}
-
-static void GetSliceRendezvousKey(const string& key_prefix,
-                                  const string& tensor_name_suffix,
-                                  const FrameAndIter& frame_iter, string* key) {
-  key->clear();
-  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
-                     frame_iter.frame_id, ":", frame_iter.iter_id);
-}
-
-static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
-                                    bool hostmem_sendrecv) {
-  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
-    // Host memory send/recv pairs are added by
-    // common_runtime/memory_types.cc.  When the pair of nodes are
-    // added inside a function, we need to use the function call frame
-    // to formulate the unique rendezvous key.
-    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
-  } else {
-    return ctx->frame_iter();
-  }
-}
-
 //------------------------------------------------------------------------------
 // Functions of SliceSendOp.
 
@@ -64,8 +33,9 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string tensor_name;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = \
-    GetSliceRendezvousKeyPrefix(send_device, recv_device,
-                                send_device_incarnation, tensor_name);
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
+
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -79,7 +49,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) {
     errors::Internal("Op kernel context needs to provide a rendezvous."));
 
   const Tensor& input_t = ctx->input(0);
-  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
 
   // send total_bytes.
   OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t));
@@ -95,8 +66,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) {
     args.alloc_attrs = ctx->input_alloc_attr(0);
 
     Rendezvous::ParsedKey parsed_key;
-    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t,
@@ -124,11 +95,11 @@ Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx,
 
   Rendezvous::ParsedKey parsed_key;
   Tensor total_bytes_t;
-  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}),
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}),
                                         &total_bytes_t));
-  total_bytes_t.scalar<int64>()() = input_t.TotalBytes();
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
-                        &parsed_key.buf_);
+  total_bytes_t.scalar<uint64>()() = input_t.TotalBytes();
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t,
@@ -152,8 +123,8 @@ Status SliceSendOp::SendShape(OpKernelContext* ctx,
   for (int i = 0; i < rank; i++) {
     shape_vec(i) = shape.dim_size(i);
   }
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_shape", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   return ctx->rendezvous()->Send(parsed_key, args, shape_t,
@@ -168,21 +139,21 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
   args.alloc_attrs = AllocatorAttributes();
   Rendezvous::ParsedKey parsed_key;
 
-  // send elements size.
-  Tensor elements_size_t;
-  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(),
-                                        &elements_size_t));
+  // send elements bytes.
+  Tensor elements_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, input_t.shape(),
+                                        &elements_bytes_t));
   int64 num_elements = input_t.NumElements();
   auto input_flat = input_t.flat<tstring>();
-  auto elements_size_flat = elements_size_t.flat<int64>();
+  auto elements_bytes_flat = elements_bytes_t.flat<uint64>();
   for (int64 i = 0; i < num_elements; i++) {
-    elements_size_flat(i) = input_flat(i).size();
+    elements_bytes_flat(i) = input_flat(i).size();
   }
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
-                        frame_iter, &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+    "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceSend " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t,
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_bytes_t,
                                              ctx->is_input_dead()));
 
   // send data.
@@ -196,8 +167,8 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
       data_t.scalar<tstring>()() = elem;
       std::string tensor_name_suffix = \
         strings::StrCat("_slice_transfer_data_", std::to_string(i));
-      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                            &parsed_key.buf_);
+      slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                            frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceSend " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
       TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -218,7 +189,10 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
   args.alloc_attrs = ctx->input_alloc_attr(0);
   Rendezvous::ParsedKey parsed_key;
 
-  int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_;
+  int64 slice_num = elem.size() / slice_size_;
+  if (elem.size() % slice_size_ != 0) {
+    slice_num += 1;
+  }
   Tensor data_t;
   for (int64 i = 0; i < slice_num; i++) {
     TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t));
@@ -231,8 +205,8 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
                       std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -252,12 +226,15 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
 
   // send data.
   Tensor data_t;
-  int64 bytes_num = input_t.TotalBytes();
-  int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_;
+  size_t bytes_num = input_t.TotalBytes();
+  int64 slice_num = bytes_num / slice_size_;
+  if (bytes_num % slice_size_ != 0) {
+    slice_num += 1;
+  }
   unsigned char* input_base = reinterpret_cast<unsigned char*>(input_t.data());
   for (int64 i = 0; i < slice_num; i++) {
-    int64 start = i * slice_size_;
-    int64 copy_size = slice_size_;
+    size_t start = i * slice_size_;
+    size_t copy_size = slice_size_;
     if (start > bytes_num - slice_size_) {
       copy_size = bytes_num - start;
     }
@@ -267,8 +244,8 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
     std::memcpy(data_base, input_base+start, copy_size);
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
@@ -296,8 +273,8 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string tensor_name;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = \
-    GetSliceRendezvousKeyPrefix(send_device, recv_device,
-                                send_device_incarnation, tensor_name);
+    slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
+                      recv_device, send_device_incarnation, tensor_name);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -311,11 +288,12 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
     ctx, ctx->rendezvous() != nullptr,
     errors::Internal("Op kernel context needs to provide a rendezvous."));
 
-  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  FrameAndIter frame_iter = \
+    slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_);
   bool is_dead;
 
   // recv total_bytes.
-  int64 total_bytes;
+  uint64 total_bytes;
   OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes));
   if (is_dead) {
     return;
@@ -334,8 +312,8 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
     }
 
     Rendezvous::ParsedKey parsed_key;
-    GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data",
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     Tensor data_t;
@@ -364,7 +342,7 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) {
 
 Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
                                    const FrameAndIter& frame_iter,
-                                   bool& is_dead, int64& total_bytes) {
+                                   bool& is_dead, uint64& total_bytes) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = AllocatorAttributes();
@@ -377,14 +355,14 @@ Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx,
 
   Rendezvous::ParsedKey parsed_key;
   Tensor total_bytes_t;
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+                    "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
   TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t,
                                              &is_dead, timeout_ms_));
   if (!is_dead) {
-    total_bytes = total_bytes_t.scalar<int64>()();
+    total_bytes = total_bytes_t.scalar<uint64>()();
   }
 
   return Status::OK();
@@ -404,8 +382,8 @@ Status SliceRecvOp::RecvShape(OpKernelContext* ctx,
   }
 
   Rendezvous::ParsedKey parsed_key;
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter,
-                        &parsed_key.buf_);
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape",
+                                        frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
 
@@ -439,27 +417,27 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
   Rendezvous::ParsedKey parsed_key;
   bool is_dead;
 
-  // recv elements size.
-  GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size",
-                        frame_iter, &parsed_key.buf_);
+  // recv elements bytes.
+  slice_sendrecv::GetSliceRendezvousKey(key_prefix_,
+    "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_);
   VLOG(2) << "SliceRecv " << parsed_key.buf_;
   TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-  Tensor elements_size_t;
-  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t,
+  Tensor elements_bytes_t;
+  TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_bytes_t,
                                              &is_dead, timeout_ms_));
   // This shouldn't be a dead tensor.
   CHECK_EQ(is_dead, false);
-  auto elements_size_flat = elements_size_t.flat<int64>();
+  auto elements_bytes_flat = elements_bytes_t.flat<uint64>();
   int64 num_elements = shape.num_elements();
   args.alloc_attrs = ctx->output_alloc_attr(0);
   Tensor data_t;
   auto output_flat = output_t->flat<tstring>();
   for (int64 i = 0; i < num_elements; i++) {
-    if (elements_size_flat(i) <= slice_size_) {
+    if (elements_bytes_flat(i) <= slice_size_) {
       std::string tensor_name_suffix = \
         strings::StrCat("_slice_transfer_data_", std::to_string(i));
-      GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                            &parsed_key.buf_);
+      slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                            frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceRecv " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
       TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
@@ -469,7 +447,7 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
       output_flat(i) = data_t.scalar<tstring>()();
     } else {
       TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i,
-                                         elements_size_flat(i), output_flat));
+                                         elements_bytes_flat(i), output_flat));
     }
   }
 
@@ -478,7 +456,8 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
 
 Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
                                     const FrameAndIter& frame_iter,
-                                    const int64 index, const int64 element_size,
+                                    const int64 index,
+                                    const uint64 element_bytes,
                                     TTypes<tstring>::Flat& output_flat) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -491,15 +470,18 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
   }
   Rendezvous::ParsedKey parsed_key;
 
-  int64 slice_num = (element_size + slice_size_ - 1) / slice_size_;
+  int64 slice_num = element_bytes / slice_size_;
+  if (element_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
   Tensor data_t;
   bool is_dead = false;
   for (int64 i = 0; i < slice_num; i++) {
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(index), "_",
                       std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
@@ -514,7 +496,7 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
 
 Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
                                   const FrameAndIter& frame_iter,
-                                  const int64 total_bytes,
+                                  const uint64 total_bytes,
                                   Tensor*& output_t) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -529,19 +511,22 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
 
   Tensor data_t;
   bool is_dead = false;
-  int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_;
+  int64 slice_num = total_bytes / slice_size_;
+  if (total_bytes % slice_size_ != 0) {
+    slice_num += 1;
+  }
   unsigned char* output_base = \
     reinterpret_cast<unsigned char*>(output_t->data());
   for (int64 i = 0; i < slice_num; i++) {
-    int64 start = i * slice_size_;
-    int64 copy_size = slice_size_;
+    uint64 start = i * slice_size_;
+    uint64 copy_size = slice_size_;
     if (start > total_bytes - slice_size_) {
       copy_size = total_bytes - start;
     }
     std::string tensor_name_suffix = \
       strings::StrCat("_slice_transfer_data_", std::to_string(i));
-    GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter,
-                          &parsed_key.buf_);
+    slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix,
+                                          frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
     TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
index df55c080aa1..43429bff32f 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -66,7 +66,7 @@ class SliceRecvOp : public OpKernel {
 
   // Fucntions.
   Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                        bool& is_dead, int64& total_bytes);
+                        bool& is_dead, uint64& total_bytes);
 
   Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter,
                    TensorShape& shape);
@@ -75,11 +75,11 @@ class SliceRecvOp : public OpKernel {
                     const TensorShape& shape, Tensor*& output_t);
 
   Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                         const int64 index, const int64 element_size,
+                         const int64 index, const uint64 element_bytes,
                          TTypes<tstring>::Flat& output_flat);
 
   Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter,
-                       const int64 total_bytes, Tensor*& output_t);
+                       const uint64 total_bytes, Tensor*& output_t);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp);
 };
diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.cc b/tensorflow/core/kernels/slice_sendrecv_utils.cc
new file mode 100644
index 00000000000..56c2166c650
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_utils.cc
@@ -0,0 +1,53 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/slice_sendrecv_utils.h"
+
+namespace tensorflow {
+
+namespace slice_sendrecv {
+
+string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                   const string& recv_device,
+                                   const uint64 send_device_incarnation,
+                                   const string& tensor_name) {
+  return strings::StrCat(send_device, ";",
+                         strings::FpToString(send_device_incarnation), ";",
+                         recv_device, ";", tensor_name);
+}
+
+void GetSliceRendezvousKey(const string& key_prefix,
+                           const string& tensor_name_suffix,
+                           const FrameAndIter& frame_iter, string* key) {
+  key->clear();
+  strings::StrAppend(key, key_prefix, tensor_name_suffix, ";",
+                     frame_iter.frame_id, ":", frame_iter.iter_id);
+}
+
+FrameAndIter GetFrameAndIter(OpKernelContext* ctx, bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
+}; // End of namespace slice_sendrecv
+
+}; // End of namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.h b/tensorflow/core/kernels/slice_sendrecv_utils.h
new file mode 100644
index 00000000000..3605eece2ca
--- /dev/null
+++ b/tensorflow/core/kernels/slice_sendrecv_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace slice_sendrecv {
+
+extern string GetSliceRendezvousKeyPrefix(const string& send_device,
+                                          const string& recv_device,
+                                          const uint64 send_device_incarnation,
+                                          const string& tensor_name);
+
+extern void GetSliceRendezvousKey(const string& key_prefix,
+                                  const string& tensor_name_suffix,
+                                  const FrameAndIter& frame_iter, string* key);
+
+extern FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv);
+
+}; // End of namespace slice_sendrecv
+
+}; // End of namespace tensorflow
+
+#endif // End of macro TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_
diff --git a/tensorflow/core/ops/file_slice_sendrecv_ops.cc b/tensorflow/core/ops/file_slice_sendrecv_ops.cc
new file mode 100644
index 00000000000..c7eb20d1358
--- /dev/null
+++ b/tensorflow/core/ops/file_slice_sendrecv_ops.cc
@@ -0,0 +1,77 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+
+namespace tensorflow {
+REGISTER_OP("_FileSliceSend")
+    .Input("file_path: string")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("slice_size: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Sends the file from send_device to recv_device.
+Supports sending the file of any size.
+
+file_path: The file to send.
+tensor_name: The name of the tensor to send.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+slice_size: The maximum number of bytes transferred at one time.
+)doc");
+
+REGISTER_OP("_FileSliceRecv")
+    .Output("file_path: string")
+    .Attr("tensor_name: string")
+    .Attr("send_device: string")
+    .Attr("send_device_incarnation: int")
+    .Attr("recv_device: string")
+    .Attr("client_terminated: bool = false")
+    .Attr("recv_dir: string")
+    .Attr("slice_size: int >= 1")
+    .Attr("timeout_ms: int >= 0 = 300000")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Receives the file from send_device on recv_device.
+Supports recving the file of any size.
+
+file_path: The file to receive.
+tensor_name: The name of the tensor to receive.
+send_device: The name of the device sending the tensor.
+send_device_incarnation: The current incarnation of send_device.
+recv_device: The name of the device receiving the tensor.
+client_terminated: If set to true, this indicates that the node was added
+  to the graph as a result of a client-side feed or fetch of Tensor data,
+  in which case the corresponding send or recv is expected to be managed
+  locally by the caller.
+recv_dir: the directory to store received file.
+slice_size: The maximum number of bytes transferred at one time.
+timeout_ms: The maximum wait time for receiving a tensor.
+)doc");
+
+}; // End of namespace tensorflow

From 2f938dc2a18e57c9a302f5a8b988f6cd39f89e2f Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Tue, 9 Jan 2024 17:46:11 -0800
Subject: [PATCH 25/45] [TensorRT] Fix Graph contains EmbeddingVariable
 compiling issue. (#964)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/compiler/tensorrt/trt_convert.py | 12 +++++-------
 tensorflow/python/framework/graph_util_impl.py     | 12 +++++-------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 2c8d603ba01..064e32c6984 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -539,13 +539,10 @@ def _gather_names(tensor_info):
       # EmbeddingVariable can not be convert to constant, so we need to
       # load ev varibles at runtime always.
       if self._use_ev:
-        global_step_collection_ops = sess.graph.get_collection("global_step")
-        global_step_name = global_step_collection_ops[0].name.split(":")[0]
         output_node_names.add(filename_tensor_name)
         output_node_names.add(save_tensor_name)
         output_node_names.add(restore_op_name)
 
-        tf_logging.info("TensorRT - global_step_name: %s" % str(global_step_name))
         tf_logging.info("TensorRT - filename_tensor_name: %s" % str(filename_tensor_name))
         tf_logging.info("TensorRT - save_tensor_name: %s" % str(save_tensor_name))
         tf_logging.info("TensorRT - restore_op_name: %s" % str(restore_op_name))
@@ -559,18 +556,19 @@ def _gather_names(tensor_info):
 
       # Freeze the variables in the SavedModel graph and copy the frozen
       # graph over.
-      variable_names_blacklist = []
       if self._use_ev:
-        variable_names_blacklist.append(global_step_name)
+        global_step_collection_ops = sess.graph.get_collection("global_step")
+        if len(global_step_collection_ops) > 0:
+          sess.run([sess.graph.get_operation_by_name("global_step/Assign")])
 
       frozen_graph_def = graph_util.convert_variables_to_constants(
           sess, sess.graph.as_graph_def(add_shapes=True),
-          list(output_node_names), variable_names_blacklist=variable_names_blacklist)
+          list(output_node_names))
 
       if self._use_ev:
         # Keep KV Variable in saver_def, these kv-vars will be initialized at runtime.
         frozen_graph_def = graph_util.create_kv_variable_init_graph(
-            frozen_graph_def, global_step_name, restore_op_name)
+            frozen_graph_def, restore_op_name)
 
       self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
       self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 76d69e886e7..c3fa37529c3 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -169,7 +169,7 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 @tf_export(v1=["graph_util.create_kv_variable_init_graph"])
-def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
+def create_kv_variable_init_graph(graph, restore_all_op_name):
   name_to_input_name, name_to_node, name_to_seq_num = \
       _extract_graph_summary(graph)
 
@@ -184,8 +184,10 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
                        " {} in current graph.".format(restore_all_op_name))
 
   for restore_shard_input_full_name in restore_all_op.input:
-    restore_shard_input_name = re.sub(r"^\^", "", restore_shard_input_full_name)
-    restore_shard_input_op = name_to_node[restore_shard_input_name]
+    restore_shard_input_no_op_name = re.sub(r"^\^", "", restore_shard_input_full_name)
+    restore_shard_input_no_op = name_to_node[restore_shard_input_no_op_name]
+    restore_shard_input_op_name = re.sub(r"^\^", "",restore_shard_input_no_op.input[0])
+    restore_shard_input_op = name_to_node[restore_shard_input_op_name]
     # go through all restore_shard ops
     new_node = node_def_pb2.NodeDef()
     new_node.CopyFrom(restore_shard_input_op)
@@ -198,10 +200,6 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name):
          n_node.op == "KvResourceImportV2" or \
          n_node.op == "KvResourceImport":
         new_node.input.append(n_full_name)
-      else:
-        # Keep global_step assign op in new save/restore_all
-        if n_node.input[0] == global_step_name:
-          new_node.input.append(n_full_name)
 
     graph.node.remove(restore_shard_input_op)
     graph.node.extend([new_node])

From 5eabe5fba8b08707020868c899b7cd63784a70f6 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Fri, 12 Jan 2024 00:24:52 -0800
Subject: [PATCH 26/45] [Embedding] Make Embedding backward compatible with
 previous saved_model. (#963)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 tensorflow/python/ops/kv_variable_ops.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 1ef9550ef6d..840aadf2541 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -530,11 +530,16 @@ def _init_from_proto(self, variable_def, import_scope=None):
           cache_op = op
     elif self._initializer_op.type == "InitializeKvVariableOp":
       init_op = self._initializer_op
-
-    self._init_op_for_restore = g.as_graph_element(
+    if variable_def.initialize_op_for_restore:
+      self._init_op_for_restore = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initialize_op_for_restore,
             import_scope=import_scope))
+    else: #Backward compatibility with 2306
+      self._init_op_for_restore = g.as_graph_element(
+        ops.prepend_name_scope(
+            variable_def.initializer_name,
+            import_scope=import_scope))
     self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(

From d84837fc3c589ea32aad9a3e6b6a272cbd92a079 Mon Sep 17 00:00:00 2001
From: dashingwu <dashingwu@gmail.com>
Date: Thu, 1 Feb 2024 12:22:01 +0800
Subject: [PATCH 27/45] [Runtime] fix a scheduling issue (#970)

The original code assumes the last 4 bits of the CPU cycle count is
uniformly distributed, but that is not true, at lease Intel IceLake
Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz, the CPU cycle is always
ODD number. This fact will result expensive ops are frequently scheduled
to signle thread, which will greatly increase the RT time (in custom
scenario, from ~30ms to ~45ms).

Signed-off-by: Xiaoguang Wu <zhongjian.wxg@alibaba-inc.com>
Co-authored-by: Xiaoguang Wu <zhongjian.wxg@alibaba-inc.com>
---
 tensorflow/core/common_runtime/executor.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index fd38329a1fa..3df0d2a15be 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -730,15 +730,16 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
 
   } else if (kernel_stats_->HasExpensiveMarker(item)) {
     KernelTimer timer;
+    static uint64 update_counter = 0;
     device->Compute(op_kernel, &ctx);
-    // For expensive kernels, always update the cost estimate. For inexpensive
-    // kernels, update the cost estimate with ~1/16 probability. This assumes
-    // that the last 4 bits of the CPU cycle count is uniformly distributed.
+
     constexpr int kKernelExecutionTrackingInvocationSkipCount = 16;
     if (is_expensive ||
-        timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0) {
+        update_counter % kKernelExecutionTrackingInvocationSkipCount == 0) {
       kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
     }
+
+    update_counter++;
   } else {
     device->Compute(op_kernel, &ctx);
   }

From 2b15e8a13a7d17736366bb9600267f94465b72e8 Mon Sep 17 00:00:00 2001
From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com>
Date: Sun, 4 Feb 2024 01:58:50 -0800
Subject: [PATCH 28/45] [Embedding] Fix shared embedding frequency counting
 problem. (#962)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
Co-authored-by: 泊霆 <hujunqi.hjq@alibaba-inc.com>
---
 .../api_def_UniqueWithExtraCounts.pbtxt       |   4 +
 .../api_def_UniqueWithExtraCounts.pbtxt       |   3 +
 .../api_def_UniqueWithExtraCounts.pbtxt       |   4 +
 tensorflow/core/kernels/unique_ali_op.cc      | 121 ++++++++++++-----
 tensorflow/core/kernels/unique_ali_op_util.h  | 122 +++++++++++++++---
 tensorflow/core/ops/array_ops.cc              |  20 +++
 .../framework/python_op_gen_internal.cc       |   1 +
 .../python/kernel_tests/unique_op_test.py     |  68 ++++++++++
 tensorflow/python/ops/array_ops.py            |   1 -
 .../python/ops/embedding_variable_ops_test.py |  69 ++++++++++
 .../python/training/gradient_descent.py       |  23 +++-
 tensorflow/python/training/optimizer.py       |  22 ++--
 12 files changed, 386 insertions(+), 72 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..b8fabfe75a9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..117b73ef185
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt
new file mode 100644
index 00000000000..b8fabfe75a9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithExtraCounts"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/unique_ali_op.cc b/tensorflow/core/kernels/unique_ali_op.cc
index 28b5dad1990..efae935db12 100644
--- a/tensorflow/core/kernels/unique_ali_op.cc
+++ b/tensorflow/core/kernels/unique_ali_op.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/task_runner.h"
 #include "tensorflow/core/kernels/unique_ali_op_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
@@ -41,40 +41,43 @@ const char* kStlHashMapString = "STL";
 const char* kAbslHashMapString = "ABSL";
 const char* kGoogleHashMapString = "GOOGLE";
 const int64 kDefaultUniqueRatioHint = 4;
-}
+}  // namespace
 
 template <typename T, typename TIndex>
 class UniqueAliOp : public OpKernel {
  public:
   explicit UniqueAliOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv,
-                                             kPartitionSize, &partition_size_));
-    OP_REQUIRES(context, partition_size_ > 0,
-                errors::InvalidArgument("Invaild PARTITION_SIZE=",
-                                        partition_size_));
+    OP_REQUIRES_OK(
+        context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, kPartitionSize,
+                                     &partition_size_));
+    OP_REQUIRES(
+        context, partition_size_ > 0,
+        errors::InvalidArgument("Invaild PARTITION_SIZE=", partition_size_));
 
-    OP_REQUIRES_OK(context, ReadBoolFromEnvVar(kUniqueOpSerialEnv,
-                                                false, &serial_));
+    OP_REQUIRES_OK(context,
+                   ReadBoolFromEnvVar(kUniqueOpSerialEnv, false, &serial_));
 
     // NOTE(zycao>: Hash map insertion and lookup performance is dominating in
     // Unique Op. Based on benchmark results, 'google::dense_hash_map' will be
     // used as default for most key types except string.
     //
-    // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a particular
-    // hash map could be seleteed to use. Possible choices are listed below:
+    // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a
+    // particular hash map could be seleteed to use. Possible choices are listed
+    // below:
     //     "MULTIMAP" for multimap parrallel process,
     //     "STL" for std::unordred_map,
     //     "ABSL" for absl::flat_hash_map,
     //     "GOOGLE" for google::dense_hash_map.
     std::string hash_map_str;
-    OP_REQUIRES_OK(context, ReadStringFromEnvVar(kUniqueOpHashMapEnv,
-                                                 kGoogleHashMapString,
-                                                 &hash_map_str));
+    OP_REQUIRES_OK(
+        context, ReadStringFromEnvVar(kUniqueOpHashMapEnv, kGoogleHashMapString,
+                                      &hash_map_str));
     std::transform(hash_map_str.begin(), hash_map_str.end(),
                    hash_map_str.begin(), ::toupper);
 
     OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpUniqRatioHint,
-        kDefaultUniqueRatioHint, &unique_ratio_hint_));
+                                                kDefaultUniqueRatioHint,
+                                                &unique_ratio_hint_));
     OP_REQUIRES(context, unique_ratio_hint_ > 0,
                 errors::InvalidArgument("Invaild ", kUniqueOpUniqRatioHint, "=",
                                         unique_ratio_hint_));
@@ -83,7 +86,8 @@ class UniqueAliOp : public OpKernel {
       map_flag_ = MULTIMAP;
       static char print_once = [] {
         LOG(INFO) << "MultiMapCompute preserved "
-          "dense hash map key: " << kPreseverdEmptyKey;
+                     "dense hash map key: "
+                  << kPreseverdEmptyKey;
         return '\0';
       }();
     } else if (!hash_map_str.compare(kStlHashMapString)) {
@@ -95,7 +99,6 @@ class UniqueAliOp : public OpKernel {
     } else {
       map_flag_ = GOOGLE;
     }
-
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,16 +113,14 @@ class UniqueAliOp : public OpKernel {
     Tensor output;
     Tensor output_counter;
     if (context->num_inputs() == 1) {
-      UniqueWithoutAxis<T, TIndex>(context, input,
-          &idx, &output, &output_counter, num_outputs(),
-          partition_size_, serial_, unique_ratio_hint_,
-          map_flag_);
+      UniqueWithoutAxis<T, TIndex>(
+          context, input, &idx, &output, &output_counter, num_outputs(),
+          partition_size_, serial_, unique_ratio_hint_, map_flag_);
     } else {
       const Tensor& axis_tensor = context->input(1);
-      UniqueWithAxis<T, TIndex>(context, input,
-          axis_tensor, &idx, &output, &output_counter,
-          num_outputs(), partition_size_, serial_,
-          unique_ratio_hint_, map_flag_);
+      UniqueWithAxis<T, TIndex>(context, input, axis_tensor, &idx, &output,
+                                &output_counter, num_outputs(), partition_size_,
+                                serial_, unique_ratio_hint_, map_flag_);
     }
     context->set_output(0, output);
     context->set_output(1, idx);
@@ -128,33 +129,65 @@ class UniqueAliOp : public OpKernel {
     }
   }
 
+ protected:
   bool serial_ = false;
   int64 partition_size_ = 0;
   int64 unique_ratio_hint_;
   UniqueMaps map_flag_ = GOOGLE;  // "GOOGLE" dense hash map is default
 };
 
+template <typename T, typename TIndex>
+class UniqueWithCountAliOp : public UniqueAliOp<T, TIndex> {
+  using UniqueAliOp<T, TIndex>::serial_;
+  using UniqueAliOp<T, TIndex>::partition_size_;
+  using UniqueAliOp<T, TIndex>::unique_ratio_hint_;
+  using UniqueAliOp<T, TIndex>::map_flag_;
+  using OpKernel::num_outputs;
+
+ public:
+  explicit UniqueWithCountAliOp(OpKernelConstruction* context)
+      : UniqueAliOp<T, TIndex>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("N", &num_sparse_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor idx;
+    Tensor output;
+    Tensor output_counter;
+    UniqueWithExtraCounts<T, TIndex>(
+        context, input, &idx, &output, &output_counter, num_outputs(),
+        partition_size_, serial_, unique_ratio_hint_, num_sparse_, map_flag_);
+    context->set_output(0, output);
+    context->set_output(1, idx);
+    context->set_output(2, output_counter);
+  }
+
+ private:
+  int num_sparse_;
+};
+
 #define REGISTER_UNIQUE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -164,7 +197,7 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -174,7 +207,17 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>)
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
@@ -198,12 +241,22 @@ REGISTER_UNIQUE(string)
                               .HostMemory("count")               \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts")         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
-#endif //GOOGLE_CUDA
-  
+#endif  // GOOGLE_CUDA
+
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
diff --git a/tensorflow/core/kernels/unique_ali_op_util.h b/tensorflow/core/kernels/unique_ali_op_util.h
index 6b59ba26e81..0a52d8864e9 100644
--- a/tensorflow/core/kernels/unique_ali_op_util.h
+++ b/tensorflow/core/kernels/unique_ali_op_util.h
@@ -191,7 +191,8 @@ void NewSizes(OpKernelContext* context, const Tensor& input,
 
 template<typename T, typename TIndex, class HashMap>
 void SerialComputeV1(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, 
+    google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   auto Tin = input.flat<T>();
   const int64 N = input.NumElements();
   auto idx_vec = idx->template vec<TIndex>();
@@ -205,7 +206,23 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input,
       ++j;
     }
   }
-
+  
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto idx_it = uniq.find(ids);
+      if (idx_it != uniq.end()) {
+        counter_map->emplace(idx_it->second, counter_vec(k));
+      }
+    }
+  }
+  
   *uniq_size = static_cast<int64>(uniq.size());
   TensorShape output_shape(input.shape());
   output_shape.set_dim(axis, *uniq_size);
@@ -223,7 +240,8 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input,
 
 template<typename T, typename TIndex, class HashMap>
 void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, 
+     google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   // Struct INode was used to store an inverse mapping for each node in the
   // hash map container.
   struct INode {
@@ -415,6 +433,25 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
   TaskRunner t3_runner(GlobalIndexTask, thread_pool, num_tasks_t1);
   t3_runner.Run();
 
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      for (int j = 0; j < num_tasks_t1; ++j) {
+        const INode* inode = uniq_maps[j].GetINodeByKey(ids);
+        if (inode != nullptr) {
+          counter_map->emplace(inode->index_, counter_vec(k));
+          continue;
+        }
+      }
+    }
+  }
+
   // Parallel Step 4: Write output indicies Tensor.
   int32 max_tasks_t4 = (N + kPartitionSize - 1) / kPartitionSize;
   int32 num_tasks_t4 = std::max(std::min(max_threads, max_tasks_t4), 1);
@@ -447,8 +484,8 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
 template<typename TIndex, class HashMap>
 void MultiMapCompute(OpKernelContext* context, const Tensor& input,
                      Tensor* idx, int64 axis, int64* uniq_size_out,
-                     int32 num_buckets, int64 unique_ratio_hint,
-                     Tensor* output) {
+                     int32 num_buckets, int64 unique_ratio_hint, int num_sparse,
+                     google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   auto Tin = input.vec<int64>();
   const int64 N = input.NumElements();
 
@@ -529,6 +566,24 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
   }
   int64 uniq_size =
       global_offsets[num_buckets - 1] + uniq_maps[num_buckets - 1].size();
+  
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * uniq_size);
+
+  google::dense_hash_map<int64, TIndex> extra_unique_id_map;
+  extra_unique_id_map.set_empty_key(std::numeric_limits<int64>::max());
+  extra_unique_id_map.resize(2 * uniq_size);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<int64>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto counts = counter_vec(k);
+      extra_unique_id_map.emplace(ids, counts);
+    }
+  }
 
   *uniq_size_out = uniq_size;
   AllocatorAttributes attr;
@@ -539,7 +594,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
   auto key_output_vec = output->template vec<int64>();
 
   auto OutputTask = [&key_output_vec, &uniq_maps, &global_offsets,
-      &Tin, &idx_vec, &map_parter]
+      &Tin, &idx_vec, &map_parter, &counter_map, extra_unique_id_map]
       (int32 task_id, int32 num_tasks) {
     TIndex offset = global_offsets[task_id];
     for (auto iter = uniq_maps[task_id].begin(); iter != uniq_maps[task_id].end(); ++iter) {
@@ -553,7 +608,10 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input,
         next_idx = idx_vec(cur_idx);
         idx_vec(cur_idx) = offset;
       }
-
+      auto it = extra_unique_id_map.find(iter->first);
+      if (it != extra_unique_id_map.end()) {
+        counter_map->emplace(offset, it->second);
+      }
       ++offset;
     }
   };
@@ -618,8 +676,9 @@ void MultipleElements(OpKernelContext* context, const Tensor& input,
 }
 
 template<typename TIndex>
-void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
-                      Tensor* idx, int num_outputs, int64 uniq_size) {
+void CheckCountOutput(OpKernelContext* context, Tensor* output, Tensor* output_counter,
+                      Tensor* idx, int num_outputs, int64 uniq_size, 
+                      int num_sparse, google::dense_hash_map<int, TIndex> counter_map) {
   if (num_outputs > 2) {
     auto idx_vec = idx->template vec<TIndex>();
     AllocatorAttributes attr;
@@ -633,12 +692,19 @@ void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
     for (int64 i = 0; i < N; ++i) {
       count_output_vec(idx_vec(i))++;
     }
+    if (num_sparse > 0) {
+      for (auto& it: counter_map) {
+        count_output_vec(it.first) += (it.second - 1);
+      }
+    }
   }
+  
 }
 
 template<typename T, typename TIndex, class HashMap>
 void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
-    Tensor* idx, int64 axis, int64* uniq_size, int64 N, bool serial, Tensor* output) {
+    Tensor* idx, int64 axis, int64* uniq_size, int64 N, int num_sparse, bool serial, 
+    google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
               errors::InvalidArgument("unique expects a 1D vector."));
   // TODO(dga):  Make unique polymorphic for returning int32 and int64
@@ -651,10 +717,10 @@ void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
 
   if (N >= kPartitionLimit && !serial) {
     ParallelComputeV1<T, TIndex, HashMap>
-        (context, input, idx, axis, uniq_size, output);
+        (context, input, idx, axis, uniq_size, num_sparse, counter_map, output);
   } else {
     SerialComputeV1<T, TIndex, HashMap>
-        (context, input, idx, axis, uniq_size, output);
+        (context, input, idx, axis, uniq_size, num_sparse, counter_map, output);
   }
 }
 
@@ -662,7 +728,7 @@ template<typename T, typename TIndex>
 void UniqueInternal(OpKernelContext* context, const Tensor& input,
     Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs,
     int64 partition_size, bool serial, int64 axis, int64 unique_ratio_hint,
-    std::vector<int64>& new_sizes, UniqueMaps map_flag) {
+    std::vector<int64>& new_sizes, UniqueMaps map_flag, int num_sparse = 0) {
   typedef google::dense_hash_map<T, TIndex> DefaultHashMap;
 
   AllocatorAttributes attr;
@@ -672,6 +738,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input,
       TensorShape({new_sizes[1]}), idx, attr));
 
   int64 uniq_size_out;
+  google::dense_hash_map<int, TIndex> counter_map;
 
   if (new_sizes[0] == 1 && new_sizes[2] == 1) {
     // Specialized and faster implementation when unique is run over single
@@ -687,33 +754,34 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input,
       case MULTIMAP:
         if (num_buckets > 1 && !serial) {
           MultiMapCompute<TIndex, google::dense_hash_map<int64, TIndex, IdHash>>
-              (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, output);
+              (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, num_sparse, &counter_map, output);
         } else {
           SerialComputeV1<T, TIndex, DefaultHashMap>
-              (context, input, idx, axis, &uniq_size_out, output);
+              (context, input, idx, axis, &uniq_size_out, num_sparse, &counter_map, output);
         }
         break;
       case STL:
         ComputeInternalWithHashMap<T, TIndex, std::unordered_map<T, TIndex>>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       case ABSL:
         ComputeInternalWithHashMap<T, TIndex, absl::flat_hash_map<T, TIndex>>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       case GOOGLE:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
         break;
       default:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>
-            (context, input, idx, axis, &uniq_size_out, N, serial, output);
+            (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output);
     }
   } else {
     MultipleElements<T, TIndex>(context, input, idx, output, &uniq_size_out, axis, new_sizes);
   }
 
-  CheckCountOutput<TIndex>(context, output_counter, idx, num_outputs, uniq_size_out);
+  CheckCountOutput<TIndex>(context, output, output_counter, idx, num_outputs, 
+                           uniq_size_out, num_sparse, counter_map);
 }
 
 template<typename T, typename TIndex>
@@ -743,6 +811,20 @@ void UniqueWithAxis(OpKernelContext* context, const Tensor& input,
       axis, unique_ratio_hint, new_sizes, map_flag);
 }
 
+template<typename T, typename TIndex>
+void UniqueWithExtraCounts(OpKernelContext* context, const Tensor& input,
+    Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs,
+    int64 partition_size, bool serial, int64 unique_ratio_hint,
+    int num_sparse, UniqueMaps map_flag) {
+  int64 axis = 0;
+  std::vector<int64> new_sizes{1, input.NumElements(), 1};
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+              errors::InvalidArgument("unique expects a 1D vector."));
+  UniqueInternal<T, TIndex>(context, input, idx, output,
+      output_counter, num_outputs, partition_size, serial,
+      axis, unique_ratio_hint, new_sizes, map_flag, num_sparse);
+}
+
 }  // namespace tensorflow
 
 #endif // TENSORFLOW_CORE_KERNELS_UNIQUE_ALI_OP_UTIL_H_
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 27f6811fcff..306026977ef 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1741,6 +1741,26 @@ REGISTER_OP("UniqueWithCountsV2")
       return Status::OK();
     });
 
+// ---------------------------------------------------
+
+REGISTER_OP("UniqueWithExtraCounts")
+    .Input("x: T")
+    .Input("extra_indices: N * T")
+    .Input("extra_counts: N * out_idx")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return Status::OK();
+    });
+
 namespace {
 
 Status ShapeShapeFn(InferenceContext* c) {
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 42ae4eacc77..d0370a09106 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -105,6 +105,7 @@ bool IsOpWithUnderscorePrefix(const string& s) {
        // TODO(annarev): reduce usage of '*' imports and remove these from the
        // list.
        "fused_batch_norm", "histogram_fixed_width", "stack",
+       "unique_with_extra_counts",
        "batch_norm_with_global_normalization", "clip_by_value"});
   return kUnderscoreOps->count(s) > 0;
 }
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 9ec0ff74e3e..08ebcf0e8dd 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -27,6 +27,7 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -278,6 +279,73 @@ def testUniqueWithCountsAbslMap(self):
   def testUniqueWithCountsDenseHashMap(self):
     self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
 
+class UniqueWithExtraCountsTest(test.TestCase):
+
+  def testInt32(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int32)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int64)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def RunUniqueWithCountsWithDifferentMaps(self, map_type):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+
+    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    self.testInt32()
+    self.testInt32OutIdxInt64()
+
+    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+
+  def testUniqueWithCountsMultiMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+
+  def testUniqueWithCountsStlMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('STL')
+
+  def testUniqueWithCountsAbslMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+
+  def testUniqueWithCountsDenseHashMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index adadf3cc427..960dae9ac8c 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1627,7 +1627,6 @@ def unique_with_counts(x, out_idx=dtypes.int32, name=None):
 
 unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
-
 @tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index 81b315e2e43..dbf254d5f14 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -19,6 +19,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.check_ops import assert_equal
 from tensorflow.python.platform import googletest
@@ -2871,6 +2872,39 @@ def testCountsTensor(self):
         value = checkpoint_utils.load_variable(ckpt_path, name)
         self.assertAllEqual(value, [3, 3, 1, 3, 2])
   
+  def testCountsWithSparseAndDenseTensor(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64)
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup(var, ids)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+  
   def testCountsTensorWithGradientDescent(self):
     os.environ["TF_RECORD_FREQ"] = "1"
     checkpoint_directory = self.get_temp_dir()
@@ -2908,6 +2942,41 @@ def testCountsTensorWithGradientDescent(self):
         self.assertAllEqual(value, [3, 3, 1, 3, 2])
 
     del os.environ["TF_RECORD_FREQ"]
+  
+  def testCountsDenseAndSparseTensorWithGradientDescent(self):
+    os.environ["TF_RECORD_FREQ"] = "1"
+    checkpoint_directory = self.get_temp_dir()
+    ckpt_path = os.path.join(checkpoint_directory, "model.ckpt")
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      var = variable_scope.get_embedding_variable("var_1",
+          embedding_dim = 3)
+      sp1 = sparse_tensor.SparseTensor(
+                      indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]],
+                      values=math_ops.cast([0,0,0,1,1,2], dtypes.int64),
+                      dense_shape=[6, 1])
+      ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64)
+      emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None)
+      emb2 = embedding_ops.embedding_lookup(var, ids)
+      emb = emb1 + emb2
+      fun = math_ops.multiply(emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = gradient_descent.GradientDescentOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      saver = saver_module.Saver()
+      init = variables.global_variables_initializer()
+    with self.test_session(graph=g) as sess:
+      sess.run([init])
+      sess.run(train_op)
+      saver.save(sess, ckpt_path)
+
+    for name, shape in checkpoint_utils.list_variables(ckpt_path):
+      if name == "var_1-freqs":
+        value = checkpoint_utils.load_variable(ckpt_path, name)
+        self.assertAllEqual(value, [3, 3, 1, 3, 2])
+
+    del os.environ["TF_RECORD_FREQ"]
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 799e3c5f5bd..bd16892c1c8 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -19,9 +19,12 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import gen_hash_training_ops
 from tensorflow.python.ops import kv_variable_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
@@ -72,22 +75,28 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
     if isinstance(handle, kv_variable_ops.EmbeddingVariable):
       global_step = training_util.get_or_create_global_step()
       if handle.need_counts() and len(handle._counts_tensor.keys()) != 0:
+        extra_counts, extra_indices = [], []
         if indices.op.type == "ConcatV2":
-          total_counts = []
           for tensor in indices.op.inputs:
             if tensor.op.type == "Reshape":
               indices_tensor = tensor.op.inputs[0]
-              total_counts.append(handle._counts_tensor[indices_tensor])
-          from tensorflow.python.ops import array_ops
-          counts_tensor = array_ops.concat(total_counts, 0)
+              if indices_tensor in handle._counts_tensor:
+                extra_counts.append(handle._counts_tensor[indices_tensor])
+                extra_indices.append(indices_tensor)
         elif indices.op.type == "Reshape":
           indices_tensor = indices.op.inputs[0]
-          counts_tensor = handle._counts_tensor[indices_tensor]
+          if indices_tensor in handle._counts_tensor:
+            extra_counts.append(handle._counts_tensor[indices_tensor])
+            extra_indices.append(indices_tensor)
+        unique_indices, new_index_positions, indices_counts = \
+            gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts)
+        summed_grads = math_ops.unsorted_segment_sum(
+            grad, new_index_positions, array_ops.shape(unique_indices)[0])
         return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
                                          grad.dtype.base_dtype),
-            grad, indices, global_step,
-            counts_tensor, use_locking=self._use_locking)
+            summed_grads, unique_indices, global_step,
+            indices_counts, use_locking=self._use_locking)
       else:
         return training_ops.kv_resource_sparse_apply_gradient_descent(
             handle.handle, math_ops.cast(self._learning_rate_tensor,
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 7523604ccf9..95383a9d962 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -34,6 +34,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import gen_io_ops
@@ -93,16 +94,14 @@ def _deduplicate_indexed_slices_with_counts(values, indices):
       array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, indices_counts)
 
-def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts):
+def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices):
   """Sums `values` associated with any non-unique `indices`
   and return counts of each count in `values`."""
-  unique_indices, new_index_positions = array_ops.unique(indices)
+  unique_indices, new_index_positions, summed_counts = \
+      gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts)
   summed_values = math_ops.unsorted_segment_sum(
       values, new_index_positions,
       array_ops.shape(unique_indices)[0])
-  summed_counts = math_ops.unsorted_segment_sum(
-      counts, new_index_positions,
-      array_ops.shape(unique_indices)[0])
   return (summed_values, unique_indices, summed_counts)
 
 def _var_key(var):
@@ -1105,19 +1104,22 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
             _deduplicate_indexed_slices_with_counts(
                 values=grad, indices=indices)
       else:
+        extra_counts, extra_indices = [], []
         if indices.op.type == "ConcatV2":
-          total_counts = []
           for tensor in indices.op.inputs:
             if tensor.op.type == "Reshape":
               indices_tensor = tensor.op.inputs[0]
-              total_counts.append(handle._counts_tensor[indices_tensor])
-          counts_tensor = array_ops.concat(total_counts, 0)
+              if indices_tensor in handle._counts_tensor:
+                extra_counts.append(handle._counts_tensor[indices_tensor])
+                extra_indices.append(indices_tensor)
         elif indices.op.type == "Reshape":
           indices_tensor = indices.op.inputs[0]
-          counts_tensor = handle._counts_tensor[indices_tensor]
+          if indices_tensor in handle._counts_tensor:
+            extra_counts.append(handle._counts_tensor[indices_tensor])
+            extra_indices.append(indices_tensor)
         summed_grad, unique_indices, indices_counts = \
             _deduplicate_indexed_slices_with_counts_reduction(
-                grad, indices, counts_tensor)
+                grad, indices, extra_counts, extra_indices)
       return self._resource_apply_sparse(
           summed_grad, handle, unique_indices, indices_counts)
     else:

From 70b32df83f0e7928d8894773fe2d5cf247ccf3d4 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Tue, 20 Feb 2024 19:20:27 +0800
Subject: [PATCH 29/45] [BUILD] Add build SDK package. (#972)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/sdk_package/BUILD            |  43 ++++++
 tensorflow/tools/sdk_package/README.md        |  41 ++++++
 .../tools/sdk_package/build_sdk_package.sh    | 136 ++++++++++++++++++
 3 files changed, 220 insertions(+)
 create mode 100644 tensorflow/tools/sdk_package/BUILD
 create mode 100644 tensorflow/tools/sdk_package/README.md
 create mode 100755 tensorflow/tools/sdk_package/build_sdk_package.sh

diff --git a/tensorflow/tools/sdk_package/BUILD b/tensorflow/tools/sdk_package/BUILD
new file mode 100644
index 00000000000..b3dca82b9e3
--- /dev/null
+++ b/tensorflow/tools/sdk_package/BUILD
@@ -0,0 +1,43 @@
+# Description:
+# TensorFlow is a computational framework, primarily for use in machine
+# learning applications.
+#
+# Public targets:
+#
+# ":sdk_package" - Package the tensorflow dynamic library and necessry
+#     headers for developing. The script should be executed manually
+#     after 'bazel build'.
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow:tensorflow.bzl", "transitive_hdrs", "tf_binary_additional_srcs")
+load("//tensorflow/core/platform:default/build_config_root.bzl",
+     "tf_additional_plugin_deps")
+
+transitive_hdrs(
+    name = "sdk_headers",
+    deps = [
+        # Need to check definition of //tensorflow:libtensorflow_cc.so
+        # for updates.
+        "//tensorflow/c:c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/contrib/session_bundle:bundle_shim",
+    ] + tf_additional_plugin_deps(),
+    tags = ["manual"],
+)
+
+sh_binary(
+    name = "build_sdk_package",
+    srcs = ["build_sdk_package.sh"],
+    data = [
+        ":sdk_headers",
+        "@com_google_protobuf//:protoc",
+        "//tensorflow:libtensorflow_cc.so",
+    ] + tf_binary_additional_srcs(),
+    tags = ["manual"],
+)
diff --git a/tensorflow/tools/sdk_package/README.md b/tensorflow/tools/sdk_package/README.md
new file mode 100644
index 00000000000..8dbac7bed92
--- /dev/null
+++ b/tensorflow/tools/sdk_package/README.md
@@ -0,0 +1,41 @@
+Bazel rules and bash scripts to package the DeepRec C/C++ APIs and
+runtime library into '\<DeepRec Root Path\>/tensorflow_sdk.tar.gz' archive.
+
+## SDK Build
+
+First of all, edit and run the configurating script **'./configure'** under
+DeeRec root directory (supposed '\<DeepRec Root Path\>').
+
+Then simply run the following commands under '\<DeepRec Root Path\>' to build
+the DeepRec SDK package:
+
+```sh
+./build sdk
+```
+_This command will put the SDK package named 'tensorflow\_sdk.tar.gz' into
+the directory below:_
+>     <DeepRec Root Path>/built/sdk/[gpu|cpu]
+
+## SDK usage:
+
+To make use of DeepRec runtime SDK for C++ codes writting with original APIs
+defined in TensorFlow, just decompress the SDK package into another work
+directory (supposed '\<workdir path\>') with the command at first:
+
+```sh
+tar xzvf -C <workdir path> tensorflow_sdk.tar.gz
+```
+
+Then a directory named 'sdk' will be placed into the \<workdir path\>, which
+contains necessary header files in the 'include' sub-directory, keeping the
+original hierarchy in TensorFlow, and the 'libtensorflow_cc.so' dynamic
+runtime library in the 'lib' sub-directoy to support TensorFlow running.
+
+Just append **'-I\<workdir path\>/sdk/include'** to compiling arguments and
+**'-L\<workdir path\>/sdk/lib'** -ltensorflow_cc to linking arguments, in the
+cases of building a project, that contains codes using original TensorFlow
+C++ APIs, together with DeepRec SDK.
+
+Finally, to successfully run the binary building with DeepRec SDK, do not
+forget to append '\<workdir path\>/sdk/lib' to **'LD_LIBRARY_PATH'** environment
+variable.
diff --git a/tensorflow/tools/sdk_package/build_sdk_package.sh b/tensorflow/tools/sdk_package/build_sdk_package.sh
new file mode 100755
index 00000000000..89b7d8e9195
--- /dev/null
+++ b/tensorflow/tools/sdk_package/build_sdk_package.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+# Copyright 2024 The DeepRec Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script is used for packaging TensorFlow SDK files into a tarball.
+# The processing flow took 'tensorflow/tools/pip_package/build_pip_package.sh'
+# as the reference.
+
+set -e
+
+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
+    true
+  else
+    false
+  fi
+}
+
+function main() {
+  if [ $# -lt 1 ] ; then
+    echo "No destination dir provided"
+    exit 1
+  fi
+
+  DEST=$1
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+  mkdir -p "${TMPDIR}/sdk/bin"
+  mkdir -p "${TMPDIR}/sdk/include"
+  mkdir -p "${TMPDIR}/sdk/lib"
+
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+
+  if [ ! -d bazel-bin/tensorflow ]; then
+    echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
+    exit 1
+  fi
+
+  if is_windows; then
+    echo "Windows version TensorFlow SDK not supported..."
+  elif [ ! -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow ]; then
+    # Really old (0.2.1-) runfiles, without workspace name.
+    echo "TensorFlow SDK does not support such old verions..."
+  else
+    RUNFILES=bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow
+    if [ -d ${RUNFILES}/external ]; then
+      # Old-style runfiles structure (--legacy_external_runfiles).
+      cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include"
+      # Check LLVM headers for XLA support.
+      if [ -d ${RUNFILES}/external/llvm_archive ]; then
+        # Old-style runfiles structure (--legacy_external_runfiles).
+        mkdir -p ${TMPDIR}/sdk/include/external/llvm/include
+        cp -RL ${RUNFILES}/external/llvm_archive/include/llvm \
+          "${TMPDIR}/sdk/include/external/llvm/include"
+        pushd ${TMPDIR}/sdk/include
+        ln -s external/llvm/include/llvm llvm
+        popd
+      fi
+      # Copy MKL libs over so they can be loaded at runtime
+      so_lib_dir=$(ls $RUNFILES | grep solib) || true
+      if [ -n "${so_lib_dir}" ]; then
+        mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+        if [ -n "${mkl_so_dir}" ]; then
+          cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib"
+        fi
+      fi
+    else
+      # New-style runfiles structure (--nolegacy_external_runfiles).
+      cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include"
+      # Check LLVM headers for XLA support.
+      if [ -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive ]; then
+        cp -RL \
+          bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive/include/llvm \
+          "${TMPDIR}/sdk/include"
+      fi
+      # Copy MKL libs over so they can be loaded at runtime
+      so_lib_dir=$(ls $RUNFILES | grep solib) || true
+      if [ -n "${so_lib_dir}" ]; then
+        mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+        if [ -n "${mkl_so_dir}" ]; then
+          cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib"
+        fi
+      fi
+    fi
+  fi
+
+  # move and strip the dynamic library file for packaging.
+  # at default the .so file was not writable for the owner,
+  # so using a 'chmod +w' to perform the strip command.
+  chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so
+  chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1
+  strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so
+  strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1
+  mv ${TMPDIR}/sdk/include/tensorflow/libtensorflow_*.so* ${TMPDIR}/sdk/lib
+
+  # third party packages doesn't ship with header files. Copy the headers
+  # over so user defined ops can be compiled.
+  mkdir -p ${TMPDIR}/sdk/include/google
+  mkdir -p ${TMPDIR}/sdk/include/third_party
+  pushd ${RUNFILES%org_tensorflow}/com_google_protobuf/src/google
+  for header in $(find protobuf -name \*.h); do
+    mkdir -p "${TMPDIR}/sdk/include/google/$(dirname ${header})"
+    cp -L "$header" "${TMPDIR}/sdk/include/google/$(dirname ${header})/"
+  done
+  popd
+  cp -RL $RUNFILES/third_party/eigen3 ${TMPDIR}/sdk/include/third_party
+  cp -RL ${RUNFILES%org_tensorflow}/eigen_archive/* ${TMPDIR}/sdk/include/
+  cp -RL ${RUNFILES%org_tensorflow}/nsync/public/* ${TMPDIR}/sdk/include
+  cp -L ${RUNFILES%org_tensorflow}/com_google_protobuf/protoc ${TMPDIR}/sdk/bin
+
+  # package all files into the target file.
+  pushd ${TMPDIR}
+  rm -f MANIFEST
+  echo $(date) : "=== Building sdk package"
+  tar czvf tensorflow_sdk.tar.gz sdk/ 1> /dev/null
+  popd
+  mkdir -p ${DEST}
+  mv ${TMPDIR}/tensorflow_sdk.tar.gz ${DEST}
+  rm -rf ${TMPDIR}
+  echo $(date) : "=== Output sdk package file is: ${DEST}/tensorflow_sdk.tar.gz"
+}
+
+main "$@"

From eb5f30db53ee41179a61a83c6ec9b54111c0257a Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 22 Feb 2024 15:08:16 +0800
Subject: [PATCH 30/45] [Embedding] Log error when EV has been initialized in
 EV Import OP. (#971)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/kernels/kv_variable_restore_ops.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 3b10c2521b9..2eccf485ef8 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -373,6 +373,12 @@ class KvResourceImportV3Op: public AsyncOpKernel {
 
     core::ScopedUnref unref_me(ev);
 
+    // EV should not be initialized at this time.
+    if (ev->IsInitialized()) {
+      LOG(ERROR) << "Import parameter for EV (" << name_string
+                 << ") failed, this EV has already been initialized.";
+    }
+
     auto do_compute = [this, context, file_name_string, ev,
          name_string, done] () {
       BundleReader reader(Env::Default(), file_name_string);

From 9a54aae7d5062330f4055c73401183b57650c7d2 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 10:54:34 +0800
Subject: [PATCH 31/45] [Release] Update DeepRec release version to
 1.15.5+deeprec2402. (#974)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e8635e1a298..10132cab678 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -47,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '1.15.5+deeprec2310'
+_VERSION = '1.15.5+deeprec2402'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.9.0',

From 8d4024406210dbcb0a99cc036606efcfa3671c3a Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 16:57:59 +0800
Subject: [PATCH 32/45] [Docs] Update deeprec2402 release images and notes in
 README.md & RELEASE.md. (#975)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 README.md                                     |  4 +-
 RELEASE.md                                    | 44 +++++++++++++++++++
 docs/docs_en/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_en/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_en/TFServing-Compile-And-Install.md |  2 +-
 docs/docs_zh/DeepRec-Compile-And-Install.md   |  4 +-
 docs/docs_zh/Estimator-Compile-And-Install.md |  2 +-
 docs/docs_zh/TFServing-Compile-And-Install.md |  2 +-
 8 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 8f491e14665..b7d7b578c24 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux
 #### Image for CPU
 
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 #### Image for GPU CUDA11.6
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
 
 ***
diff --git a/RELEASE.md b/RELEASE.md
index 6b7e4a7fd79..b095351d2a0 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,48 @@
+# Release r1.15.5-deeprec2402
+
+## **Major Features and Improvements**
+
+### **Embedding**
+
+- Refine KVInterface::GetShardedSnapshot API.
+- Undefine EV GPU interface in CPU compile.
+- Make Embedding backward compatible with previous saved_model.
+- Log error when EV has been initialized in EV Import OP.
+
+### **Op Implement**
+
+- Implement of SliceSend/SliceRecv Op.
+- Implement FileSliceSend/FileSliceRecvOp.
+
+### **SDK**
+
+- Add build SDK package.
+
+### **BugFix**
+
+- Fix shared embedding frequency counting problem.
+- Fix Graph contains EmbeddingVariable compiling issue.
+- Fix a scheduling issue.
+- Fix tensor shape meta-data bug for DataFrame Value.
+
+### **ModelZoo**
+
+- Set Saver parameter sharded=True in distributed training.
+
+More details of features: [https://deeprec.readthedocs.io/zh/latest/](url)
+
+## **Release Images**
+
+### **CPU Image**
+
+`alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04`
+
+### **GPU Image**
+
+`alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04`
+
 # Release r1.15.5-deeprec2310
+
 ## **Major Features and Improvements**
 
 ### **Embedding**
diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md
index fdf3e295fdd..379526e5b24 100644
--- a/docs/docs_en/DeepRec-Compile-And-Install.md
+++ b/docs/docs_en/DeepRec-Compile-And-Install.md
@@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU Image with CUDA 11.6**
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md
index 55f759a3c2a..6305d739571 100644
--- a/docs/docs_en/Estimator-Compile-And-Install.md
+++ b/docs/docs_en/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which
 
 Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-Develop Branch：master, Latest Release Branch: deeprec2310
+Develop Branch：master, Latest Release Branch: deeprec2402
 
 ## Estimator Build
 
diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md
index 79a0944aa3e..ea70f397c98 100644
--- a/docs/docs_en/TFServing-Compile-And-Install.md
+++ b/docs/docs_en/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen
 
 Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-Develop Branch: master, Latest Release Branch: deeprec2310
+Develop Branch: master, Latest Release Branch: deeprec2402
 
 ## TFServing Build
 
diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md
index ad8fd36dbf7..0c11dca394f 100644
--- a/docs/docs_zh/DeepRec-Compile-And-Install.md
+++ b/docs/docs_zh/DeepRec-Compile-And-Install.md
@@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x
 
 x86_64:
 ```
-alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04
 ```
 
 arm64:
@@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64
 **GPU CUDA11.6镜像**
 
 ```
-alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04
+alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04
 ```
 
 ## DeepRec Processor编译打包
diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md
index e54c8ddbd2f..eeb4f66dc99 100644
--- a/docs/docs_zh/Estimator-Compile-And-Install.md
+++ b/docs/docs_zh/Estimator-Compile-And-Install.md
@@ -40,7 +40,7 @@
 
 代码库：[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator)
 
-开发分支：master，最新Release分支：deeprec2310
+开发分支：master，最新Release分支：deeprec2402
 
 ## Estimator编译
 
diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md
index a43d2d517a6..b0460934165 100644
--- a/docs/docs_zh/TFServing-Compile-And-Install.md
+++ b/docs/docs_zh/TFServing-Compile-And-Install.md
@@ -39,7 +39,7 @@
 
 代码库：[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving)
 
-开发分支：master，最新Release分支：deeprec2310
+开发分支：master，最新Release分支：deeprec2402
 
 ## TFServing编译&打包
 

From 8b58f9b93e144fa2d6517d5d370dc0df4fd3644b Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Wed, 28 Feb 2024 17:18:29 +0800
Subject: [PATCH 33/45] [Dockerfile] Add DeepRec release image dockerfile.
 (#976)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 cibuild/dockerfiles/Dockerfile.release | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 cibuild/dockerfiles/Dockerfile.release

diff --git a/cibuild/dockerfiles/Dockerfile.release b/cibuild/dockerfiles/Dockerfile.release
new file mode 100644
index 00000000000..77b013f840d
--- /dev/null
+++ b/cibuild/dockerfiles/Dockerfile.release
@@ -0,0 +1,32 @@
+# build DeepRec & estimator wheel
+FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04 AS deeprec_build
+
+ARG TF_COMMIT=deeprec2402
+
+RUN mkdir -p /src
+RUN wget -nv -O /src/install_bazel.sh \
+    http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/bazel-0.26.1-installer-linux-x86_64.sh && \
+    bash /src/install_bazel.sh
+
+RUN git clone https://github.com/DeepRec-AI/DeepRec.git /src/DeepRec && \
+    cd /src/DeepRec && \
+    git checkout ${TF_COMMIT}
+RUN cd /src/DeepRec && \
+    yes "" | bash ./configure || true && \
+    bazel build -c opt --config=opt //tensorflow/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package /src/
+
+RUN pip install /src/tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl 
+
+RUN git clone https://github.com/DeepRec-AI/estimator.git /src/estimator && \
+    cd /src/estimator && \
+    git checkout ${TF_COMMIT}
+RUN cd /src/estimator && \
+    bazel build //tensorflow_estimator/tools/pip_package:build_pip_package && \
+    bazel-bin/tensorflow_estimator/tools/pip_package/build_pip_package /src/
+
+# build DeeepRec release image
+FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04
+COPY --from=deeprec_build /src/*.whl /
+RUN pip install /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl
+RUN rm -f /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl /tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl

From 186afd0479bb43c629cafa808be70b7f5ac33d83 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Thu, 29 Feb 2024 10:10:38 +0800
Subject: [PATCH 34/45] [Serving] Fix syntax error in generate timeline tool.
 (#977)

Signed-off-by: candy.dc <candy.dc@alibaba-inc.com>
---
 serving/tools/timeline/gen_timeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/serving/tools/timeline/gen_timeline.py b/serving/tools/timeline/gen_timeline.py
index f055e473fa0..d56c1b39897 100644
--- a/serving/tools/timeline/gen_timeline.py
+++ b/serving/tools/timeline/gen_timeline.py
@@ -1,6 +1,6 @@
 import sys
-import config_pb2
-import timeline
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import timeline
 
 def gen_timeline(src_name, dest_name):
   run_metadata = config_pb2.RunMetadata()

From 6dae552cb40e954cce59e125977f141c6a926ada Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 7 Mar 2024 14:35:36 +0800
Subject: [PATCH 35/45] [Embedding] Refine header file of embedding variable.
 (#978)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/framework/embedding/embedding_var.h | 1 -
 tensorflow/core/kernels/kv_variable_ops.cc          | 1 +
 tensorflow/core/kernels/kv_variable_restore_ops.cc  | 1 +
 tensorflow/core/kernels/training_ali_ops.cc         | 8 ++++----
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
index c0d26a2f4d8..81941bc9ff9 100644
--- a/tensorflow/core/framework/embedding/embedding_var.h
+++ b/tensorflow/core/framework/embedding/embedding_var.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h"
 #include "tensorflow/core/framework/embedding/embedding_config.h"
 #include "tensorflow/core/framework/embedding/storage.h"
-#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
index 5cd0ef140bd..b7567ffe924 100644
--- a/tensorflow/core/kernels/kv_variable_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index 2eccf485ef8..e16db9b4cd6 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/embedding/cache.h"
 #include "tensorflow/core/framework/embedding/config.pb.h"
 #include "tensorflow/core/framework/embedding/embedding_var.h"
+#include "tensorflow/core/framework/embedding/storage_factory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc
index 546b30e29dd..fc21ab610cf 100644
--- a/tensorflow/core/kernels/training_ali_ops.cc
+++ b/tensorflow/core/kernels/training_ali_ops.cc
@@ -236,7 +236,7 @@ class KvSparseApplyAdagradGPUOp : public OpKernel {
     T** dev_a = dev_v + task_size;
     CHECK(dev_a);
     CHECK(dev_v);
-    DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2);
+    se::DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2);
     stream->ThenMemcpy(&dev_v_ptr, v, sizeof(T*) * task_size * 2);
 
     int block_size = 128;
@@ -1606,7 +1606,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;
@@ -2579,7 +2579,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;
@@ -3236,7 +3236,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel {
     CHECK(dev_m_ptr);
     CHECK(dev_v_ptr);
 
-    DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
     stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
 
     int block_size = 128;

From cf16856d01551c9d1cb005722d7f62a448df7095 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 26 Mar 2024 17:15:18 +0800
Subject: [PATCH 36/45] [Incremental Checkpoint] Fix import incremental
 embedding variable. (#983)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../embedding/embedding_var_restore.cc        | 50 +++++++++--------
 tensorflow/python/training/incr_ckpt_test.py  | 54 +++++++++++++++++++
 2 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc
index 11c13008995..6ff07bf7e43 100644
--- a/tensorflow/core/framework/embedding/embedding_var_restore.cc
+++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc
@@ -102,45 +102,48 @@ void CheckpointLoader<K, V>::RestoreInternal(
   Tensor part_filter_offset_tensor;
   if (!restore_args_.m_is_oldform) {
     /****** InitPartOffsetTensor ******/
-    TensorShape part_offset_shape, part_filter_offset_shape;
-    DataType part_offset_type, part_filter_offset_type;
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
     string offset_tensor_name;
     if (!restore_args_.m_is_incr) {
       offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
     } else {
       offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
     }
-  
-    string offset_filter_tensor_name =
-        name_string + kPartFilterOffsetTensorSuffsix;
+
     Status s = reader_->LookupDtypeAndShape(
         offset_tensor_name, &part_offset_type, &part_offset_shape);
     if (!s.ok()) {
       LOG(ERROR) << "EV restoring fail:" << s.error_message();
     }
-    s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
-                                     &part_filter_offset_type,
-                                     &part_filter_offset_shape);
-    if (!s.ok()) {
-      LOG(ERROR) << "EV restoring fail: " << s.error_message();
-    }
     part_offset_tensor =
         Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
-    part_filter_offset_tensor = Tensor(
-        cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
     s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
     if (!s.ok()) {
       LOG(ERROR) << "EV restoring fail:" << s.error_message();
     }
 
-    s = reader_->Lookup(offset_filter_tensor_name,
-                        &part_filter_offset_tensor);
-    if (!s.ok()) {
-      LOG(ERROR) << "EV restoring fail: " << s.error_message();
+    if (restore_args_.m_has_filter) {
+      TensorShape part_filter_offset_shape;
+      DataType part_filter_offset_type;
+      string offset_filter_tensor_name =
+        name_string + kPartFilterOffsetTensorSuffsix;
+      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                       &part_filter_offset_type,
+                                       &part_filter_offset_shape);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
+      part_filter_offset_tensor = \
+        Tensor(cpu_allocator(), part_filter_offset_type,
+               part_filter_offset_shape);
+      s = reader_->Lookup(offset_filter_tensor_name,
+                          &part_filter_offset_tensor);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.error_message();
+      }
     }
   }
-  auto part_offset_flat = part_offset_tensor.flat<int32>();
-  auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
   
   if (restore_args_.m_is_oldform) {
     VLOG(1) << "old form, EV name:" << name_string
@@ -164,6 +167,7 @@ void CheckpointLoader<K, V>::RestoreInternal(
     VLOG(1) << "new form checkpoint... :" << name_string
             << " , partition_id:" << restore_args_.m_partition_id
             << " , partition_num:" << restore_args_.m_partition_num;
+    auto part_offset_flat = part_offset_tensor.flat<int32>();
     for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
       int subpart_id = restore_args_.m_loaded_parts[i];
       size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
@@ -183,6 +187,7 @@ void CheckpointLoader<K, V>::RestoreInternal(
                         new_dim, emb_config, device);
 
       if (restore_args_.m_has_filter) {
+        auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
         Status s = EVRestoreFilteredFeatures(
             subpart_id, new_dim, restore_buff, part_filter_offset_flat,
             emb_config, device);
@@ -444,7 +449,7 @@ Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
     }
     st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
                                sizeof(K) * version_filter_shape.dim_size(0));
-    if (!st.ok()) {
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
       return st;
     }
     st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
@@ -463,7 +468,8 @@ Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
       return st;
     }
   }
-  return st;
+
+  return Status::OK();
 }
 #define REGISTER_KERNELS(ktype, vtype)                               \
   template Status CheckpointLoader<ktype, vtype>::EVInitTensorNameAndShape(\
@@ -644,4 +650,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
 #undef REGISTER_KERNELS_ALL_INDEX
 #undef REGISTER_KERNELS
 
-}// namespace tensorflow
\ No newline at end of file
+}// namespace tensorflow
diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py
index b4f7ded3cea..55cf748a9d6 100644
--- a/tensorflow/python/training/incr_ckpt_test.py
+++ b/tensorflow/python/training/incr_ckpt_test.py
@@ -451,5 +451,59 @@ def testIncrementalSaverForResourceVariable(self):
     saver.build()
     incr_saver = incr_saver_module._get_incremental_saver(True, saver)
 
+  def testIncrementalSaverSaveAndRestore(self):
+    tmp_path = self.get_temp_dir()
+    full_ckpt_dir = os.path.join(tmp_path, "model.ckpt")
+    incr_ckpt_dir = os.path.join(tmp_path, "incr.ckpt")
+    full_ckpt_path = None
+    incr_ckpt_path = None
+
+    # construct graph
+    emb_var = variable_scope.get_embedding_variable("emb", embedding_dim=3,
+                initializer = init_ops.ones_initializer(dtypes.float32))
+    emb = embedding_ops.embedding_lookup(emb_var,
+            math_ops.cast([0, 1, 2, 3, 4], dtypes.int64))
+    loss = math_ops.reduce_sum(emb, name = 'reduce_sum')
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    saver = saver_module.Saver(sharded=True, incremental_save_restore=True)
+    incr_saver = \
+      incr_saver_module.IncrementalSaver(sharded=True,
+                          saver_def=saver.saver_def, defer_build=True)
+    incr_saver.build(saver._builder.filename_tensor)
+
+    # generate full ckpt and incr ckpt.
+    full_ckpt_value=None
+    incr_ckpt_value=None
+    with self.test_session() as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run([init])
+      sess.run([train_op])
+      full_ckpt_path = saver.save(sess, full_ckpt_dir, global_step = 10)
+      full_ckpt_value = sess.run([emb])
+      print("full_ckpt: {}".format(full_ckpt_value))
+      sess.run([train_op])
+      incr_ckpt_path = \
+        incr_saver.incremental_save(sess, incr_ckpt_dir, global_step=20)
+      incr_ckpt_value = sess.run([emb])
+      print("incr_ckpt: {}".format(incr_ckpt_value))
+
+    # check the value after restoring parameter.
+    with self.test_session() as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run([init])
+      saver.restore(sess, full_ckpt_path)
+      restore_full_ckpt_value = sess.run([emb])
+      print("restore_full_ckpt: {}".format(restore_full_ckpt_value))
+      incr_saver.incremental_restore(sess, full_ckpt_path, incr_ckpt_path)
+      restore_incr_ckpt_value = sess.run([emb])
+      print("restore_incr_ckpt: {}".format(restore_incr_ckpt_value))
+      self.assertAllClose(full_ckpt_value, restore_full_ckpt_value)
+      self.assertAllClose(incr_ckpt_value, restore_incr_ckpt_value)
+
 if __name__ == "__main__":
   googletest.main()

From d5f7f6ad77a59b70679835009dbe31add175dba3 Mon Sep 17 00:00:00 2001
From: "Secret.Sun" <sunputonsteam@gmail.com>
Date: Wed, 10 Apr 2024 14:41:50 +0800
Subject: [PATCH 37/45] [Runtime] Remove read limit of ReadBinaryProto. (#981)

Signed-off-by: Secret.Sun <sunputonsteam@gmail.com>
---
 tensorflow/core/platform/env.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index ac91b79a07f..b835677627a 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -508,14 +508,7 @@ Status ReadBinaryProto(Env* env, const string& fname,
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
 
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
   ::tensorflow::protobuf::io::CodedInputStream coded_stream(stream.get());
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
 
   if (!proto->ParseFromCodedStream(&coded_stream) ||
       !coded_stream.ConsumedEntireMessage()) {

From a4489e31a4b9bc8371198537a0a15af6011ef8ae Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Fri, 12 Apr 2024 14:22:32 +0800
Subject: [PATCH 38/45] [EVAllocator] Fix the bug in configuring
 ARENA_ARRAY_SIZE. (#986)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/framework/ev_allocator.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/ev_allocator.h b/tensorflow/core/framework/ev_allocator.h
index d3251b14782..5082ee04b72 100644
--- a/tensorflow/core/framework/ev_allocator.h
+++ b/tensorflow/core/framework/ev_allocator.h
@@ -546,15 +546,15 @@ class EVAllocatorImpl {
     page_map_ = new PageMap<ChunkType>();
     page_map_->Init();
 
-    int64 arena_array_size = ARENA_ARRAY_SIZE;
+    arena_array_size_ = ARENA_ARRAY_SIZE;
     Status s = ReadInt64FromEnvVar("ARENA_ARRAY_SIZE",
-        ARENA_ARRAY_SIZE, &arena_array_size);
+        ARENA_ARRAY_SIZE, &arena_array_size_);
     if (!s.ok()) {
       LOG(ERROR) << "Read ARENA_ARRAY_SIZE env error: " << s.error_message();
     }
-    LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size;
+    LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size_;
 
-    arenas_ = new std::vector<Arena<ChunkType>>(arena_array_size, page_map_);
+    arenas_ = new std::vector<Arena<ChunkType>>(arena_array_size_, page_map_);
     arena_cur_index = 0;
   }
 
@@ -602,7 +602,7 @@ class EVAllocatorImpl {
     {
       mutex_lock l(mu_arena_index_);
       ret = &((*arenas_)[arena_cur_index]);
-      arena_cur_index = (arena_cur_index + 1) % ARENA_ARRAY_SIZE;
+      arena_cur_index = (arena_cur_index + 1) % arena_array_size_;
     }
 
     return ret;
@@ -619,6 +619,7 @@ class EVAllocatorImpl {
   PageMap<ChunkType>* page_map_ = nullptr;
   std::vector<Arena<ChunkType>> *arenas_ = nullptr;
   int arena_cur_index GUARDED_BY(mu_arena_index_);
+  int64 arena_array_size_;
 };
 
 template<typename ChunkType>

From 04413cf0ee6ca57f35446095c4e27bc1cfdf2b0d Mon Sep 17 00:00:00 2001
From: Chaofeng Guo <guocfly@gmail.com>
Date: Thu, 18 Apr 2024 19:56:17 +0800
Subject: [PATCH 39/45] [Embedding] Fix the issue of default_value type
 mismatch in the EV Gather op. (#989)

Signed-off-by: Lyaction <guocfly@gmail.com>
---
 tensorflow/python/ops/kv_variable_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py
index 840aadf2541..55e01537c0d 100644
--- a/tensorflow/python/ops/kv_variable_ops.py
+++ b/tensorflow/python/ops/kv_variable_ops.py
@@ -858,10 +858,10 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
       if self._trainable:
         tape.variable_accessed(self)
       if ev_init_value is not None:
-        default_value = ev_init_value
+        default_value = math_ops.cast(ev_init_value, self.dtype)
         is_use_default_value_tensor = True
       else:
-        default_value = ops.convert_to_tensor(1.0)
+        default_value = ops.convert_to_tensor(1.0, dtype=self.dtype)
         is_use_default_value_tensor = False
       if counts != None:
         value = gen_kv_variable_ops.kv_resource_gather_v1(self._handle,

From fc08e1b605490e818cdf80bc2389b68028c19049 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Fri, 26 Apr 2024 11:33:59 +0800
Subject: [PATCH 40/45] [Hook] Add 'before_create_session' interface to
 SessionRunHook. (#991)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/python/training/monitored_session.py |  3 +++
 tensorflow/python/training/session_run_hook.py  | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6eb204785dd..9492028a200 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -957,6 +957,8 @@ def __init__(self, session_creator, hooks, stop_grace_period_secs):
     def create_session(self):
       """Creates a coordinated session."""
       # Keep the tf_sess for unit testing.
+      for hook in self._hooks:
+        hook.before_create_session()
       self.tf_sess = self._session_creator.create_session()
       # We don't want coordinator to suppress any exception.
       self.coord = coordinator.Coordinator(clean_stop_exception_types=[])
@@ -1027,6 +1029,7 @@ class MonitoredSession(_MonitoredSession):
   in given order:
 
   * calls `hook.begin()` for each given hook
+  * calls `hook.before_create_session()`
   * finalizes the graph via `scaffold.finalize()`
   * create session
   * initializes the model via initialization ops provided by `Scaffold`
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index e598bc2d98c..9d05d04c139 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -109,6 +109,20 @@ def begin(self):
     """
     pass
 
+  def before_create_session(self):
+    """Called before new TensorFlow session is created.
+
+    This has two essential differences with the situation in which `begin` is
+    called:
+
+    * Do not modify the graph in this method, ops should not be added to graph.
+        The modification of the graph should take place within the begin
+        interface.
+    * This method will also be called prior to the recovery of a wrapped
+        session, not just at the beginning of the overall session.
+    """
+    pass
+
   def after_create_session(self, session, coord):  # pylint: disable=unused-argument
     """Called when new TensorFlow session is created.
 

From e10d4411dfb93ca47f6e1908ac878d1417c7db58 Mon Sep 17 00:00:00 2001
From: Chen Ding <candy.dc@alibaba-inc.com>
Date: Mon, 29 Apr 2024 17:18:35 +0800
Subject: [PATCH 41/45] [Docs] Fix readthedoc build fail. (#993)

- Add configure file: docs/docs_zh/.readthedocs.yaml docs/docs_en/.readthedocs.yaml

Signed-off-by: Chen Ding <candy.dc@alibaba-inc.com>
---
 docs/docs_en/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++
 docs/docs_zh/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 docs/docs_en/.readthedocs.yaml
 create mode 100644 docs/docs_zh/.readthedocs.yaml

diff --git a/docs/docs_en/.readthedocs.yaml b/docs/docs_en/.readthedocs.yaml
new file mode 100644
index 00000000000..c69bbd13812
--- /dev/null
+++ b/docs/docs_en/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/docs_en/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/docs_en/requirements.txt
diff --git a/docs/docs_zh/.readthedocs.yaml b/docs/docs_zh/.readthedocs.yaml
new file mode 100644
index 00000000000..859db8adfa5
--- /dev/null
+++ b/docs/docs_zh/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/docs_zh/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/docs_zh/requirements.txt

From b2aed9686182124fca72f8093e74136cc13dcd39 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 14 May 2024 10:43:13 +0800
Subject: [PATCH 42/45] [Embedding] Change the log level for EV restore. (#995)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 tensorflow/core/kernels/kv_variable_restore_ops.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc
index e16db9b4cd6..0a0165595f0 100644
--- a/tensorflow/core/kernels/kv_variable_restore_ops.cc
+++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc
@@ -376,8 +376,8 @@ class KvResourceImportV3Op: public AsyncOpKernel {
 
     // EV should not be initialized at this time.
     if (ev->IsInitialized()) {
-      LOG(ERROR) << "Import parameter for EV (" << name_string
-                 << ") failed, this EV has already been initialized.";
+      LOG(WARNING) << "EV (" << name_string
+                   << ") has already been initialized.";
     }
 
     auto do_compute = [this, context, file_name_string, ev,

From 93c69ad9576d6ee0f7b9479bef9b091451e5b91a Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Tue, 21 May 2024 19:26:07 +0800
Subject: [PATCH 43/45] [Rendezvous] RemoteRendezvous supports FlowControl.
 (#994)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 .../base_rendezvous_mgr.cc                    | 213 ++++++++++++++-
 .../distributed_runtime/base_rendezvous_mgr.h |  45 ++++
 .../rendezvous_mgr_interface.h                |  11 +-
 .../rpc/grpc_remote_worker.cc                 |  10 +
 .../rpc/grpc_worker_interface.h               |   6 +
 .../rpc/grpc_worker_service.cc                | 162 ++++++++++++
 .../rpc/grpc_worker_service.h                 |   4 +
 .../rpc/grpc_worker_service_impl.cc           |   2 +
 .../rpc/grpc_worker_service_impl.h            |   1 +
 .../rpc/rpc_rendezvous_mgr.cc                 | 245 ++++++++++++++++++
 .../rpc/rpc_rendezvous_mgr_test.cc            |  26 ++
 tensorflow/core/framework/rendezvous.cc       |  41 +++
 tensorflow/core/framework/rendezvous.h        |  26 ++
 .../core/kernels/file_slice_sendrecv_ops.cc   |  20 +-
 .../core/kernels/file_slice_sendrecv_ops.h    |   2 +
 .../kernels/file_slice_sendrecv_ops_test.cc   |  13 +
 tensorflow/core/kernels/slice_sendrecv_ops.cc |  40 +--
 tensorflow/core/kernels/slice_sendrecv_ops.h  |   2 +
 .../core/kernels/slice_sendrecv_ops_test.cc   |  13 +
 tensorflow/core/protobuf/worker.proto         |  46 ++++
 tensorflow/core/protobuf/worker_service.proto |   5 +
 21 files changed, 903 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 17935eb8982..ead121b30c8 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -34,11 +34,13 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
 namespace {
   uint64 kGlobalStepId = 0x100000000000000uLL;
+  int64 kFlowControlMaxSize = 16;
 } // namespace anonymous
 
 static void StartAbortRendevous(Rendezvous* rendez, const Status& s) {
@@ -127,6 +129,23 @@ void BaseRendezvousMgr::FuseRecvLocalAsync(
   rendez->FuseRecvLocalAsync(parsed_keys, std::move(done_cb));
 }
 
+void BaseRendezvousMgr::FlowControlRecvLocalAsync(int64 step_id,
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       Rendezvous::DoneCallback done) {
+  auto rendez = FindOrCreate(step_id);
+  using namespace std::placeholders;
+  Rendezvous::DoneCallback done_cb = std::bind(
+      [rendez](Rendezvous::DoneCallback done,
+               // Begin unbound arguments.
+               const Status& s, const Rendezvous::Args& send_args,
+               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
+        rendez->Unref();
+        done(s, send_args, recv_args, v, dead);
+      },
+      std::move(done), _1, _2, _3, _4, _5);
+  rendez->FlowControlRecvLocalAsync(tag, parsed, std::move(done_cb));
+}
+
 void BaseRendezvousMgr::Cleanup(int64 step_id) {
   Rendezvous* rendez = nullptr;
   {
@@ -174,7 +193,17 @@ BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id)
     : env_(env),
       step_id_(step_id),
       local_(NewLocalRendezvous()),
-      session_(nullptr) {}
+      session_(nullptr),
+      flow_control_num_(0) {
+  Status s = ReadInt64FromEnvVar("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE",
+        kFlowControlMaxSize, &flow_control_max_size_);
+  if (!s.ok()) {
+    LOG(ERROR) << "Read REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE env error: "
+               << s.error_message();
+  }
+  VLOG(2) << "BaseRemoteRendezvous set flow control max size: "
+          << flow_control_max_size_;
+}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
   CHECK(active_.empty());
@@ -221,6 +250,16 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
                                std::move(fuse_call.done));
   }
 
+  std::vector<DeferredFlowControlCall> deferred_flow_control_calls;
+  {
+    mutex_lock l(mu_);
+    std::swap(deferred_flow_control_calls, deferred_flow_control_calls_);
+  }
+  for (auto& fc_call : deferred_flow_control_calls) {
+    FlowControlRecvLocalAsyncInternal(fc_call.tag, fc_call.parsed,
+                                      std::move(fc_call.done));
+  }
+
   return Status::OK();
 }
 
@@ -271,6 +310,43 @@ Status BaseRemoteRendezvous::Send(const ParsedKey& parsed,
   return local_->Send(parsed, args, val, mu, is_dead);
 }
 
+Status BaseRemoteRendezvous::FlowControlSend(const StringPiece& tag,
+                                             const ParsedKey& parsed,
+                                             const Args& args,
+                                             const Tensor& val,
+                                             const bool is_dead,
+                                             const int64 timeout_millis) {
+  VLOG(1) << "BaseRemoteRendezvous FlowControlSend " << this << " "
+          << parsed.FullKey();
+  const std::string tag_string(tag.data(), tag.size());
+  {
+    mutex_lock l(mu_);
+    while(status_.ok() && flow_control_num_ >= flow_control_max_size_) {
+      if (flow_control_cv_.wait_for(
+            l, std::chrono::milliseconds(timeout_millis)) == \
+          std::cv_status::timeout) {
+        return errors::DeadlineExceeded("FlowControlSend has timed out.");
+      }
+    }
+
+    if (!status_.ok()) return status_;
+    DCHECK(is_initialized_locked());
+    if (!IsLocalDevice(session_->worker_name, parsed.src_device)) {
+      return errors::InvalidArgument(
+          "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
+          session_->worker_name);
+    }
+
+    flow_control_num_++;
+    if (flow_control_counters_.count(tag_string) == 0) {
+      flow_control_counters_[tag_string] = 0;
+    }
+    flow_control_counters_[tag_string]++;
+  }
+  // Buffers "val" and "device_context" in local_.
+  return local_->Send(parsed, args, val, is_dead);
+}
+
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
                                              bool is_src) {
   // Cache session pointer to avoid repeatedly taking & releasing the lock
@@ -413,6 +489,63 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   }
 }
 
+void BaseRemoteRendezvous::FlowControlRecvAsync(const StringPiece& tag,
+                                                const ParsedKey& parsed,
+                                                const Args& recv_args,
+                                                DoneCallback done) {
+  VLOG(1) << "RemoteRendezvous FlowControlRecvAsync " << this
+          << " " << tag << " " << parsed.FullKey();
+
+  Status s = ValidateDevices(parsed, false /*!is_src*/);
+  if (s.ok() && !is_initialized()) {
+    s.Update(errors::Internal(
+        "FlowControlRecvAsync called when uninitialized (key:",
+        parsed.FullKey(), ")."));
+  }
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor(), false);
+    return;
+  }
+
+  // Are src and dst in the same worker?
+  if (IsSameWorker(parsed.src, parsed.dst)) {
+    // Recv the tensor from local_.
+    local_->RecvAsync(
+        parsed, recv_args,
+        [this, tag, parsed, done](
+            const Status& status, const Rendezvous::Args& send_args,
+            const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
+          VLOG(2) << "RemoteRendezvous Finished Recv " << this << " "
+                  << parsed.FullKey();
+          Tensor* out = new Tensor;
+          StatusCallback final_callback = [done, send_args, recv_args, out,
+                                           is_dead](const Status& s) {
+            done(s, send_args, recv_args, *out, is_dead);
+            delete out;
+          };
+
+          if (status.ok()) {
+            SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                               std::move(final_callback));
+            const std::string tag_string(tag.data(), tag.size());
+            {
+              mutex_lock l(mu_);
+              flow_control_num_--;
+              DCHECK(flow_control_counters_.count(tag_string) != 0);
+              flow_control_counters_[tag_string]--;
+            }
+            flow_control_cv_.notify_one();
+          } else {
+            final_callback(status);
+          }
+        });
+    return;
+  } else {
+    FlowControlRecvFromRemoteAsync(tag, parsed, recv_args, std::move(done));
+  }
+
+}
+
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
   {
@@ -600,6 +733,58 @@ void BaseRemoteRendezvous::FuseRecvLocalAsyncInternal(
   }
 }
 
+void BaseRemoteRendezvous::FlowControlRecvLocalAsync(const StringPiece& tag,
+                                                     const ParsedKey& parsed,
+                                                     DoneCallback done) {
+  {
+    mutex_lock l(mu_);
+    if (!is_initialized_locked()) {
+      // FlowControlRecvLocalAsync can be called (due to an incoming RecvTensor
+      // RPC from a remote worker) before the RunStep (or PartialRunStep) RPC
+      // from the master arrives. RecvLocalAsync thus buffers the arguments
+      // until after the RemoteRendezvous is Initialize()'d, when it completes
+      // the rendezvous logic. At some point after Initialize() is called, a
+      // Tensor is produced locally that will then be sent in response to the
+      // incoming RPC.
+      DeferredFlowControlCall call(tag, parsed, std::move(done));
+      deferred_flow_control_calls_.push_back(call);
+      return;
+    }
+  }
+  FlowControlRecvLocalAsyncInternal(tag, parsed, std::move(done));
+}
+
+void BaseRemoteRendezvous::FlowControlRecvLocalAsyncInternal(
+       const StringPiece& tag, const ParsedKey& parsed, DoneCallback done) {
+  Status s = ValidateDevices(parsed, true /* is_src */);
+  if (!s.ok()) {
+    done(s, Args(), Args(), Tensor(), false);
+    return;
+  }
+
+  using namespace std::placeholders;
+  Rendezvous::DoneCallback done_cb = std::bind(
+      [this, tag](Rendezvous::DoneCallback done,
+             // Begin unbound arguments.
+             const Status& s, const Rendezvous::Args& send_args,
+             const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
+        done(s, send_args, recv_args, v, dead);
+        if (s.ok()) {
+          const std::string tag_string(tag.data(), tag.size());
+          {
+            mutex_lock l(mu_);
+            flow_control_num_--;
+            DCHECK(flow_control_counters_.count(tag_string) != 0);
+            flow_control_counters_[tag_string]--;
+          }
+          flow_control_cv_.notify_one();
+        }
+      },
+      std::move(done), _1, _2, _3, _4, _5);
+
+  local_->RecvAsync(parsed, Args(), std::move(done_cb));
+}
+
 void BaseRemoteRendezvous::FuseRecvFromRemoteAsync(
         const std::vector<Rendezvous::ParsedKey>& parsed_keys,
         const Rendezvous::Args& args,
@@ -607,6 +792,12 @@ void BaseRemoteRendezvous::FuseRecvFromRemoteAsync(
     CHECK(false) << "FuseRecvFromRemoteAsync Unimplemented";
 }
 
+void BaseRemoteRendezvous::FlowControlRecvFromRemoteAsync(
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       const Rendezvous::Args& args, DoneCallback done) {
+  CHECK(false) << "FlowControlRecvFromRemoteAsync Unimplemented.";
+}
+
 void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      RefDoneCallback done) {
@@ -636,6 +827,19 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
   }
 }
 
+int64 BaseRemoteRendezvous::GetAllFlowControlItemNum() {
+  mutex_lock l(mu_);
+  return flow_control_num_;
+}
+
+int64 BaseRemoteRendezvous::GetFlowControlItemNum(StringPiece tag) {
+  const std::string tag_string(tag.data(), tag.size());
+  mutex_lock l(mu_);
+  if (flow_control_counters_.count(tag_string) == 0)
+    return 0;
+  return flow_control_counters_[tag_string];
+}
+
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
   CHECK(!s.ok());
   // Use a "derived" status as the status for the rendezvous. Derived
@@ -656,7 +860,10 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
       }
       active_.clear();
     }
+    flow_control_num_ = 0;
+    flow_control_counters_.clear();
   }
+  flow_control_cv_.notify_all();
 }
 
 void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
@@ -707,4 +914,8 @@ BaseRemoteRendezvous::DeferredFuseCall::DeferredFuseCall(
     const std::vector<ParsedKey>& parsed_keys, FuseDoneCallback done)
     : parsed_keys(parsed_keys), done(std::move(done)) {}
 
+BaseRemoteRendezvous::DeferredFlowControlCall::DeferredFlowControlCall(
+    const StringPiece& tag, const ParsedKey& parsed, DoneCallback done)
+    : tag(tag), parsed(parsed), done(std::move(done)) {}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index b65e59436c0..fc72d9bedfc 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_
 
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
@@ -86,6 +87,10 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
                           const std::vector<Rendezvous::ParsedKey>& parsed_keys,
                           Rendezvous::FuseDoneCallback done) override;
 
+  void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag,
+                                 const Rendezvous::ParsedKey& parsed,
+                                 Rendezvous::DoneCallback done) override;
+
   // Removes rendezvous for "step_id".
   //
   // TODO(zhifengc): Have a background thread in worker that
@@ -140,6 +145,11 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   Status Send(const ParsedKey& key, const Rendezvous::Args& args,
               Tensor* val, mutex* mu, const bool is_dead) override;
 
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead,
+                         const int64 timeout_millis) override;
+
   // This method is called only by the RecvOp.  It tests to see
   // whether the value will be produced by a local or remote device
   // and handles accordingly.  In the local case it forwards to
@@ -147,6 +157,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
                  DoneCallback done) override;
 
+  void FlowControlRecvAsync(const StringPiece& tag,
+                            const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) override;
+
   void StartAbort(const Status& status) override;
 
   // This method is called only by the local Worker, forwarded through
@@ -171,10 +185,18 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   void FuseRecvLocalSync(const std::vector<ParsedKey>& parsed_keys,
                          FuseDoneCallback done);
 
+  void FlowControlRecvLocalAsync(const StringPiece& tag,
+                                 const ParsedKey& parsed, DoneCallback done);
+
   // For ref send/recv
   void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args,
                  RefDoneCallback done) override;
 
+  // Obtain statistical information
+  int64 GetAllFlowControlItemNum() override;
+
+  int64 GetFlowControlItemNum(StringPiece tag) override;
+
  protected:
   virtual void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                                    const Rendezvous::Args& args,
@@ -185,6 +207,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
       const Rendezvous::Args& args,
       FuseDoneCallback done);
 
+  virtual void FlowControlRecvFromRemoteAsync(const StringPiece& tag,
+                 const Rendezvous::ParsedKey& parsed,
+                 const Rendezvous::Args& args, DoneCallback done);
+
   // Returns true if "src" and "dst" are located in the same worker,
   // and hence may use a local rendezvous.
   virtual bool IsSameWorker(DeviceNameUtils::ParsedName src,
@@ -210,6 +236,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
   mutable mutex mu_;
 
+  // For Flow Control.
+  int64 flow_control_max_size_;
+  int64 flow_control_num_ GUARDED_BY(mu_);
+  std::unordered_map<string, int64> flow_control_counters_ GUARDED_BY(mu_);
+  tensorflow::condition_variable flow_control_cv_;
+
   // Status given by StartAbort() if any.
   Status status_ GUARDED_BY(mu_);
   WorkerSession* session_ GUARDED_BY(mu_);  // Not owned.
@@ -233,6 +265,16 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   };
   std::vector<DeferredFuseCall> deferred_fuse_calls_ GUARDED_BY(mu_);
 
+  struct DeferredFlowControlCall {
+    const StringPiece tag;
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    DeferredFlowControlCall(const StringPiece& tag, const ParsedKey& parsed,
+                            DoneCallback done);
+  };
+  std::vector<DeferredFlowControlCall> deferred_flow_control_calls_ GUARDED_BY(mu_);
+
   typedef std::function<void()> InactiveCallback;
 
   // Active outstanding RecvTensor calls.
@@ -262,6 +304,9 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
   void FuseRecvLocalAsyncInternal(const std::vector<ParsedKey>& parsed_keys,
                                   FuseDoneCallback done);
+  void FlowControlRecvLocalAsyncInternal(const StringPiece& tag,
+                                         const ParsedKey& parsed,
+                                         DoneCallback done);
 
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
 };
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index caf4af97ac2..abc971c4552 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -40,6 +40,11 @@ class RemoteRendezvous : public Rendezvous {
  public:
   // Fully construct the RemoteRendezvous.
   virtual Status Initialize(WorkerSession* session) = 0;
+
+  // Obtain statistical information
+  virtual int64 GetAllFlowControlItemNum() = 0;
+
+  virtual int64 GetFlowControlItemNum(StringPiece tag) = 0;
 };
 
 // RendezvousMgr keeps track of a set of local rendezvous instances.
@@ -87,7 +92,11 @@ class RendezvousMgrInterface {
 
   virtual void FuseRecvLocalAsync(
       int64 step_id, const std::vector<Rendezvous::ParsedKey>& parsed_keys,
-      Rendezvous::FuseDoneCallback done) = 0;
+                                  Rendezvous::FuseDoneCallback done) = 0;
+
+  virtual void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag,
+                                         const Rendezvous::ParsedKey& parsed,
+                                         Rendezvous::DoneCallback done) = 0;
 
   // Removes rendezvous for "step_id".
   //
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index ba95e80b496..c3fb6a8ee6c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -63,6 +63,7 @@ class GrpcRemoteWorker :
         cleanupall_(Method(GrpcWorkerMethod::kCleanupAll)),
         recvtensor_(Method(GrpcWorkerMethod::kRecvTensor)),
         fuserecvtensor_(Method(GrpcWorkerMethod::kFuseRecvTensor)),
+        flowcontrolrecvtensor_(Method(GrpcWorkerMethod::kFlowControlRecvTensor)),
         recvbuf_(Method(GrpcWorkerMethod::kRecvBuf)),
         logging_(Method(GrpcWorkerMethod::kLogging)),
         tracing_(Method(GrpcWorkerMethod::kTracing)),
@@ -210,6 +211,14 @@ class GrpcRemoteWorker :
     IssueRequest(request, response, fuserecvtensor_, done, call_opts);
   }
 
+  void FlowControlRecvTensorAsync(CallOptions* call_opts,
+                                  const FlowControlRecvTensorRequest* request,
+                                  TensorResponse* response,
+                                  StatusCallback done) {
+    VLOG(1) << "FlowControlRecvTensorAsync req: " << request->DebugString();
+    IssueRequest(request, response, flowcontrolrecvtensor_, done, call_opts);
+  }
+
   void RecvTensorAsync(CallOptions* call_opts, const RecvTensorRequest* request,
                        TensorResponse* response, StatusCallback done) override {
     VLOG(1) << "RecvTensorAsync req: " << request->DebugString();
@@ -341,6 +350,7 @@ class GrpcRemoteWorker :
   const ::grpc::string cleanupall_;
   const ::grpc::string recvtensor_;
   const ::grpc::string fuserecvtensor_;
+  const ::grpc::string flowcontrolrecvtensor_;
   const ::grpc::string recvbuf_;
   const ::grpc::string logging_;
   const ::grpc::string tracing_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
index 20f1d2b5a62..2c885fec75d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h
@@ -6,6 +6,8 @@ namespace tensorflow {
 class CallOptions;
 class FuseTensorResponse;
 class FuseRecvTensorRequest;
+class FlowControlRecvTensorRequest;
+class TensorResponse;
 
 class GrpcWorkerInterface {
  public:
@@ -13,6 +15,10 @@ class GrpcWorkerInterface {
                                    const FuseRecvTensorRequest* request,
                                    FuseTensorResponse* response,
                                    StatusCallback done) = 0;
+
+  virtual void FlowControlRecvTensorAsync(CallOptions* call_opts,
+                 const FlowControlRecvTensorRequest* request,
+                 TensorResponse* response, StatusCallback done) = 0;
 };
 
 } // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index ef4fbeab438..3bdacc29a12 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -170,6 +170,15 @@ class GrpcWorkerServiceThread {
       EnqueueFuseRecvTensorRequestRaw();
     }
 
+    // Support FlowControlRecv
+    for (int i = 0;
+         i < gtl::FindWithDefault(
+                 queue_depth_, static_cast<int>(GrpcWorkerMethod::kFlowControlRecvTensor),
+                 1000);
+         ++i) {
+      EnqueueFlowControlRecvTensorRequestRaw();
+    }
+
     void* tag;
     bool ok;
 
@@ -312,6 +321,24 @@ class GrpcWorkerServiceThread {
       EnqueueFuseRecvTensorRequestRaw();
     }
 
+  void FlowControlRecvTensorHandlerRaw(
+         WorkerCall<FlowControlRecvTensorRequest, ::grpc::ByteBuffer>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+
+      worker_->GrpcFlowControlRecvTensorAsync(call_opts, &call->request,
+                                       &call->response,
+                                       [call, call_opts
+                                       ](const Status& s) {
+                                         call->ClearCancelCallback();
+                                         delete call_opts;
+                                         call->SendResponse(ToGrpcStatus(s));
+                                       });
+    });
+    EnqueueFlowControlRecvTensorRequestRaw();
+  }
+
   void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
     Schedule([this, call]() {
       CallOptions* call_opts = new CallOptions;
@@ -394,6 +421,19 @@ class GrpcWorkerServiceThread {
     }
   }
 
+  void EnqueueFlowControlRecvTensorRequestRaw() {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           FlowControlRecvTensorRequest, ::grpc::ByteBuffer>::
+          EnqueueRequestForMethod(
+              worker_service_, cq_.get(),
+              static_cast<int>(GrpcWorkerMethod::kFlowControlRecvTensor),
+              &GrpcWorkerServiceThread::FlowControlRecvTensorHandlerRaw,
+              true /* supports cancel*/);
+    }
+  }
+
   GrpcWorker* const worker_ = nullptr;  // Not owned.
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   std::unique_ptr<Thread> thread_;
@@ -746,6 +786,128 @@ void GrpcWorker::GrpcFuseRecvTensorAsync(CallOptions* opts,
       });
 }
 
+// GrpcFlowControlRecvTensorAsync: unlike the other Worker methods, which use
+// protocol buffers for a response object, to avoid extra protocol buffer
+// serialization overhead we generate our response directly into a
+// ::grpc::ByteBuffer object
+void GrpcWorker::GrpcFlowControlRecvTensorAsync(CallOptions* opts,
+                   const FlowControlRecvTensorRequest* request,
+                   ::grpc::ByteBuffer* response, StatusCallback done) {
+  VLOG(1) << "GrpcFlowControlRecvTensorAsync req: " << request->DebugString();
+  const int64 request_id = request->request_id();
+  const int64 step_id = request->step_id();
+
+  bool cache_enabled = (response_cache_ != nullptr && request_id != 0);
+
+  auto do_response = [response, done, cache_enabled](const Tensor& tensor,
+                                                     bool is_dead,
+                                                     const Status& status) {
+    if (status.ok()) {
+      grpc::EncodeTensorToByteBuffer(is_dead, tensor, cache_enabled, response);
+    }
+    done(status);
+  };
+
+  // If response cache is enabled and the response cache already contains the
+  // request, we delegate this retry request to the response cache. Otherwise,
+  // we add the request to the response cache and start the computation to
+  // retrieve the requested data.
+  if (cache_enabled &&
+      response_cache_->QueueRequest(request_id, step_id, do_response)) {
+    return;
+  }
+
+  auto rendezvous_done = [this, request_id, do_response, cache_enabled](
+                             const Tensor& tensor, bool is_dead,
+                             const Status& status) {
+    if (cache_enabled) {
+      // Data is ready. Process all pending requests in the response cache.
+      response_cache_->OnRequestFinished(request_id, tensor, is_dead, status);
+    } else {
+      do_response(tensor, is_dead, status);
+    }
+  };
+
+  auto fail = [&rendezvous_done](const Status& status) {
+    rendezvous_done(Tensor(), false, status);
+  };
+
+  Status s = recent_request_ids_.TrackUnique(
+      request_id, "RecvTensor (GrpcWorker)", *request);
+  if (!s.ok()) {
+    fail(s);
+    return;
+  }
+
+  const string& key = request->rendezvous_key();
+  TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
+  Rendezvous::ParsedKey parsed;
+  s = Rendezvous::ParseKey(key, &parsed);
+  Device* src_dev = nullptr;
+  if (s.ok()) {
+    s = PrepareRecvTensor(parsed, &src_dev);
+  }
+  if (!s.ok()) {
+    fail(s);
+    return;
+  }
+
+  // Request the tensor associated with the rendezvous key.
+  // Note that we log the cancellation here but do not abort the current step.
+  // gRPC can generate cancellations in response to transient network failures,
+  // and aborting the step eliminates the opportunity for client side retries.
+  // Repeated client failures will eventually cause the step to be aborted by
+  // the client.
+  opts->SetCancelCallback(
+      [step_id]() { LOG(WARNING) << "RecvTensor cancelled for " << step_id; });
+  StringPiece tag = request->tag();
+  env_->rendezvous_mgr->FlowControlRecvLocalAsync(
+      step_id, tag, parsed,
+      [opts, rendezvous_done, src_dev, request](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val,
+          const bool is_dead) {
+        opts->ClearCancelCallback();
+        if (status.ok()) {
+          // DMA can only be used for Tensors that do not fall into
+          // the following three odd edge cases: 1) a zero-size
+          // buffer, 2) a dead tensor which has an uninit value, and
+          // 3) the tensor has the on_host allocation attribute,
+          // i.e. it's in CPU RAM *independent of its assigned
+          // device type*.
+          const bool on_host = send_args.alloc_attrs.on_host();
+          {
+            // Non-DMA cases.
+            if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
+              DeviceContext* send_dev_context = send_args.device_context;
+              AllocatorAttributes alloc_attrs;
+              alloc_attrs.set_gpu_compatible(true);
+              alloc_attrs.set_on_host(true);
+              Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
+              Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
+              CHECK(send_dev_context)
+                  << "send dev name: " << src_dev->name()
+                  << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
+              StatusCallback copy_ready = [rendezvous_done, copy,
+                                           is_dead](const Status& s) {
+                // The value is now ready to be returned on the wire.
+                rendezvous_done(*copy, is_dead, s);
+                delete copy;
+              };
+
+              CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(),
+                               src_dev, copy, send_dev_context, copy_ready);
+              return;
+            }
+          }
+        }
+
+        rendezvous_done(val, is_dead, status);
+      });
+}
+
 namespace {
 // If RecvBufRespExtra.tensor_content is a single large string, then gRPC
 // can stall on the recv side when the string buffer needs to be enlarged,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 69759c420cc..48941d438c9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -51,6 +51,10 @@ class GrpcWorker : public Worker {
                                        ::grpc::ByteBuffer* response,
                                        StatusCallback done);
 
+  virtual void GrpcFlowControlRecvTensorAsync(CallOptions* opts,
+                   const FlowControlRecvTensorRequest* request,
+                   ::grpc::ByteBuffer* response, StatusCallback done);
+
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
                     StatusCallback done) override;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 515d6e90beb..2095540e36a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -48,6 +48,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
       return "/tensorflow.WorkerService/RecvTensor";
     case GrpcWorkerMethod::kFuseRecvTensor:
       return "/tensorflow.WorkerService/FuseRecvTensor";
+    case GrpcWorkerMethod::kFlowControlRecvTensor:
+      return "/tensorflow.WorkerService/FlowControlRecvTensor";
     case GrpcWorkerMethod::kRecvBuf:
       return "/tensorflow.WorkerService/RecvBuf";
     case GrpcWorkerMethod::kLogging:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index ff8e1c07cb4..ad77ee0fd80 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -80,6 +80,7 @@ enum class GrpcWorkerMethod {
   kCleanupAll,
   kRecvTensor,
   kFuseRecvTensor,
+  kFlowControlRecvTensor,
   kRecvBuf,
   kLogging,
   kTracing,
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 69f1481f59e..267bf09e66f 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -53,6 +53,10 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
       const Rendezvous::Args& args,
       FuseDoneCallback done) override;
 
+  void FlowControlRecvFromRemoteAsync(const StringPiece& tag,
+      const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
+      DoneCallback done) override;
+
  private:
   ~RpcRemoteRendezvous() override {}
 
@@ -529,6 +533,247 @@ void RpcRemoteRendezvous::FuseRecvFromRemoteAsync(
   });
 }
 
+
+
+class FlowControlRpcRecvTensorCall : public BaseRecvTensorCall {
+ public:
+  FlowControlRpcRecvTensorCall()
+    : wi_(nullptr), dst_device_(nullptr) {}
+
+  void Init(WorkerInterface* wi, int64 step_id, const StringPiece& tag,
+            const StringPiece& key, AllocatorAttributes alloc_attrs,
+            Device* dst_device, const Rendezvous::Args& recv_args,
+            Rendezvous::DoneCallback done) {
+    wi_ = wi;
+    grpc_wi_ = dynamic_cast<GrpcWorkerInterface*>(wi_);
+    alloc_attrs_ = alloc_attrs;
+    dst_device_ = dst_device;
+    recv_args_ = recv_args;
+    done_ = std::move(done);
+    req_.set_step_id(step_id);
+    req_.set_tag(tag.data(), tag.size());
+    req_.set_request_id(GetUniqueRequestId());
+    req_.set_rendezvous_key(key.data(), key.size());
+  }
+
+  void Reset() {
+    // The FlowControlRpcRemoteRendezvous using this object is responsible for
+    // calling ReleaseWorker() before Reset().
+    DCHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "Leaking WorkerInterface in RpcRecvTensorCall::Reset().";
+
+    alloc_attrs_ = AllocatorAttributes();
+    dst_device_ = nullptr;
+    // We don't clear opts_ and assume that Init will set up the state for
+    // opts_ appropriately.
+    req_.Clear();
+    resp_.Clear();
+    {
+      mutex_lock l(mu_);
+      status_ = Status::OK();
+    }
+    done_ = nullptr;
+  }
+
+  ~FlowControlRpcRecvTensorCall() override {
+    // Since only the FlowControlRpcRecvTensorFreeList will delete an
+    // FlowControlRpcRecvTensorCall, and it always sets this->wi_ to null when
+    // a call object is released to it, we can assert that this->wi_ is
+    // always null at the point of deletion.
+    CHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+      << "Leaking WorkerInterface in FlowControlRpcRecvTensorCall destructor.";
+  }
+
+  void Start(std::function<void()> recv_done) override {
+    StartRTCall(std::move(recv_done));
+  }
+
+  void StartAbort(const Status& s) override {
+    {
+      mutex_lock l(mu_);
+      status_.Update(s);
+    }
+    opts_.StartCancel();
+  }
+
+  Status status() const override {
+    mutex_lock l(mu_);
+    return status_;
+  }
+
+  void ReleaseWorker(WorkerCacheInterface* worker_cache) {
+    DCHECK_NE(static_cast<WorkerInterface*>(nullptr), wi_)
+      << "FlowControlRpcRecvTensorCall::ReleaseWorker() called twice.";
+    worker_cache->ReleaseWorker(src_worker_, wi_);
+    wi_ = nullptr;
+    grpc_wi_ = nullptr;
+  }
+
+  const Tensor& tensor() const { return resp_.tensor(); }
+
+  bool is_dead() const { return resp_.metadata().is_dead(); }
+
+  Device* dst_device() const { return dst_device_; }
+  const Rendezvous::Args recv_args() const { return recv_args_; }
+  const Rendezvous::DoneCallback& done() const { return done_; }
+
+ private:
+  friend class RpcRemoteRendezvous;
+
+  // Start the main RecvTensor call, checking for an async abort.
+  void StartRTCall(std::function<void()> recv_done) {
+    resp_.InitAlloc(dst_device_, alloc_attrs_);
+    using namespace std::placeholders;
+    StatusCallback cb = std::bind(
+        [this](std::function<void()> recv_done,
+               // Begin unbound arguments.
+               const Status& s) {
+          if (!s.ok()) {
+            mutex_lock l(mu_);
+            status_.Update(s);
+          }
+          recv_done();
+        },
+        std::move(recv_done), _1);
+    grpc_wi_->FlowControlRecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+  }
+
+  string src_worker_;
+  string src_rel_device_;
+  WorkerInterface* wi_;  // Not owned.
+  GrpcWorkerInterface* grpc_wi_;
+  AllocatorAttributes alloc_attrs_;
+  Device* dst_device_;
+  CallOptions opts_;
+  FlowControlRecvTensorRequest req_;
+  TensorResponse resp_;
+  Rendezvous::Args recv_args_;
+  Rendezvous::DoneCallback done_;
+
+  mutable mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FlowControlRpcRecvTensorCall);
+};
+
+class FlowControlRpcRecvTensorFreeList {
+ public:
+  FlowControlRpcRecvTensorFreeList() {}
+  ~FlowControlRpcRecvTensorFreeList() {
+    for (size_t i = 0; i < objects_.size(); i++) {
+      delete objects_[i];
+    }
+  }
+
+  FlowControlRpcRecvTensorCall* New() {
+    {
+      mutex_lock l(mu_);
+      if (!objects_.empty()) {
+        FlowControlRpcRecvTensorCall* result = objects_.back();
+        objects_.pop_back();
+        return result;
+      }
+    }
+    return new FlowControlRpcRecvTensorCall;
+  }
+
+  void Release(FlowControlRpcRecvTensorCall* obj) {
+    obj->Reset();
+    {
+      mutex_lock l(mu_);
+      if (objects_.size() < kMaxObjects) {
+        objects_.push_back(obj);
+        return;
+      }
+    }
+    delete obj;
+  }
+
+ private:
+  static const int kMaxObjects = 1000;
+
+  mutex mu_;
+  std::vector<FlowControlRpcRecvTensorCall*> objects_ GUARDED_BY(mu_);
+};
+
+static FlowControlRpcRecvTensorFreeList* get_flow_control_call_freelist() {
+  static FlowControlRpcRecvTensorFreeList* call_freelist = \
+    new FlowControlRpcRecvTensorFreeList();
+  return call_freelist;
+}
+
+void RpcRemoteRendezvous::FlowControlRecvFromRemoteAsync(
+       const StringPiece& tag, const Rendezvous::ParsedKey& parsed,
+       const Rendezvous::Args& recv_args, DoneCallback done) {
+  CHECK(is_initialized());
+  Status s;
+
+  // Prepare a FlowControlRecvTensor call that can handle being aborted.
+  FlowControlRpcRecvTensorCall* call = get_flow_control_call_freelist()->New();
+
+  // key.src_device identifies a remote device.
+  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &call->src_worker_,
+                                        &call->src_rel_device_)) {
+    s = errors::Internal(parsed.src_device,
+                         " is invalid remote source device.");
+  }
+
+  WorkerSession* sess = session();
+  WorkerInterface* rwi =
+      sess->worker_cache->GetOrCreateWorker(call->src_worker_);
+  if (s.ok() && rwi == nullptr) {
+    s = errors::Internal("No worker known as ", call->src_worker_);
+  }
+
+  Device* dst_device;
+  if (s.ok()) {
+    s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device);
+  }
+  if (!s.ok()) {
+    if (rwi != nullptr) {
+      sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
+    }
+    get_flow_control_call_freelist()->Release(call);
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+
+  call->Init(rwi, step_id_, tag, parsed.FullKey(), recv_args.alloc_attrs,
+             dst_device, recv_args, std::move(done));
+
+  // Record "call" in active_ so that it can be aborted cleanly.
+  RegisterCall(call, recv_args);
+
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    // NOTE: `*sess` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(sess->worker_cache.get());
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    get_flow_control_call_freelist()->Release(call);
+    return;
+  }
+
+  // Start "call".
+  Ref();
+  call->Start([this, call]() {
+    // Removes "call" from active_. Prevent StartAbort().
+    DeregisterCall(call);
+    // If StartAbort was called prior to DeregisterCall, then the
+    // current status should be bad.
+    Status s = call->status();
+    // NOTE: `*session()` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(session()->worker_cache.get());
+    call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+    get_flow_control_call_freelist()->Release(call);
+    Unref();
+  });
+
+}
+
 }  // namespace
 
 RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 5021853ce23..75f41ab3057 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -211,6 +211,32 @@ TEST_F(RpcRendezvousMgrTest, CleanupAll) {
   }
 }
 
+TEST_F(RpcRendezvousMgrTest, FlowControlSend) {
+  setenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE", "2", 1);
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:mnist/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+    TF_ASSERT_OK(
+      rendez->FlowControlSend("TEST", key, args, V("peach_0"), false));
+    TF_ASSERT_OK(
+      rendez->FlowControlSend("TEST", key, args, V("peach_1"), false));
+
+    EXPECT_NE(
+      rendez->FlowControlSend("TEST", key, args, V("peach_2"), false, 100),
+      Status::OK());
+    EXPECT_EQ(rendez->GetAllFlowControlItemNum(), 2);
+    EXPECT_EQ(rendez->GetFlowControlItemNum("TEST"), 2);
+  }
+
+  unsetenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE");
+}
+
 class DummyDeviceContext : public DeviceContext {
  public:
   explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {}
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index e4db066a562..4d1adf1a070 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -146,6 +146,47 @@ Status Rendezvous::Recv(const ParsedKey& key, const Args& args, Tensor* val,
   return Recv(key, args, val, is_dead, no_timeout);
 }
 
+Status Rendezvous::FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, const Tensor& val,
+                                   const bool is_dead) {
+  int64 no_timeout = 300000;
+  return FlowControlSend(tag, key, args, val, is_dead, no_timeout);
+}
+
+Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, Tensor* val, bool* is_dead,
+                                   int64 timeout_ms) {
+  Status ret;
+  Notification n;
+  FlowControlRecvAsync(tag, key, args, [&ret, &n, val, is_dead](
+                       const Status& s, const Args& send_args,
+                       const Args& recv_args, const Tensor& v,
+                       const bool dead) {
+    ret = s;
+    *val = v;
+    *is_dead = dead;
+    n.Notify();
+  });
+  if (timeout_ms > 0) {
+    int64 timeout_us = timeout_ms * 1000;
+    bool notified = WaitForNotificationWithTimeout(&n, timeout_us);
+    if (!notified) {
+      return Status(error::DEADLINE_EXCEEDED,
+                    "Timed out waiting for notification");
+    }
+  } else {
+    n.WaitForNotification();
+  }
+  return ret;
+}
+
+Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                                   const Args& args, Tensor* val,
+                                   bool* is_dead) {
+  const int64 no_timeout = 0;
+  return FlowControlRecv(tag, key, args, val, is_dead, no_timeout);
+}
+
 class LocalRendezvousImpl : public Rendezvous {
  public:
   explicit LocalRendezvousImpl() {}
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index 3aa65534272..106c0f26b32 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -108,6 +108,17 @@ class Rendezvous : public core::RefCounted {
   virtual Status Send(const ParsedKey& key, const Args& args, Tensor* ref_val,
                       mutex* ref_mu, const bool is_dead) { return Status::OK(); }
 
+  virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                 const Args& args, const Tensor& val,
+                                 const bool is_dead,
+                                 const int64 timeout_millis) {
+    return errors::Unimplemented("[Rendezvous] unimplement FlowControlSend.");
+  }
+
+  virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                                 const Args& args, const Tensor& val,
+                                 const bool is_dead);
+
   // Callback provided by a tensor consumer waiting on the rendezvous.
   // It will be invoked when the tensor is available, or when a non-OK
   // status arises in the production of that tensor.  It also gets
@@ -139,12 +150,27 @@ class Rendezvous : public core::RefCounted {
   virtual void FuseRecvAsync(const std::vector<ParsedKey>& parsed_keys,
                              const Args& args, FuseDoneCallback done) {}
 
+  // Local rendezvous does not need this.
+  virtual void FlowControlRecvAsync(const StringPiece& tag,
+                 const ParsedKey& parsed_key, const Args& args,
+                 DoneCallback done) {
+    CHECK(false) << "[Rendezvous] unimplement FlowControlRecvAsync.";
+  }
+
   // Synchronous wrapper for RecvAsync.
   Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
               bool* is_dead, int64 timeout_ms);
   Status Recv(const ParsedKey& key, const Args& args, Tensor* val,
               bool* is_dead);
 
+  // Synchronous wrapper for FlowControlRecvAsync.
+  Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, Tensor* val, bool* is_dead,
+                         int64 timeout_ms);
+
+  Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, Tensor* val, bool* is_dead);
+
   // Aborts all pending and future Send/Recv with the given "status".
   //
   // StartAbort() does not wait for ongoing calls to finish.
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
index 6bfe54363f9..a919238a5ee 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc
@@ -33,11 +33,10 @@ FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
 
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
@@ -212,8 +211,9 @@ Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "FileSliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
 
@@ -253,11 +253,10 @@ FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -464,8 +463,9 @@ Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "FileSliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     file_ptr->Append(data_t.scalar<tstring>()());
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
index 6701196d481..df7e6c646f8 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h
@@ -28,6 +28,7 @@ class FileSliceSendOp : public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
@@ -63,6 +64,7 @@ class FileSliceRecvOp: public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   string recv_dir_;
diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
index 931cd152253..62f5596bb62 100644
--- a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc
@@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous {
     kv_.erase(key_str);
     return Status::OK();
   }
+
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead) {
+    return Send(key, args, val, is_dead);
+  }
+
   void RecvAsync(const ParsedKey& key, const Args& args,
                  DoneCallback done) override {
     std::string key_str = { key.FullKey().data(), key.FullKey().size() };
@@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous {
     done(Status::OK(), var.args, args, var.data, var.is_dead);
     kv_.erase(key_str);
   }
+
+  void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) {
+    RecvAsync(parsed_key, args, done);
+  }
+
   void StartAbort(const Status& status) override {}
 
  private:
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc
index 25f1a4e8738..ee0e5426cbc 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc
@@ -30,11 +30,10 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
 
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
@@ -171,8 +170,9 @@ Status SliceSendOp::SendString(OpKernelContext* ctx,
                                             frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceSend " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-      TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                                 ctx->is_input_dead()));
+      TF_RETURN_IF_ERROR(
+        ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args,
+                                           data_t, ctx->is_input_dead()));
     } else {
       TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i));
     }
@@ -209,8 +209,9 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
   return Status::OK();
@@ -248,8 +249,9 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t,
-                                               ctx->is_input_dead()));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t,
+                                         ctx->is_input_dead()));
   }
 
   return Status::OK();
@@ -270,11 +272,10 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr("send_device_incarnation",
                         reinterpret_cast<int64*>(&send_device_incarnation)));
-  string tensor_name;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_));
   key_prefix_ = \
     slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device,
-                      recv_device, send_device_incarnation, tensor_name);
+                      recv_device, send_device_incarnation, tensor_name_);
   if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
     hostmem_sendrecv_ = false;
   }
@@ -440,8 +441,9 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx,
                                             frame_iter, &parsed_key.buf_);
       VLOG(2) << "SliceRecv " << parsed_key.buf_;
       TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-      TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                                 &is_dead, timeout_ms_));
+      TF_RETURN_IF_ERROR(
+        ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                           &data_t, &is_dead, timeout_ms_));
       // This shouldn't be a dead tensor.
       CHECK_EQ(is_dead, false);
       output_flat(i) = data_t.scalar<tstring>()();
@@ -484,8 +486,9 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceRecv " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     output_flat(index) += data_t.scalar<tstring>()();
@@ -529,8 +532,9 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx,
                                           frame_iter, &parsed_key.buf_);
     VLOG(2) << "SliceSend " << parsed_key.buf_;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key));
-    TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t,
-                                               &is_dead, timeout_ms_));
+    TF_RETURN_IF_ERROR(
+      ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args,
+                                         &data_t, &is_dead, timeout_ms_));
     // This shouldn't be a dead tensor.
     CHECK_EQ(is_dead, false);
     auto data_base = data_t.data();
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h
index 43429bff32f..12e583e5551 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops.h
+++ b/tensorflow/core/kernels/slice_sendrecv_ops.h
@@ -28,6 +28,7 @@ class SliceSendOp : public OpKernel {
 
  private:
   // Variables.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
@@ -58,6 +59,7 @@ class SliceRecvOp : public OpKernel {
 
  private:
   // Variable.
+  string tensor_name_;
   string key_prefix_;
   bool hostmem_sendrecv_;
   int32 slice_size_;
diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
index 5693ed57918..0eeb6d98c36 100644
--- a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc
@@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous {
     kv_.erase(key_str);
     return Status::OK();
   }
+
+  Status FlowControlSend(const StringPiece& tag, const ParsedKey& key,
+                         const Args& args, const Tensor& val,
+                         const bool is_dead) {
+    return Send(key, args, val, is_dead);
+  }
+
   void RecvAsync(const ParsedKey& key, const Args& args,
                  DoneCallback done) override {
     std::string key_str = { key.FullKey().data(), key.FullKey().size() };
@@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous {
     done(Status::OK(), var.args, args, var.data, var.is_dead);
     kv_.erase(key_str);
   }
+
+  void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key,
+                            const Args& args, DoneCallback done) {
+    RecvAsync(parsed_key, args, done);
+  }
+  
   void StartAbort(const Status& status) override {}
 
  private:
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 65ec7ffe4bc..fa18fec180c 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -441,6 +441,52 @@ message MarkRecvFinishedRequest {
 
 message MarkRecvFinishedResponse {}
 
+////////////////////////////////////////////////////////////////////////////////
+//
+// FlowControlRecvTensor method request messages
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message FlowControlRecvTensorRequest {
+  // The step in which the tensor will be produced.
+  //
+  // REQUIRED: This must eventually correspond to the `step_id` passed
+  // into a RunGraph call on the same WorkerService.
+  int64 step_id = 1;
+
+  string tag = 2;
+
+  // A key identifying the channel to receive tensors from. A RecvTensor request
+  // retrieves one tensor from the channel, but multiple tensors can be sent and
+  // received over the same channel with multiple RecvTensor requests. See
+  // rendezvous.h for details.
+  string rendezvous_key = 3;
+
+  // If true, use an out-of-band DMA mechanism to transfer the
+  // received tensor.
+  bool dma_ok = 4;
+
+  // Optional information on client-side device locality.
+  DeviceLocality client_locality = 5;
+
+  // Optional information on server-side device locality.
+  DeviceLocality server_locality = 6;
+
+  // Optional information needed by the RPC subsystem.
+  google.protobuf.Any transport_options = 7;
+
+  // Unique identifier for this request. Every RecvTensorRequest must have a
+  // unique request_id, and retried RecvTensorRequests must have the same
+  // request_id. If request_id is zero, retry detection and response cache
+  // are disabled.
+  //
+  // Retried RecvTensorRequests are problematic because a RecvTensor with no
+  // corresponding sender will wait forever, and the tensor may have been
+  // delivered to a previous retry. Workers use request_ids to reject retried
+  // RecvTensor requests instead of waiting forever.
+  int64 request_id = 8;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Logging method request/response messages
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 07a64c55ad8..8591f2fe6ab 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -72,6 +72,11 @@ service WorkerService {
     // FuseRecvTensor Method
   }
 
+  // See worker.proto for details.
+  rpc FlowControlRecvTensor(FlowControlRecvTensorRequest) returns (RecvTensorResponse) {
+    // FlowControlRecvTensor Method
+  }
+
   // See worker.proto for details.
   rpc Logging(LoggingRequest) returns (LoggingResponse);
 

From 9e30ab604aa316359f249bc061b5fe87a5773604 Mon Sep 17 00:00:00 2001
From: Chen Bangduo <chenbangduo.cbd@alibaba-inc.com>
Date: Thu, 23 May 2024 12:00:02 +0800
Subject: [PATCH 44/45] [Embedding] Check the sharded property of
 tf.train.Saver. (#996)

Signed-off-by: chenbangduo.cbd <chenbangduo.cbd@alibaba-inc.com>
---
 modelzoo/bst/train.py                         |  3 +-
 modelzoo/dbmtl/train.py                       |  3 +-
 modelzoo/dcn/train.py                         |  3 +-
 modelzoo/dcnv2/train.py                       |  3 +-
 modelzoo/deepfm/train.py                      |  3 +-
 modelzoo/dien/train.py                        |  3 +-
 modelzoo/din/train.py                         |  3 +-
 modelzoo/dlrm/train.py                        |  3 +-
 modelzoo/dssm/train.py                        |  3 +-
 modelzoo/esmm/train.py                        |  3 +-
 modelzoo/masknet/train.py                     |  3 +-
 modelzoo/mlperf/train.py                      |  3 +-
 modelzoo/mmoe/train.py                        |  3 +-
 modelzoo/ple/train.py                         |  3 +-
 modelzoo/simple_multitask/train.py            |  3 +-
 modelzoo/wide_and_deep/train.py               |  3 +-
 .../feature_column/feature_column_v2_test.py  |  6 +-
 .../ops/embedding_variable_ops_gpu_test.py    |  7 +-
 .../python/ops/embedding_variable_ops_test.py | 64 ++++++++++---------
 tensorflow/python/training/incr_ckpt_test.py  |  5 +-
 tensorflow/python/training/saver.py           | 11 ++++
 tensorflow/python/training/saver_test.py      |  6 ++
 22 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py
index eeeb136678b..536ddbc6905 100644
--- a/modelzoo/bst/train.py
+++ b/modelzoo/bst/train.py
@@ -612,10 +612,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py
index c848cbc76b2..36f2685a175 100644
--- a/modelzoo/dbmtl/train.py
+++ b/modelzoo/dbmtl/train.py
@@ -527,10 +527,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py
index 44701e22d9f..5094a18bd85 100644
--- a/modelzoo/dcn/train.py
+++ b/modelzoo/dcn/train.py
@@ -594,10 +594,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py
index 5b572af0425..c1346ad6d7d 100644
--- a/modelzoo/dcnv2/train.py
+++ b/modelzoo/dcnv2/train.py
@@ -610,10 +610,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py
index 166bedec0d0..89b2b823a46 100644
--- a/modelzoo/deepfm/train.py
+++ b/modelzoo/deepfm/train.py
@@ -472,10 +472,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py
index 190695f6ce0..f43fd2f1e73 100644
--- a/modelzoo/dien/train.py
+++ b/modelzoo/dien/train.py
@@ -776,10 +776,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py
index 058583ce6fd..34621dee45e 100644
--- a/modelzoo/din/train.py
+++ b/modelzoo/din/train.py
@@ -594,10 +594,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py
index cc4c045c349..9dff32aca52 100644
--- a/modelzoo/dlrm/train.py
+++ b/modelzoo/dlrm/train.py
@@ -507,10 +507,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py
index db949aac5e8..9d2264d9ce9 100644
--- a/modelzoo/dssm/train.py
+++ b/modelzoo/dssm/train.py
@@ -478,10 +478,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py
index 073b08814d4..1916ed76c27 100755
--- a/modelzoo/esmm/train.py
+++ b/modelzoo/esmm/train.py
@@ -534,10 +534,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py
index bb96a467701..bb9eee0ec3f 100644
--- a/modelzoo/masknet/train.py
+++ b/modelzoo/masknet/train.py
@@ -529,10 +529,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py
index ce34fe5e55c..559e4fb6efc 100644
--- a/modelzoo/mlperf/train.py
+++ b/modelzoo/mlperf/train.py
@@ -522,10 +522,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py
index 694eb45da80..a3a6c9146d8 100644
--- a/modelzoo/mmoe/train.py
+++ b/modelzoo/mmoe/train.py
@@ -523,10 +523,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py
index b2d2f2057ec..33aa9a15e8e 100644
--- a/modelzoo/ple/train.py
+++ b/modelzoo/ple/train.py
@@ -592,10 +592,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py
index 4ef1874a521..6eb51f7d4e9 100644
--- a/modelzoo/simple_multitask/train.py
+++ b/modelzoo/simple_multitask/train.py
@@ -427,10 +427,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=train_steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py
index 3024f58024e..2d1c964e593 100644
--- a/modelzoo/wide_and_deep/train.py
+++ b/modelzoo/wide_and_deep/train.py
@@ -543,10 +543,9 @@ def train(sess_config,
     hooks = []
     hooks.extend(input_hooks)
 
-    sharded_saver = tf_config != None
     scaffold = tf.train.Scaffold(
         local_init_op=tf.group(tf.local_variables_initializer(), data_init_op),
-        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver))
+        saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True))
 
     stop_hook = tf.train.StopAtStepHook(last_step=steps)
     log_hook = tf.train.LoggingTensorHook(
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 7946aee1e1a..24f8a36daa4 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -7527,7 +7527,7 @@ def testEmbeddingVariableForL2FeatureEviction(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables_lib.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -7758,7 +7758,7 @@ def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables_lib.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
 
   @test_util.run_deprecated_v1
   def testEmbeddingVariableForInt32ID(self):
@@ -7783,7 +7783,7 @@ def testEmbeddingVariableForInt32ID(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables_lib.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
index d47d94d0d99..3c69153ab1b 100644
--- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py
@@ -63,7 +63,8 @@ def testEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -748,7 +749,7 @@ def testSaveV3(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver = saver_module.Saver()
+    saver = saver = saver_module.Saver(sharded=True)
     checkpoint_directory = self.get_temp_dir()
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
@@ -816,7 +817,7 @@ def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self):
         opt = adagrad.AdagradOptimizer(0.1)
         g_v = opt.compute_gradients(loss)
         train_op = opt.apply_gradients(g_v, gs)
-        saver = saver_module.Saver()
+        saver = saver_module.Saver(sharded=True)
         graph = ops.get_default_graph()
         with self.test_session(graph = graph) as sess:
           saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345"))
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
index dbf254d5f14..1119fd1c194 100644
--- a/tensorflow/python/ops/embedding_variable_ops_test.py
+++ b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -162,7 +162,7 @@ def _RecordFreqTestTemplate(self, optimizer):
     opt = self._CreateOptimizer(optimizer)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -194,7 +194,7 @@ def _RecordVersionTemplate(self, optimizer):
     opt = self._CreateOptimizer(optimizer)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -232,7 +232,7 @@ def testSaveVersionWithGlobalStepEviction(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
       sess.run([init])
@@ -269,7 +269,7 @@ def testFeatureColumnRecordFreqWithPartition(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -313,7 +313,7 @@ def testFeatureColumnRecordFreqSGDWithPartition(self):
     opt = gradient_descent.GradientDescentOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -387,7 +387,8 @@ def testDynamicEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -406,7 +407,8 @@ def testEmbeddingVariableForInitFromProto(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     graph = ops.get_default_graph()
-    meta_graph_def = saver_module.export_meta_graph()
+    saver = saver_module.Saver(sharded=True)
+    meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def())
     ops.reset_default_graph()
     with self.test_session() as sess:
       res = saver_module.import_meta_graph(meta_graph_def)
@@ -450,7 +452,7 @@ def testEmbeddingVariableForLookupInt32(self):
     opt = adam.AdamOptimizer(0.01)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -643,7 +645,7 @@ def testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -682,7 +684,7 @@ def testEmbeddingVariableForGlobalStepEviction(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run([init])
@@ -720,7 +722,7 @@ def testEmbeddingVariableForL2FeatureEviction(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -1534,7 +1536,7 @@ def testEmbeddingVariableForSaveFreq(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
       sess.run([init])
@@ -1567,7 +1569,7 @@ def testEmbeddingVariableForL2FeatureEvictionDRAM(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
@@ -1724,7 +1726,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1778,7 +1780,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1849,7 +1851,7 @@ def runTestAdagrad(self, var, g):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, global_step=gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1923,7 +1925,7 @@ def testEmbeddingVariableForRecordFreq(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -1963,7 +1965,7 @@ def testEmbeddingVariableForRecordFreqWithCounterFilter(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2278,7 +2280,7 @@ def testEmbeddingVariableForContirbFeatureColumnWithPartitionNum(self):
     opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
 
   def testSaveV3(self):
     print("testSaveV3")
@@ -2295,7 +2297,7 @@ def testSaveV3(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, global_step=gs)
     init = variables.global_variables_initializer()
-    saver = saver = saver_module.Saver()
+    saver = saver = saver_module.Saver(sharded=True)
     checkpoint_directory = self.get_temp_dir()
     model_path = os.path.join(checkpoint_directory, "model.ckpt")
     with self.test_session() as sess:
@@ -2326,7 +2328,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2359,7 +2361,7 @@ def testEmbeddingVariableForSaveUnfilterFeature(self):
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v, gs)
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     model_path = os.path.join(checkpoint_directory,
                               "model1.ckpt")
@@ -2390,7 +2392,7 @@ def testEmbeddingVariableForMultiTierInference(self):
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v, gs)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session() as sess:
         sess.run([init])
@@ -2412,7 +2414,7 @@ def testEmbeddingVariableForMultiTierInference(self):
       emb = embedding_ops.embedding_lookup(emb_var, ids)
       tires = kv_variable_ops.lookup_tier(emb_var,
                   math_ops.cast([1,2,3,4], dtypes.int64))
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       graph = ops.get_default_graph()
       with self.test_session(graph = graph) as sess:
         saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt"))
@@ -2784,7 +2786,7 @@ def testSetInitializedWithoutRestore(self):
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
     init = variables.global_variables_initializer()
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     with self.test_session() as sess:
       result = sess.run(var._is_initialized_op)
       self.assertEqual(False, result)
@@ -2806,7 +2808,7 @@ def testSetInitializedWithRestore(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         sess.run([init])
@@ -2823,7 +2825,7 @@ def testSetInitializedWithRestore(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
       with self.test_session(graph=g) as sess:
         result = sess.run(var._is_initialized_op)
@@ -2860,7 +2862,7 @@ def testCountsTensor(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2893,7 +2895,7 @@ def testCountsWithSparseAndDenseTensor(self):
       opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2929,7 +2931,7 @@ def testCountsTensorWithGradientDescent(self):
       opt = gradient_descent.GradientDescentOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
@@ -2964,7 +2966,7 @@ def testCountsDenseAndSparseTensorWithGradientDescent(self):
       opt = gradient_descent.GradientDescentOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
       train_op = opt.apply_gradients(g_v)
-      saver = saver_module.Saver()
+      saver = saver_module.Saver(sharded=True)
       init = variables.global_variables_initializer()
     with self.test_session(graph=g) as sess:
       sess.run([init])
diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py
index 55cf748a9d6..849c73a44dc 100644
--- a/tensorflow/python/training/incr_ckpt_test.py
+++ b/tensorflow/python/training/incr_ckpt_test.py
@@ -75,7 +75,7 @@ def testSparseEvIncrSaveRestore(self):
     emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64))
     with ops.device("/device:CPU:0"):
       apply_incr = gen_io_ops.record_sparse_indices(math_ops.cast([0,1,2,5,6,7], dtypes.int64), "var_ev1")
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     ev_var_name = "var_ev1"
     incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, [ev_var_name], [], [True],[var.handle])
@@ -178,7 +178,7 @@ def testMixIncrSaveRestore(self):
     activate_op = gen_io_ops. activate_sparse_recorder(["var_ev1","var_norm1"])
  
   
-    saver = saver_module.Saver()
+    saver = saver_module.Saver(sharded=True)
     init = variables.global_variables_initializer()
     incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, ["var_norm1", "var_ev1"], [], [True, True], [var_norm, var_ev.handle])
     
@@ -445,6 +445,7 @@ def testIncrementalSaverForResourceVariable(self):
     variable_scope.get_variable('var', shape=[100], use_resource=False)
     variable_scope.get_embedding_variable('ev', embedding_dim=100)
     saver = saver_module.Saver(
+        sharded=True,
         save_relative_paths=True,
         incremental_save_restore=True,
     )
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index acc9723c183..e70226f2968 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1071,10 +1071,14 @@ def _build(self, checkpoint_path, build_save, build_restore):
         # pylint: disable=protected-access
         self._var_list = variables._all_saveable_objects()
       from tensorflow.python.ops import hash_table
+      from tensorflow.python.ops import kv_variable_ops
       if isinstance(self._var_list, dict):
+        ev = {}
         ht = {}
         lst = {}
         for name, x in self._var_list.items():
+          if isinstance(x, kv_variable_ops.EmbeddingVariable):
+            ev[name] = x
           if isinstance(x, hash_table.HashTable):
             if x.hash_table not in ht:
               ht[x.hash_table] = [x]
@@ -1084,15 +1088,20 @@ def _build(self, checkpoint_path, build_save, build_restore):
             lst[name] = BloomFilterSaveable(x)
           else:
             lst[name] = x
+        if len(ev) != 0 and not self._sharded:
+          raise ValueError("EmbeddingVariable can only use sharded saver")
         if len(ht) != 0 and not self._sharded:
           raise ValueError("HashTable can only use sharded saver")
         for x, y in ht.items():
           lst[x.name] = HashTableSaveable(y)
         self._var_list = lst
       else:
+        ev = []
         ht = {}
         lst = []
         for x in self._var_list:
+          if isinstance(x, kv_variable_ops.EmbeddingVariable):
+            ev.append(x)
           if isinstance(x, hash_table.HashTable):
             if x.hash_table not in ht:
               ht[x.hash_table] = [x]
@@ -1102,6 +1111,8 @@ def _build(self, checkpoint_path, build_save, build_restore):
             lst.append(BloomFilterSaveable(x))
           else:
             lst.append(x)
+        if len(ev) != 0 and not self._sharded:
+          raise ValueError("EmbeddingVariable can only use sharded saver")
         if len(ht) != 0 and not self._sharded:
           raise ValueError("HashTable can only use sharded saver")
         for x, y in ht.items():
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index b48f00d0c14..365ef85af1d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -852,6 +852,12 @@ def _model():
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
 
+  def testEnableSaverShardedWhenUseEmbeddingVariable(self):
+    with ops_lib.Graph().as_default():
+      emb_var = \
+        variable_scope.get_embedding_variable(name="emb_var", embedding_dim=64)
+      with self.assertRaisesRegexp(ValueError, "EmbeddingVariable"):
+        saver_module.Saver([emb_var], sharded=False)
 
 class SaveRestoreShardedTest(test.TestCase):
 

From d1c5a6e9aa2ec62da93f6719c6755293cf6406a5 Mon Sep 17 00:00:00 2001
From: LightWang4 <303176469@qq.com>
Date: Tue, 21 Jan 2025 17:54:28 +0800
Subject: [PATCH 45/45] [Embedding] Fix op dependency in init_from_checkpoint
 API. (#1012)

Signed-off-by: lightwang <lightwang983@gmail.com>
---
 tensorflow/python/training/checkpoint_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index db887fa12f1..d87a9f1b39b 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -443,7 +443,8 @@ def _set_checkpoint_initializer(variable,
       is_partitioned_ev = variable._save_slice_info is not None
       partition_id = variable._save_slice_info.var_offset[0] if is_partitioned_ev else 0
       partition_num = variable._save_slice_info.full_shape[0] if is_partitioned_ev else 1
-      with ops.control_dependencies([variable._initializer_op]):
+      restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+      with ops.control_dependencies(restore_dependency[variable._primary_handle]):
         rank = variable.initial_value.get_shape().rank - 1
         restore_op = gen_kv_variable_ops.kv_resource_import_v3(
             ckpt_file,