From 419162720dcfc543a84873b24772a262bc1de6b3 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Tue, 8 Aug 2023 20:15:06 +0800 Subject: [PATCH 01/45] [Docs] Update deeprec2306 release images and notes in README.md & RELEASE.md. (#922) Signed-off-by: candy.dc --- README.md | 4 +- RELEASE.md | 84 +++++++++++++++++++ docs/docs_en/DeepRec-Compile-And-Install.md | 4 +- docs/docs_en/Estimator-Compile-And-Install.md | 2 +- docs/docs_en/TFServing-Compile-And-Install.md | 2 +- docs/docs_zh/DeepRec-Compile-And-Install.md | 4 +- docs/docs_zh/Estimator-Compile-And-Install.md | 2 +- docs/docs_zh/TFServing-Compile-And-Install.md | 2 +- 8 files changed, 94 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 927afe31480..53cca5c5c83 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux #### Image for CPU ``` -alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 ``` #### Image for GPU CUDA11.6 ``` -alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 ``` *** diff --git a/RELEASE.md b/RELEASE.md index d41d9e569ad..43e03bc2b49 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,87 @@ +# Release r1.15.5-deeprec2306 + +## **Major Features and Improvements** + +### **Embedding** + +- Support StaticGPUHashMap to optimize EmbeddingVariable in inference. +- Update logic of GroupEmbedding in feature_column API. +- Refine APIs for foward-backward optimization. +- Move insertions of new features into the backward process when lti-tier storage. +- Move insertion of new features into the backward ops. +- Modify calculation logic of embedding lookup sparse combiner. +- Add memory and performance tests of EmbeddingVariable. + +### **Graph & Grappler Optimization** + +- Support IteratorGetNext for SmartStage as a starting node for searching. +- Reimplement PrefetchRunner in C++. + +### **Runtime Optimization** + +- Dispatch expensive ops via multiple threads in theadpool. +- Enable multi-stream in session_group by default. +- Support for loading saved_model with device information when use p and multi_stream. +- Make ARENA_ARRAY_SIZE to be configurable. +- Optimize EV allocator performance. +- Integrate HybridBackend in collective training mode. + +### **Ops & Hardware Acceleration** + +- Disable MatMul fused with LeakyRule when MKL is disabled. + +### **Serving** + +- Clear virtual_device configurations before load new checkpoint. + +### **Environment & Build** + +- Update docker images in user documents. +- Update DEFAULT_CUDA_VERSION and DEFAULT_CUDNN_VERSION in configure.py. +- Move thirdparties from WORKSPACE to workspace.bzl. +- Update urls corresponding to colm, ragel, aliyun-oss-sdk and uuid. +- Update default TF_CUDA_COMPUTE_CAPABILITIES to 7.0,7.5,8.0,8.6. +- Update SparseOperationKit to v23.5.01 and docker file. + +### **BugFix** + +- Fix issue of missing params while constructing the ngScope. +- Fix memory leak to avoid OOM. +- Fix shape validation in API shared_embedding_columns. +- Fix the device placement bug of stage_subgraph_on_cpu in distributed. +- Fix hung issue when using both SOK and SmartStaged simultaneously. +- Fix bug: init global_step before saving variables +- Fix bug: reserve input nodes, clear saver devices on demand. +- Fix memory leak when a graph node is invalid. + +### **ModelZoo** + +- Add examples and docs to demonstrate Collective Training. +- Update documents and config files for modelzoo benchmark. +- Update modelzoo README. + +### **Tool & Documents** + +- Update cases of configure TF_CUDA_COMPUTE_CAPABILITIES for H100. +- Update COMMITTERS.md. +- Update device placement documents. +- Update document for SmartStage. +- Update session_group documents. +- Update the download link of the library that Processor depends on. +- Update sok to 1.20. + +More details of features: [https://deeprec.readthedocs.io/zh/latest/](url) + +## **Release Images** + +### **CPU Image** + +`alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04` + +### **GPU Image** + +`alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04` + # Release r1.15.5-deeprec2304 ## **Major Features and Improvements** diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md index 0a170177353..83ba4854b9f 100644 --- a/docs/docs_en/DeepRec-Compile-And-Install.md +++ b/docs/docs_en/DeepRec-Compile-And-Install.md @@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 ``` arm64: @@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU Image with CUDA 11.6** ``` -alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 ``` diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md index cdc04044875..73b6a36f318 100644 --- a/docs/docs_en/Estimator-Compile-And-Install.md +++ b/docs/docs_en/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -Develop Branch:master, Latest Release Branch: deeprec2304 +Develop Branch:master, Latest Release Branch: deeprec2306 ## Estimator Build diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md index 8ced3628673..346a848ca74 100644 --- a/docs/docs_en/TFServing-Compile-And-Install.md +++ b/docs/docs_en/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -Develop Branch: master, Latest Release Branch: deeprec2304 +Develop Branch: master, Latest Release Branch: deeprec2306 ## TFServing Build diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md index 20df07aa252..08d249f8eeb 100644 --- a/docs/docs_zh/DeepRec-Compile-And-Install.md +++ b/docs/docs_zh/DeepRec-Compile-And-Install.md @@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2304-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 ``` arm64: @@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU CUDA11.6镜像** ``` -alideeprec/deeprec-release:deeprec2304-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 ``` ## DeepRec Processor编译打包 diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md index 332b96e6086..e5455aae91a 100644 --- a/docs/docs_zh/Estimator-Compile-And-Install.md +++ b/docs/docs_zh/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ 代码库:[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -开发分支:master,最新Release分支:deeprec2304 +开发分支:master,最新Release分支:deeprec2306 ## Estimator编译 diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md index 27bfc864e4e..0c76400e6c6 100644 --- a/docs/docs_zh/TFServing-Compile-And-Install.md +++ b/docs/docs_zh/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ 代码库:[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -开发分支:master,最新Release分支:deeprec2304 +开发分支:master,最新Release分支:deeprec2306 ## TFServing编译&打包 From 4983e027e2eae258a82b34ee19b8ae2cb59e6c56 Mon Sep 17 00:00:00 2001 From: shijieliu Date: Wed, 9 Aug 2023 11:26:59 +0800 Subject: [PATCH 02/45] [Distributed] Fix wgrad bug in Sparse Operation Kit. (#918) Use new_git_repository to manage sok dependency,update sok for fixing localized mode wgrad bug. Signed-off-by: aleliu --- tensorflow/tools/pip_package/build_sok.sh | 3 +-- tensorflow/workspace.bzl | 11 +++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tensorflow/tools/pip_package/build_sok.sh b/tensorflow/tools/pip_package/build_sok.sh index 2c99ceb5ac1..3860f5fdcff 100755 --- a/tensorflow/tools/pip_package/build_sok.sh +++ b/tensorflow/tools/pip_package/build_sok.sh @@ -16,5 +16,4 @@ export MAKEFLAGS=-j$(nproc) export SOK_COMPILE_GPU_SM="70;75;80" cd ./bazel-DeepRec/external/hugectr/sparse_operation_kit -"${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel -pip install ./dist/merlin_sok-1.2.0-cp38-cp38-linux_x86_64.whl +"${PYTHON_BIN_PATH:-python}" setup.py install diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 3495efd182d..540f733b2ea 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -1369,13 +1369,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ], ) - http_archive( + new_git_repository( name = "hugectr", # Apache License 2.0 - build_file = "//third_party:hugectr.BUILD", - strip_prefix = "HugeCTR-23.06.00", - urls = [ - "https://github.com/NVIDIA-Merlin/HugeCTR/archive/refs/tags/v23.06.00.tar.gz", - ], + build_file = "//third_party:hugectr.BUILD", + commit = "869028c1c32bdcda2f18efc88d54f0527ed28d6d", + init_submodules = True, + remote = "https://github.com/NVIDIA-Merlin/HugeCTR.git", ) def tf_bind(): From f09e5ec0c1a2424727f8ffc5eaf98b771c4b374e Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Fri, 11 Aug 2023 14:02:44 +0800 Subject: [PATCH 03/45] [Embedding] Add GetSnapshot and Create API for EmbeddingVariable. (#923) Signed-off-by: lixy9474 --- .../core/framework/embedding/embedding_var.h | 35 ++++++++++++ .../framework/embedding/eviction_manager.h | 5 +- .../kernels/embedding_variable_ops_test.cc | 54 +++++++++++++++++-- 3 files changed, 88 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index 9a5b5cf9a19..b29493f2169 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -186,6 +186,13 @@ class EmbeddingVar : public ResourceBase { } } + Status Insert(K key, V* value) { + ValuePtr* value_ptr = nullptr; + CreateKey(key, &value_ptr, true); + LookupOrCreateEmb(value_ptr, value); + return Status::OK(); + } + Status LookupOrCreateKey(K key, ValuePtr** value_ptr) { Status s = storage_->GetOrCreate(key, value_ptr, emb_config_.total_num(storage_->GetAllocLen())); @@ -592,6 +599,34 @@ class EmbeddingVar : public ResourceBase { default_value_); } + void GetSnapshot(std::vector* key_list, + std::vector* value_list, + std::vector* version_list, + std::vector* freq_list) { + std::vector*> value_ptr_list; + storage_->GetSnapshot(key_list, &value_ptr_list); + bool is_save_freq = emb_config_.is_save_freq(); + bool is_save_version = emb_config_.is_save_version(); + for (int64 i = 0; i < key_list->size(); i++) { + V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0); + if (val != nullptr) { + value_list->emplace_back(val); + } else { + value_list->emplace_back(default_value_); + } + + if(is_save_version) { + int64 dump_version = value_ptr_list[i]->GetStep(); + version_list->emplace_back(dump_version); + } + + if(is_save_freq) { + int64 dump_freq = value_ptr_list[i]->GetFreq(); + freq_list->emplace_back(dump_freq); + } + } + } + mutex* mu() { return &mu_; } diff --git a/tensorflow/core/framework/embedding/eviction_manager.h b/tensorflow/core/framework/embedding/eviction_manager.h index b5a78765170..ca646c9b420 100644 --- a/tensorflow/core/framework/embedding/eviction_manager.h +++ b/tensorflow/core/framework/embedding/eviction_manager.h @@ -47,8 +47,7 @@ class EvictionManager { "EVICTION_MANAGER", 3, /*low_latency_hint=*/false)); } - ~EvictionManager() { - } + ~EvictionManager() {} TF_DISALLOW_COPY_AND_ASSIGN(EvictionManager); @@ -124,8 +123,8 @@ class EvictionManager { int64 num_of_threads_; int64 num_of_active_threads_; std::atomic_flag flag_ = ATOMIC_FLAG_INIT; - std::unique_ptr thread_pool_; std::map*, StorageItem*> storage_table_; + std::unique_ptr thread_pool_; mutex mu_; }; diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc index eff4b77c2dc..4839c171708 100644 --- a/tensorflow/core/kernels/embedding_variable_ops_test.cc +++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc @@ -1191,6 +1191,7 @@ TEST(EmbeddingVariableTest, TestLFUCache) { } TEST(EmbeddingVariableTest, TestCacheRestore) { + setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1); int64 value_size = 4; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); @@ -1237,8 +1238,11 @@ TEST(EmbeddingVariableTest, TestCacheRestore) { LOG(INFO) << "size:" << variable->Size(); BundleWriter writer(Env::Default(), Prefix("foo")); - DumpEmbeddingValues(variable, "var/part_0", &writer, &part_offset_tensor); - TF_ASSERT_OK(writer.Finish()); + embedding::ShrinkArgs shrink_args; + shrink_args.global_step = 1; + variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); + TF_ASSERT_OK(writer.Finish()); + variable->Unref(); auto imported_storage= embedding::StorageFactory::Create( embedding::StorageConfig(embedding::DRAM_SSDHASH, @@ -1258,6 +1262,7 @@ TEST(EmbeddingVariableTest, TestCacheRestore) { ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size); ASSERT_EQ(imported_storage->Size(1), 2); + delete imported_storage; } void t1_gpu(KVInterface* hashmap) { @@ -1703,7 +1708,50 @@ TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) { for (auto &t : insert_threads) { t.join(); } - } +} + +TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) { + int value_size = 10; + Tensor value(DT_FLOAT, TensorShape({value_size})); + test::FillValues(&value, std::vector(value_size, 10.0)); + auto emb_config = EmbeddingConfig( + /*emb_index = */0, /*primary_emb_index = */0, + /*block_num = */1, /*slot_num = */0, + /*name = */"", /*steps_to_live = */0, + /*filter_freq = */0, /*max_freq = */999999, + /*l2_weight_threshold = */-1.0, /*layout = */"normal", + /*max_element_size = */0, /*false_positive_probability = */-1.0, + /*counter_type = */DT_UINT64); + auto storage = embedding::StorageFactory::Create( + embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); + auto var = new EmbeddingVar("EmbeddingVar", + storage, + emb_config, + cpu_allocator()); + var->Init(value, 1); + float* set_value = (float*)malloc(value_size * sizeof(float)); + //Insertion + for (int i = 0; i < 100; i++) { + for (int j = 0; j < value_size; j++) { + set_value[j] = i + j; + } + var->Insert(i, set_value); + } + free(set_value); + //GetSnapshot + std::vector key_list; + std::vector value_ptr_list; + std::vector version_list; + std::vector freq_list; + var->GetSnapshot(&key_list, &value_ptr_list, + &version_list, &freq_list); + for (int i = 0; i < key_list.size(); i++) { + ASSERT_EQ(key_list[i], i); + for (int j = 0; j < value_size; j++) { + ASSERT_EQ(value_ptr_list[i][j], i + j); + } + } +} } // namespace } // namespace embedding From 8d8e16aae66add22cf8a4812d549c83f3569ef13 Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Fri, 11 Aug 2023 18:00:40 +0800 Subject: [PATCH 04/45] [Embedding] Fix set initialized flag too early in restore subgraph. (#920) Signed-off-by: lixy9474 --- .../core/framework/embedding/config.proto | 4 ++ .../framework/embedding/multi_tier_storage.h | 10 +-- tensorflow/core/framework/variable.proto | 2 + tensorflow/core/kernels/kv_variable_ops.cc | 28 ++++---- .../python/ops/embedding_variable_ops_test.py | 65 +++++++++++++++++++ tensorflow/python/ops/kv_variable_ops.py | 52 +++++++++++++++ tensorflow/python/training/optimizer.py | 3 +- .../training/saving/saveable_object_util.py | 2 +- tensorflow/python/training/slot_creator.py | 18 +++-- 9 files changed, 158 insertions(+), 26 deletions(-) diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto index 3d5fae9f6ad..a8535347020 100644 --- a/tensorflow/core/framework/embedding/config.proto +++ b/tensorflow/core/framework/embedding/config.proto @@ -56,3 +56,7 @@ enum ValuePosition { IN_DRAM = 0; NOT_IN_DRAM = 1; } + +enum IsSetInitialized { + NOT_SET_INITAILIZED = 0; +} diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h index ff18425ad9a..8239d109e64 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.h +++ b/tensorflow/core/framework/embedding/multi_tier_storage.h @@ -81,10 +81,12 @@ class MultiTierStorage : public Storage { } void InitCache(embedding::CacheStrategy cache_strategy) override { - cache_ = CacheFactory::Create(cache_strategy, name_); - eviction_manager_ = EvictionManagerCreator::Create(); - eviction_manager_->AddStorage(this); - cache_thread_pool_ = CacheThreadPoolCreator::Create(); + if (cache_ == nullptr) { + cache_ = CacheFactory::Create(cache_strategy, name_); + eviction_manager_ = EvictionManagerCreator::Create(); + eviction_manager_->AddStorage(this); + cache_thread_pool_ = CacheThreadPoolCreator::Create(); + } } Status BatchCommit(const std::vector& keys, diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto index 79ccd107628..5f9e0f16b5d 100644 --- a/tensorflow/core/framework/variable.proto +++ b/tensorflow/core/framework/variable.proto @@ -74,6 +74,8 @@ message VariableDef { // EmebddingVariable bool is_embedding_var = 91; + + string initialize_op_for_restore = 92; } message SaveSliceInfoDef { diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc index 20ea6d3cb61..8a01a7bf2cd 100644 --- a/tensorflow/core/kernels/kv_variable_ops.cc +++ b/tensorflow/core/kernels/kv_variable_ops.cc @@ -43,11 +43,6 @@ limitations under the License. namespace tensorflow { -namespace { -const int64 kEmbeddingVarUseDB = -214; -const int64 kInitializableEmbeddingVarUseDB = -215; -} - Status MoveMatchingFiles( Env* env, const tstring& pattern, @@ -207,6 +202,15 @@ class InitializeKvVariableOp : public OpKernel { (embedding_var_type == embedding::EmbeddingVariableType::IMMUTABLE); + //initial_num_buckets is useless, so is used to set is_set_initialized_. + int64 initial_num_buckets = 0; + OP_REQUIRES_OK(c, c->GetAttr("initial_num_buckets", &initial_num_buckets)); + is_set_initialized_ = true; + if (initial_num_buckets == + embedding::IsSetInitialized::NOT_SET_INITAILIZED) { + is_set_initialized_ = false; + } + int64 storage_type = 0; OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type)); storage_type_ = static_cast(storage_type); @@ -263,15 +267,10 @@ class InitializeKvVariableOp : public OpKernel { " should be DRAM when layout is 'compact'.")); } - if (steps_to_live_ == kEmbeddingVarUseDB || - steps_to_live_ == kInitializableEmbeddingVarUseDB) { - LOG(INFO) << "hashmap use db"; - //use_db_ = true; - } else { - OP_REQUIRES(c, steps_to_live_ >= 0, - errors::InvalidArgument( + OP_REQUIRES(c, steps_to_live_ >= 0, + errors::InvalidArgument( "steps_to_live must >= 0, ", std::to_string(steps_to_live_))); - } + OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_)); if (embedding::StorageType::LEVELDB == storage_type_) { ht_type_ = "leveldb_kv"; @@ -406,7 +405,7 @@ class InitializeKvVariableOp : public OpKernel { core::ScopedUnref unref_me(primary_variable); } core::ScopedUnref unref_me(ev); - if (steps_to_live_ != kEmbeddingVarUseDB) { + if (is_set_initialized_) { ev->SetInitialized(); } } @@ -436,6 +435,7 @@ class InitializeKvVariableOp : public OpKernel { bool record_freq_; bool record_version_; bool is_inference_; + bool is_set_initialized_; }; #define REGISTER_KERNELS(ktype, vtype) \ diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index d3e453df9d1..25a0cb6ff11 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -2751,5 +2751,70 @@ def testCPUFbjOptWithBloomFilter(self): self.assertNotEqual(val, 1.0) del os.environ["TF_EMBEDDING_FBJ_OPT"] + def testSetInitializedWithoutRestore(self): + print("testSetInitializedWithoutRestore") + with ops.device("/cpu:0"): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + saver = saver_module.Saver() + with self.test_session() as sess: + result = sess.run(var._is_initialized_op) + self.assertEqual(False, result) + sess.run([init]) + result = sess.run(var._is_initialized_op) + self.assertEqual(True, result) + + def testSetInitializedWithRestore(self): + print("testSetInitializedWitRestore") + checkpoint_directory = self.get_temp_dir() + ckpt_path = os.path.join(checkpoint_directory, "model.ckpt") + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,2 ,3], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + sess.run([init]) + sess.run(train_op) + saver.save(sess, ckpt_path) + + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1, 2, 3], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + result = sess.run(var._is_initialized_op) + self.assertEqual(False, result) + sess.run([var._initializer_for_restore]) + result = sess.run(var._is_initialized_op) + self.assertEqual(False, result) + + saver.restore(sess, ckpt_path) + result = sess.run(var._is_initialized_op) + self.assertEqual(True, result) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index e6140c9c149..701c03f6975 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -434,6 +434,8 @@ def is_multi_tier(storage_type): with ops.control_dependencies(set_attr_ops + [self._init_op]): self._initializer_op = control_flow_ops.no_op() + self.create_init_op_for_restore(name, initial_value, invalid_key, rank) + self._graph_element = self._handle self._cached_value = None if not context.executing_eagerly(): @@ -444,6 +446,49 @@ def is_multi_tier(storage_type): def export(self): return gen_kv_variable_ops.kv_resource_export(self._handle, Tkeys=self._invalid_key_type) + + def create_init_op_for_restore(self, name, initial_value, invalid_key, rank): + with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]): + self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op( + self._handle, + self._primary._handle, + variables._try_guard_against_uninitialized_dependencies(name, initial_value), + ops.convert_to_tensor(invalid_key), + initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED, + slot_num = self._slot_num, + shape=initial_value.get_shape()[rank:], + steps_to_live=self._steps_to_live, + emb_index=self._emb_index, block_num=self.block_num, + slot_index=self._slot_index, + ht_type=self._ht_type, + ht_partition_num=self._ht_partition_num, + filter_freq = self._filter_freq, + l2_weight_threshold = self._l2_weight_threshold, + max_element_size = self._max_element_size, + false_positive_probability = self._false_positive_probability, + counter_type = self._counter_type, + max_freq = 99999, + layout = self._layout, + storage_type = self._storage_type, + storage_path = self._storage_path, + storage_size = self._storage_size, + default_value_dim = self._default_value_dim, + default_value_no_permission = self._default_value_no_permission, + record_freq = self._record_freq, + record_version = self._record_version, + embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE) + set_attr_ops = [] + if self._is_primary and self._is_multi_tier: + with ops.control_dependencies([self._initializer_for_restore]): + set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op( + self._handle, + cache_strategy=self._storage_cache_strategy, + Tkeys=self._invalid_key_type, + dtype=self._dtype) + set_attr_ops.append(set_cache_op) + with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]): + self._init_op_for_restore = control_flow_ops.no_op() + def need_counts(self): return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier) @property @@ -482,6 +527,11 @@ def _init_from_proto(self, variable_def, import_scope=None): cache_op = op elif self._initializer_op.type == "InitializeKvVariableOp": init_op = self._initializer_op + + self._init_op_for_restore = g.as_graph_element( + ops.prepend_name_scope( + variable_def.initialize_op_for_restore, + import_scope=import_scope)) self._trainable = getattr(variable_def, "trainable", True) if variable_def.snapshot_name: self._cached_value = g.as_graph_element( @@ -842,6 +892,8 @@ def to_proto(self, export_scope=None): if self._save_slice_info: var_def.save_slice_info_def.MergeFrom( self._save_slice_info.to_proto(export_scope=export_scope)) + var_def.initialize_op_for_restore = ops.strip_name_scope( + self._init_op_for_restore.name, export_scope) return var_def else: return None diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 2b765814c0d..578d682cc11 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -243,8 +243,7 @@ def _get_processor(v): if v.op.type == "KvVarHandleOp": from tensorflow.core.framework import attr_value_pb2 from tensorflow.core.framework.embedding import config_pb2 - v._init_op._set_attr("embedding_variable_type", - attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE)) + slot_creator._set_init_op_embedding_type_attr(v, config_pb2.EmbeddingVariableType.MUTABLE) return _DenseResourceVariableProcessor(v) if isinstance(v, variables.Variable): return _RefVariableProcessor(v) diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py index cd3cba52676..0d8bfe87022 100644 --- a/tensorflow/python/training/saving/saveable_object_util.py +++ b/tensorflow/python/training/saving/saveable_object_util.py @@ -195,7 +195,7 @@ def restore(self, restored_tensors, unused_restored_shapes): if self.var._init_data_source is not None: return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num) else: - with ops.control_dependencies([self.var._initializer_op]): + with ops.control_dependencies([self.var._init_op_for_restore]): rank = self.op.initial_value.get_shape().rank - 1 restore_op = gen_kv_variable_ops.kv_resource_import_v3( restored_tensors[0], diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py index 90a820d82f6..6a359321c20 100644 --- a/tensorflow/python/training/slot_creator.py +++ b/tensorflow/python/training/slot_creator.py @@ -94,8 +94,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con validate_shape=validate_shape, steps_to_live=primary._steps_to_live, ht_partition_num=primary._ht_partition_num) - slot._init_op._set_attr("embedding_variable_type", - attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE)) + _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE) else: filter_strategy = None if primary._filter_freq != 0: @@ -107,7 +106,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con else: filter_strategy = variables.CounterFilter(filter_freq=primary._filter_freq) if slot_config.slot_type is config_pb2.SlotType.EMBEDDING_VARIABLE: - primary._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_config.slot_num)) + _set_init_op_slot_num_attr(primary, slot_config.slot_num) primary._slot_num = slot_config.slot_num emb_index = primary._emb_index if primary.block_num > 1: @@ -132,8 +131,7 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype, slot_con l2_weight_threshold=primary._l2_weight_threshold, filter_strategy=filter_strategy) ) - slot._init_op._set_attr("embedding_variable_type", - attr_value_pb2.AttrValue(i=config_pb2.EmbeddingVariableType.MUTABLE)) + _set_init_op_embedding_type_attr(slot, config_pb2.EmbeddingVariableType.MUTABLE) else: slot = variable_scope.get_variable( scope, @@ -300,3 +298,13 @@ def create_zeros_slot(primary, name, dtype=None, colocate_with_primary=True, slo return create_slot(primary, val, name, colocate_with_primary=colocate_with_primary, slot_config=slot_config) + +def _set_init_op_embedding_type_attr(var, embedding_type): + var._init_op._set_attr("embedding_variable_type", + attr_value_pb2.AttrValue(i=embedding_type)) + var._initializer_for_restore._set_attr("embedding_variable_type", + attr_value_pb2.AttrValue(i=embedding_type)) + +def _set_init_op_slot_num_attr(var, slot_num): + var._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num)) + var._initializer_for_restore._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num)) From 821d5e8d39156d477bacd2ede9f68f76ede0f77d Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Tue, 19 Sep 2023 09:56:20 +0800 Subject: [PATCH 05/45] [Embedding] Remove the dependency on private header file in EmbeddingVariable. (#927) Signed-off-by: lixy9474 --- tensorflow/core/BUILD | 5 +- .../framework/embedding/embedding_config.h | 3 + .../core/framework/embedding/embedding_var.h | 1 - .../embedding/embedding_var_ckpt_data.cc | 262 +++++++ .../embedding/embedding_var_ckpt_data.h | 190 +---- .../embedding/embedding_var_dump_iterator.h | 7 +- .../embedding/embedding_var_restore.cc | 647 ++++++++++++++++++ .../embedding/embedding_var_restore.h | 534 +-------------- .../core/framework/embedding/kv_interface.h | 8 +- .../embedding/ssd_record_descriptor.cc | 88 +++ .../embedding/ssd_record_descriptor.h | 49 +- tensorflow/core/framework/embedding/storage.h | 4 +- tensorflow/core/kernels/BUILD | 5 +- 13 files changed, 1041 insertions(+), 762 deletions(-) create mode 100644 tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc create mode 100644 tensorflow/core/framework/embedding/embedding_var_restore.cc create mode 100644 tensorflow/core/framework/embedding/ssd_record_descriptor.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 8ae5b4f156c..95bbbab5624 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -3026,7 +3026,10 @@ tf_cuda_library( "framework/embedding/gpu_hash_table.cu.cc", "framework/embedding/gpu_hash_table.h", "framework/embedding/embedding_var.cu.cc", - "framework/embedding/multi_tier_storage.cu.cc" + "framework/embedding/multi_tier_storage.cu.cc", + "framework/embedding/embedding_var_ckpt_data.cc", + "framework/embedding/embedding_var_restore.cc", + "framework/embedding/ssd_record_descriptor.cc" ], ) + select({ "//tensorflow:windows": [], diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h index 0a50b492159..d47d07d4205 100644 --- a/tensorflow/core/framework/embedding/embedding_config.h +++ b/tensorflow/core/framework/embedding/embedding_config.h @@ -3,6 +3,9 @@ #include #include "tensorflow/core/framework/embedding/config.pb.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/default/logging.h" namespace tensorflow { struct EmbeddingConfig { diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index b29493f2169..28ce5094d87 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -37,7 +37,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/storage.h" #include "tensorflow/core/framework/embedding/storage_factory.h" #include "tensorflow/core/framework/typed_allocator.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" namespace tensorflow { using CPUDevice = Eigen::ThreadPoolDevice; diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc new file mode 100644 index 00000000000..c1b43a608b5 --- /dev/null +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc @@ -0,0 +1,262 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "tensorflow/core/framework/embedding/embedding_var_ckpt_data.h" +#include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h" +#include "tensorflow/core/kernels/save_restore_tensor.h" +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { +namespace embedding { +template +void EmbeddingVarCkptData::Emplace( + K key, ValuePtr* value_ptr, + const EmbeddingConfig& emb_config, + V* default_value, int64 value_offset, + bool is_save_freq, + bool is_save_version, + bool save_unfiltered_features) { + if((int64)value_ptr == ValuePtrStatus::IS_DELETED) + return; + + V* primary_val = value_ptr->GetValue(0, 0); + bool is_not_admit = + primary_val == nullptr + && emb_config.filter_freq != 0; + + if (!is_not_admit) { + key_vec_.emplace_back(key); + + if (primary_val == nullptr) { + value_ptr_vec_.emplace_back(default_value); + } else if ( + (int64)primary_val == ValuePosition::NOT_IN_DRAM) { + value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM); + } else { + V* val = value_ptr->GetValue(emb_config.emb_index, + value_offset); + value_ptr_vec_.emplace_back(val); + } + + + if(is_save_version) { + int64 dump_version = value_ptr->GetStep(); + version_vec_.emplace_back(dump_version); + } + + if(is_save_freq) { + int64 dump_freq = value_ptr->GetFreq(); + freq_vec_.emplace_back(dump_freq); + } + } else { + if (!save_unfiltered_features) + return; + + key_filter_vec_.emplace_back(key); + + if(is_save_version) { + int64 dump_version = value_ptr->GetStep(); + version_filter_vec_.emplace_back(dump_version); + } + + int64 dump_freq = value_ptr->GetFreq(); + freq_filter_vec_.emplace_back(dump_freq); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::Emplace( \ + ktype, ValuePtr*, const EmbeddingConfig&, \ + vtype*, int64, bool, bool, bool); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + + +template +void EmbeddingVarCkptData::Emplace(K key, V* value_ptr) { + key_vec_.emplace_back(key); + value_ptr_vec_.emplace_back(value_ptr); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::Emplace( \ + ktype, vtype*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void EmbeddingVarCkptData::SetWithPartition( + std::vector>& ev_ckpt_data_parts) { + part_offset_.resize(kSavedPartitionNum + 1); + part_filter_offset_.resize(kSavedPartitionNum + 1); + part_offset_[0] = 0; + part_filter_offset_[0] = 0; + for (int i = 0; i < kSavedPartitionNum; i++) { + part_offset_[i + 1] = + part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size(); + + part_filter_offset_[i + 1] = + part_filter_offset_[i] + + ev_ckpt_data_parts[i].key_filter_vec_.size(); + + for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) { + key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) { + value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) { + version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) { + freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) { + key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) { + version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]); + } + + for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) { + freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]); + } + } +} + +#define REGISTER_KERNELS(ktype, vtype) \ + template void EmbeddingVarCkptData::SetWithPartition( \ + std::vector>&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status EmbeddingVarCkptData::ExportToCkpt( + const string& tensor_name, + BundleWriter* writer, + int64 value_len, + ValueIterator* value_iter) { + size_t bytes_limit = 8 << 20; + std::unique_ptr dump_buffer(new char[bytes_limit]); + + EVVectorDataDumpIterator key_dump_iter(key_vec_); + Status s = SaveTensorWithFixedBuffer( + tensor_name + "-keys", writer, dump_buffer.get(), + bytes_limit, &key_dump_iter, + TensorShape({key_vec_.size()})); + if (!s.ok()) + return s; + + EV2dVectorDataDumpIterator value_dump_iter( + value_ptr_vec_, value_len, value_iter); + s = SaveTensorWithFixedBuffer( + tensor_name + "-values", writer, dump_buffer.get(), + bytes_limit, &value_dump_iter, + TensorShape({value_ptr_vec_.size(), value_len})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator version_dump_iter(version_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-versions", writer, dump_buffer.get(), + bytes_limit, &version_dump_iter, + TensorShape({version_vec_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator freq_dump_iter(freq_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-freqs", writer, dump_buffer.get(), + bytes_limit, &freq_dump_iter, + TensorShape({freq_vec_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator filtered_key_dump_iter(key_filter_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-keys_filtered", writer, dump_buffer.get(), + bytes_limit, &filtered_key_dump_iter, + TensorShape({key_filter_vec_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator + filtered_version_dump_iter(version_filter_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-versions_filtered", + writer, dump_buffer.get(), + bytes_limit, &filtered_version_dump_iter, + TensorShape({version_filter_vec_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator + filtered_freq_dump_iter(freq_filter_vec_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-freqs_filtered", + writer, dump_buffer.get(), + bytes_limit, &filtered_freq_dump_iter, + TensorShape({freq_filter_vec_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator + part_offset_dump_iter(part_offset_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-partition_offset", + writer, dump_buffer.get(), + bytes_limit, &part_offset_dump_iter, + TensorShape({part_offset_.size()})); + if (!s.ok()) + return s; + + EVVectorDataDumpIterator + part_filter_offset_dump_iter(part_filter_offset_); + s = SaveTensorWithFixedBuffer( + tensor_name + "-partition_filter_offset", + writer, dump_buffer.get(), + bytes_limit, &part_filter_offset_dump_iter, + TensorShape({part_filter_offset_.size()})); + if (!s.ok()) + return s; + + return Status::OK(); +} + +#define REGISTER_KERNELS(ktype, vtype) \ + template Status EmbeddingVarCkptData::ExportToCkpt( \ + const string&, BundleWriter*, int64, ValueIterator*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS +}// namespace embedding +}// namespace tensorflow \ No newline at end of file diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h index aa1a08cbcfd..6d7b09e70b0 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h @@ -15,11 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_ #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" -#include "tensorflow/core/kernels/save_restore_tensor.h" #include "tensorflow/core/framework/embedding/embedding_config.h" #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h" namespace tensorflow { +class BundleWriter; + namespace embedding { template @@ -30,195 +30,17 @@ class EmbeddingVarCkptData { V* default_value, int64 value_offset, bool is_save_freq, bool is_save_version, - bool save_unfiltered_features) { - if((int64)value_ptr == ValuePtrStatus::IS_DELETED) - return; - - V* primary_val = value_ptr->GetValue(0, 0); - bool is_not_admit = - primary_val == nullptr - && emb_config.filter_freq != 0; - - if (!is_not_admit) { - key_vec_.emplace_back(key); - - if (primary_val == nullptr) { - value_ptr_vec_.emplace_back(default_value); - } else if ( - (int64)primary_val == ValuePosition::NOT_IN_DRAM) { - value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM); - } else { - V* val = value_ptr->GetValue(emb_config.emb_index, - value_offset); - value_ptr_vec_.emplace_back(val); - } - - - if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); - version_vec_.emplace_back(dump_version); - } - - if(is_save_freq) { - int64 dump_freq = value_ptr->GetFreq(); - freq_vec_.emplace_back(dump_freq); - } - } else { - if (!save_unfiltered_features) - return; - - key_filter_vec_.emplace_back(key); + bool save_unfiltered_features); - if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); - version_filter_vec_.emplace_back(dump_version); - } - - int64 dump_freq = value_ptr->GetFreq(); - freq_filter_vec_.emplace_back(dump_freq); - } - } - - void Emplace(K key, V* value_ptr) { - key_vec_.emplace_back(key); - value_ptr_vec_.emplace_back(value_ptr); - } + void Emplace(K key, V* value_ptr); void SetWithPartition( - std::vector>& ev_ckpt_data_parts) { - part_offset_.resize(kSavedPartitionNum + 1); - part_filter_offset_.resize(kSavedPartitionNum + 1); - part_offset_[0] = 0; - part_filter_offset_[0] = 0; - for (int i = 0; i < kSavedPartitionNum; i++) { - part_offset_[i + 1] = - part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size(); - - part_filter_offset_[i + 1] = - part_filter_offset_[i] + - ev_ckpt_data_parts[i].key_filter_vec_.size(); - - for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) { - key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) { - value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) { - version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) { - freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) { - key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size(); j++) { - version_filter_vec_.emplace_back(ev_ckpt_data_parts[i].version_filter_vec_[j]); - } - - for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) { - freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]); - } - } - } + std::vector>& ev_ckpt_data_parts); Status ExportToCkpt(const string& tensor_name, BundleWriter* writer, int64 value_len, - ValueIterator* value_iter = nullptr) { - size_t bytes_limit = 8 << 20; - std::unique_ptr dump_buffer(new char[bytes_limit]); - - EVVectorDataDumpIterator key_dump_iter(key_vec_); - Status s = SaveTensorWithFixedBuffer( - tensor_name + "-keys", writer, dump_buffer.get(), - bytes_limit, &key_dump_iter, - TensorShape({key_vec_.size()})); - if (!s.ok()) - return s; - - EV2dVectorDataDumpIterator value_dump_iter( - value_ptr_vec_, value_len, value_iter); - s = SaveTensorWithFixedBuffer( - tensor_name + "-values", writer, dump_buffer.get(), - bytes_limit, &value_dump_iter, - TensorShape({value_ptr_vec_.size(), value_len})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator version_dump_iter(version_vec_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-versions", writer, dump_buffer.get(), - bytes_limit, &version_dump_iter, - TensorShape({version_vec_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator freq_dump_iter(freq_vec_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-freqs", writer, dump_buffer.get(), - bytes_limit, &freq_dump_iter, - TensorShape({freq_vec_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator filtered_key_dump_iter(key_filter_vec_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-keys_filtered", writer, dump_buffer.get(), - bytes_limit, &filtered_key_dump_iter, - TensorShape({key_filter_vec_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator - filtered_version_dump_iter(version_filter_vec_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-versions_filtered", - writer, dump_buffer.get(), - bytes_limit, &filtered_version_dump_iter, - TensorShape({version_filter_vec_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator - filtered_freq_dump_iter(freq_filter_vec_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-freqs_filtered", - writer, dump_buffer.get(), - bytes_limit, &filtered_freq_dump_iter, - TensorShape({freq_filter_vec_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator - part_offset_dump_iter(part_offset_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-partition_offset", - writer, dump_buffer.get(), - bytes_limit, &part_offset_dump_iter, - TensorShape({part_offset_.size()})); - if (!s.ok()) - return s; - - EVVectorDataDumpIterator - part_filter_offset_dump_iter(part_filter_offset_); - s = SaveTensorWithFixedBuffer( - tensor_name + "-partition_filter_offset", - writer, dump_buffer.get(), - bytes_limit, &part_filter_offset_dump_iter, - TensorShape({part_filter_offset_.size()})); - if (!s.ok()) - return s; - - return Status::OK(); - } - + ValueIterator* value_iter = nullptr); private: std::vector key_vec_; std::vector value_ptr_vec_; diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h index 71ba054b873..84c823a90dc 100644 --- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h +++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h @@ -15,9 +15,12 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_ #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" -#include "tensorflow/core/kernels/save_restore_tensor.h" +#include "tensorflow/core/framework/embedding/embedding_config.h" +#include "tensorflow/core/framework/embedding/kv_interface.h" namespace tensorflow { +template +class DumpIterator; + namespace embedding { template class EVVectorDataDumpIterator: public DumpIterator { diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc new file mode 100644 index 00000000000..11c13008995 --- /dev/null +++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc @@ -0,0 +1,647 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/embedding/embedding_var_restore.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/save_restore_tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" + +namespace tensorflow { +template +int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) { + TensorShape shape; + Status st; + st = reader->LookupTensorShape(record_key, &shape); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0)); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + size_t bytes_read = 0; + *buffer = new K[shape.dim_size(0)]; + st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0), + (char*)*buffer, bytes_read); + if (!st.ok()) { + LOG(FATAL) << "Restore record " << record_key << " failed"; + } + return shape.dim_size(0); +} +#define REGISTER_KERNELS(ktype) \ + template int64 ReadRecord(BundleReader*, const string&, ktype**); +REGISTER_KERNELS(int32); +REGISTER_KERNELS(int64); +#undef REGISTER_KERNELS + +template +void CheckpointLoader::RestoreSSD() { + std::string name_string_temp(restore_args_.m_name_string); + std::string new_str = "_"; + int64 pos = name_string_temp.find("/"); + while (pos != std::string::npos) { + name_string_temp.replace(pos, 1, new_str.data(), 1); + pos = name_string_temp.find("/"); + } + std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" + + name_string_temp + "-ssd_record"; + if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) { + std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" + + name_string_temp + "-emb_files"; + BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name); + RestoreSSDBuffer ssd_buffer(&ssd_record_reader); + VLOG(1) << "Loading SSD record... " << ssd_record_file_name; + storage_->RestoreSSD(ev_->GetEmbeddingIndex(), + ev_->GetEmbeddingSlotNum(), ev_->ValueLen(), + ssd_emb_file_name, ev_, ssd_buffer); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::RestoreSSD(); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +void CheckpointLoader::RestoreInternal( + const std::string& name_string, + const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device, + RestoreBuffer& restore_buff) { + Status s = EVInitTensorNameAndShape(name_string); + if (!s.ok()) { + LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString(); + return; + } + + Tensor part_offset_tensor; + Tensor part_filter_offset_tensor; + if (!restore_args_.m_is_oldform) { + /****** InitPartOffsetTensor ******/ + TensorShape part_offset_shape, part_filter_offset_shape; + DataType part_offset_type, part_filter_offset_type; + string offset_tensor_name; + if (!restore_args_.m_is_incr) { + offset_tensor_name = name_string + kPartOffsetTensorSuffsix; + } else { + offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix; + } + + string offset_filter_tensor_name = + name_string + kPartFilterOffsetTensorSuffsix; + Status s = reader_->LookupDtypeAndShape( + offset_tensor_name, &part_offset_type, &part_offset_shape); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail:" << s.error_message(); + } + s = reader_->LookupDtypeAndShape(offset_filter_tensor_name, + &part_filter_offset_type, + &part_filter_offset_shape); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.error_message(); + } + part_offset_tensor = + Tensor(cpu_allocator(), part_offset_type, part_offset_shape); + part_filter_offset_tensor = Tensor( + cpu_allocator(), part_filter_offset_type, part_filter_offset_shape); + s = reader_->Lookup(offset_tensor_name, &part_offset_tensor); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail:" << s.error_message(); + } + + s = reader_->Lookup(offset_filter_tensor_name, + &part_filter_offset_tensor); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.error_message(); + } + } + auto part_offset_flat = part_offset_tensor.flat(); + auto part_filter_offset_flat = part_filter_offset_tensor.flat(); + + if (restore_args_.m_is_oldform) { + VLOG(1) << "old form, EV name:" << name_string + << ", partition_id:" << restore_args_.m_partition_id + << ", new partition num:" << restore_args_.m_partition_num; + int64 new_dim = ev_->ValueLen(); + TensorShape key_shape; + Status st = + reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); + if (!st.ok()) { + LOG(ERROR) << "EVRestoreFeaturesOld fail: " << st.error_message(); + } + int tot_key_num = key_shape.dim_size(0); + Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff, + new_dim, emb_config, device); + if (!s.ok()) { + LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message(); + } + } else { + int64 new_dim = ev_->ValueLen(); + VLOG(1) << "new form checkpoint... :" << name_string + << " , partition_id:" << restore_args_.m_partition_id + << " , partition_num:" << restore_args_.m_partition_num; + for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) { + int subpart_id = restore_args_.m_loaded_parts[i]; + size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; + size_t value_unit_bytes_new = sizeof(V) * new_dim; + int subpart_offset = part_offset_flat(subpart_id); + int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset; + int64 key_part_offset = subpart_offset * sizeof(K); + int64 value_part_offset = + subpart_offset * sizeof(V) * restore_args_.m_old_dim; + int64 version_part_offset = subpart_offset * sizeof(int64); + int64 freq_part_offset = subpart_offset * sizeof(int64); + VLOG(1) << "dynamically load ev : " << name_string + << ", subpartid:" << subpart_id; + + EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset, + version_part_offset, freq_part_offset, restore_buff, + new_dim, emb_config, device); + + if (restore_args_.m_has_filter) { + Status s = EVRestoreFilteredFeatures( + subpart_id, new_dim, restore_buff, part_filter_offset_flat, + emb_config, device); + if (!s.ok()) { + LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message(); + } + } + } + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::RestoreInternal( \ + const std::string&, const EmbeddingConfig&, \ + const Eigen::GpuDevice*, RestoreBuffer&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +bool CheckpointLoader::IsOldCheckpoint( + const std::string& curr_partid_str, + const std::string& kPartOffsetTensorSuffsix) { + if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) { + string tensor_name = restore_args_.m_name_string; + TensorShape part_offset_shape; + DataType part_offset_type; + Status st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (st.ok()) return false; + + string part_id = std::to_string(0); + tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id; + + Status form_st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (form_st.ok()) return false; + } else { + string part_id = std::to_string(0); + size_t part_pos = restore_args_.m_name_string.find(kPartStr); + size_t part_size = strlen(kPartStr); + size_t cur_part_size = curr_partid_str.size(); + + string pre_subname = restore_args_.m_name_string.substr(0, part_pos); + string post_subname = restore_args_.m_name_string.substr( + part_pos + part_size + cur_part_size); + string tensor_name = pre_subname + kPartStr + part_id + post_subname; + + TensorShape part_offset_shape; + DataType part_offset_type; + Status form_st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (form_st.ok()) return false; + pre_subname = + restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/ + post_subname = restore_args_.m_name_string.substr(part_pos + part_size + + cur_part_size); + tensor_name = pre_subname + post_subname; + + Status st = + reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, + &part_offset_type, &part_offset_shape); + if (st.ok()) return false; + } + + return true; +} +#define REGISTER_KERNELS(ktype, vtype) \ + template bool CheckpointLoader::IsOldCheckpoint( \ + const std::string&, const std::string&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + + +template +void CheckpointLoader::InitPartNumAndLoadedParts( + std::vector& tensor_name_vec) { + std::string tmp_key_suffix; + std::string tmp_kPartOffsetTensorSuffsix; + if (!restore_args_.m_is_incr) { + tmp_key_suffix = kKeySuffix; + tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix; + } else { + tmp_key_suffix = kIncrKeySuffix; + tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix; + } + + restore_args_.m_loaded_parts.reserve(kSavedPartitionNum); + int orig_partnum = 0; + const string& curr_partid_str = std::to_string(restore_args_.m_partition_id); + size_t part_pos = restore_args_.m_name_string.find(kPartStr); + + if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) { + restore_args_.m_is_oldform = true; + } + + if (part_pos == std::string::npos) { + for (;; orig_partnum++) { + string part_id = std::to_string(orig_partnum); + string tensor_name = + restore_args_.m_name_string + "/" + kPartStr + part_id; + string tensor_key = tensor_name + tmp_key_suffix; + TensorShape key_shape; + Status st = reader_->LookupTensorShape(tensor_key, &key_shape); + if (!st.ok()) { + break; + } + tensor_name_vec.emplace_back(tensor_name); + } + if (orig_partnum == 0) { + tensor_name_vec.emplace_back(restore_args_.m_name_string); + } + for (int i = 0; i < kSavedPartitionNum; ++i) { + restore_args_.m_loaded_parts.push_back(i); + } + } else { + for (;; orig_partnum++) { + string part_id = std::to_string(orig_partnum); + string pre_subname = restore_args_.m_name_string.substr(0, part_pos); + string post_subname = restore_args_.m_name_string.substr( + part_pos + strlen(kPartStr) + curr_partid_str.size()); + string tensor_name = pre_subname + kPartStr + part_id + post_subname; + string tensor_key = tensor_name + tmp_key_suffix; + TensorShape key_shape; + Status st = reader_->LookupTensorShape(tensor_key, &key_shape); + if (!st.ok()) { + break; + } + tensor_name_vec.emplace_back(tensor_name); + } + if (orig_partnum == 0) { + string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1); + string post_subname = restore_args_.m_name_string.substr( + part_pos + strlen(kPartStr) + curr_partid_str.size()); + string tmp_name = pre_subname + post_subname; + tensor_name_vec.emplace_back(tmp_name); + } + for (int i = 0; i < kSavedPartitionNum; i++) { + if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) { + restore_args_.m_loaded_parts.push_back(i); + } + } + } + for (auto& tensor_name : tensor_name_vec) { + VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name + << " ****"; + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void CheckpointLoader::InitPartNumAndLoadedParts(\ + std::vector&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVInitTensorNameAndShape( + const std::string& tensor_name) { + if (!restore_args_.m_is_incr) { + restore_args_.m_tensor_key = tensor_name + kKeySuffix; + restore_args_.m_tensor_value = tensor_name + kValueSuffix; + restore_args_.m_tensor_version = tensor_name + kVersionSuffix; + restore_args_.m_tensor_freq = tensor_name + kFreqSuffix; + } else { + restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix; + restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix; + restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix; + restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix; + } + + TensorShape key_shape, value_shape, version_shape, freq_shape; + + Status st = + reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_version, + &version_shape); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_key, + sizeof(K) * key_shape.dim_size(0)); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_value, + sizeof(V) * value_shape.dim_size(0) * + value_shape.dim_size(1)); + if (!st.ok()) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_version, + sizeof(int64) * version_shape.dim_size(0)); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + freq_shape = version_shape; + } else { + return st; + } + } + st = reader_->LookupHeader(restore_args_.m_tensor_freq, + sizeof(int64) * freq_shape.dim_size(0)); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + restore_args_.m_has_freq = false; + } else { + return st; + } + } + restore_args_.m_old_dim = value_shape.dim_size(1); + + if (!restore_args_.m_is_oldform) { + TensorShape key_filter_shape, version_filter_shape, freq_filter_shape; + st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered", + &key_filter_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + key_filter_shape = key_shape; + restore_args_.m_has_filter = false; + } else { + return st; + } + } + st = reader_->LookupTensorShape( + restore_args_.m_tensor_version + "_filtered", &version_filter_shape); + if ((!st.ok()) && (st.code() != error::NOT_FOUND)) { + return st; + } + st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered", + sizeof(K) * key_filter_shape.dim_size(0)); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + restore_args_.m_has_filter = false; + } else { + return st; + } + } + st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered", + sizeof(K) * version_filter_shape.dim_size(0)); + if (!st.ok()) { + return st; + } + st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered", + &freq_filter_shape); + if (!st.ok()) { + if (st.code() == error::NOT_FOUND) { + freq_filter_shape = freq_shape; + } else { + return st; + } + } + + st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered", + sizeof(K) * freq_filter_shape.dim_size(0)); + if (!st.ok() && st.code() != error::NOT_FOUND) { + return st; + } + } + return st; +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVInitTensorNameAndShape(\ + const std::string&); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVRestoreFeatures( + int tot_key_num, int64 key_part_offset, + int64 value_part_offset, int64 version_part_offset, + int64 freq_part_offset, RestoreBuffer& restore_buff, + int64 new_dim, const EmbeddingConfig& emb_config, + const Eigen::GpuDevice* device) { + size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; + size_t value_unit_bytes_new = sizeof(V) * new_dim; + int64 tot_key_bytes_read(0); + int64 tot_value_bytes_read(0); + int64 tot_version_bytes_read(0); + int64 tot_freq_bytes_read(0); + size_t key_bytes_read = 0; + size_t value_bytes_read = 0; + size_t version_bytes_read = 0; + size_t freq_bytes_read = 0; + + while (tot_key_num > 0) { + size_t read_key_num = std::min( + std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes), + kBufferSize / sizeof(int64)); + read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new); + read_key_num = std::min((int)read_key_num, tot_key_num); + reader_->LookupSegmentOffset( + restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read, + read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read); + reader_->LookupSegmentOffset( + restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read, + read_key_num * value_unit_bytes, restore_buff.value_buffer, + value_bytes_read); + if (!restore_args_.m_reset_version) { + reader_->LookupSegmentOffset( + restore_args_.m_tensor_version, + version_part_offset + tot_version_bytes_read, + read_key_num * sizeof(int64), restore_buff.version_buffer, + version_bytes_read); + if (version_bytes_read == 0) { + memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num); + } + } else { + int64* version_tmp = (int64*)restore_buff.version_buffer; + memset(version_tmp, 0, read_key_num * sizeof(int64)); + } + + if (restore_args_.m_has_freq) { + reader_->LookupSegmentOffset( + restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read, + read_key_num * sizeof(int64), restore_buff.freq_buffer, + freq_bytes_read); + if (freq_bytes_read == 0) { + int64* freq_tmp = (int64*)restore_buff.freq_buffer; + for (int64 i = 0; i < read_key_num; i++) { + freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); + } + } + } else { + int64* freq_tmp = (int64*)restore_buff.freq_buffer; + for (int64 i = 0; i < read_key_num; i++) { + freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); + } + } + if (key_bytes_read > 0) { + read_key_num = key_bytes_read / sizeof(K); + Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes, + value_bytes_read, value_unit_bytes_new, + restore_buff); + if (!st.ok()) { + LOG(FATAL) << "EV Restore fail:" << st.ToString(); + } + + st = storage_->RestoreFeatures( + read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, + restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr, + emb_config, device, + filter_, restore_buff); + if (!st.ok()) { + LOG(FATAL) << "EV Restore fail:" << st.ToString(); + } + } + + tot_key_num -= read_key_num; + tot_key_bytes_read += key_bytes_read; + tot_value_bytes_read += value_bytes_read; + tot_version_bytes_read += version_bytes_read; + tot_freq_bytes_read += freq_bytes_read; + } + + return Status::OK(); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVRestoreFeatures( \ + int, int64, int64, int64, int64, RestoreBuffer&, \ + int64, const EmbeddingConfig&, const Eigen::GpuDevice*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +template +Status CheckpointLoader::EVRestoreFilteredFeatures( + int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff, + typename TTypes::Flat part_filter_offset_flat, + const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) { + int subpart_filter_offset = part_filter_offset_flat(subpart_id); + int tot_key_filter_num = + part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset; + int64 key_filter_part_offset = subpart_filter_offset * sizeof(K); + int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64); + int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64); + + VLOG(1) << "key_filter_num: " << tot_key_filter_num + << ", subpart_filter_offset: " << subpart_filter_offset; + + size_t key_filter_bytes_read = 0; + size_t version_filter_bytes_read = 0; + size_t freq_filter_bytes_read = 0; + + while (tot_key_filter_num > 0) { + size_t read_key_num = + std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64)); + read_key_num = std::min((int)read_key_num, tot_key_filter_num); + reader_->LookupSegmentOffset( + restore_args_.m_tensor_key + "_filtered", + key_filter_part_offset + key_filter_bytes_read, + read_key_num * sizeof(K), restore_buff.key_buffer, + key_filter_bytes_read); + if (!restore_args_.m_reset_version) { + reader_->LookupSegmentOffset( + restore_args_.m_tensor_version + "_filtered", + version_filter_part_offset + version_filter_bytes_read, + read_key_num * sizeof(int64), restore_buff.version_buffer, + version_filter_bytes_read); + } else { + int64* version_tmp = (int64*)restore_buff.version_buffer; + memset(version_tmp, 0, read_key_num * sizeof(int64)); + } + reader_->LookupSegmentOffset( + restore_args_.m_tensor_freq + "_filtered", + freq_filter_part_offset + freq_filter_bytes_read, + read_key_num * sizeof(int64), restore_buff.freq_buffer, + freq_filter_bytes_read); + if (key_filter_bytes_read > 0) { + read_key_num = key_filter_bytes_read / sizeof(K); + VLOG(2) << "restore, read_key_num:" << read_key_num; + Status st = storage_->RestoreFeatures( + read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, + restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr, + emb_config, device, + filter_, restore_buff); + if (!st.ok()) return st; + tot_key_filter_num -= read_key_num; + } + } + return Status::OK(); +} +#define REGISTER_KERNELS(ktype, vtype) \ + template Status CheckpointLoader::EVRestoreFilteredFeatures( \ + int64, int64, RestoreBuffer&, typename TTypes::Flat, \ + const EmbeddingConfig&, const Eigen::GpuDevice*); +#define REGISTER_KERNELS_ALL_INDEX(type) \ + REGISTER_KERNELS(int32, type) \ + REGISTER_KERNELS(int64, type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) +#undef REGISTER_KERNELS_ALL_INDEX +#undef REGISTER_KERNELS + +}// namespace tensorflow \ No newline at end of file diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.h b/tensorflow/core/framework/embedding/embedding_var_restore.h index ec97566fbec..3016ba9eeb8 100644 --- a/tensorflow/core/framework/embedding/embedding_var_restore.h +++ b/tensorflow/core/framework/embedding/embedding_var_restore.h @@ -16,23 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_ -#include "tensorflow/core/framework/allocator.h" -#include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/embedding/embedding_var.h" #include "tensorflow/core/framework/embedding/embedding_config.h" #include "tensorflow/core/framework/embedding/filter_policy.h" #include "tensorflow/core/framework/embedding/storage.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/kernels/save_restore_tensor.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/random/philox_random.h" -#include "tensorflow/core/lib/random/random.h" -#include "tensorflow/core/lib/random/random_distributions.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" +#include "tensorflow/core/util/env_var.h" namespace tensorflow { using GPUDevice = Eigen::GpuDevice; @@ -60,26 +48,7 @@ namespace { } // namespace template -int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) { - TensorShape shape; - Status st; - st = reader->LookupTensorShape(record_key, &shape); - if (!st.ok()) { - LOG(FATAL) << "Restore record " << record_key << " failed"; - } - st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0)); - if (!st.ok()) { - LOG(FATAL) << "Restore record " << record_key << " failed"; - } - size_t bytes_read = 0; - *buffer = new K[shape.dim_size(0)]; - st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0), - (char*)*buffer, bytes_read); - if (!st.ok()) { - LOG(FATAL) << "Restore record " << record_key << " failed"; - } - return shape.dim_size(0); -} +int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer); template struct RestoreSSDBuffer { @@ -178,513 +147,28 @@ class CheckpointLoader { void RestoreInternal(const std::string& name_string, const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device, - RestoreBuffer& restore_buff) { - Status s = EVInitTensorNameAndShape(name_string); - if (!s.ok()) { - LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString(); - return; - } - - Tensor part_offset_tensor; - Tensor part_filter_offset_tensor; - if (!restore_args_.m_is_oldform) { - /****** InitPartOffsetTensor ******/ - TensorShape part_offset_shape, part_filter_offset_shape; - DataType part_offset_type, part_filter_offset_type; - string offset_tensor_name; - if (!restore_args_.m_is_incr) { - offset_tensor_name = name_string + kPartOffsetTensorSuffsix; - } else { - offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix; - } - - string offset_filter_tensor_name = - name_string + kPartFilterOffsetTensorSuffsix; - Status s = reader_->LookupDtypeAndShape( - offset_tensor_name, &part_offset_type, &part_offset_shape); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail:" << s.error_message(); - } - s = reader_->LookupDtypeAndShape(offset_filter_tensor_name, - &part_filter_offset_type, - &part_filter_offset_shape); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail: " << s.error_message(); - } - part_offset_tensor = - Tensor(cpu_allocator(), part_offset_type, part_offset_shape); - part_filter_offset_tensor = Tensor( - cpu_allocator(), part_filter_offset_type, part_filter_offset_shape); - s = reader_->Lookup(offset_tensor_name, &part_offset_tensor); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail:" << s.error_message(); - } - - s = reader_->Lookup(offset_filter_tensor_name, - &part_filter_offset_tensor); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail: " << s.error_message(); - } - } - auto part_offset_flat = part_offset_tensor.flat(); - auto part_filter_offset_flat = part_filter_offset_tensor.flat(); - - if (restore_args_.m_is_oldform) { - VLOG(1) << "old form, EV name:" << name_string - << ", partition_id:" << restore_args_.m_partition_id - << ", new partition num:" << restore_args_.m_partition_num; - int64 new_dim = ev_->ValueLen(); - TensorShape key_shape; - Status st = - reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); - if (!st.ok()) { - } - int tot_key_num = key_shape.dim_size(0); - Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff, - new_dim, emb_config, device); - if (!s.ok()) { - LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.error_message(); - } - } else { - int64 new_dim = ev_->ValueLen(); - VLOG(1) << "new form checkpoint... :" << name_string - << " , partition_id:" << restore_args_.m_partition_id - << " , partition_num:" << restore_args_.m_partition_num; - for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) { - int subpart_id = restore_args_.m_loaded_parts[i]; - size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; - size_t value_unit_bytes_new = sizeof(V) * new_dim; - int subpart_offset = part_offset_flat(subpart_id); - int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset; - int64 key_part_offset = subpart_offset * sizeof(K); - int64 value_part_offset = - subpart_offset * sizeof(V) * restore_args_.m_old_dim; - int64 version_part_offset = subpart_offset * sizeof(int64); - int64 freq_part_offset = subpart_offset * sizeof(int64); - VLOG(1) << "dynamically load ev : " << name_string - << ", subpartid:" << subpart_id; - - EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset, - version_part_offset, freq_part_offset, restore_buff, - new_dim, emb_config, device); - - if (restore_args_.m_has_filter) { - Status s = EVRestoreFilteredFeatures( - subpart_id, new_dim, restore_buff, part_filter_offset_flat, - emb_config, device); - if (!s.ok()) { - LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.error_message(); - } - } - } - } - } + RestoreBuffer& restore_buff); private: - void RestoreSSD() { - std::string name_string_temp(restore_args_.m_name_string); - std::string new_str = "_"; - int64 pos = name_string_temp.find("/"); - while (pos != std::string::npos) { - name_string_temp.replace(pos, 1, new_str.data(), 1); - pos = name_string_temp.find("/"); - } - std::string ssd_record_file_name = restore_args_.m_file_name_string + "-" + - name_string_temp + "-ssd_record"; - if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) { - std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" + - name_string_temp + "-emb_files"; - BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name); - RestoreSSDBuffer ssd_buffer(&ssd_record_reader); - VLOG(1) << "Loading SSD record... " << ssd_record_file_name; - storage_->RestoreSSD(ev_->GetEmbeddingIndex(), - ev_->GetEmbeddingSlotNum(), ev_->ValueLen(), - ssd_emb_file_name, ev_, ssd_buffer); - } - } + void RestoreSSD(); bool IsOldCheckpoint(const std::string& curr_partid_str, - const std::string& kPartOffsetTensorSuffsix) { - if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) { - string tensor_name = restore_args_.m_name_string; - TensorShape part_offset_shape; - DataType part_offset_type; - Status st = - reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, - &part_offset_type, &part_offset_shape); - if (st.ok()) return false; - - string part_id = std::to_string(0); - tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id; - - Status form_st = - reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, - &part_offset_type, &part_offset_shape); - if (form_st.ok()) return false; - } else { - string part_id = std::to_string(0); - size_t part_pos = restore_args_.m_name_string.find(kPartStr); - size_t part_size = strlen(kPartStr); - size_t cur_part_size = curr_partid_str.size(); - - string pre_subname = restore_args_.m_name_string.substr(0, part_pos); - string post_subname = restore_args_.m_name_string.substr( - part_pos + part_size + cur_part_size); - string tensor_name = pre_subname + kPartStr + part_id + post_subname; - - TensorShape part_offset_shape; - DataType part_offset_type; - Status form_st = - reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, - &part_offset_type, &part_offset_shape); - if (form_st.ok()) return false; - pre_subname = - restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/ - post_subname = restore_args_.m_name_string.substr(part_pos + part_size + - cur_part_size); - tensor_name = pre_subname + post_subname; - - Status st = - reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix, - &part_offset_type, &part_offset_shape); - if (st.ok()) return false; - } - - return true; - } - - void InitPartNumAndLoadedParts(std::vector& tensor_name_vec) { - std::string tmp_key_suffix; - std::string tmp_kPartOffsetTensorSuffsix; - if (!restore_args_.m_is_incr) { - tmp_key_suffix = kKeySuffix; - tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix; - } else { - tmp_key_suffix = kIncrKeySuffix; - tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix; - } - - restore_args_.m_loaded_parts.reserve(kSavedPartitionNum); - int orig_partnum = 0; - const string& curr_partid_str = std::to_string(restore_args_.m_partition_id); - size_t part_pos = restore_args_.m_name_string.find(kPartStr); - - if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) { - restore_args_.m_is_oldform = true; - } - - if (part_pos == std::string::npos) { - for (;; orig_partnum++) { - string part_id = std::to_string(orig_partnum); - string tensor_name = - restore_args_.m_name_string + "/" + kPartStr + part_id; - string tensor_key = tensor_name + tmp_key_suffix; - TensorShape key_shape; - Status st = reader_->LookupTensorShape(tensor_key, &key_shape); - if (!st.ok()) { - break; - } - tensor_name_vec.emplace_back(tensor_name); - } - if (orig_partnum == 0) { - tensor_name_vec.emplace_back(restore_args_.m_name_string); - } - for (int i = 0; i < kSavedPartitionNum; ++i) { - restore_args_.m_loaded_parts.push_back(i); - } - } else { - for (;; orig_partnum++) { - string part_id = std::to_string(orig_partnum); - string pre_subname = restore_args_.m_name_string.substr(0, part_pos); - string post_subname = restore_args_.m_name_string.substr( - part_pos + strlen(kPartStr) + curr_partid_str.size()); - string tensor_name = pre_subname + kPartStr + part_id + post_subname; - string tensor_key = tensor_name + tmp_key_suffix; - TensorShape key_shape; - Status st = reader_->LookupTensorShape(tensor_key, &key_shape); - if (!st.ok()) { - break; - } - tensor_name_vec.emplace_back(tensor_name); - } - if (orig_partnum == 0) { - string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1); - string post_subname = restore_args_.m_name_string.substr( - part_pos + strlen(kPartStr) + curr_partid_str.size()); - string tmp_name = pre_subname + post_subname; - tensor_name_vec.emplace_back(tmp_name); - } - for (int i = 0; i < kSavedPartitionNum; i++) { - if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) { - restore_args_.m_loaded_parts.push_back(i); - } - } - } - for (auto& tensor_name : tensor_name_vec) { - VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name - << " ****"; - } - } + const std::string& kPartOffsetTensorSuffsix); - Status EVInitTensorNameAndShape(const std::string& tensor_name) { - if (!restore_args_.m_is_incr) { - restore_args_.m_tensor_key = tensor_name + kKeySuffix; - restore_args_.m_tensor_value = tensor_name + kValueSuffix; - restore_args_.m_tensor_version = tensor_name + kVersionSuffix; - restore_args_.m_tensor_freq = tensor_name + kFreqSuffix; - } else { - restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix; - restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix; - restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix; - restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix; - } + void InitPartNumAndLoadedParts(std::vector& tensor_name_vec); - TensorShape key_shape, value_shape, version_shape, freq_shape; - - Status st = - reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape); - if (!st.ok()) { - return st; - } - st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape); - if (!st.ok()) { - return st; - } - st = reader_->LookupTensorShape(restore_args_.m_tensor_version, - &version_shape); - if (!st.ok()) { - return st; - } - st = reader_->LookupHeader(restore_args_.m_tensor_key, - sizeof(K) * key_shape.dim_size(0)); - if (!st.ok()) { - return st; - } - st = reader_->LookupHeader(restore_args_.m_tensor_value, - sizeof(V) * value_shape.dim_size(0) * - value_shape.dim_size(1)); - if (!st.ok()) { - return st; - } - st = reader_->LookupHeader(restore_args_.m_tensor_version, - sizeof(int64) * version_shape.dim_size(0)); - if (!st.ok()) { - return st; - } - st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape); - if (!st.ok()) { - if (st.code() == error::NOT_FOUND) { - freq_shape = version_shape; - } else { - return st; - } - } - st = reader_->LookupHeader(restore_args_.m_tensor_freq, - sizeof(int64) * freq_shape.dim_size(0)); - if (!st.ok()) { - if (st.code() == error::NOT_FOUND) { - restore_args_.m_has_freq = false; - } else { - return st; - } - } - restore_args_.m_old_dim = value_shape.dim_size(1); - - if (!restore_args_.m_is_oldform) { - TensorShape key_filter_shape, version_filter_shape, freq_filter_shape; - st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered", - &key_filter_shape); - if (!st.ok()) { - if (st.code() == error::NOT_FOUND) { - key_filter_shape = key_shape; - restore_args_.m_has_filter = false; - } else { - return st; - } - } - st = reader_->LookupTensorShape( - restore_args_.m_tensor_version + "_filtered", &version_filter_shape); - if ((!st.ok()) && (st.code() != error::NOT_FOUND)) { - return st; - } - st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered", - sizeof(K) * key_filter_shape.dim_size(0)); - if (!st.ok()) { - if (st.code() == error::NOT_FOUND) { - restore_args_.m_has_filter = false; - } else { - return st; - } - } - st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered", - sizeof(K) * version_filter_shape.dim_size(0)); - if (!st.ok()) { - return st; - } - st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered", - &freq_filter_shape); - if (!st.ok()) { - if (st.code() == error::NOT_FOUND) { - freq_filter_shape = freq_shape; - } else { - return st; - } - } - - st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered", - sizeof(K) * freq_filter_shape.dim_size(0)); - if (!st.ok() && st.code() != error::NOT_FOUND) { - return st; - } - } - return st; - } + Status EVInitTensorNameAndShape(const std::string& tensor_name); Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset, int64 value_part_offset, int64 version_part_offset, int64 freq_part_offset, RestoreBuffer& restore_buff, int64 new_dim, const EmbeddingConfig& emb_config, - const Eigen::GpuDevice* device) { - size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; - size_t value_unit_bytes_new = sizeof(V) * new_dim; - int64 tot_key_bytes_read(0); - int64 tot_value_bytes_read(0); - int64 tot_version_bytes_read(0); - int64 tot_freq_bytes_read(0); - size_t key_bytes_read = 0; - size_t value_bytes_read = 0; - size_t version_bytes_read = 0; - size_t freq_bytes_read = 0; - - while (tot_key_num > 0) { - size_t read_key_num = std::min( - std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes), - kBufferSize / sizeof(int64)); - read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new); - read_key_num = std::min((int)read_key_num, tot_key_num); - reader_->LookupSegmentOffset( - restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read, - read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read); - reader_->LookupSegmentOffset( - restore_args_.m_tensor_value, value_part_offset + tot_value_bytes_read, - read_key_num * value_unit_bytes, restore_buff.value_buffer, - value_bytes_read); - if (!restore_args_.m_reset_version) { - reader_->LookupSegmentOffset( - restore_args_.m_tensor_version, - version_part_offset + tot_version_bytes_read, - read_key_num * sizeof(int64), restore_buff.version_buffer, - version_bytes_read); - if (version_bytes_read == 0) { - memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num); - } - } else { - int64* version_tmp = (int64*)restore_buff.version_buffer; - memset(version_tmp, 0, read_key_num * sizeof(int64)); - } - - if (restore_args_.m_has_freq) { - reader_->LookupSegmentOffset( - restore_args_.m_tensor_freq, freq_part_offset + tot_freq_bytes_read, - read_key_num * sizeof(int64), restore_buff.freq_buffer, - freq_bytes_read); - if (freq_bytes_read == 0) { - int64* freq_tmp = (int64*)restore_buff.freq_buffer; - for (int64 i = 0; i < read_key_num; i++) { - freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); - } - } - } else { - int64* freq_tmp = (int64*)restore_buff.freq_buffer; - for (int64 i = 0; i < read_key_num; i++) { - freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq(); - } - } - if (key_bytes_read > 0) { - read_key_num = key_bytes_read / sizeof(K); - Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes, - value_bytes_read, value_unit_bytes_new, - restore_buff); - if (!st.ok()) { - LOG(FATAL) << "EV Restore fail:" << st.ToString(); - } - - st = storage_->RestoreFeatures( - read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, - restore_args_.m_partition_num, new_dim, false, restore_args_.m_is_incr, - emb_config, device, - filter_, restore_buff); - if (!st.ok()) { - LOG(FATAL) << "EV Restore fail:" << st.ToString(); - } - } - - tot_key_num -= read_key_num; - tot_key_bytes_read += key_bytes_read; - tot_value_bytes_read += value_bytes_read; - tot_version_bytes_read += version_bytes_read; - tot_freq_bytes_read += freq_bytes_read; - } - - return Status::OK(); - } + const Eigen::GpuDevice* device); Status EVRestoreFilteredFeatures( int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff, typename TTypes::Flat part_filter_offset_flat, - const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) { - int subpart_filter_offset = part_filter_offset_flat(subpart_id); - int tot_key_filter_num = - part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset; - int64 key_filter_part_offset = subpart_filter_offset * sizeof(K); - int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64); - int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64); - - VLOG(1) << "key_filter_num: " << tot_key_filter_num - << ", subpart_filter_offset: " << subpart_filter_offset; - - size_t key_filter_bytes_read = 0; - size_t version_filter_bytes_read = 0; - size_t freq_filter_bytes_read = 0; - - while (tot_key_filter_num > 0) { - size_t read_key_num = - std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64)); - read_key_num = std::min((int)read_key_num, tot_key_filter_num); - reader_->LookupSegmentOffset( - restore_args_.m_tensor_key + "_filtered", - key_filter_part_offset + key_filter_bytes_read, - read_key_num * sizeof(K), restore_buff.key_buffer, - key_filter_bytes_read); - if (!restore_args_.m_reset_version) { - reader_->LookupSegmentOffset( - restore_args_.m_tensor_version + "_filtered", - version_filter_part_offset + version_filter_bytes_read, - read_key_num * sizeof(int64), restore_buff.version_buffer, - version_filter_bytes_read); - } else { - int64* version_tmp = (int64*)restore_buff.version_buffer; - memset(version_tmp, 0, read_key_num * sizeof(int64)); - } - reader_->LookupSegmentOffset( - restore_args_.m_tensor_freq + "_filtered", - freq_filter_part_offset + freq_filter_bytes_read, - read_key_num * sizeof(int64), restore_buff.freq_buffer, - freq_filter_bytes_read); - if (key_filter_bytes_read > 0) { - read_key_num = key_filter_bytes_read / sizeof(K); - VLOG(2) << "restore, read_key_num:" << read_key_num; - Status st = storage_->RestoreFeatures( - read_key_num, kSavedPartitionNum, restore_args_.m_partition_id, - restore_args_.m_partition_num, value_len, true, restore_args_.m_is_incr, - emb_config, device, - filter_, restore_buff); - if (!st.ok()) return st; - tot_key_filter_num -= read_key_num; - } - } - return Status::OK(); - } + const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device); Status RestoreCustomDim(int new_dim, int read_key_num, size_t value_unit_bytes, size_t value_bytes_read, diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h index 71667cf0917..5d1f20b581a 100644 --- a/tensorflow/core/framework/embedding/kv_interface.h +++ b/tensorflow/core/framework/embedding/kv_interface.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ +#include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { @@ -29,6 +30,7 @@ class ValuePtr; template class GPUHashTable; +using GPUDevice = Eigen::GpuDevice; namespace embedding { template @@ -90,15 +92,15 @@ class KVInterface { virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v, int32 default_v_num, - size_t n, const Eigen::GpuDevice& device) { + size_t n, const GPUDevice& device) { return Status::OK(); } virtual Status BatchLookupOrCreateKeys(const K* keys, size_t n, - int32* item_idxs, const Eigen::GpuDevice& device) { + int32* item_idxs, const GPUDevice& device) { return Status::OK(); } - virtual Status BatchLookup(const Eigen::GpuDevice& device, + virtual Status BatchLookup(const GPUDevice& device, const K* keys, V* val, size_t n, const V* default_v) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchLookup in KVInterface."); diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.cc b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc new file mode 100644 index 00000000000..b224b24e856 --- /dev/null +++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.cc @@ -0,0 +1,88 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ +#include "tensorflow/core/framework/embedding/ssd_record_descriptor.h" +#include "tensorflow/core/kernels/save_restore_tensor.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/io/path.h" + +namespace tensorflow { +namespace embedding { +template +template +void SsdRecordDescriptor::DumpSection( + const std::vector& data_vec, + const std::string& section_str, + BundleWriter* writer, + std::vector& dump_buffer) { + EVVectorDataDumpIterator iter(data_vec); + SaveTensorWithFixedBuffer( + section_str, + writer, dump_buffer.data(), + dump_buffer.size(), &iter, + TensorShape({data_vec.size()})); +} +#define REGISTER_KERNELS(ktype, ttype) \ + template void SsdRecordDescriptor::DumpSection( \ + const std::vector&, const std::string&, \ + BundleWriter*, std::vector&); +REGISTER_KERNELS(int32, int32); +REGISTER_KERNELS(int32, int64); +REGISTER_KERNELS(int64, int32); +REGISTER_KERNELS(int64, int64); +#undef REGISTER_KERNELS + +template +void SsdRecordDescriptor::DumpSsdMeta( + const std::string& prefix, + const std::string& var_name) { + std::fstream fs; + std::string var_name_temp(var_name); + std::string new_str = "_"; + int64 pos = var_name_temp.find("/"); + while (pos != std::string::npos) { + var_name_temp.replace(pos, 1, new_str.data(), 1); + pos = var_name_temp.find("/"); + } + + std::string ssd_record_path = + prefix + "-" + var_name_temp + "-ssd_record"; + BundleWriter ssd_record_writer(Env::Default(), + ssd_record_path); + size_t bytes_limit = 8 << 20; + std::vector dump_buffer(bytes_limit); + + DumpSection(key_list, "keys", + &ssd_record_writer, dump_buffer); + DumpSection(key_file_id_list, "keys_file_id", + &ssd_record_writer, dump_buffer); + DumpSection(key_offset_list, "keys_offset", + &ssd_record_writer, dump_buffer); + DumpSection(file_list, "files", + &ssd_record_writer, dump_buffer); + DumpSection(invalid_record_count_list, "invalid_record_count", + &ssd_record_writer, dump_buffer); + DumpSection(record_count_list, "record_count", + &ssd_record_writer, dump_buffer); + + ssd_record_writer.Finish(); +} +#define REGISTER_KERNELS(ktype) \ + template void SsdRecordDescriptor::DumpSsdMeta( \ + const std::string&, const std::string&); +REGISTER_KERNELS(int32); +REGISTER_KERNELS(int64); +#undef REGISTER_KERNELS +}//namespace embedding +}//namespace tensorflow diff --git a/tensorflow/core/framework/embedding/ssd_record_descriptor.h b/tensorflow/core/framework/embedding/ssd_record_descriptor.h index 9d015236934..aeb8d324759 100644 --- a/tensorflow/core/framework/embedding/ssd_record_descriptor.h +++ b/tensorflow/core/framework/embedding/ssd_record_descriptor.h @@ -20,14 +20,13 @@ limitations under the License. #include #include #include - +#include #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h" #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/platform/env.h" namespace tensorflow { +class BundleWriter; namespace embedding { template @@ -59,48 +58,10 @@ class SsdRecordDescriptor { void DumpSection(const std::vector& data_vec, const std::string& section_str, BundleWriter* writer, - std::vector& dump_buffer) { - EVVectorDataDumpIterator iter(data_vec); - SaveTensorWithFixedBuffer( - section_str, - writer, dump_buffer.data(), - dump_buffer.size(), &iter, - TensorShape({data_vec.size()})); - } + std::vector& dump_buffer); void DumpSsdMeta(const std::string& prefix, - const std::string& var_name) { - std::fstream fs; - std::string var_name_temp(var_name); - std::string new_str = "_"; - int64 pos = var_name_temp.find("/"); - while (pos != std::string::npos) { - var_name_temp.replace(pos, 1, new_str.data(), 1); - pos = var_name_temp.find("/"); - } - - std::string ssd_record_path = - prefix + "-" + var_name_temp + "-ssd_record"; - BundleWriter ssd_record_writer(Env::Default(), - ssd_record_path); - size_t bytes_limit = 8 << 20; - std::vector dump_buffer(bytes_limit); - - DumpSection(key_list, "keys", - &ssd_record_writer, dump_buffer); - DumpSection(key_file_id_list, "keys_file_id", - &ssd_record_writer, dump_buffer); - DumpSection(key_offset_list, "keys_offset", - &ssd_record_writer, dump_buffer); - DumpSection(file_list, "files", - &ssd_record_writer, dump_buffer); - DumpSection(invalid_record_count_list, "invalid_record_count", - &ssd_record_writer, dump_buffer); - DumpSection(record_count_list, "record_count", - &ssd_record_writer, dump_buffer); - - ssd_record_writer.Finish(); - } + const std::string& var_name); void CopyEmbeddingFilesToCkptDir( const std::string& prefix, diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h index d212e5b9c77..bb949183492 100644 --- a/tensorflow/core/framework/embedding/storage.h +++ b/tensorflow/core/framework/embedding/storage.h @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/storage_config.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" #include "tensorflow/core/util/work_sharder.h" #include "tensorflow/core/framework/device_base.h" #if GOOGLE_CUDA @@ -53,6 +52,9 @@ struct SsdRecordDescriptor; template class GPUHashTable; +class BundleWriter; +class BundleReader; + template struct EmbeddingVarContext; namespace { diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index fc1b2cd9c67..115e3c4bae6 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -2907,7 +2907,10 @@ tf_kernel_library( hdrs = ["kv_variable_ops.h"], srcs = ["kv_variable_ops.cc", "kv_variable_lookup_ops.cc", - "kv_variable_restore_ops.cc"], + "kv_variable_restore_ops.cc", + "//tensorflow/core:framework/embedding/embedding_var_ckpt_data.cc", + "//tensorflow/core:framework/embedding/embedding_var_restore.cc", + "//tensorflow/core:framework/embedding/ssd_record_descriptor.cc"], copts = tf_copts() + ["-g"], deps = [ ":bounds_check", From fe194b0718f9cc4f30a31e721780da2a956b6df8 Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Tue, 19 Sep 2023 09:57:00 +0800 Subject: [PATCH 06/45] [Embedding] Fix incorrect frequency in shared-embedding. (#931) Signed-off-by: lixy9474 --- .../python/ops/embedding_variable_ops_test.py | 74 +++++++++++++++++++ tensorflow/python/ops/kv_variable_ops.py | 4 +- .../python/training/gradient_descent.py | 15 +++- tensorflow/python/training/optimizer.py | 30 +++++++- 4 files changed, 115 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index 25a0cb6ff11..c6cdf951a1e 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -2816,5 +2816,79 @@ def testSetInitializedWithRestore(self): result = sess.run(var._is_initialized_op) self.assertEqual(True, result) + def testCountsTensor(self): + os.environ["TF_RECORD_FREQ"] = "1" + checkpoint_directory = self.get_temp_dir() + ckpt_path = os.path.join(checkpoint_directory, "model.ckpt") + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + sp1 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([0,0,0,1,1,2], dtypes.int64), + dense_shape=[6, 1]) + sp2 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([3,3,3,4,4,1], dtypes.int64), + dense_shape=[6, 1]) + emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None) + emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None) + emb = emb1 + emb2 + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + sess.run([init]) + sess.run(train_op) + saver.save(sess, ckpt_path) + + for name, shape in checkpoint_utils.list_variables(ckpt_path): + if name == "var_1-freqs": + value = checkpoint_utils.load_variable(ckpt_path, name) + self.assertAllEqual(value, [3, 3, 1, 3, 2]) + + def testCountsTensorWithGradientDescent(self): + os.environ["TF_RECORD_FREQ"] = "1" + checkpoint_directory = self.get_temp_dir() + ckpt_path = os.path.join(checkpoint_directory, "model.ckpt") + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + sp1 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([0,0,0,1,1,2], dtypes.int64), + dense_shape=[6, 1]) + sp2 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([3,3,3,4,4,1], dtypes.int64), + dense_shape=[6, 1]) + emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None) + emb2 = embedding_ops.embedding_lookup_sparse(var, sp2, None) + emb = emb1 + emb2 + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = gradient_descent.GradientDescentOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + sess.run([init]) + sess.run(train_op) + saver.save(sess, ckpt_path) + + for name, shape in checkpoint_utils.list_variables(ckpt_path): + if name == "var_1-freqs": + value = checkpoint_utils.load_variable(ckpt_path, name) + self.assertAllEqual(value, [3, 3, 1, 3, 2]) + + del os.environ["TF_RECORD_FREQ"] + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index 701c03f6975..96329ca345b 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -368,7 +368,7 @@ def _init_from_args(self, self._dtype = initial_value.dtype.base_dtype self._constraint = constraint self._gather_op = None - self._counts_tensor = None + self._counts_tensor = {} if self._is_primary: self._slot_num = 0 else: @@ -850,7 +850,7 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None): default_value, counts, is_inference=True, name=name) - self._counts_tensor = counts + self._counts_tensor[indices] = counts else: value = gen_kv_variable_ops.kv_resource_gather(self._handle, indices, diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py index 32a12a0554f..799e3c5f5bd 100644 --- a/tensorflow/python/training/gradient_descent.py +++ b/tensorflow/python/training/gradient_descent.py @@ -71,12 +71,23 @@ def _resource_apply_dense(self, grad, handle): def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): if isinstance(handle, kv_variable_ops.EmbeddingVariable): global_step = training_util.get_or_create_global_step() - if handle.need_counts() and handle._counts_tensor is not None: + if handle.need_counts() and len(handle._counts_tensor.keys()) != 0: + if indices.op.type == "ConcatV2": + total_counts = [] + for tensor in indices.op.inputs: + if tensor.op.type == "Reshape": + indices_tensor = tensor.op.inputs[0] + total_counts.append(handle._counts_tensor[indices_tensor]) + from tensorflow.python.ops import array_ops + counts_tensor = array_ops.concat(total_counts, 0) + elif indices.op.type == "Reshape": + indices_tensor = indices.op.inputs[0] + counts_tensor = handle._counts_tensor[indices_tensor] return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts( handle.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), grad, indices, global_step, - handle._counts_tensor, use_locking=self._use_locking) + counts_tensor, use_locking=self._use_locking) else: return training_ops.kv_resource_sparse_apply_gradient_descent( handle.handle, math_ops.cast(self._learning_rate_tensor, diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 578d682cc11..7523604ccf9 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -93,6 +93,18 @@ def _deduplicate_indexed_slices_with_counts(values, indices): array_ops.shape(unique_indices)[0]) return (summed_values, unique_indices, indices_counts) +def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts): + """Sums `values` associated with any non-unique `indices` + and return counts of each count in `values`.""" + unique_indices, new_index_positions = array_ops.unique(indices) + summed_values = math_ops.unsorted_segment_sum( + values, new_index_positions, + array_ops.shape(unique_indices)[0]) + summed_counts = math_ops.unsorted_segment_sum( + counts, new_index_positions, + array_ops.shape(unique_indices)[0]) + return (summed_values, unique_indices, summed_counts) + def _var_key(var): # TODO(ashankar): Consolidate handling for eager and graph if hasattr(var, "op"): @@ -1088,14 +1100,24 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): """ from tensorflow.python.ops import kv_variable_ops if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts(): - if handle._counts_tensor is None: + if len(handle._counts_tensor.keys()) == 0: summed_grad, unique_indices, indices_counts = \ _deduplicate_indexed_slices_with_counts( values=grad, indices=indices) else: - summed_grad, unique_indices = _deduplicate_indexed_slices( - values=grad, indices=indices) - indices_counts = handle._counts_tensor + if indices.op.type == "ConcatV2": + total_counts = [] + for tensor in indices.op.inputs: + if tensor.op.type == "Reshape": + indices_tensor = tensor.op.inputs[0] + total_counts.append(handle._counts_tensor[indices_tensor]) + counts_tensor = array_ops.concat(total_counts, 0) + elif indices.op.type == "Reshape": + indices_tensor = indices.op.inputs[0] + counts_tensor = handle._counts_tensor[indices_tensor] + summed_grad, unique_indices, indices_counts = \ + _deduplicate_indexed_slices_with_counts_reduction( + grad, indices, counts_tensor) return self._resource_apply_sparse( summed_grad, handle, unique_indices, indices_counts) else: From 29ecde4f6418cd3beca400a31e87e1e53d9567dc Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Wed, 20 Sep 2023 10:45:48 +0800 Subject: [PATCH 07/45] [Embedding] Fix missing return value of RestoreSSD of DramSSDHashStorage. (#926) Signed-off-by: lixy9474 --- tensorflow/core/framework/embedding/dram_ssd_storage.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h index 4243cc14eb3..356a61d865f 100644 --- a/tensorflow/core/framework/embedding/dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h @@ -181,7 +181,9 @@ class DramSsdHashStorage : public MultiTierStorage { restore_buff.key_offset_list_buf, restore_buff.num_of_keys, file_id_map); + return Status::OK(); } + Status Eviction(K* evict_ids, int64 evict_size) override { ValuePtr* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { From 06f81cc7c26972d8d0851a652dc212976f54f592 Mon Sep 17 00:00:00 2001 From: lixy9474 Date: Tue, 17 Oct 2023 15:49:38 +0800 Subject: [PATCH 08/45] [Embedding] Refactor the data structure of EmbeddingVariable. (#924) Signed-off-by: lixy9474 --- .../framework/embedding/bloom_filter_policy.h | 77 ++- .../core/framework/embedding/config.proto | 6 +- .../counter_filter_descriptor_impl.h | 272 ++++++++ .../embedding/counter_filter_policy.h | 104 ++- .../framework/embedding/cpu_hash_map_kv.h | 91 ++- .../framework/embedding/dense_hash_map_kv.h | 15 +- .../embedding/dram_leveldb_storage.h | 75 +- .../framework/embedding/dram_pmem_storage.h | 88 +-- .../framework/embedding/dram_ssd_storage.h | 62 +- .../dynamic_dim_feature_descriptor_impl.h | 214 ++++++ .../framework/embedding/embedding_config.h | 17 +- .../embedding/embedding_memory_pool.h | 12 +- .../framework/embedding/embedding_var.cu.cc | 144 ---- .../core/framework/embedding/embedding_var.h | 345 +++------- .../embedding/embedding_var_ckpt_data.cc | 38 +- .../embedding/embedding_var_ckpt_data.h | 10 +- .../embedding/embedding_var_dump_iterator.h | 4 +- .../framework/embedding/feature_descriptor.h | 200 ++++++ .../embedding/feature_descriptor_impl.h | 317 +++++++++ .../core/framework/embedding/filter_factory.h | 12 +- .../core/framework/embedding/filter_policy.h | 48 +- .../embedding/globalstep_shrink_policy.h | 18 +- .../framework/embedding/gpu_hash_map_kv.h | 20 +- .../embedding/hbm_dram_ssd_storage.h | 458 ++++--------- .../framework/embedding/hbm_dram_storage.h | 411 ++++------- .../hbm_multi_tier_feature_descriptor.h | 122 ++++ .../embedding/hbm_storage_iterator.h | 7 +- .../core/framework/embedding/kv_interface.h | 29 +- .../embedding/l2weight_shrink_policy.h | 19 +- .../core/framework/embedding/layout_creator.h | 104 --- .../core/framework/embedding/leveldb_kv.h | 79 ++- .../embedding/lockless_hash_map_cpu.h | 243 ------- .../embedding/multi_tier_storage.cu.cc | 77 ++- .../framework/embedding/multi_tier_storage.h | 136 ++-- .../embedding/normal_feature_descriptor.h | 134 ++++ .../embedding/nullable_filter_policy.h | 99 ++- .../core/framework/embedding/shrink_policy.h | 21 +- .../framework/embedding/single_tier_storage.h | 237 +++---- .../core/framework/embedding/ssd_hash_kv.h | 112 +-- tensorflow/core/framework/embedding/storage.h | 170 +++-- .../core/framework/embedding/storage_config.h | 30 +- .../framework/embedding/storage_factory.h | 42 +- .../core/framework/embedding/value_ptr.h | 647 ------------------ tensorflow/core/kernels/BUILD | 5 +- .../kernels/embedding_variable_memory_test.cc | 20 +- .../kernels/embedding_variable_ops_test.cc | 632 ++++------------- .../embedding_variable_performance_test.cc | 25 +- .../core/kernels/embedding_variable_test.h | 43 +- .../group_embedding_lookup_ops_test.cc | 4 +- .../core/kernels/incr_save_restore_ops.h | 4 +- .../core/kernels/kv_variable_lookup_ops.cc | 4 +- tensorflow/core/kernels/kv_variable_ops.cc | 129 ++-- tensorflow/core/kernels/kv_variable_ops.h | 1 + .../core/kernels/kv_variable_restore_ops.cc | 72 +- tensorflow/core/kernels/save_restore_tensor.h | 1 - .../core/kernels/training_ali_op_helpers.h | 53 +- tensorflow/core/kernels/training_ali_ops.cc | 59 +- tensorflow/python/framework/ops.py | 2 + .../ops/embedding_variable_ops_gpu_test.py | 164 ++--- .../python/ops/embedding_variable_ops_test.py | 197 +++--- tensorflow/python/ops/kv_variable_ops.py | 14 + .../training/saving/saveable_object_util.py | 3 +- 62 files changed, 3060 insertions(+), 3738 deletions(-) create mode 100644 tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h create mode 100644 tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h create mode 100644 tensorflow/core/framework/embedding/feature_descriptor.h create mode 100644 tensorflow/core/framework/embedding/feature_descriptor_impl.h create mode 100644 tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h delete mode 100644 tensorflow/core/framework/embedding/layout_creator.h delete mode 100644 tensorflow/core/framework/embedding/lockless_hash_map_cpu.h create mode 100644 tensorflow/core/framework/embedding/normal_feature_descriptor.h delete mode 100644 tensorflow/core/framework/embedding/value_ptr.h diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h index 29b85e5bb4e..781511578af 100644 --- a/tensorflow/core/framework/embedding/bloom_filter_policy.h +++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h @@ -35,9 +35,10 @@ class BloomFilterPolicy : public FilterPolicy { using FilterPolicy::config_; public: - BloomFilterPolicy(const EmbeddingConfig& config, EV* ev) : - FilterPolicy(config, ev) { - + BloomFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), + FilterPolicy(config, ev) { switch (config_.counter_type){ case DT_UINT64: VLOG(2) << "The type of bloom counter is uint64"; @@ -64,10 +65,10 @@ class BloomFilterPolicy : public FilterPolicy { Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); if (s.ok()) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); @@ -81,17 +82,17 @@ class BloomFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; if (value_ptr != nullptr) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission; } @@ -109,13 +110,13 @@ class BloomFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> lookup_or_create_ids(num_worker_threads); std::vector> lookup_or_create_cursor(num_worker_threads); - std::vector*>> + std::vector> lookup_or_create_ptrs(num_worker_threads); IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); std::vector> @@ -147,7 +148,7 @@ class BloomFilterPolicy : public FilterPolicy { 1000, do_work); std::vector total_ids(num_of_keys); - std::vector*> total_ptrs(num_of_keys); + std::vector total_ptrs(num_of_keys); std::vector total_cursors(num_of_keys); int num_of_admit_id = 0; for (int i = 0; i < num_worker_threads; i++) { @@ -157,7 +158,7 @@ class BloomFilterPolicy : public FilterPolicy { sizeof(K) * lookup_or_create_ids[i].size()); memcpy(total_ptrs.data() + num_of_admit_id, lookup_or_create_ptrs[i].data(), - sizeof(ValuePtr*) * lookup_or_create_ptrs[i].size()); + sizeof(void*) * lookup_or_create_ptrs[i].size()); memcpy(total_cursors.data() + num_of_admit_id, lookup_or_create_cursor[i].data(), sizeof(int) * lookup_or_create_cursor[i].size()); @@ -174,11 +175,12 @@ class BloomFilterPolicy : public FilterPolicy { #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { if (GetBloomFreq(key) >= config_.filter_freq) { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { AddFreq(key, count); @@ -186,19 +188,27 @@ class BloomFilterPolicy : public FilterPolicy { } } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { - *val = nullptr; - if ((GetFreq(key, *val) + count) >= config_.filter_freq) { + *value_ptr = nullptr; + if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) { + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + ev_->storage()->Insert(key, value_ptr); + s = Status::OK(); + } *is_filter = true; - return ev_->LookupOrCreateKey(key, val); + feat_desc_->AddFreq(*value_ptr, count); + } else { + *is_filter = false; + AddFreq(key, count); } - *is_filter = false; - AddFreq(key, count); return Status::OK(); } - int64 GetFreq(K key, ValuePtr*) override { + int64 GetFreq(K key, void* val) override { return GetBloomFreq(key); } @@ -210,7 +220,7 @@ class BloomFilterPolicy : public FilterPolicy { return bloom_counter_; } - bool is_admit(K key, ValuePtr* value_ptr) override { + bool is_admit(K key, void* value_ptr) override { if (value_ptr == nullptr) { return false; } else { @@ -326,8 +336,12 @@ class BloomFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; int64 new_freq = freq_buff[i]; + int64 import_version = -1; + if (config_.steps_to_live != 0 || config_.record_version) { + import_version = version_buff[i]; + } if (!is_filter) { if (freq_buff[i] >= config_.filter_freq) { SetBloomFreq(key_buff[i], freq_buff[i]); @@ -339,17 +353,9 @@ class BloomFilterPolicy : public FilterPolicy { SetBloomFreq(key_buff[i], freq_buff[i]); } if (new_freq >= config_.filter_freq){ - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); - if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); - } - if (!is_filter){ - ev_->LookupOrCreateEmb(value_ptr, - value_buff + i * ev_->ValueLen()); - } else { - ev_->LookupOrCreateEmb(value_ptr, - ev_->GetDefaultValue(key_buff[i])); - } + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + new_freq, import_version, config_.emb_index); } } return Status::OK(); @@ -449,6 +455,7 @@ class BloomFilterPolicy : public FilterPolicy { } private: void* bloom_counter_; + embedding::FeatureDescriptor* feat_desc_; std::vector seeds_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/config.proto b/tensorflow/core/framework/embedding/config.proto index a8535347020..424fc5e1a38 100644 --- a/tensorflow/core/framework/embedding/config.proto +++ b/tensorflow/core/framework/embedding/config.proto @@ -50,11 +50,7 @@ enum EmbeddingVariableType { enum ValuePtrStatus { OK = 0; IS_DELETED = 1; -} - -enum ValuePosition { - IN_DRAM = 0; - NOT_IN_DRAM = 1; + NOT_IN_DRAM = 2; } enum IsSetInitialized { diff --git a/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h new file mode 100644 index 00000000000..e51166a2895 --- /dev/null +++ b/tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h @@ -0,0 +1,272 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl: public FeatureDescriptorImpl { + public: + CounterFilterDescriptorImpl( + Allocator* alloc, + int64 slot_num, + bool need_record_freq, + bool need_record_version, + int64 filter_freq, + StorageType storage_type) + : filter_freq_(filter_freq), + is_record_freq_(need_record_freq), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) { + if (filter_freq >= (1L << version_offset_bits_)) { + LOG(FATAL)<<"Filter freqeuncy threshold shouldn't bigger than 2^12."; + } + + if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { +#if GOOGLE_CUDA + feat_desc_impl_.reset( + new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); +#endif //GOOGLE_CUDA + } else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } + } + + CounterFilterDescriptorImpl(CounterFilterDescriptorImpl* feat_desc_impl) + : filter_freq_(feat_desc_impl->filter_freq_), + FeatureDescriptorImpl(feat_desc_impl) { +#if GOOGLE_CUDA + if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl*)){ + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); + } else { +#endif //GOOGLE_CUDA + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc_impl->feat_desc_impl_.get()))); +#if GOOGLE_CUDA + } +#endif //GOOGLE_CUDA + } + + ~CounterFilterDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return feat_desc_impl_->InitSlotInfo( + emb_index, embedding_dim, default_value); + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + return feat_desc_impl_->InitSlotInfo(feat_desc_impl); + } + + V* GetEmbedding(void* val, int emb_index) override { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + bool IsAdmit(void* val) override { + return (GetFlag(val) == 0); + } + + void* Admit(void* val) override { + if (!IsAdmit(val)) { + return feat_desc_impl_->Allocate(); + } else { + LOG(FATAL)<<"Only unadmited feature could be admited."; + return nullptr; + } + } + + void* Allocate() override { + uint64* val = (uint64*)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + uint64 flag = 1L << flag_offset_bits_; + uint64 version = (0xffffffffffffffff << version_offset_bits_); + uint64 freq = 0; + *val = version + freq; + val = (uint64*)((uint64)val | flag); + return (void*)val; + } + + void* Allocate(int64 freq) override { + if (freq < filter_freq_) { + return Allocate(); + } else { + return feat_desc_impl_->Allocate(); + } + } + + void Deallocate(void* val) override { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val: vals) { + if (IsAdmit(val)) { + feat_desc_impl_->Deallocate(val); + } else { + void* tmp = GetPtr(val); + alloc_->DeallocateRaw(tmp); + } + } + } + + void AddFreq(void* val, int64 count) override { + uint64* tmp = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + __sync_fetch_and_add(tmp, count); + } else { + feat_desc_impl_->AddFreq(val, count); + } + } + + void SetAllocator(Allocator* alloc) override { + feat_desc_impl_->SetAllocator(alloc); + } + + void SetValue(void* val, int64 emb_index, V* value) { + if (IsAdmit(val)) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + } + + void SetDefaultValue(void* val, int64 key) override { + feat_desc_impl_->SetDefaultValue(val, key); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + feat_desc_impl_->SetDefaultValues( + keys, init_cursor, + value_ptrs, compute_stream, + event_mgr, gpu_device); + } +#endif + + int64 GetFreq(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + return *((uint64*)tmp) & + ((1L << version_offset_bits_) - 1); + } else { + if (is_record_freq_) { + return feat_desc_impl_->GetFreq(val); + } else { + return filter_freq_; + } + } + } + + int64 GetVersion(void* val) override { + if (!IsAdmit(val)) { + void* tmp = GetPtr(val); + int64 version = *(uint64*)tmp >> version_offset_bits_; + if (version == 0xffffffffffff) { + version = -1; + } + return version; + } else { + return feat_desc_impl_->GetVersion(val); + } + } + + void UpdateVersion(void* val, int64 version) override { + if (!IsAdmit(val)) { + void* tmp_ptr = GetPtr(val); + uint64 tmp_val = 0; + uint64 result = 0; + do { + tmp_val = *(uint64*)tmp_ptr; + version = version << version_offset_bits_; + uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1); + result = version + freq; + } while(!__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result)); + } else { + feat_desc_impl_->UpdateVersion(val, version); + } + } + + void SetFreq(void* val, int64 freq) override { + uint64* tmp_ptr = (uint64*)GetPtr(val); + if (!IsAdmit(val)) { + uint64 tmp = *tmp_ptr; + tmp = ~((1L << version_offset_bits_) - 1) & tmp; + tmp += freq; + __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp); + } else { + feat_desc_impl_->SetFreq(val, freq); + } + } + + int data_bytes() override { + return alloc_bytes_; + } + private: + uint64 GetFlag(void* val) { + return (uint64)val >> flag_offset_bits_; + } + + void* GetPtr(void* val) { + return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1)); + } + + int64 filter_freq_; + int alloc_bytes_ = 8; + Allocator* alloc_ = ev_allocator(); + const int freq_offset_bits_ = 0; + const int version_offset_bits_ = 16; + const int flag_offset_bits_ = 48; + std::unique_ptr> feat_desc_impl_; + bool is_record_freq_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h index c9f19f34cd2..19cd90ad01c 100644 --- a/tensorflow/core/framework/embedding/counter_filter_policy.h +++ b/tensorflow/core/framework/embedding/counter_filter_policy.h @@ -25,18 +25,19 @@ template class CounterFilterPolicy : public FilterPolicy { using FilterPolicy::ev_; using FilterPolicy::config_; - using FilterPolicy::LookupOrCreateEmbInternal; public: - CounterFilterPolicy(const EmbeddingConfig& config, EV* ev) : - FilterPolicy(config, ev) {} + CounterFilterPolicy(const EmbeddingConfig& config, EV* ev, + embedding::FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc), + FilterPolicy(config, ev) {} Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); - if (s.ok() && GetFreq(key, value_ptr) >= config_.filter_freq) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + if (s.ok() && feat_desc_->IsAdmit(value_ptr)) { + V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); @@ -50,18 +51,18 @@ class CounterFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, keys, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; int64 freq = GetFreq(keys[i], value_ptr); - if (value_ptr != nullptr && freq >= config_.filter_freq) { + if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission; } @@ -79,7 +80,7 @@ class CounterFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> @@ -90,36 +91,61 @@ class CounterFilterPolicy : public FilterPolicy { #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - if (GetFreq(key, *value_ptr) >= config_.filter_freq) { - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + if (is_filter) { + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen()); } } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { - Status s = ev_->LookupOrCreateKey(key, val); - *is_filter = (GetFreq(key, *val) + count) >= config_.filter_freq; + *is_filter = false; + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + if (count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + feat_desc_->Deallocate(*value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + ev_->storage()->Insert(key, value_ptr); + s = Status::OK(); + } else if (!feat_desc_->IsAdmit(*value_ptr)) { + int64 freq = feat_desc_->GetFreq(*value_ptr); + if (freq + count >= config_.filter_freq) { + void* admit_value_ptr = feat_desc_->Admit(*value_ptr); + feat_desc_->SetFreq(admit_value_ptr, freq); + feat_desc_->UpdateVersion( + admit_value_ptr, feat_desc_->GetVersion(*value_ptr)); + feat_desc_->SetDefaultValue(admit_value_ptr, key); + ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr); + *value_ptr = admit_value_ptr; + *is_filter = true; + } + } else { + *is_filter = true; + } + feat_desc_->AddFreq(*value_ptr, count); return s; } - int64 GetFreq(K key, ValuePtr* value_ptr) override { - return value_ptr->GetFreq(); + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); } int64 GetFreq(K key) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetFreq(); - } - - bool is_admit(K key, ValuePtr* value_ptr) override { - return (GetFreq(key, value_ptr) >= config_.filter_freq); + return feat_desc_->GetFreq(value_ptr); } Status Restore(int64 key_num, int bucket_num, int64 partition_id, @@ -136,27 +162,33 @@ class CounterFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); + int64 import_freq = 0; + int64 import_version = -1; if (!is_filter) { if (freq_buff[i] >= config_.filter_freq) { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } else { - value_ptr->SetFreq(config_.filter_freq); + import_freq = config_.filter_freq; } } else { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); - } - if (value_ptr->GetFreq() >= config_.filter_freq) { - LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len, - value_ptr, value_buff, key_buff); + import_version = version_buff[i]; } + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); } return Status::OK(); } + + bool is_admit(K key, void* value_ptr) override { + return feat_desc_->IsAdmit(value_ptr); + } + + private: + embedding::FeatureDescriptor* feat_desc_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h index 600f6c20e44..8476c399c40 100644 --- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h @@ -21,25 +21,25 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class LocklessHashMap : public KVInterface { public: - LocklessHashMap() { + LocklessHashMap(FeatureDescriptor* feat_desc): feat_desc_(feat_desc) { hash_map_.max_load_factor(0.8); hash_map_.set_empty_key_and_value( LocklessHashMap::EMPTY_KEY_, nullptr); hash_map_.set_counternum(16); hash_map_.set_deleted_key(LocklessHashMap::DELETED_KEY_); + pthread_key_create(&key_, NULL); } - ~LocklessHashMap() override {} + ~LocklessHashMap() override { + pthread_key_delete(key_); + } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { auto iter = hash_map_.find_wait_free(key); if (iter.first == LocklessHashMap::EMPTY_KEY_) { return errors::NotFound( @@ -60,10 +60,10 @@ class LocklessHashMap : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { auto iter = hash_map_.insert_lockless( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); + std::move(std::pair(key, + const_cast(value_ptr)))); // insert fail, exist key if ((*(iter.first)).second != value_ptr){ return errors::AlreadyExists( @@ -88,14 +88,40 @@ class LocklessHashMap : public KVInterface { } } + Status Commit(K key, const void* value_ptr) override { + auto iter = hash_map_.insert_lockless(std::move( + std::pair(key, + const_cast(value_ptr)))); + if ((*(iter.first)).second != value_ptr) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap( + &((*(iter.first)).second), + (*(iter.first)).second, + value_ptr); + } + return Status::OK(); + } + Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { + for(int i = 0; i < keys.size(); ++i) { + auto iter = hash_map_.insert_lockless(std::move( + std::pair(keys[i], + const_cast(value_ptrs[i])))); + if ((*(iter.first)).second != value_ptrs[i]) { + AppendToValuePtrQueue((*(iter.first)).second); + __sync_bool_compare_and_swap( + &((*(iter.first)).second), + (*(iter.first)).second, + value_ptrs[i]); + } + } return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { - std::pair*> *hash_map_dump; + std::vector* value_ptr_list) override { + std::pair *hash_map_dump; int64 bucket_count; auto it = hash_map_.GetSnapshot(); hash_map_dump = it.first; @@ -120,11 +146,50 @@ class LocklessHashMap : public KVInterface { return ""; } + void UpdateValuePtr( + K key, void* new_value_ptr, + void* old_value_ptr) override { + auto iter = hash_map_.insert_lockless( + std::move(std::pair(key, old_value_ptr))); + bool flag = __sync_bool_compare_and_swap( + &((*(iter.first)).second), old_value_ptr, new_value_ptr); + if (flag) { + AppendToValuePtrQueue(old_value_ptr); + } else { + feat_desc_->Deallocate(new_value_ptr); + } + } + + private: + void AppendToValuePtrQueue(void* old_value_ptr) { + //A parameter that can be adjusted in the future + std::deque* value_ptr_queue = GetOutOfDateValuePtrQueue(); + if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) { + void* value_ptr = value_ptr_queue->front(); + feat_desc_->Deallocate(value_ptr); + value_ptr_queue->pop_front(); + } + value_ptr_queue->emplace_back(old_value_ptr); + } + + std::deque* GetOutOfDateValuePtrQueue() { + std::deque* value_ptr_queue = + static_cast*>(pthread_getspecific(key_)); + if (value_ptr_queue == nullptr) { + value_ptr_queue = new std::deque(); + pthread_setspecific(key_, value_ptr_queue); + } + return value_ptr_queue; + } + private: - typedef google::dense_hash_map_lockless*> LockLessHashMap; + typedef google::dense_hash_map_lockless LockLessHashMap; static const int EMPTY_KEY_; static const int DELETED_KEY_; LockLessHashMap hash_map_; + const int CAP_INVALID_VALUEPTR = 20000; + FeatureDescriptor* feat_desc_; + pthread_key_t key_; }; template const int LocklessHashMap::EMPTY_KEY_ = -1; diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h index 92baf037721..ffaf2e335dc 100644 --- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h @@ -23,9 +23,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/kv_interface.h" namespace tensorflow { -template -class ValuePtr; - namespace embedding { template @@ -45,7 +42,7 @@ class DenseHashMap : public KVInterface { delete []hash_map_; } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { int64 l_id = std::abs(key)%partition_num_; spin_rd_lock l(hash_map_[l_id].mu); auto iter = hash_map_[l_id].hash_map.find(key); @@ -70,7 +67,7 @@ class DenseHashMap : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { int64 l_id = std::abs(key)%partition_num_; spin_wr_lock l(hash_map_[l_id].mu); auto iter = hash_map_[l_id].hash_map.find(key); @@ -80,8 +77,8 @@ class DenseHashMap : public KVInterface { "already exists Key: ", key, " in DenseHashMap."); } else { auto iter = hash_map_[l_id].hash_map.insert( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); + std::move(std::pair(key, + const_cast(value_ptr)))); return Status::OK(); } } @@ -109,7 +106,7 @@ class DenseHashMap : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector* >* value_ptr_list) override { + std::vector* value_ptr_list) override { dense_hash_map hash_map_dump[partition_num_]; for (int i = 0; i< partition_num_; i++) { spin_rd_lock l(hash_map_[i].mu); @@ -132,7 +129,7 @@ class DenseHashMap : public KVInterface { const int partition_num_ = 1000; struct dense_hash_map { mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER; - google::dense_hash_map* > hash_map; + google::dense_hash_map hash_map; }; dense_hash_map* hash_map_; }; diff --git a/tensorflow/core/framework/embedding/dram_leveldb_storage.h b/tensorflow/core/framework/embedding/dram_leveldb_storage.h index fdb6697d541..2f9fbade6c5 100644 --- a/tensorflow/core/framework/embedding/dram_leveldb_storage.h +++ b/tensorflow/core/framework/embedding/dram_leveldb_storage.h @@ -21,9 +21,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/single_tier_storage.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,11 +28,12 @@ namespace embedding { template class DramLevelDBStore : public MultiTierStorage { public: - DramLevelDBStore(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_ = new DramStorage(sc, alloc, lc, new LocklessHashMap()); - leveldb_ = new LevelDBStore(sc, alloc, lc); + DramLevelDBStore(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + leveldb_ = new LevelDBStore(sc, feat_desc); } ~DramLevelDBStore() override { @@ -46,7 +44,7 @@ class DramLevelDBStore : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -63,23 +61,22 @@ class DramLevelDBStore : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramLevelDBStore."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramLevelDBStore can not be called."; + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -93,7 +90,7 @@ class DramLevelDBStore : public MultiTierStorage { leveldb_->DestroyValuePtr(*value_ptr); return dram_->Get(key, value_ptr); } - dram_->Insert(key, value_ptr, size); + dram_->CreateAndInsert(key, value_ptr); return Status::OK(); } @@ -146,15 +143,15 @@ class DramLevelDBStore : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_leveldb_key_list; - std::vector*> value_ptr_list, tmp_leveldb_value_list; + std::vector value_ptr_list, tmp_leveldb_value_list; TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); TF_CHECK_OK(leveldb_->GetSnapshot( &tmp_leveldb_key_list, &tmp_leveldb_value_list)); for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) { - tmp_leveldb_value_list[i]->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - tmp_leveldb_value_list[i]->SetInitialized(emb_config.primary_emb_index); + tmp_leveldb_value_list[i] = + (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset)); } std::vector leveldb_key_list; @@ -173,26 +170,34 @@ class DramLevelDBStore : public MultiTierStorage { { mutex_lock l(*(leveldb_->get_mutex())); + std::vector*> feat_desc_list(2); + FeatureDescriptor hbm_feat_desc( + 1, 1, ev_allocator()/*useless*/, + StorageType::HBM_DRAM, + true, true, + {false, 0}); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = &hbm_feat_desc; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, value_iter))); } for (auto it: tmp_leveldb_value_list) { - delete it; + cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff)); } - delete value_iter; return Status::OK(); } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); @@ -206,8 +211,8 @@ class DramLevelDBStore : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(leveldb_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr)); @@ -218,14 +223,20 @@ class DramLevelDBStore : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + protected: - void SetTotalDims(int64 total_dims) override { - leveldb_->SetTotalDims(total_dims); + int total_dim() override { + return dram_feat_desc_->total_dim(); } private: DramStorage* dram_; LevelDBStore* leveldb_; + FeatureDescriptor* dram_feat_desc_ = nullptr; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dram_pmem_storage.h b/tensorflow/core/framework/embedding/dram_pmem_storage.h index fd19f75ab4c..e58d9450d96 100644 --- a/tensorflow/core/framework/embedding/dram_pmem_storage.h +++ b/tensorflow/core/framework/embedding/dram_pmem_storage.h @@ -15,14 +15,12 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_ +#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" -#include "tensorflow/core/framework/embedding/cpu_hash_map_kv.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,36 +29,36 @@ namespace embedding { template class DramPmemStorage : public MultiTierStorage { public: - DramPmemStorage(const StorageConfig& sc, Allocator* dram_alloc, - Allocator* pmem_alloc, LayoutCreator* lc, + DramPmemStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_ = new DramStorage(sc, dram_alloc, lc, new LocklessHashMap()); - pmem_ = new PmemLibpmemStorage(sc, pmem_alloc, lc); - value_ptr_size_ = - const_cast(sc.embedding_config).total_num( - Storage::GetAllocLen()); + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_ = new DramStorage(sc, feat_desc); + pmem_feat_desc_ = new FeatureDescriptor(feat_desc); + pmem_feat_desc_->SetAllocator(experimental_pmem_allocator(sc.path, sc.size[0])); + + pmem_ = new PmemLibpmemStorage(sc, pmem_feat_desc_); } ~DramPmemStorage() override { MultiTierStorage::DeleteFromEvictionManager(); delete dram_; delete pmem_; + delete pmem_feat_desc_; } TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; } s = pmem_->Get(key, value_ptr); + void* new_value_ptr = dram_->CreateValuePtr(); if (s.ok()) { - ValuePtr* new_value_ptr = dram_->CreateValuePtr(value_ptr_size_); - memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(), - sizeof(FixedLengthHeader) + sizeof(V) * value_ptr_size_); - *value_ptr = new_value_ptr; + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); s = dram_->TryInsert(key, *value_ptr); if (s.ok()) { return s; @@ -71,19 +69,19 @@ class DramPmemStorage : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramPmemStorage."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramPmemStorage can not be called."; + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } bool IsUseHbm() override { @@ -94,18 +92,16 @@ class DramPmemStorage : public MultiTierStorage { return false; } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; } s = pmem_->Get(key, value_ptr); - ValuePtr* new_value_ptr = dram_->CreateValuePtr(size); + void* new_value_ptr = dram_->CreateValuePtr(); if (s.ok()) { - memcpy(new_value_ptr->GetPtr(), (*value_ptr)->GetPtr(), - sizeof(FixedLengthHeader) + sizeof(V) * size); + memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes()); } *value_ptr = new_value_ptr; @@ -159,7 +155,7 @@ class DramPmemStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_pmem_key_list; - std::vector*> value_ptr_list, tmp_pmem_value_list; + std::vector value_ptr_list, tmp_pmem_value_list; TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list)); dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len); @@ -182,13 +178,14 @@ class DramPmemStorage : public MultiTierStorage { emb_config, value_len, default_value, key_list, - value_ptr_list))); + value_ptr_list, + pmem_feat_desc_))); return Status::OK(); } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); @@ -202,8 +199,8 @@ class DramPmemStorage : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(pmem_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr)); @@ -214,13 +211,26 @@ class DramPmemStorage : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + pmem_feat_desc_->InitSlotInfo(dram_feat_desc_); + MultiTierStorage::Init(); + } + protected: - void SetTotalDims(int64 total_dims) override {} + int total_dim() override { + return pmem_feat_desc_->total_dim(); + } private: DramStorage* dram_; PmemLibpmemStorage* pmem_; - int64 value_ptr_size_; + FeatureDescriptor* dram_feat_desc_ = nullptr; + FeatureDescriptor* pmem_feat_desc_ = nullptr; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dram_ssd_storage.h b/tensorflow/core/framework/embedding/dram_ssd_storage.h index 356a61d865f..ddd2d782e03 100644 --- a/tensorflow/core/framework/embedding/dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/dram_ssd_storage.h @@ -21,9 +21,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/single_tier_storage.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -31,11 +28,12 @@ namespace embedding { template class DramSsdHashStorage : public MultiTierStorage { public: - DramSsdHashStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, const std::string& name) - : MultiTierStorage(sc, name) { - dram_= new DramStorage(sc, alloc, lc, new LocklessHashMap()); - ssd_hash_ = new SsdHashStorage(sc, alloc, lc); + DramSsdHashStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc, const std::string& name) + : dram_feat_desc_(feat_desc), + MultiTierStorage(sc, name) { + dram_= new DramStorage(sc, feat_desc); + ssd_hash_ = new SsdHashStorage(sc, feat_desc); } ~DramSsdHashStorage() override { @@ -46,7 +44,7 @@ class DramSsdHashStorage : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -64,24 +62,22 @@ class DramSsdHashStorage : public MultiTierStorage { return s; } - void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in DramSsdHashStorage."; + void Insert(K key, void** value_ptr) override { + dram_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - dram_->Insert(key, value_ptr, alloc_len); + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { + dram_->CreateAndInsert(key, value_ptr); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - LOG(FATAL)<<"GetOrCreate(K key, ValuePtr** value_ptr, " - <<"size_t size, CopyBackFlag &need_copyback) " - <<"in DramSsdStorage can not be called."; + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = dram_->Get(key, value_ptr); if (s.ok()) { return s; @@ -96,7 +92,7 @@ class DramSsdHashStorage : public MultiTierStorage { ssd_hash_->DestroyValuePtr(*value_ptr); return dram_->Get(key, value_ptr); } - dram_->Insert(key, value_ptr, size); + dram_->CreateAndInsert(key, value_ptr); return Status::OK(); } @@ -164,7 +160,6 @@ class DramSsdHashStorage : public MultiTierStorage { Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len, const std::string& ssd_emb_file_name, EmbeddingVar* ev, RestoreSSDBuffer& restore_buff) override { - int64 alloc_len = Storage::ComputeAllocLen(value_len); std::map file_id_map; for (int64 i = 0; i < restore_buff.num_of_files; i++) { file_id_map[restore_buff.file_list_buf[i]] = i; @@ -185,7 +180,7 @@ class DramSsdHashStorage : public MultiTierStorage { } Status Eviction(K* evict_ids, int64 evict_size) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); @@ -199,8 +194,8 @@ class DramSsdHashStorage : public MultiTierStorage { Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override { mutex_lock l(*(dram_->get_mutex())); mutex_lock l1(*(ssd_hash_->get_mutex())); - MultiTierStorage::ReleaseInvalidValuePtr(dram_->alloc_); - ValuePtr* value_ptr = nullptr; + MultiTierStorage::ReleaseInvalidValuePtr(dram_->feature_descriptor()); + void* value_ptr = nullptr; for (int64 i = 0; i < evict_size; ++i) { if (dram_->Get(evict_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr)); @@ -211,14 +206,25 @@ class DramSsdHashStorage : public MultiTierStorage { return Status::OK(); } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + ssd_hash_->Init(); + MultiTierStorage::Init(); + } + protected: - void SetTotalDims(int64 total_dims) override { - ssd_hash_->SetTotalDims(total_dims); + int total_dim() override { + return dram_feat_desc_->total_dim(); } private: DramStorage* dram_ = nullptr; SsdHashStorage* ssd_hash_ = nullptr; + FeatureDescriptor* dram_feat_desc_; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h new file mode 100644 index 00000000000..c1fa878788b --- /dev/null +++ b/tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h @@ -0,0 +1,214 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_ +#include +#include +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +constexpr int COLUMN_BITSET_BYTES = 5; +constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8; + +struct MetaHeader { + volatile unsigned char embed_num; + unsigned char value_type; + unsigned char header_size; + unsigned char column_bitset[COLUMN_BITSET_BYTES]; + + static const int kEmbeddingNumStartIndex = 0; + static const int kValueTypeStartIndex = + kEmbeddingNumStartIndex + sizeof(char); + static const int kHeaderSizeStartIndex = + kValueTypeStartIndex + sizeof(char); + static const int kColumnBitsetIndex = + kHeaderSizeStartIndex + sizeof(char); + + inline unsigned int GetEmbeddingNum() { + return (unsigned int) embed_num; + } + + inline void SetEmbeddingNum(size_t s) { + embed_num = (unsigned char)s; + } + + inline std::bitset GetColumnBitset() { + unsigned long meta = ((unsigned long*)this)[0]; + std::bitset bs(meta >> (8 * kColumnBitsetIndex)); + return bs; + } + + inline void SetColumnBitset(const std::bitset& bs, + unsigned int embnum) { + ((unsigned long*)(this))[0] = + (bs.to_ulong() << (8 * kColumnBitsetIndex)) | + (header_size << (8 * kHeaderSizeStartIndex)) | + (value_type << (8 * kValueTypeStartIndex)) | + (embnum << (8 * kEmbeddingNumStartIndex)); + } + + inline unsigned int GetHeaderSize() { + return (unsigned int) header_size; + } + + inline void SetHeaderSize(size_t size) { + header_size = (unsigned char)size; + } +}; + +template +class DynmaicDimDescriptorImpl: public FeatureDescriptorImpl { +using FeatureDescriptorImpl::slot_infos_; + public: + DynmaicDimDescriptorImpl( + Allocator* alloc, + int64 slot_num) + : alloc_bytes_(sizeof(std::atomic_flag) + + sizeof(MetaHeader) + + sizeof(V*) * slot_num), + header_offset_bytes_(sizeof(V*) * slot_num), + flag_offset_bytes_(sizeof(MetaHeader) + + sizeof(V*) * slot_num), + FeatureDescriptorImpl(slot_num, + false, + false) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + ~DynmaicDimDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + return FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + } + + V* GetEmbedding(void* val, int emb_index) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->embed_num; + auto metadata = meta->GetColumnBitset(); + + if (!metadata.test(emb_index)) { + std::atomic_flag* flag= (std::atomic_flag*)(val + flag_offset_bytes_); + while(flag->test_and_set(std::memory_order_acquire)); + metadata = meta->GetColumnBitset(); + if (metadata.test(emb_index)) { + flag->clear(std::memory_order_release); + return ((V**)val)[emb_index]; + } + embnum++ ; + int64 alloc_value_len = slot_infos_[emb_index].embedding_dim; + V* tensor_val = (V*)alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len); + V* default_v = (V*)slot_infos_[emb_index].default_value; + memcpy(tensor_val, default_v, + sizeof(V) * slot_infos_[emb_index].default_value_len); + ((V**)val)[emb_index] = tensor_val; + + metadata.set(emb_index); + // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong(); + // the ptr_ will be occaionally modified from 0x7f18700912a0 to 0x700912a0 + // must use ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val; to avoid + //LOG(INFO)<<"emb_num: "<SetColumnBitset(metadata, embnum); + flag->clear(std::memory_order_release); + return tensor_val; + } else { + return ((V**)val)[emb_index]; + } + } + + bool IsAdmit(void* val) override { + return true; + } + + void* Admit(void* val) override {} + + void* Allocate() override { + void* val = alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + memset(val, 0, alloc_bytes_); + new ((char*)val + header_offset_bytes_) MetaHeader(); + return val; + } + + void Deallocate(void* val) override { + MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_); + unsigned int embnum = (unsigned int)meta->GetEmbeddingNum(); + //LOG(INFO)<<"emb_num in deallocate: "<GetColumnBitset(); + for (int i = 0; i< embnum; i++) { + if (metadata.test(i)) { + V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i]; + if (val_ptr != nullptr) { + alloc_->DeallocateRaw(val_ptr); + } + } + } + } + + void Deallocate(const std::vector& vals) override { + for (auto val: vals) { + Deallocate(val); + } + } + + void AddFreq(void* val, int64 count) override {} + + void SetAllocator(Allocator* alloc) override { + alloc_ = alloc; + } + + void SetDefaultValue(void* val, int64 key) override {} + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy(val_ptr, value, + sizeof(V) * FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + + int64 GetFreq(void* val) override {} + + int64 GetVersion(void* val) override {} + + void UpdateVersion(void* val, int64 version) override {} + + void SetFreq(void* val, int64 freq) override {} + + int data_bytes() override { + return alloc_bytes_; + } + private: + int alloc_bytes_ = 0; + int header_offset_bytes_ = 0; + int flag_offset_bytes_ = 0; + Allocator* alloc_ = ev_allocator(); +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/embedding_config.h b/tensorflow/core/framework/embedding/embedding_config.h index d47d07d4205..a39d2dca303 100644 --- a/tensorflow/core/framework/embedding/embedding_config.h +++ b/tensorflow/core/framework/embedding/embedding_config.h @@ -23,7 +23,6 @@ struct EmbeddingConfig { DataType counter_type; int64 default_value_dim; float default_value_no_permission; - int normal_fix_flag; bool record_freq; bool record_version; bool is_inference; @@ -37,7 +36,6 @@ struct EmbeddingConfig { int64 filter_freq = 0, int64 max_freq = 999999, float l2_weight_threshold = -1.0, - const std::string& layout = "normal", int64 max_element_size = 0, float false_positive_probability = -1.0, DataType counter_type = DT_UINT64, @@ -58,7 +56,6 @@ struct EmbeddingConfig { counter_type(counter_type), default_value_dim(default_value_dim), default_value_no_permission(default_value_no_permission), - normal_fix_flag(0), record_freq(record_freq), record_version(record_version), is_inference(is_inference) { @@ -70,10 +67,6 @@ struct EmbeddingConfig { kHashFunc = 0; num_counter = 0; } - if (layout == "normal_contiguous" || - layout == "normal_contiguous_gpu") { - normal_fix_flag = 1; - } } int64 calc_num_counter(int64 max_element_size, @@ -105,21 +98,13 @@ struct EmbeddingConfig { } bool is_save_freq() const { - return filter_freq != 0 || - record_freq || - normal_fix_flag == 1; + return filter_freq != 0 || record_freq; } bool is_save_version() const { return steps_to_live != 0 || record_version; } - int64 total_num(int alloc_len) { - return block_num * - (1 + (1 - normal_fix_flag) * slot_num) * - (1 + normal_fix_flag * (alloc_len * (slot_num + 1) - 1)); - } - int64 get_filter_freq() { return filter_freq; } diff --git a/tensorflow/core/framework/embedding/embedding_memory_pool.h b/tensorflow/core/framework/embedding/embedding_memory_pool.h index 27b31ce1ed7..ef175151b00 100644 --- a/tensorflow/core/framework/embedding/embedding_memory_pool.h +++ b/tensorflow/core/framework/embedding/embedding_memory_pool.h @@ -18,9 +18,6 @@ limitations under the License. #include namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class EmbeddingMemoryPool { @@ -50,7 +47,7 @@ class EmbeddingMemoryPool { return ptr; } - void Deallocate(std::vector*> value_ptrs) { + void Deallocate(std::vector value_ptrs) { int64 prev_size = value_ptrs_queue_.size(); for (auto it : value_ptrs) { value_ptrs_queue_.emplace_back(it); @@ -59,9 +56,8 @@ class EmbeddingMemoryPool { int64 n = value_ptrs_queue_.size() - embs_per_block_; n = std::min(prev_size, n); for (int64 i = 0; i < n; i++) { - ValuePtr* val = value_ptrs_queue_.front(); - free_ptr_queue_.emplace_back(val->GetValue(0, 0)); - delete val; + void* val = value_ptrs_queue_.front(); + free_ptr_queue_.emplace_back((V*)val); value_ptrs_queue_.pop_front(); } } @@ -88,7 +84,7 @@ class EmbeddingMemoryPool { int64 embs_per_block_; Allocator* alloc_; std::deque free_ptr_queue_; - std::deque*> value_ptrs_queue_; + std::deque value_ptrs_queue_; std::vector block_list_; }; } //embedding diff --git a/tensorflow/core/framework/embedding/embedding_var.cu.cc b/tensorflow/core/framework/embedding/embedding_var.cu.cc index 0c0be83ec1d..f7162fd2c22 100644 --- a/tensorflow/core/framework/embedding/embedding_var.cu.cc +++ b/tensorflow/core/framework/embedding/embedding_var.cu.cc @@ -42,71 +42,6 @@ void SyncWithEventMgr(se::Stream* stream, while(!is_kernel_finish) {} } -template -void EmbeddingVar::SetDefaultValueOfNewFeatures( - const K* keys, int64 size, const std::list& init_cursor, - V** memcpy_address, se::Stream* compute_stream, EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device) { - if (init_cursor.size() > 0) { - int64 total = init_cursor.size(); - V** value_address = nullptr; - value_address = TypedAllocator::Allocate(cpu_allocator(), total * 2, - AllocationAttributes()); - V** default_value_address = value_address + total; - V** dev_value_address = nullptr; - dev_value_address = - TypedAllocator::Allocate(alloc_, total * 2, AllocationAttributes()); - V** dev_default_value_address = dev_value_address + total; - int64 i = 0; - auto it = init_cursor.cbegin(); - for (; it != init_cursor.cend(); ++it, ++i) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_address[i] = - *((V**)((char*)(value_ptr->GetPtr()) + sizeof(FixedLengthHeader))) + - storage_->GetOffset(emb_config_.emb_index); - default_value_address[i] = - default_value_ + - (keys[i] % emb_config_.default_value_dim) % value_len_; - } - DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(V*)); - compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, - total * 2 * sizeof(V*)); - int block_dim = 128; - TF_CHECK_OK(GpuLaunchKernel( - embedding::CopyEmbedding, - (total * value_len_ + block_dim - 1) / block_dim, - block_dim, 0, gpu_device.stream(), dev_default_value_address, - dev_value_address, value_len_, total)); - SyncWithEventMgr(compute_stream, event_mgr); - // Set init meta of ValuePtrs - for (auto it = init_cursor.cbegin(); it != init_cursor.cend(); ++it) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_ptr->SetInitialized(emb_config_.emb_index); - memcpy_address[*it] = value_ptr->GetValue( - emb_config_.emb_index, - storage_->GetOffset(emb_config_.emb_index)); - } - TypedAllocator::Deallocate(alloc_, dev_value_address, total * 2); - TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2); - } -} - -#define REGISTER_KERNELS(ktype, vtype) \ - template void EmbeddingVar::SetDefaultValueOfNewFeatures( \ - const ktype*, int64, const std::list&, vtype**, \ - se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device); -#define REGISTER_KERNELS_ALL(type) \ - REGISTER_KERNELS(int32, type); \ - REGISTER_KERNELS(int64, type) -#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) -TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) -#undef REGISTER_KERNELS_CPU - -#undef REGISTER_KERNELS_ALL -#undef REGISTER_KERNELS - template void EmbeddingVar::CopyEmbeddingsToBuffer( V* val_base, int64 size, V** memcpy_address, @@ -136,85 +71,6 @@ void EmbeddingVar::CopyEmbeddingsToBuffer( TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) #undef REGISTER_KERNELS_CPU -#undef REGISTER_KERNELS_ALL -#undef REGISTER_KERNELS - -template -void EmbeddingVar::CopyEmbeddingsFromCPUToGPU( - const K* keys, const std::list& copyback_cursor, V** memcpy_address, - se::Stream* compute_stream, EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device, - const DeviceBase::CpuWorkerThreads* worker_threads, - int64* output_value_ptrs) { - if (copyback_cursor.size() > 0) { - int64 total = copyback_cursor.size(); - size_t value_len = emb_config_.total_num(storage_->GetAllocLen()); - V* memcpy_buffer_gpu = nullptr; - ValuePtr** gpu_value_ptrs = new ValuePtr*[total]; - memcpy_buffer_gpu = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, - total * value_len * sizeof(V)); - storage_->CopyEmbeddingsFromCPUToGPU( - total, keys, copyback_cursor, memcpy_address, value_len, gpu_value_ptrs, - memcpy_buffer_gpu, compute_stream, event_mgr, worker_threads); - - V** value_address = (V**)cpu_allocator()->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V*) * total); - V** dev_value_address = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment, - sizeof(V*) * total); - std::vector copyback_keys(total); - int64 i = 0; - auto it = copyback_cursor.cbegin(); - for (; it != copyback_cursor.cend(); ++it, ++i) { - bool init; - // Get the curosr - int64 cursor = *it & 0x0fffffffffffffff; - gpu_value_ptrs[i]->SetInitialized(emb_config_.emb_index); - memcpy_address[cursor] = LookupOrCreateEmb(gpu_value_ptrs[i], init); - value_address[i] = memcpy_address[cursor]; - copyback_keys[i] = keys[cursor]; - } - DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * sizeof(V*)); - compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, total * sizeof(V*)); - - int block_dim = 128; - TF_CHECK_OK(GpuLaunchKernel( - embedding::BatchUnpack, (total + block_dim - 1) / block_dim * value_len, - block_dim, 0, gpu_device.stream(), dev_value_address, memcpy_buffer_gpu, - value_len, total)); - - auto do_insert = [this, copyback_keys, gpu_value_ptrs, value_len]( - int64 start, int64 limit) { - for (int64 i = start; i < limit; i++) - storage_->Insert(copyback_keys[i], gpu_value_ptrs[i]); - }; - Shard(worker_threads->num_threads, worker_threads->workers, - copyback_keys.size(), 100000, do_insert); - if (output_value_ptrs != nullptr) { - auto it = copyback_cursor.cbegin(); - for (int64 i = 0; it != copyback_cursor.cend(); ++it, ++i) { - int64 cursor = *it & 0x0fffffffffffffff; - output_value_ptrs[cursor] = (int64)gpu_value_ptrs[i]; - } - } - SyncWithEventMgr(compute_stream, event_mgr); - - alloc_->DeallocateRaw(dev_value_address); - alloc_->DeallocateRaw(memcpy_buffer_gpu); - cpu_allocator()->DeallocateRaw(value_address); - delete[] gpu_value_ptrs; - } -} -#define REGISTER_KERNELS(ktype, vtype) \ - template void EmbeddingVar::CopyEmbeddingsFromCPUToGPU( \ - const ktype*, const std::list&, vtype**, se::Stream*, EventMgr*, \ - const Eigen::GpuDevice&, const DeviceBase::CpuWorkerThreads*, int64*); -#define REGISTER_KERNELS_ALL(type) \ - REGISTER_KERNELS(int32, type); \ - REGISTER_KERNELS(int64, type) -#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) -TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) -#undef REGISTER_KERNELS_CPU - #undef REGISTER_KERNELS_ALL #undef REGISTER_KERNELS } // namespace tensorflow diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index 28ce5094d87..487f595bf31 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -30,7 +30,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/embedding_var_context.h" #include "tensorflow/core/framework/embedding/embedding_var_restore.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/framework/embedding/filter_factory.h" #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h" #include "tensorflow/core/framework/embedding/embedding_config.h" @@ -57,7 +56,8 @@ class EmbeddingVar : public ResourceBase { EmbeddingVar(const string& name, embedding::Storage* storage, EmbeddingConfig emb_cfg, - Allocator* alloc): + Allocator* alloc, + embedding::FeatureDescriptor* feat_desc): name_(name), storage_(storage), default_value_(nullptr), @@ -65,27 +65,8 @@ class EmbeddingVar : public ResourceBase { value_len_(0), alloc_(alloc), default_value_alloc_(alloc), - emb_config_(emb_cfg) { - if (IsMultiLevel() || emb_config_.record_freq) { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) { - value_ptr->AddFreq(freq); - }; - } else if (emb_config_.is_counter_filter()) { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) { - if (value_ptr->GetFreq() < filter_freq) - value_ptr->AddFreq(freq); - }; - } else { - add_freq_fn_ = [](ValuePtr* value_ptr, int64 freq, int64 filter_freq) {}; - } - if (emb_config_.steps_to_live != 0 || emb_config_.record_version) { - update_version_fn_ = [](ValuePtr* value_ptr, int64 gs) { - value_ptr->SetStep(gs); - }; - } else { - update_version_fn_ = [](ValuePtr* value_ptr, int64 gs) {}; - } - } + emb_config_(emb_cfg), + feat_desc_(feat_desc) {} Status Init(const Tensor& default_tensor, int64 default_value_dim) { if (storage_ == nullptr) { @@ -95,17 +76,11 @@ class EmbeddingVar : public ResourceBase { storage_type_ = storage_->GetStorageType(); filter_ = FilterFactory::CreateFilter>( - emb_config_, this, storage_); + emb_config_, this, storage_, feat_desc_); emb_config_.default_value_dim = default_value_dim; value_len_ = default_tensor.NumElements() / emb_config_.default_value_dim; - if (LayoutType::NORMAL_CONTIGUOUS == storage_->GetLayoutType() || - LayoutType::NORMAL_CONTIGUOUS_GPU == storage_->GetLayoutType() || - LayoutType::COMPACT == storage_->GetLayoutType()) { - storage_->SetAllocLen(value_len_, emb_config_.slot_num + 1); - } - if (storage_->IsUseHbm()) { #if GOOGLE_CUDA default_value_ = TypedAllocator::Allocate(alloc_, @@ -115,12 +90,6 @@ class EmbeddingVar : public ResourceBase { dev_addr_buffer_size_ = 0; cudaMemcpy(default_value_, &default_tensor_flat(0), default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice); - storage_-> - CreateEmbeddingMemoryPool( - alloc_, - emb_config_.total_num( - storage_->GetAllocLen()), - 1024 * 1024 * 64); #endif // GOOGLE_CUDA } else if (storage_->IsSingleHbm()) { #if GOOGLE_CUDA @@ -147,6 +116,14 @@ class EmbeddingVar : public ResourceBase { emb_config_.default_value_no_permission); } } + bool is_all_slots_initialized = + feat_desc_->InitSlotInfo( + emb_config_.emb_index, value_len_, + std::pair( + default_value_, emb_config_.default_value_dim)); + if (is_all_slots_initialized) { + storage_->Init(); + } return Status::OK(); } @@ -159,57 +136,92 @@ class EmbeddingVar : public ResourceBase { return is_initialized_; } - Status LookupKey(K key, ValuePtr** value_ptr) { + Status LookupKey(K key, void** value_ptr) { return storage_->Get(key, value_ptr); } void BatchLookupKey(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys) { - storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys, - emb_config_.total_num(storage_->GetAllocLen())); + storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys); } - Status LookupOrCreateKey(K key, ValuePtr** value_ptr, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, bool indices_as_pointer, int64 count = 1) { if (indices_as_pointer) { - *value_ptr = (ValuePtr*)key; - *is_filter = (*value_ptr != nullptr); + *value_ptr = (void*)key; + *is_filter = filter_->is_admit(key, *value_ptr); return Status::OK(); } else { Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count); - add_freq_fn_(*value_ptr, count, emb_config_.filter_freq); return s; } } Status Insert(K key, V* value) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; CreateKey(key, &value_ptr, true); - LookupOrCreateEmb(value_ptr, value); + feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value); return Status::OK(); } - Status LookupOrCreateKey(K key, ValuePtr** value_ptr) { - Status s = storage_->GetOrCreate(key, value_ptr, - emb_config_.total_num(storage_->GetAllocLen())); + Status LookupOrCreateKey(const EmbeddingVarContext& context, + const K* keys, + void** value_ptrs, + int64 num_of_keys, + int64* indices_counts, + bool indices_as_pointer = false) { + if (indices_as_pointer) { + auto lookup_key_and_set_version_fn = [keys, value_ptrs] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + value_ptrs[i] = (void*)keys[i]; + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + lookup_key_and_set_version_fn); + } else { + filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); + } + + if (indices_counts != nullptr) { + auto add_freq_fn = [this, value_ptrs, indices_counts] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]); + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + add_freq_fn); + } + return Status::OK(); + } + + + Status LookupOrCreateKey(K key, void** value_ptr) { + Status s = storage_->GetOrCreate(key, value_ptr); TF_CHECK_OK(s); return s; } - void CreateKey(K key, ValuePtr** value_ptr, bool to_dram) { - storage_->Insert(key, value_ptr, - emb_config_.total_num(storage_->GetAllocLen()), to_dram); + void CreateKey(K key, void** value_ptr, bool to_dram) { + storage_->CreateAndInsert(key, value_ptr, to_dram); } - void UpdateVersion(ValuePtr* value_ptr, int64 gs) { - update_version_fn_(value_ptr, gs); + void UpdateVersion(void* value_ptr, int64 gs) { + feat_desc_->UpdateVersion(value_ptr, gs); } void BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs)); } @@ -218,9 +230,9 @@ class EmbeddingVar : public ResourceBase { } int64 GetVersion(K key) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetStep(); + return feat_desc_->GetVersion(value_ptr); } int64 GetFreq(K key) { @@ -261,11 +273,11 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { V* default_v = default_value + i * value_len_; - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; filter_->LookupOrCreate( keys[i], output + i * value_len_, default_v, &value_ptr, 1, default_value_no_permission_); - add_freq_fn_(value_ptr, 1, emb_config_.filter_freq); + feat_desc_->AddFreq(value_ptr, 1); } }; auto worker_threads = context.worker_threads; @@ -276,7 +288,7 @@ class EmbeddingVar : public ResourceBase { void GetOrCreateKey(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, int64 num_of_keys) { const K* keys = (K*)keys_tensor.data(); auto do_work = [this, keys, value_ptrs] (int64 start, int64 limit) { @@ -295,7 +307,7 @@ class EmbeddingVar : public ResourceBase { void GatherEmbeddings(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, V* output, int64 num_of_keys) { const K* keys = (K*)keys_tensor.data(); @@ -303,13 +315,10 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); - add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq); V* value = nullptr; if (is_admit) { - V* default_v = - default_value_ + - (keys[i] % emb_config_.default_value_dim) * value_len_; - value = LookupOrCreateEmb(value_ptrs[i], default_v); + value = feat_desc_->GetEmbedding( + value_ptrs[i], emb_config_.emb_index); } else { value = default_value_no_permission_; } @@ -341,8 +350,9 @@ class EmbeddingVar : public ResourceBase { void GetOrCreateKey(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, - int64 num_of_keys) { + void** value_ptrs, + int64 num_of_keys, + bool indices_as_pointer = false) { const K* keys = (K*)keys_tensor.data(); filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); storage_->AddToCachePrefetchList(keys_tensor); @@ -351,17 +361,17 @@ class EmbeddingVar : public ResourceBase { void BatchLookupOrCreateKey( const EmbeddingVarContext& context, const K* keys, - ValuePtr** value_ptrs, + void** value_ptrs, int64 num_of_keys, std::vector>& not_found_cursor_list) { storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys, - emb_config_.total_num(storage_->GetAllocLen()), + value_len_, not_found_cursor_list); } void GatherEmbeddings(const EmbeddingVarContext& context, const Tensor& keys_tensor, - ValuePtr** value_ptrs, + void** value_ptrs, V* output, int64 num_of_keys) { std::vector embedding_ptr(num_of_keys); @@ -370,12 +380,10 @@ class EmbeddingVar : public ResourceBase { (int64 start, int64 limit) { for (int64 i = start; i < limit; ++i) { bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]); - add_freq_fn_(value_ptrs[i], 1, emb_config_.filter_freq); + feat_desc_->AddFreq(value_ptrs[i], 1); if (is_admit) { - V* default_v = - default_value_ + - (keys[i] % emb_config_.default_value_dim) * value_len_; - embedding_ptr[i] = LookupOrCreateEmb(value_ptrs[i], default_v); + embedding_ptr[i] = feat_desc_->GetEmbedding( + value_ptrs[i], emb_config_.emb_index); } else { embedding_ptr[i] = default_value_no_permission_; } @@ -394,72 +402,8 @@ class EmbeddingVar : public ResourceBase { storage_->AddToCache(keys_tensor); } - - void BatchLookupOrCreateEmb( - const EmbeddingVarContext& ctx, - V** var_ptr, - ValuePtr** value_ptrs, - const K* indices, - int64 num_of_keys, - IntraThreadCopyIdAllocator* thread_copy_id_alloc) { - int num_worker_threads = ctx.worker_threads->num_threads; - std::vector> init_cursor_list( - num_worker_threads + 1); - uint64 main_thread_id = Env::Default()->GetCurrentThreadId(); - - auto do_work_get_ptrs = [this, value_ptrs, &init_cursor_list, - &thread_copy_id_alloc, main_thread_id, var_ptr] (int64 start, int64 limit) { - int copy_id = - thread_copy_id_alloc->GetCopyIdOfThread(main_thread_id); - for (int i = start; i < limit; i++) { - bool is_need_set_default_value = false; - var_ptr[i] = LookupOrCreateEmb( - value_ptrs[i], is_need_set_default_value); - if (is_need_set_default_value) { - init_cursor_list[copy_id].emplace_back(i); - } - } - }; - const int64 unit_cost = 1000; - auto worker_threads = ctx.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, - num_of_keys, unit_cost, do_work_get_ptrs); - - // Merge copies of init_cursor_list - for (int i = 1; i < (worker_threads->num_threads + 1); i++) { - if (init_cursor_list[i].size() > 0) { - init_cursor_list[0].splice(init_cursor_list[0].end(), - init_cursor_list[i]); - } - } - - auto stream = ctx.compute_stream; - auto event_mgr = ctx.event_mgr; - - SetDefaultValueOfNewFeatures( - indices, num_of_keys, - init_cursor_list[0], - var_ptr, stream, event_mgr, - ctx.gpu_device); - } #endif - void LookupOrCreate(K key, V* val, V* default_v, int count = 1) { - const V* default_value_ptr = - (default_v == nullptr) ? default_value_ : default_v; - ValuePtr* value_ptr = nullptr; - filter_->LookupOrCreate(key, val, default_value_ptr, &value_ptr, count, - default_value_no_permission_); - add_freq_fn_(value_ptr, count, emb_config_.filter_freq); - } - - void BatchInitEmb(int64 size, V** memcpy_address, V* default_value, - bool* init_flags, int64 value_len) { - filter_->BatchInitEmb(size, memcpy_address, default_value, - init_flags, value_len); - } - #if GOOGLE_CUDA void CopyEmbeddingsToBuffer( V* val_base, int64 size, @@ -467,73 +411,18 @@ class EmbeddingVar : public ResourceBase { se::Stream* compute_stream, EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device); - - void SetDefaultValueOfNewFeatures( - const K* keys, int64 size, - const std::list& init_cursor, - V** memcpy_address, - se::Stream* compute_stream, - EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device); - - void CopyEmbeddingsFromCPUToGPU( - const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, - se::Stream* compute_stream, - EventMgr* event_mgr, - const Eigen::GpuDevice& gpu_device, - const DeviceBase::CpuWorkerThreads* worker_threads, - int64* output_value_ptrs = nullptr); - - void AllocateMemoryForNewFeatures( - V** memcpy_address, - const std::list& init_cursor) { - std::vector*> value_ptr_list; - for (auto it = init_cursor.cbegin(); - it != init_cursor.cend(); ++it) { - ValuePtr* value_ptr = - reinterpret_cast*>(memcpy_address[*it]); - value_ptr_list.emplace_back(value_ptr); - } - storage_->AllocateMemoryForNewFeatures(value_ptr_list); - } #endif // GOOGLE_CUDA - V* LookupOrCreateEmb(ValuePtr* value_ptr, const V* default_v) { - return value_ptr->GetOrAllocate(alloc_, value_len_, default_v, - emb_config_.emb_index, storage_->GetOffset( - emb_config_.emb_index)); - } - - V* LookupOrCreateEmb(ValuePtr* value_ptr, const V* default_v, - Allocator* alloc) { - return value_ptr->GetOrAllocate(alloc, value_len_, default_v, - emb_config_.emb_index, storage_->GetOffset( - emb_config_.emb_index)); - } - - V* LookupOrCreateEmb(ValuePtr* value_ptr, bool &need_initialize) { - return value_ptr->GetOrAllocate(alloc_, value_len_, nullptr, - emb_config_.emb_index, - storage_->GetOffset(emb_config_.emb_index), - need_initialize); - } - - V* LookupPrimaryEmb(ValuePtr* value_ptr) { - V* primary_val = value_ptr->GetValue(emb_config_.primary_emb_index, - storage_->GetOffset(emb_config_.primary_emb_index)); - return primary_val; - } - - typename TTypes::Flat flat(ValuePtr* value_ptr, int64 index) { - V* default_v = - default_value_ + (index % emb_config_.default_value_dim) * value_len_; - V* val = LookupOrCreateEmb(value_ptr, default_v); + typename TTypes::Flat flat(void* value_ptr) { + V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index); Eigen::array dims({value_len_}); return typename TTypes::Flat(val, dims); } + V* GetValuePtr(void* ptr) { + return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index); + } + int64 ValueLen() const { return value_len_; } @@ -602,25 +491,26 @@ class EmbeddingVar : public ResourceBase { std::vector* value_list, std::vector* version_list, std::vector* freq_list) { - std::vector*> value_ptr_list; + std::vector value_ptr_list; storage_->GetSnapshot(key_list, &value_ptr_list); bool is_save_freq = emb_config_.is_save_freq(); bool is_save_version = emb_config_.is_save_version(); for (int64 i = 0; i < key_list->size(); i++) { - V* val = value_ptr_list[i]->GetValue(emb_config_.emb_index, 0); - if (val != nullptr) { + if (feat_desc_->IsAdmit(value_ptr_list[i])) { + V* val = feat_desc_->GetEmbedding( + value_ptr_list[i], emb_config_.emb_index); value_list->emplace_back(val); } else { value_list->emplace_back(default_value_); } if(is_save_version) { - int64 dump_version = value_ptr_list[i]->GetStep(); + int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]); version_list->emplace_back(dump_version); } if(is_save_freq) { - int64 dump_freq = value_ptr_list[i]->GetFreq(); + int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]); freq_list->emplace_back(dump_freq); } } @@ -634,6 +524,10 @@ class EmbeddingVar : public ResourceBase { return storage_; } + embedding::FeatureDescriptor* feature_descriptor() { + return feat_desc_; + } + Status Shrink(embedding::ShrinkArgs& shrink_args) { if (emb_config_.is_primary()) { shrink_args.value_len = value_len_; @@ -671,10 +565,6 @@ class EmbeddingVar : public ResourceBase { return alloc_; } - int64 GetAllocLen() { - return emb_config_.total_num(storage_->GetAllocLen()); - } - V** GetBuffer(int64 size) { if (dev_addr_buffer_size_ >= size) { return dev_addr_buffer_; @@ -756,16 +646,17 @@ class EmbeddingVar : public ResourceBase { return storage_->HashTable(); } - protected: FilterPolicy>* GetFilter() const { return filter_; } + protected: ~EmbeddingVar() override { // When dynamic dimension embedding is used, // there will be more than one primary slot if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) { delete storage_; + delete feat_desc_; } if (embedding::StorageType::HBM_DRAM == storage_type_) { alloc_->DeallocateRaw(dev_addr_buffer_); @@ -804,35 +695,6 @@ class EmbeddingVar : public ResourceBase { value_len_ * sizeof(V), do_work); } - V* GetAddressOfGpuValuePtr(ValuePtr* value_ptr, - int64 index, - bool copyback_flag, - std::list& init_cursor, - std::list& copyback_cursor) { - V* mem_addr = nullptr; - bool init_flag = false; - if (!copyback_flag) { - mem_addr = LookupOrCreateEmb(value_ptr, init_flag); - } else { - mem_addr = value_ptr->GetValue(0,0); - if (copyback_flag == - embedding::CopyBackFlag::COPYBACK_AND_DESTROY) { - delete value_ptr; - // If the 64th bit of cursor is set to 1, - // the corresponding valueptr need to be deleted later. - int64 tmp = 1; - tmp = tmp << 63; - copyback_cursor.emplace_back(index | tmp); - } else { - copyback_cursor.emplace_back(index); - } - } - if (init_flag) { - init_cursor.emplace_back(index); - } - return mem_addr; - } - std::string name_; bool is_initialized_ = false; @@ -849,8 +711,7 @@ class EmbeddingVar : public ResourceBase { embedding::StorageType storage_type_; EmbeddingConfig emb_config_; FilterPolicy>* filter_; - std::function*, int64, int64)> add_freq_fn_; - std::function*, int64)> update_version_fn_; + embedding::FeatureDescriptor* feat_desc_; TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar); }; diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc index c1b43a608b5..7dddf714b6b 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.cc @@ -21,42 +21,38 @@ namespace tensorflow { namespace embedding { template void EmbeddingVarCkptData::Emplace( - K key, ValuePtr* value_ptr, + K key, void* value_ptr, const EmbeddingConfig& emb_config, - V* default_value, int64 value_offset, + V* default_value, + FeatureDescriptor* feat_desc, bool is_save_freq, bool is_save_version, bool save_unfiltered_features) { if((int64)value_ptr == ValuePtrStatus::IS_DELETED) return; - V* primary_val = value_ptr->GetValue(0, 0); - bool is_not_admit = - primary_val == nullptr - && emb_config.filter_freq != 0; + bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); + bool is_admit = feat_desc->IsAdmit(value_ptr); - if (!is_not_admit) { + if (is_admit) { key_vec_.emplace_back(key); - if (primary_val == nullptr) { + if (!is_in_dram) { + value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM); + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) { value_ptr_vec_.emplace_back(default_value); - } else if ( - (int64)primary_val == ValuePosition::NOT_IN_DRAM) { - value_ptr_vec_.emplace_back((V*)ValuePosition::NOT_IN_DRAM); } else { - V* val = value_ptr->GetValue(emb_config.emb_index, - value_offset); + V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index); value_ptr_vec_.emplace_back(val); } - - if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); + int64 dump_version = feat_desc->GetVersion(value_ptr); version_vec_.emplace_back(dump_version); } if(is_save_freq) { - int64 dump_freq = value_ptr->GetFreq(); + int64 dump_freq = feat_desc->GetFreq(value_ptr); freq_vec_.emplace_back(dump_freq); } } else { @@ -66,18 +62,18 @@ void EmbeddingVarCkptData::Emplace( key_filter_vec_.emplace_back(key); if(is_save_version) { - int64 dump_version = value_ptr->GetStep(); + int64 dump_version = feat_desc->GetVersion(value_ptr); version_filter_vec_.emplace_back(dump_version); } - int64 dump_freq = value_ptr->GetFreq(); + int64 dump_freq = feat_desc->GetFreq(value_ptr); freq_filter_vec_.emplace_back(dump_freq); } } #define REGISTER_KERNELS(ktype, vtype) \ template void EmbeddingVarCkptData::Emplace( \ - ktype, ValuePtr*, const EmbeddingConfig&, \ - vtype*, int64, bool, bool, bool); + ktype, void*, const EmbeddingConfig&, \ + vtype*, FeatureDescriptor*, bool, bool, bool); #define REGISTER_KERNELS_ALL_INDEX(type) \ REGISTER_KERNELS(int32, type) \ REGISTER_KERNELS(int64, type) diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h index 6d7b09e70b0..10bf0d0e43b 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h @@ -19,15 +19,19 @@ limitations under the License. #include "tensorflow/core/framework/embedding/embedding_var_dump_iterator.h" namespace tensorflow { class BundleWriter; +namespace { + const int kSavedPartitionNum = 1000; + const int kDramFlagOffset = 49; +} namespace embedding { - template class EmbeddingVarCkptData { public: - void Emplace(K key, ValuePtr* value_ptr, + void Emplace(K key, void* value_ptr, const EmbeddingConfig& emb_config, - V* default_value, int64 value_offset, + V* default_value, + FeatureDescriptor* feat_desc, bool is_save_freq, bool is_save_version, bool save_unfiltered_features); diff --git a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h index 84c823a90dc..4c052b43c7e 100644 --- a/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h +++ b/tensorflow/core/framework/embedding/embedding_var_dump_iterator.h @@ -57,7 +57,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator { value_len_(value_len), col_idx_(0) { if (!valueptr_list.empty()) { - if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { curr_ptr_ = val_iter_->Next(); } else { curr_ptr_ = *curr_iter_; @@ -75,7 +75,7 @@ class EV2dVectorDataDumpIterator: public DumpIterator { curr_iter_++; col_idx_ = 0; if (curr_iter_ != end_iter_) { - if ((int64)*curr_iter_ == ValuePosition::NOT_IN_DRAM) { + if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) { curr_ptr_ = val_iter_->Next(); } else { curr_ptr_ = *curr_iter_; diff --git a/tensorflow/core/framework/embedding/feature_descriptor.h b/tensorflow/core/framework/embedding/feature_descriptor.h new file mode 100644 index 00000000000..8808da353f4 --- /dev/null +++ b/tensorflow/core/framework/embedding/feature_descriptor.h @@ -0,0 +1,200 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/framework/embedding/config.pb.h" +#include "tensorflow/core/framework/embedding/counter_filter_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/dynamic_dim_feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h" +#include "tensorflow/core/framework/embedding/normal_feature_descriptor.h" +#include + +namespace tensorflow { +namespace embedding { + +template +class HbmMultiTierFeatureDescriptorImpl; + +template +class NormalFeatureDescriptorImpl; + +template +class CounterFilterDescriptorImpl; + +template +class FeatureDescriptor { + public: + FeatureDescriptor( + int64 block_num, + int64 slot_num, + Allocator* alloc, + StorageType storage_type, + bool need_record_freq, + bool need_record_version, + const std::pair& filter_info) { + if (block_num > 1) { + feat_desc_impl_.reset( + new DynmaicDimDescriptorImpl( + alloc, block_num * slot_num)); + } else if (filter_info.first) { + feat_desc_impl_.reset( + new CounterFilterDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version, + filter_info.second, + storage_type)); + } else if (storage_type == StorageType::HBM_DRAM || + storage_type == StorageType::HBM_DRAM_SSDHASH) { + feat_desc_impl_.reset( + new HbmMultiTierFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + alloc, slot_num, + need_record_freq, + need_record_version)); + } + } + + FeatureDescriptor(FeatureDescriptor* feat_desc) { + if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(CounterFilterDescriptorImpl*)) { + feat_desc_impl_.reset( + new CounterFilterDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + else if (typeid(*(feat_desc->feat_desc_impl_.get())) == + typeid(HbmMultiTierFeatureDescriptorImpl)) { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + else { + feat_desc_impl_.reset( + new NormalFeatureDescriptorImpl( + dynamic_cast*>( + feat_desc->feat_desc_impl_.get()))); + } + } + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + return feat_desc_impl_->InitSlotInfo( + emb_index, embedding_dim, default_value); + } + + bool InitSlotInfo(FeatureDescriptor* feat_desc) { + return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get()); + } + + V* GetEmbedding(void *val, int emb_index) { + return feat_desc_impl_->GetEmbedding(val, emb_index); + } + + void* Allocate() { + return feat_desc_impl_->Allocate(); + } + + void* Allocate(int64 freq) { + return feat_desc_impl_->Allocate(freq); + } + + void Deallocate(void* val) { + feat_desc_impl_->Deallocate(val); + } + + void Deallocate(const std::vector& value_ptrs) { + feat_desc_impl_->Deallocate(value_ptrs); + } + + void SetDefaultValue(void* val, int64 index) { + feat_desc_impl_->SetDefaultValue(val, index); + } + + void SetValue(void* val, int64 emb_index, V* value) { + feat_desc_impl_->SetValue(val, emb_index, value); + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + reinterpret_cast*>(feat_desc_impl_.get())->SetDefaultValues( + keys, init_cursor, value_ptrs, + compute_stream, event_mgr, gpu_device); + } +#endif + + void SetAllocator(Allocator* alloc) { + feat_desc_impl_->SetAllocator(alloc); + } + + int data_bytes() { + return feat_desc_impl_->data_bytes(); + } + + int64 GetFreq(void* val) { + return feat_desc_impl_->GetFreq(val); + } + + int64 GetVersion(void* val) { + return feat_desc_impl_->GetVersion(val); + } + + void SetFreq(void* val, int64 freq) { + feat_desc_impl_->SetFreq(val, freq); + } + + void UpdateVersion(void* val, int64 version) { + feat_desc_impl_->UpdateVersion(val, version); + } + + void AddFreq(void* val, int64 freq) { + feat_desc_impl_->AddFreq(val, freq); + } + + int total_dim() { + return feat_desc_impl_->total_dim(); + } + + bool IsAdmit(void* val) { + return feat_desc_impl_->IsAdmit(val); + } + + void* Admit(void* val) { + return feat_desc_impl_->Admit(val); + } + + + protected: + std::unique_ptr> feat_desc_impl_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/feature_descriptor_impl.h b/tensorflow/core/framework/embedding/feature_descriptor_impl.h new file mode 100644 index 00000000000..6996d22f447 --- /dev/null +++ b/tensorflow/core/framework/embedding/feature_descriptor_impl.h @@ -0,0 +1,317 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ +#include "tensorflow/core/util/env_var.h" + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { +namespace embedding { +struct SlotInfo { + int embedding_dim; + int embedding_offset; + void* default_value; + int64 default_value_dim; + int default_value_len; +}; + +class BaseFreqDescriptor { + public: + virtual int64 GetFreq(void* value_ptr) = 0; + virtual void AddFreq(void* value_ptr, int64 freq) {} + virtual void SetFreq(void* value_ptr, int64 freq) {} + virtual BaseFreqDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class FreqDescriptor: public BaseFreqDescriptor { + public: + explicit FreqDescriptor(int offset_byte) + : offset_byte_(offset_byte) {} + + int64 GetFreq(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void AddFreq(void* value_ptr, int64 freq) override { + __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq); + } + + void SetFreq(void* value_ptr, int64 freq) override { + *(int64*)(value_ptr + offset_byte_) = freq; + } + + BaseFreqDescriptor* Clone() override { + return new FreqDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonFreqDescriptor: public BaseFreqDescriptor { + public: + int64 GetFreq(void* value_ptr) override { + LOG(FATAL)<<"Can not get freq from NonFreqCounter."; + } + + BaseFreqDescriptor* Clone() override { + return new NonFreqDescriptor(); + } +}; + +class BaseVersionDescriptor { + public: + virtual int64 GetVersion(void* value_ptr) = 0; + virtual void UpdateVersion(void* value_ptr, int64 version) {} + virtual BaseVersionDescriptor* Clone() = 0; + virtual void SetOffset(int* alloc_bytes) {} +}; + +class VersionDescriptor: public BaseVersionDescriptor { + public: + explicit VersionDescriptor(int offset_byte) + : offset_byte_(offset_byte) {} + + int64 GetVersion(void* value_ptr) override { + return *(int64*)(value_ptr + offset_byte_); + } + + void UpdateVersion(void* value_ptr, int64 version) override { + *(int64*)(value_ptr + offset_byte_) = version; + } + + BaseVersionDescriptor* Clone() override { + return new VersionDescriptor(offset_byte_); + } + + void SetOffset(int* alloc_bytes) override { + offset_byte_ = *alloc_bytes; + *alloc_bytes += sizeof(int64); + } + + private: + int offset_byte_; +}; + +class NonVersionDescriptor: public BaseVersionDescriptor { + public: + int64 GetVersion(void* value_ptr) override { + LOG(FATAL)<<"Can not get version from NonFreqCounter."; + } + + BaseVersionDescriptor* Clone() override { + return new NonVersionDescriptor(); + } +}; + +template +class FeatureDescriptorImpl { + public: + FeatureDescriptorImpl(int64 slot_num, + bool need_record_freq, + bool need_record_version) { + slot_infos_.resize(slot_num); + for (int i = 0; i < slot_infos_.size(); i++) { + slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE; + } + + if (!need_record_freq) { + freq_desc_.reset(new NonFreqDescriptor()); + } + if (!need_record_version) { + version_desc_.reset(new NonVersionDescriptor()); + } + } + + FeatureDescriptorImpl(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + freq_desc_.reset( + feat_desc_impl->freq_desc_->Clone()); + version_desc_.reset( + feat_desc_impl->version_desc_->Clone()); + } + + virtual ~FeatureDescriptorImpl() {} + + virtual bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) = 0; + virtual bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + LOG(FATAL)<<"InitSlotInfo(feat_desc_impl) is not implemented."; + } + virtual V* GetEmbedding(void* val, int emb_index) = 0; + virtual void* Allocate() = 0; + virtual void* Allocate(int64 freq) {return Allocate();} + virtual void Deallocate(void* val) = 0; + virtual void Deallocate(const std::vector& val) = 0; + virtual void SetAllocator(Allocator* alloc) = 0; + virtual void SetDefaultValue(void* val, int64 key) = 0; + virtual void SetValue(void* val, int64 emb_index, V* value) {} + virtual bool IsAdmit(void* val) {return true;} + virtual void* Admit(void* val) {} +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) {} +#endif + virtual int data_bytes() = 0; + + virtual int64 GetFreq(void* val) { + return freq_desc_->GetFreq(val); + } + + virtual int64 GetVersion(void* val) { + return version_desc_->GetVersion(val); + } + + virtual void SetFreq(void* val, int64 freq) { + freq_desc_->SetFreq(val, freq); + } + + virtual void UpdateVersion(void* val, int64 version) { + version_desc_->UpdateVersion(val, version); + } + + virtual void AddFreq(void* val, int64 freq) { + freq_desc_->AddFreq(val, freq); + } + + inline int total_dim() { + int64 slot_num = slot_infos_.size(); + return slot_infos_[slot_num - 1].embedding_offset + + slot_infos_[slot_num - 1].embedding_dim; + } + + protected: + bool SetEmbeddingInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) { + slot_infos_[emb_index].default_value = default_value.first; + slot_infos_[emb_index].default_value_dim = default_value.second; + slot_infos_[emb_index].default_value_len = embedding_dim; + + bool is_aligned = true; + TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true, + &is_aligned)); + if (is_aligned) { + embedding_dim = ComputeAlignedDim(embedding_dim); + } + + //Avoid parallel consitency issue + __sync_bool_compare_and_swap( + &slot_infos_[emb_index].embedding_offset, + EMPTY_OFFSET_VALUE, embedding_dim); + slot_infos_[emb_index].embedding_dim = embedding_dim; + //Check whether all offsets are set + for (int i = 0; i < slot_infos_.size(); i++) { + if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) { + return false; + } + } + + ComputeEmbeddingOffsets(); + return true; + } + + void SetSlotInfo(FeatureDescriptorImpl* feat_desc_impl) { + slot_infos_ = feat_desc_impl->slot_infos_; + } + + void ComputeAllocBytes(int* alloc_bytes) { + for(auto slot_info: slot_infos_) { + *alloc_bytes += slot_info.embedding_dim * sizeof(V); + } + } + + void CreateFreqAndVersionDescriptor(int* alloc_bytes) { + if (!freq_desc_) { + freq_desc_.reset(new FreqDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + if (!version_desc_) { + version_desc_.reset(new VersionDescriptor(*alloc_bytes)); + *alloc_bytes += sizeof(int64); + } + } + + void InitFreqAndVersion(void* val) { + freq_desc_->SetFreq(val, 0); + version_desc_->UpdateVersion(val, -1); + } + + void SetFreqAndVersionOffset(int* alloc_bytes) { + freq_desc_->SetOffset(alloc_bytes); + version_desc_->SetOffset(alloc_bytes); + } + + V* GetDefaultValuePtr(int64 emb_index, int64 key) { + V* default_value_base = (V*)slot_infos_[emb_index].default_value; + int64 default_value_offset = + (key % slot_infos_[emb_index].default_value_dim) * + slot_infos_[emb_index].default_value_len; + return default_value_base + default_value_offset; + } + + void SetDefaultValue(void* val, int64 emb_index, int64 key) { + memcpy(val, + GetDefaultValuePtr(emb_index, key), + slot_infos_[emb_index].default_value_len * sizeof(V)); + } + + private: + int64 ComputeAlignedDim(int64 embedding_dim) { + int padding_bytes = + ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES; + if (padding_bytes == ALIGN_BYTES) { + return embedding_dim; + } else { + return embedding_dim + padding_bytes / sizeof(V); + } + } + + void ComputeEmbeddingOffsets() { + for (int i = slot_infos_.size() - 1 ; i >= 0; i--) { + slot_infos_[i].embedding_offset = 0; + for (int j = 0; j < i; j++) { + slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset; + } + } + } + + protected: + const int EMPTY_OFFSET_VALUE= -1; + const int ALIGN_BYTES = 16; + std::vector slot_infos_; + std::unique_ptr freq_desc_; + std::unique_ptr version_desc_; +}; + +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_ diff --git a/tensorflow/core/framework/embedding/filter_factory.h b/tensorflow/core/framework/embedding/filter_factory.h index 5bb92467a51..0127e2c882a 100644 --- a/tensorflow/core/framework/embedding/filter_factory.h +++ b/tensorflow/core/framework/embedding/filter_factory.h @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/filter_policy.h" #include "tensorflow/core/framework/embedding/nullable_filter_policy.h" - namespace tensorflow { namespace embedding{ template @@ -34,22 +33,23 @@ class FilterFactory { template static FilterPolicy* CreateFilter( const EmbeddingConfig& config, EV* ev, - embedding::Storage* storage) { + embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) { if (config.filter_freq > 0) { if (config.kHashFunc != 0) { return new BloomFilterPolicy( - config, ev); + config, ev, feat_desc); } else { return new CounterFilterPolicy( - config, ev); + config, ev, feat_desc); } } else { return new NullableFilterPolicy( - config, ev, storage); + config, ev, storage, feat_desc); } } }; -} // tensorflow +} //namespace tensorflow #endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_ diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h index 559a6796246..256d3b044d4 100644 --- a/tensorflow/core/framework/embedding/filter_policy.h +++ b/tensorflow/core/framework/embedding/filter_policy.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/embedding/embedding_config.h" #include "tensorflow/core/framework/embedding/emb_file.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" namespace tensorflow { @@ -45,9 +46,6 @@ struct RestoreBuffer { template class RestoreSSDBuffer; -template -class ValuePtr; - template class FilterPolicy { public: @@ -55,7 +53,7 @@ class FilterPolicy { config_(config), ev_(ev) {} virtual void LookupOrCreate(K key, V* val, - const V* default_value_ptr, ValuePtr** value_ptr, + const V* default_value_ptr, void** value_ptr, int count, const V* default_value_no_permission) = 0; virtual Status Lookup(K key, V* val, const V* default_value_ptr, @@ -70,53 +68,25 @@ class FilterPolicy { virtual void BatchLookupOrCreateKey( const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs_list, + const K* keys, void** value_ptrs_list, int64 num_of_keys) = 0; #endif //GOOGLE_CUDA - virtual Status LookupOrCreateKey(K key, ValuePtr** val, + virtual Status LookupOrCreateKey(K key, void** val, bool* is_filter, int64 count) = 0; + + virtual Status LookupKey(K key, void** val, + bool* is_filter, int64 count) {} - virtual int64 GetFreq(K key, ValuePtr* value_ptr) = 0; - + virtual int64 GetFreq(K key, void* value_ptr) = 0; virtual int64 GetFreq(K key) = 0; - virtual bool is_admit(K key, ValuePtr* value_ptr) = 0; + virtual bool is_admit(K key, void* value_ptr) = 0; virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, bool to_dram, bool is_incr, RestoreBuffer& restore_buff) = 0; - protected: - void LookupOrCreateEmbInternal(bool is_filter, bool to_dram, - int i, int value_len, - ValuePtr* value_ptr, - V* value_src, K* key_src) { - - if (!is_filter) { - ev_->LookupOrCreateEmb(value_ptr, value_src + i * ev_->ValueLen()); - return; - } else { - if (to_dram) { -#if GOOGLE_CUDA - std::vector default_value_host; - default_value_host.resize(config_.default_value_dim * value_len); - cudaMemcpy(default_value_host.data(), ev_->GetDefaultValuePtr(), - sizeof(V) * config_.default_value_dim * value_len, - cudaMemcpyDeviceToHost); - ev_->LookupOrCreateEmb(value_ptr, - default_value_host.data() + - (key_src[i] % config_.default_value_dim) - * ev_->ValueLen()); -#endif - return; - } else { - ev_->LookupOrCreateEmb(value_ptr, ev_->GetDefaultValue(key_src[i])); - return; - } - } - } - protected: EmbeddingConfig config_; EV* ev_; diff --git a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h index a2af6a2430a..b0950eff22d 100644 --- a/tensorflow/core/framework/embedding/globalstep_shrink_policy.h +++ b/tensorflow/core/framework/embedding/globalstep_shrink_policy.h @@ -18,25 +18,21 @@ limitations under the License. #include "tensorflow/core/framework/embedding/shrink_policy.h" namespace tensorflow { - -template -class ValuePtr; - namespace embedding { template class GlobalStepShrinkPolicy : public ShrinkPolicy { public: GlobalStepShrinkPolicy(int64 steps_to_live, - Allocator* alloc, + FeatureDescriptor* feat_desc, KVInterface* kv) : steps_to_live_(steps_to_live), kv_(kv), - ShrinkPolicy(alloc) {} + ShrinkPolicy(feat_desc) {} TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override { ShrinkPolicy::ReleaseValuePtrs(); FilterToDelete(shrink_args.global_step, @@ -46,16 +42,16 @@ class GlobalStepShrinkPolicy : public ShrinkPolicy { private: void FilterToDelete(int64 global_step, std::vector& key_list, - std::vector*>& value_list) { + std::vector& value_list) { for (int64 i = 0; i < key_list.size(); ++i) { - int64 version = value_list[i]->GetStep(); + int64 version = ShrinkPolicy::feat_desc_->GetVersion(value_list[i]); if (version == -1) { - value_list[i]->SetStep(global_step); + ShrinkPolicy::feat_desc_->UpdateVersion(value_list[i], global_step); } else { if (global_step - version > steps_to_live_) { kv_->Remove(key_list[i]); ShrinkPolicy::EmplacePointer(value_list[i]); - value_list[i] = (ValuePtr*)ValuePtrStatus::IS_DELETED; + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; } } } diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h index 1dd90d63a6e..fc4a2506313 100644 --- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h @@ -204,29 +204,29 @@ class GPUHashMapKV : public KVInterface { } Status BatchLookupOrCreate(const K* keys, size_t n, - ValuePtr** value_ptrs) override { + void** value_ptrs) override { return Status::OK(); } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { return Status::OK(); } Status Contains(K key) override { return Status::OK(); } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { return Status::OK(); } Status Remove(K key) override { return Status::OK(); } Status BatchLookup(const K* keys, size_t size, - ValuePtr** value_ptrs) override { + void** value_ptrs) override { return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return Status::OK(); } @@ -235,22 +235,20 @@ class GPUHashMapKV : public KVInterface { } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return Status::OK(); } int64 Size() const override { return 0; } - void SetTotalDims(int total_dims) override {} + void FreeValuePtr(void* value_ptr) override {} - void FreeValuePtr(ValuePtr* value_ptr) override {} - - Status Commit(K key, const ValuePtr* value_ptr) override { + Status Commit(K key, const void* value_ptr) override { return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { return Status::OK(); } diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h index 581f1f1cfaf..1056f4bbd78 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h @@ -3,7 +3,6 @@ #if GOOGLE_CUDA #define EIGEN_USE_GPU -#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" @@ -14,9 +13,6 @@ namespace tensorflow { using se::DeviceMemoryBase; using se::Stream; -template -class ValuePtr; - template class CheckpointLoader; @@ -26,15 +22,17 @@ namespace embedding { template class HbmDramSsdStorage : public MultiTierStorage { public: - HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc, - Allocator* cpu_alloc, LayoutCreator* lc, const std::string& name) - : cpu_alloc_(cpu_alloc), gpu_alloc_(gpu_alloc), + HbmDramSsdStorage(const StorageConfig& sc, + Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), MultiTierStorage(sc, name), dram_capacity_(-1) { - hbm_ = new HbmStorageWithCpuKv(sc, gpu_alloc_, lc); - dram_ = new DramStorage(sc, cpu_alloc_, lc, - new LocklessHashMapCPU(gpu_alloc_)); - ssd_ = new SsdHashStorage(sc, cpu_alloc_, lc); + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); + ssd_ = new SsdHashStorage(sc, dram_feat_desc_); } ~HbmDramSsdStorage() override { @@ -46,29 +44,20 @@ class HbmDramSsdStorage : public MultiTierStorage { TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage); - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + ssd_->Init(); - MultiTierStorage::cache_capacity_ = - Storage::storage_config_.size[0] - / (Storage::total_dims_ * sizeof(V)); + MultiTierStorage::cache_capacity_ = + Storage::storage_config_.size[0] + / (total_dim() * sizeof(V)); - dram_capacity_ = Storage::storage_config_.size[1] - / (Storage::total_dims_ * sizeof(V)); - MultiTierStorage::ready_eviction_ = true; - } - Storage::flag_.clear(std::memory_order_release); + dram_capacity_ = Storage::storage_config_.size[1] + / (total_dim() * sizeof(V)); + MultiTierStorage::ready_eviction_ = true; } - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = hbm_->Get(key, value_ptr); if (s.ok()) { return s; @@ -88,13 +77,12 @@ class HbmDramSsdStorage : public MultiTierStorage { void BatchGet(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) override { + void** value_ptr_list, + int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); - std::vector*>> + std::vector> ssd_value_ptr_list(num_worker_threads + 1); BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, @@ -102,20 +90,20 @@ class HbmDramSsdStorage : public MultiTierStorage { CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursor_list[0], - ssd_value_ptr_list[0], value_len); + ssd_value_ptr_list[0]); } void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_fountd_cursor_list) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); - std::vector*>> + std::vector> ssd_value_ptr_list(num_worker_threads + 1); BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys, @@ -124,70 +112,27 @@ class HbmDramSsdStorage : public MultiTierStorage { CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursor_list[0], - ssd_value_ptr_list[0], value_len); + ssd_value_ptr_list[0]); CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], value_len); } - void Insert(K key, ValuePtr* value_ptr) override { + void Insert(K key, void** value_ptr) override { hbm_->Insert(key, value_ptr); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + void CreateAndInsert(K key, void** value_ptr, + bool to_dram = false) override { if (to_dram) { - dram_->Insert(key, value_ptr, alloc_len); + dram_->Insert(key, value_ptr); } else { - hbm_->Insert(key, value_ptr, alloc_len); + hbm_->Insert(key, value_ptr); } } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(size); - { - mutex_lock l(memory_pool_mu_); - gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate()); - *value_ptr = gpu_value_ptr; - } - s = hbm_->TryInsert(key, *value_ptr); - // Insert Failed - if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0)); - } - delete *value_ptr; - return hbm_->Get(key, value_ptr); - } else { - return s; - } - } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - s = dram_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK; - return s; - } - s = ssd_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK_AND_DESTROY; - return s; - } - hbm_->Insert(key, value_ptr, size); - return Status::OK(); + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs."; } void InitCache(embedding::CacheStrategy cache_strategy) override { @@ -195,66 +140,6 @@ class HbmDramSsdStorage : public MultiTierStorage { dram_cache_ = new LRUCache(); } - void CopyEmbeddingsFromCPUToGPU( - int total, const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, V* memcpy_buffer_gpu, - se::Stream* compute_stream, - EventMgr* event_mgr, - const DeviceBase::CpuWorkerThreads* worker_threads) override { - auto memcpy_buffer_cpu = TypedAllocator::Allocate(cpu_allocator(), - total * value_len, AllocationAttributes()); - int64* memory_index = new int64[total]; - int64 i = 0; - auto it = copyback_cursor.cbegin(); - { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursor.cend(); ++it, ++i) { - int64 j = *it & 0x0fffffffffffffff; - memory_index[i] = *it; - ValuePtr* gpu_value_ptr = - hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)memcpy_address[j] - sizeof(FixedLengthHeader), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - } - } - - auto do_work = [memory_index, memcpy_address, - memcpy_buffer_cpu, gpu_value_ptrs, - value_len, this] (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - int64 j = memory_index[i] & 0x0fffffffffffffff; - bool destroy_flag = (memory_index[i] >> 63) & 0x1; - memcpy(memcpy_buffer_cpu + i * value_len, - memcpy_address[j], value_len * sizeof(V)); - if (destroy_flag) { - ssd_->DestroyValuePtr(reinterpret_cast*>( - (char *)memcpy_address[j] - sizeof(FixedLengthHeader))); - } - } - }; - Shard(worker_threads->num_threads, worker_threads->workers, total, - 1000, do_work); - - DeviceMemoryBase gpu_dst_ptr( - memcpy_buffer_gpu, total * value_len * sizeof(V)); - compute_stream->ThenMemcpy( - &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V)); - SyncWithEventMgr(compute_stream, event_mgr); - TypedAllocator::Deallocate( - cpu_allocator(), memcpy_buffer_cpu, total * value_len); - delete[] memory_index; - } - Status Remove(K key) override { hbm_->Remove(key); dram_->Remove(key); @@ -311,25 +196,23 @@ class HbmDramSsdStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_dram_key_list; - std::vector*> value_ptr_list, tmp_dram_value_list; + std::vector value_ptr_list, tmp_dram_value_list; TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); HbmValueIterator hbm_value_iter( key_list, value_ptr_list, - emb_config.emb_index, Storage::alloc_len_, - gpu_alloc_); + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); - std::vector*> tmp_hbm_value_ptrs(value_ptr_list.size()); for (int64 i = 0; i < value_ptr_list.size(); i++) { - ValuePtr* value_ptr = hbm_->CreateValuePtr(value_len); - memcpy((char *)value_ptr->GetPtr(), - (char *)value_ptr_list[i]->GetPtr(), - sizeof(FixedLengthHeader)); - value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - value_ptr->SetInitialized(emb_config.primary_emb_index); - tmp_hbm_value_ptrs[i] = value_ptr; - value_ptr_list[i] = value_ptr; + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq( + value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); } TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, @@ -347,17 +230,24 @@ class HbmDramSsdStorage : public MultiTierStorage { { mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, &hbm_value_iter))); } - for (auto it: tmp_hbm_value_ptrs) { - delete it; + for (auto value_ptr: value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); + } } ssd_->Save(tensor_name, prefix, writer, emb_config, @@ -368,7 +258,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status DramToSsdBatchCommit(std::shared_ptr> keys) { MultiTierStorage::ReleaseValuePtrs(dram_value_ptr_out_of_date_, - dram_->alloc_); + dram_feat_desc_); mutex_lock l(*(ssd_->get_mutex())); mutex_lock l1(*(dram_->get_mutex())); @@ -380,7 +270,7 @@ class HbmDramSsdStorage : public MultiTierStorage { k_size = std::min(k_size, DramEvictionSize); K dram_evic_ids[DramEvictionSize]; size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; for (int64 i = 0; i < true_size; ++i) { if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) { TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr)); @@ -408,22 +298,31 @@ class HbmDramSsdStorage : public MultiTierStorage { k_size = std::min(k_size, EvictionSize); size_t true_size = MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; std::shared_ptr> keys(new std::vector()); - std::vector*> value_ptrs; + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; for (int64 i = 0; i < true_size; ++i) { if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { keys->emplace_back(evic_ids[i]); - value_ptrs.emplace_back(value_ptr); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); } } - dram_->BatchCommit(*keys, value_ptrs); - { - //Mutex with main thread - mutex_lock l_mem(memory_pool_mu_); - embedding_mem_pool_->Deallocate(value_ptrs); - } + + CopyEmbeddingFromHbmToDram( + hbm_value_ptrs, + dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(*keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); for (auto it : *keys) { TF_CHECK_OK(hbm_->Remove(it)); } @@ -435,58 +334,14 @@ class HbmDramSsdStorage : public MultiTierStorage { } } - void CreateEmbeddingMemoryPool( - Allocator* alloc, - int64 value_len, - int64 block_size) override { - embedding_mem_pool_ = - new EmbeddingMemoryPool(alloc, value_len, block_size); - } - - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for (auto it : value_ptr_list) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = it->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - //Mutex with other ImportOps - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < num_of_value_ptrs; i++) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = value_ptr_list[i]->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); } protected: - void SetTotalDims(int64 total_dims) override { - dram_->SetTotalDims(total_dims); - ssd_->SetTotalDims(total_dims); - } - - void CopyToGpuValuePtr( - ValuePtr* gpu_ptr, - ValuePtr* cpu_ptr, - int64 size) { - V* cpu_data_address = cpu_ptr->GetValue(0, 0); - V* gpu_data_address = gpu_ptr->GetValue(0, 0); - cudaMemcpy(gpu_data_address, cpu_data_address, - size * sizeof(V), cudaMemcpyHostToDevice); - memcpy(gpu_ptr->GetPtr(), - cpu_ptr->GetPtr(), - sizeof(FixedLengthHeader)); + int total_dim() override { + return hbm_feat_desc_->total_dim(); } void Restore(const std::string& name_string, @@ -539,6 +394,10 @@ class HbmDramSsdStorage : public MultiTierStorage { (int64*)restore_buff.freq_buffer); return s; } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override {} private: void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { V* memcpy_buffer_cpu = new V[size * value_len]; @@ -551,46 +410,30 @@ class HbmDramSsdStorage : public MultiTierStorage { (V*)gpu_alloc_->AllocateRaw( Allocator::kAllocatorAlignment, size * sizeof(V*)); - ValuePtr** gpu_value_ptrs = new ValuePtr*[size]; - ValuePtr** cpu_value_ptrs = new ValuePtr*[size]; - { - //Mutex with other Import Ops - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < size; i++) { - dram_->Get(ids[i], &cpu_value_ptrs[i]); - gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - gpu_value_ptrs[i]->SetPtr(val_ptr); - memcpy((char *)gpu_value_ptrs[i]->GetPtr(), - (char *)cpu_value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); } } //Split from above for loop for minize the cost of mutex lock //TODO: Speed up with intra parallelism - std::vector*> invalid_value_ptrs; + for (int64 i = 0; i < size; i++) { memcpy(memcpy_buffer_cpu + i * value_len, - cpu_value_ptrs[i]->GetValue(emb_index, - Storage::GetOffset(emb_index)), - value_len * sizeof(V)); - Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); - if (!s.ok()) { - invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]); - hbm_->Get(ids[i], &gpu_value_ptrs[i]); - } - gpu_value_ptrs[i]->SetInitialized(emb_index); - value_address[i] = gpu_value_ptrs[i]->GetValue( - emb_index, Storage::GetOffset(emb_index)); + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); } cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, size * value_len * sizeof(V), cudaMemcpyHostToDevice); cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), cudaMemcpyHostToDevice); - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate(invalid_value_ptrs); - } int block_dim = 128; void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, (void*)&value_len, (void*)&size}; @@ -611,10 +454,10 @@ class HbmDramSsdStorage : public MultiTierStorage { void BatchGetValuePtrs( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, std::vector>& copyback_cursor_list, - std::vector*>>& ssd_value_ptr_list, + std::vector>& ssd_value_ptr_list, std::vector>* not_found_cursor_list = nullptr) { int num_worker_threads = ctx.worker_threads->num_threads; IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads); @@ -688,39 +531,32 @@ class HbmDramSsdStorage : public MultiTierStorage { void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursors, - std::list*>& ssd_value_ptrs, - int64 value_len) { + std::list& ssd_value_ptrs) { int64 total = copyback_cursors.size(); - std::vector*> gpu_value_ptrs(total); + std::vector gpu_value_ptrs(total); std::vector copyback_keys(total); std::vector memory_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = copyback_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursors.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)value_ptr_list[j]->GetPtr(), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - copyback_keys[i] = keys[*it]; - } + int64 i = 0; + auto it = copyback_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion(gpu_value_ptr, + dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; } MultiTierStorage::CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursors, - memory_index, gpu_value_ptrs, value_len); + memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(), + hbm_feat_desc_, dram_feat_desc_); //Insert copyback ids to hbm hash table. auto do_insert = [this, copyback_keys, gpu_value_ptrs, @@ -730,12 +566,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status s = hbm_->TryInsert( copyback_keys[i], gpu_value_ptrs[i]); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - gpu_value_ptrs[i]->GetValue(0, 0)); - } - delete gpu_value_ptrs[i]; + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); } } @@ -752,34 +583,31 @@ class HbmDramSsdStorage : public MultiTierStorage { void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& not_found_cursors, int64 value_len) { int64 total = not_found_cursors.size(); if (total > 0) { - std::vector*>> insert_pairs(total); + std::vector> insert_pairs(total); std::vector cursor_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = not_found_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != not_found_cursors.cend(); ++it, ++i) { - int64 j = *it; - cursor_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - value_ptr_list[j] = gpu_value_ptr; - insert_pairs[i].first = keys[j]; - insert_pairs[i].second = value_ptr_list[j]; - } + + int64 i = 0; + auto it = not_found_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; } + hbm_feat_desc_->SetDefaultValues( + keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, ctx.gpu_device); + //Insert copyback ids to hbm hash table. auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index] (int64 start, int64 limit) { @@ -787,12 +615,7 @@ class HbmDramSsdStorage : public MultiTierStorage { Status s = hbm_->TryInsert( insert_pairs[i].first, insert_pairs[i].second); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - insert_pairs[i].second->GetValue(0, 0)); - } - delete insert_pairs[i].second; + hbm_->DestroyValuePtr(insert_pairs[i].second); hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); } } @@ -804,29 +627,28 @@ class HbmDramSsdStorage : public MultiTierStorage { } void AddCopyBackFlagToValuePtr( - ValuePtr** value_ptr, CopyBackFlag copyback_flag) { + void** value_ptr, CopyBackFlag copyback_flag) { int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; tmp = ((int64)*value_ptr) | tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } - void RemoveCopyBackFlagInValuePtr(ValuePtr** value_ptr) { + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; tmp = ((int64)*value_ptr) & tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } private: HbmStorageWithCpuKv* hbm_ = nullptr; DramStorage* dram_ = nullptr; SsdHashStorage* ssd_ = nullptr; - EmbeddingMemoryPool* embedding_mem_pool_; Allocator* gpu_alloc_; - Allocator* cpu_alloc_; BatchCache* dram_cache_; int64 dram_capacity_; - std::deque*> dram_value_ptr_out_of_date_; - mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ + std::deque dram_value_ptr_out_of_date_; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; const int copyback_flag_offset_bits_ = 60; }; } // embedding diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h index 518c39287e0..d058d95f05b 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h @@ -17,7 +17,6 @@ limitations under the License. #if GOOGLE_CUDA #define EIGEN_USE_GPU -#include "tensorflow/core/framework/embedding/lockless_hash_map_cpu.h" #include "tensorflow/core/framework/embedding/multi_tier_storage.h" #include "tensorflow/core/framework/embedding/single_tier_storage.h" #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h" @@ -29,9 +28,6 @@ namespace tensorflow { using se::DeviceMemoryBase; using se::Stream; -template -class ValuePtr; - template class CheckpointLoader; @@ -41,27 +37,27 @@ namespace embedding { template class HbmDramStorage : public MultiTierStorage { public: - HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc, - Allocator* cpu_alloc, LayoutCreator* lc, - const std::string& name) - : gpu_alloc_(gpu_alloc), MultiTierStorage(sc, name) { - hbm_ = new HbmStorageWithCpuKv(sc, gpu_alloc, lc); - StorageConfig storage_config = StorageConfig(); - storage_config.layout_type = LayoutType::NORMAL_CONTIGUOUS; - dram_ = new DramStorage(sc, cpu_alloc, - LayoutCreatorFactory::Create(storage_config), - new LocklessHashMapCPU(gpu_alloc)); + HbmDramStorage(const StorageConfig& sc, + Allocator* gpu_alloc, + FeatureDescriptor* feat_desc, const std::string& name) + : gpu_alloc_(gpu_alloc), + MultiTierStorage(sc, name) { + hbm_ = new HbmStorageWithCpuKv(sc, feat_desc); + hbm_feat_desc_ = feat_desc; + dram_feat_desc_ = new FeatureDescriptor(feat_desc); + dram_ = new DramStorage(sc, dram_feat_desc_); } ~HbmDramStorage() override { MultiTierStorage::DeleteFromEvictionManager(); delete hbm_; delete dram_; + delete dram_feat_desc_; } TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { Status s = hbm_->Get(key, value_ptr); if (s.ok()) { return s; @@ -76,9 +72,8 @@ class HbmDramStorage : public MultiTierStorage { void BatchGet(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) override { + void** value_ptr_list, + int64 num_of_keys) override { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> copyback_cursor_list(num_worker_threads + 1); @@ -87,18 +82,17 @@ class HbmDramStorage : public MultiTierStorage { copyback_cursor_list); CopyEmbeddingsFromDramToHbm( - ctx, keys, value_ptr_list, copyback_cursor_list[0], - value_len); + ctx, keys, value_ptr_list, copyback_cursor_list[0]); } - void Insert(K key, ValuePtr* value_ptr) override { + void Insert(K key, void** value_ptr) override { hbm_->Insert(key, value_ptr); } void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_fountd_cursor_list) override { @@ -110,115 +104,22 @@ class HbmDramStorage : public MultiTierStorage { copyback_cursor_list, ¬_fountd_cursor_list); CopyEmbeddingsFromDramToHbm( - ctx, keys, value_ptr_list, copyback_cursor_list[0], - value_len); - + ctx, keys, value_ptr_list, copyback_cursor_list[0]); CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0], value_len); } - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) override { if (to_dram) { - dram_->Insert(key, value_ptr, alloc_len); + dram_->CreateAndInsert(key, value_ptr); } else { - hbm_->Insert(key, value_ptr, alloc_len); + hbm_->CreateAndInsert(key, value_ptr); } } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(size); - { - mutex_lock l(memory_pool_mu_); - gpu_value_ptr->SetPtr(embedding_mem_pool_->Allocate()); - *value_ptr = gpu_value_ptr; - } - s = hbm_->TryInsert(key, *value_ptr); - if (s.ok()) { - return s; - } - // Insert Failed, key already exist - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate((*value_ptr)->GetValue(0, 0)); - } - delete *value_ptr; - return hbm_->Get(key, value_ptr); - } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - Status s = hbm_->Get(key, value_ptr); - if (s.ok()) { - return s; - } - s = dram_->Get(key, value_ptr); - if (s.ok()) { - need_copyback = COPYBACK; - return s; - } - - hbm_->Insert(key, value_ptr, size); - return Status::OK(); - } - - void CopyEmbeddingsFromCPUToGPU( - int total, const K* keys, - const std::list& copyback_cursor, - V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, V* memcpy_buffer_gpu, - se::Stream* compute_stream, - EventMgr* event_mgr, - const DeviceBase::CpuWorkerThreads* worker_threads) override { - auto memcpy_buffer_cpu = TypedAllocator::Allocate(cpu_allocator(), - total * value_len, AllocationAttributes()); - int64* memory_index = new int64[total]; - int64 i = 0; - auto it = copyback_cursor.cbegin(); - { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursor.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)memcpy_address[j] - sizeof(FixedLengthHeader), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - } - } - //Split from above for loop for minize the cost of mutex lock - auto do_work = [memory_index, memcpy_address, - memcpy_buffer_cpu, gpu_value_ptrs, - value_len, this] (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - int j = memory_index[i]; - memcpy(memcpy_buffer_cpu + i * value_len, - memcpy_address[j], value_len * sizeof(V)); - } - }; - Shard(worker_threads->num_threads, worker_threads->workers, total, - 1000, do_work); - DeviceMemoryBase gpu_dst_ptr( - memcpy_buffer_gpu, total * value_len * sizeof(V)); - compute_stream->ThenMemcpy( - &gpu_dst_ptr, memcpy_buffer_cpu, total * value_len * sizeof(V)); - SyncWithEventMgr(compute_stream, event_mgr); - TypedAllocator::Deallocate( - cpu_allocator(), memcpy_buffer_cpu, total * value_len); - delete[] memory_index; + Status GetOrCreate(K key, void** value_ptr) override { + LOG(FATAL)<<"Stroage with HBM only suppotrs batch APIs."; } Status Remove(K key) override { @@ -270,25 +171,23 @@ class HbmDramStorage : public MultiTierStorage { int64 value_len, V* default_value) override { std::vector key_list, tmp_dram_key_list; - std::vector*> value_ptr_list, tmp_dram_value_list; + std::vector value_ptr_list, tmp_dram_value_list; TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list)); hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len); HbmValueIterator hbm_value_iter( key_list, value_ptr_list, - emb_config.emb_index, Storage::alloc_len_, - gpu_alloc_); - - std::vector*> tmp_hbm_value_ptrs(value_ptr_list.size()); + emb_config.emb_index, value_len, + gpu_alloc_, hbm_feat_desc_); + for (int64 i = 0; i < value_ptr_list.size(); i++) { - ValuePtr* value_ptr = hbm_->CreateValuePtr(value_len); - memcpy((char *)value_ptr->GetPtr(), - (char *)value_ptr_list[i]->GetPtr(), - sizeof(FixedLengthHeader)); - value_ptr->SetPtr((V*)ValuePosition::NOT_IN_DRAM); - value_ptr->SetInitialized(emb_config.primary_emb_index); - tmp_hbm_value_ptrs[i] = value_ptr; - value_ptr_list[i] = value_ptr; + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes()); + hbm_feat_desc_->SetFreq( + value_ptr, hbm_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion( + value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i])); + value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset)); } TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, @@ -306,54 +205,26 @@ class HbmDramStorage : public MultiTierStorage { { mutex_lock l(*(hbm_->get_mutex())); + std::vector*> feat_desc_list(2); + feat_desc_list[0] = dram_feat_desc_; + feat_desc_list[1] = hbm_feat_desc_; TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list, value_ptr_list, + feat_desc_list, &hbm_value_iter))); } - for (auto it: tmp_hbm_value_ptrs) { - delete it; - } - return Status::OK(); - } - - void CreateEmbeddingMemoryPool( - Allocator* alloc, - int64 value_len, - int64 block_size) override { - embedding_mem_pool_ = - new EmbeddingMemoryPool(alloc, value_len, block_size); - } - - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for (auto it : value_ptr_list) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = it->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - } - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - //Mutex with other ImportOps - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < num_of_value_ptrs; i++) { - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = value_ptr_list[i]->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); + for (auto value_ptr: value_ptr_list) { + if ((int64)value_ptr >> kDramFlagOffset == 1) { + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + cpu_allocator()->DeallocateRaw(value_ptr); } } + return Status::OK(); } void BatchEviction() override { @@ -372,22 +243,31 @@ class HbmDramStorage : public MultiTierStorage { k_size = std::min(k_size, EvictionSize); size_t true_size = MultiTierStorage::cache_->get_evic_ids(evic_ids, k_size); - ValuePtr* value_ptr; + void* value_ptr; std::vector keys; - std::vector*> value_ptrs; + std::vector hbm_value_ptrs; + std::vector dram_value_ptrs; for (int64 i = 0; i < true_size; ++i) { if (hbm_->Get(evic_ids[i], &value_ptr).ok()) { keys.emplace_back(evic_ids[i]); - value_ptrs.emplace_back(value_ptr); + hbm_value_ptrs.emplace_back(value_ptr); + void* dram_value_ptr = dram_->CreateValuePtr(); + dram_feat_desc_->SetFreq(dram_value_ptr, + hbm_feat_desc_->GetFreq(value_ptr)); + dram_feat_desc_->UpdateVersion(dram_value_ptr, + hbm_feat_desc_->GetVersion(value_ptr)); + dram_value_ptrs.emplace_back(dram_value_ptr); } } - dram_->BatchCommit(keys, value_ptrs); - { - //Mutex with main thread - mutex_lock l_mem(memory_pool_mu_); - embedding_mem_pool_->Deallocate(value_ptrs); - } + + CopyEmbeddingFromHbmToDram( + hbm_value_ptrs, + dram_value_ptrs, gpu_alloc_, + hbm_feat_desc_, dram_feat_desc_); + + dram_->BatchCommit(keys, dram_value_ptrs); + hbm_feat_desc_->Deallocate(hbm_value_ptrs); for (auto it : keys) { TF_CHECK_OK(hbm_->Remove(it)); } @@ -430,6 +310,16 @@ class HbmDramStorage : public MultiTierStorage { } } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + + void Init() override { + dram_feat_desc_->InitSlotInfo(hbm_feat_desc_); + MultiTierStorage::Init(); + } + protected: Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, @@ -447,14 +337,14 @@ class HbmDramStorage : public MultiTierStorage { return s; } - void SetTotalDims(int64 total_dims) override { - dram_->SetTotalDims(total_dims); + int total_dim() override { + return hbm_feat_desc_->total_dim(); } private: void BatchGetValuePtrs( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, std::vector>& copyback_cursor_list, std::vector>* not_found_cursor_list = nullptr) { @@ -522,38 +412,31 @@ class HbmDramStorage : public MultiTierStorage { void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, - std::list& copyback_cursors, - int64 value_len) { + void** value_ptr_list, + std::list& copyback_cursors) { int64 total = copyback_cursors.size(); - std::vector*> gpu_value_ptrs(total); + std::vector gpu_value_ptrs(total); std::vector copyback_keys(total); std::vector memory_index(total); //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = copyback_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != copyback_cursors.cend(); ++it, ++i) { - int64 j = *it; - memory_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - memcpy((char *)gpu_value_ptr->GetPtr(), - (char *)value_ptr_list[j]->GetPtr(), - sizeof(FixedLengthHeader)); - gpu_value_ptrs[i] = gpu_value_ptr; - copyback_keys[i] = keys[*it]; - } + int64 i = 0; + auto it = copyback_cursors.cbegin(); + //Mutex with eviction thread + for ( ; it != copyback_cursors.cend(); ++it, ++i) { + int64 j = *it; + memory_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + hbm_feat_desc_->SetFreq(gpu_value_ptr, + dram_feat_desc_->GetFreq(value_ptr_list[i])); + hbm_feat_desc_->UpdateVersion(gpu_value_ptr, + dram_feat_desc_->GetVersion(value_ptr_list[i])); + gpu_value_ptrs[i] = gpu_value_ptr; + copyback_keys[i] = keys[*it]; } MultiTierStorage::CopyEmbeddingsFromDramToHbm( ctx, keys, value_ptr_list, copyback_cursors, - memory_index, gpu_value_ptrs, value_len); + memory_index, gpu_value_ptrs, hbm_feat_desc_->total_dim(), + hbm_feat_desc_, dram_feat_desc_); //Insert copyback ids to hbm hash table. auto do_insert = [this, copyback_keys, gpu_value_ptrs, @@ -563,12 +446,7 @@ class HbmDramStorage : public MultiTierStorage { Status s = hbm_->TryInsert( copyback_keys[i], gpu_value_ptrs[i]); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - gpu_value_ptrs[i]->GetValue(0, 0)); - } - delete gpu_value_ptrs[i]; + hbm_->DestroyValuePtr(gpu_value_ptrs[i]); hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]); } } @@ -580,34 +458,29 @@ class HbmDramStorage : public MultiTierStorage { void CreateValuePtrs(const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& not_found_cursors, int64 value_len) { int64 total = not_found_cursors.size(); if (total > 0) { - std::vector*>> insert_pairs(total); + std::vector> insert_pairs(total); std::vector cursor_index(total); - //Create Hbm ValuePtrs. - { - int64 i = 0; - auto it = not_found_cursors.cbegin(); - //Mutex with eviction thread - mutex_lock l(memory_pool_mu_); - for ( ; it != not_found_cursors.cend(); ++it, ++i) { - int64 j = *it; - cursor_index[i] = j; - ValuePtr* gpu_value_ptr = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - bool flag = gpu_value_ptr->SetPtr(val_ptr); - if (!flag) { - embedding_mem_pool_->Deallocate(val_ptr); - } - value_ptr_list[j] = gpu_value_ptr; - insert_pairs[i].first = keys[j]; - insert_pairs[i].second = value_ptr_list[j]; - } + //Create Hbm ValuePtrs. + int64 i = 0; + auto it = not_found_cursors.cbegin(); + for ( ; it != not_found_cursors.cend(); ++it, ++i) { + int64 j = *it; + cursor_index[i] = j; + void* gpu_value_ptr = hbm_->CreateValuePtr(); + value_ptr_list[j] = gpu_value_ptr; + insert_pairs[i].first = keys[j]; + insert_pairs[i].second = value_ptr_list[j]; } + hbm_feat_desc_->SetDefaultValues( + keys, not_found_cursors, value_ptr_list, + ctx.compute_stream, ctx.event_mgr, ctx.gpu_device); + //Insert copyback ids to hbm hash table. auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index] (int64 start, int64 limit) { @@ -615,12 +488,7 @@ class HbmDramStorage : public MultiTierStorage { Status s = hbm_->TryInsert( insert_pairs[i].first, insert_pairs[i].second); if (!s.ok()) { - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate( - insert_pairs[i].second->GetValue(0, 0)); - } - delete insert_pairs[i].second; + hbm_->DestroyValuePtr(insert_pairs[i].second); hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]); } } @@ -632,16 +500,22 @@ class HbmDramStorage : public MultiTierStorage { } void AddCopyBackFlagToValuePtr( - ValuePtr** value_ptr, CopyBackFlag copyback_flag) { + void** value_ptr, CopyBackFlag copyback_flag) { int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_; tmp = ((int64)*value_ptr) | tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); } - void RemoveCopyBackFlagInValuePtr(ValuePtr** value_ptr) { + void RemoveCopyBackFlagInValuePtr(void** value_ptr) { int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1; tmp = ((int64)*value_ptr) & tmp; - *value_ptr = reinterpret_cast*>(tmp); + *value_ptr = reinterpret_cast(tmp); + } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + dram_->Import(key, value, freq, version, emb_index); } void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) { @@ -655,45 +529,30 @@ class HbmDramStorage : public MultiTierStorage { (V*)gpu_alloc_->AllocateRaw( Allocator::kAllocatorAlignment, size * sizeof(V*)); - ValuePtr** gpu_value_ptrs = new ValuePtr*[size]; - ValuePtr** cpu_value_ptrs = new ValuePtr*[size]; - { - //Mutex with other Import Ops - mutex_lock l(memory_pool_mu_); - for (int64 i = 0; i < size; i++) { - dram_->Get(ids[i], &cpu_value_ptrs[i]); - gpu_value_ptrs[i] = hbm_->CreateValuePtr(value_len); - V* val_ptr = embedding_mem_pool_->Allocate(); - gpu_value_ptrs[i]->SetPtr(val_ptr); - memcpy((char *)gpu_value_ptrs[i]->GetPtr(), - (char *)cpu_value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader)); + void** gpu_value_ptrs = new void*[size]; + void** cpu_value_ptrs = new void*[size]; + for (int64 i = 0; i < size; i++) { + dram_->Get(ids[i], &cpu_value_ptrs[i]); + gpu_value_ptrs[i] = hbm_->CreateValuePtr(); + Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); + if (!s.ok()) { + hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]); + hbm_->Get(ids[i], &gpu_value_ptrs[i]); } } //Split from above for loop for minize the cost of mutex lock //TODO: Speed up with intra parallelism - std::vector*> invalid_value_ptrs; + for (int64 i = 0; i < size; i++) { memcpy(memcpy_buffer_cpu + i * value_len, - cpu_value_ptrs[i]->GetValue(emb_index, - Storage::GetOffset(emb_index)), value_len * sizeof(V)); - Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]); - if (!s.ok()) { - invalid_value_ptrs.emplace_back(gpu_value_ptrs[i]); - hbm_->Get(ids[i], &gpu_value_ptrs[i]); - } - gpu_value_ptrs[i]->SetInitialized(emb_index); - value_address[i] = gpu_value_ptrs[i]->GetValue( - emb_index, Storage::GetOffset(emb_index)); + dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index), + value_len * sizeof(V)); + value_address[i] = hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index); } cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu, size * value_len * sizeof(V), cudaMemcpyHostToDevice); cudaMemcpy(dev_value_address, value_address, size * sizeof(V*), cudaMemcpyHostToDevice); - { - mutex_lock l(memory_pool_mu_); - embedding_mem_pool_->Deallocate(invalid_value_ptrs); - } int block_dim = 128; void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu, (void*)&value_len, (void*)&size}; @@ -714,9 +573,9 @@ class HbmDramStorage : public MultiTierStorage { private: HbmStorageWithCpuKv* hbm_ = nullptr; DramStorage* dram_ = nullptr; - EmbeddingMemoryPool* embedding_mem_pool_ = nullptr; + FeatureDescriptor* hbm_feat_desc_ = nullptr; + FeatureDescriptor* dram_feat_desc_ = nullptr; Allocator* gpu_alloc_; - mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ const int copyback_flag_offset_bits_ = 60; }; } // embedding diff --git a/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h new file mode 100644 index 00000000000..a3603a61550 --- /dev/null +++ b/tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h @@ -0,0 +1,122 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ +#include "tensorflow/core/util/env_var.h" +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" +#include "tensorflow/core/framework/embedding/embedding_memory_pool.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { +namespace embedding { +template +class NormalFeatureDescriptorImpl; + +template +class HbmMultiTierFeatureDescriptorImpl + : public FeatureDescriptorImpl { + public: + HbmMultiTierFeatureDescriptorImpl( + Allocator* alloc, int64 slot_num, + bool need_record_freq, + bool need_record_version) + : dram_alloc_bytes_(sizeof(V*)), + hbm_alloc_(alloc), + dram_alloc_(ev_allocator()), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) { + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&dram_alloc_bytes_); + } + + ~HbmMultiTierFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = + FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&hbm_alloc_bytes_); + embedding_mem_pool_.reset( + new EmbeddingMemoryPool(hbm_alloc_, + hbm_alloc_bytes_ / sizeof(V), + 1024 * 1024 * 64)); + } + return is_compute_alloc_bytes; + } + + V* GetEmbedding(void *val, int emb_index) override { + return *((V**)val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = dram_alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, dram_alloc_bytes_); + mutex_lock l(memory_pool_mu_); + *((V**)val) = embedding_mem_pool_->Allocate(); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { + mutex_lock l(memory_pool_mu_); + embedding_mem_pool_->Deallocate(*((V**)val)); + dram_alloc_->DeallocateRaw(val); + } + + void Deallocate(const std::vector& value_ptrs) override { + mutex_lock l(memory_pool_mu_); + for (auto ptr: value_ptrs) { + embedding_mem_pool_->Deallocate(*((V**)ptr)); + dram_alloc_->DeallocateRaw(ptr); + } + } + void SetDefaultValue(void* val, int64 key) override { + LOG(FATAL)<<"Can't call SetDefaultValue(void* val, int64 key," + <<"int default_value_len) in HbmMultiTierFeatureDescriptor."; + } + + void SetAllocator(Allocator* alloc) override { + hbm_alloc_ = alloc; + } + + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device); + + int data_bytes() override { + return dram_alloc_bytes_; + } + public: + friend class NormalFeatureDescriptorImpl; + protected: + int dram_alloc_bytes_; + int hbm_alloc_bytes_ = 0; + mutex memory_pool_mu_; //ensure thread safety of embedding_mem_pool_ + Allocator* hbm_alloc_; + Allocator* dram_alloc_; + std::unique_ptr> embedding_mem_pool_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/hbm_storage_iterator.h b/tensorflow/core/framework/embedding/hbm_storage_iterator.h index 36d331e74aa..31dc4459a13 100644 --- a/tensorflow/core/framework/embedding/hbm_storage_iterator.h +++ b/tensorflow/core/framework/embedding/hbm_storage_iterator.h @@ -28,10 +28,11 @@ class HbmValueIterator: public ValueIterator { public: HbmValueIterator( const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, int64 emb_index, int64 value_len, - Allocator* alloc) + Allocator* alloc, + FeatureDescriptor* feat_desc) : value_len_(value_len), alloc_(alloc) { int64 emb_offset = value_len_ * emb_index; @@ -40,7 +41,7 @@ class HbmValueIterator: public ValueIterator { for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { if (key_list[i] % kSavedPartitionNum == part_id) { value_parts_vec[part_id].emplace_back( - value_ptr_list[i]->GetValue(emb_index, emb_offset)); + feat_desc->GetEmbedding(value_ptr_list[i], emb_index)); break; } } diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h index 5d1f20b581a..3659187c825 100644 --- a/tensorflow/core/framework/embedding/kv_interface.h +++ b/tensorflow/core/framework/embedding/kv_interface.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_ #include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { @@ -24,9 +25,6 @@ namespace { const char* kInferenceMode = "INFERENCE_MODE"; } -template -class ValuePtr; - template class GPUHashTable; @@ -43,19 +41,19 @@ template class KVInterface { public: virtual ~KVInterface() {} - virtual Status Lookup(K key, ValuePtr** value_ptr) = 0; + virtual Status Lookup(K key, void** value_ptr) = 0; virtual Status Contains(K key) = 0; - virtual Status Insert(K key, const ValuePtr* value_ptr) = 0; + virtual Status Insert(K key, const void* value_ptr) = 0; virtual Status Remove(K key) = 0; virtual Status BatchLookup(const K* keys, size_t size, - ValuePtr** value_ptrs) { + void** value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchLookup in KVInterface."); } // KV Batch Insert virtual Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchInsert in KVInterface."); } @@ -66,27 +64,30 @@ class KVInterface { } virtual Status BatchLookupOrCreate(const K* keys, size_t size, - ValuePtr** value_ptrs) { + void** value_ptrs) { return Status(error::Code::UNIMPLEMENTED, "Unimplemented for BatchLookupOrInsert in KVInterface."); } + virtual void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) { + LOG(FATAL)<<"Unimplemented for UpdateValuePtr in KVInterface."; + } + virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) = 0; + const std::vector& value_ptrs) = 0; // KV Size virtual int64 Size() const = 0; - virtual void SetTotalDims(int total_dims) {} - - virtual void FreeValuePtr(ValuePtr* value_ptr) {} + virtual void FreeValuePtr(void* value_ptr) {} - virtual Status Commit(K key, const ValuePtr* value_ptr) { + virtual Status Commit(K key, const void* value_ptr) { return Status::OK(); } virtual Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) = 0; + std::vector* value_ptr_list) = 0; virtual std::string DebugString() const = 0; diff --git a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h index 2af6b58f94b..9b0ea8aba3f 100644 --- a/tensorflow/core/framework/embedding/l2weight_shrink_policy.h +++ b/tensorflow/core/framework/embedding/l2weight_shrink_policy.h @@ -19,28 +19,23 @@ limitations under the License. namespace tensorflow { -template -class ValuePtr; - namespace embedding { template class L2WeightShrinkPolicy : public ShrinkPolicy { public: L2WeightShrinkPolicy(float l2_weight_threshold, int64 index, - int64 offset, - Allocator* alloc, + FeatureDescriptor* feat_desc, KVInterface* kv) : index_(index), - offset_(offset), kv_(kv), l2_weight_threshold_(l2_weight_threshold), - ShrinkPolicy(alloc) {} + ShrinkPolicy(feat_desc) {} TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override { ShrinkPolicy::ReleaseValuePtrs(); FilterToDelete(shrink_args.value_len, @@ -50,9 +45,9 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { private: void FilterToDelete(int64 value_len, std::vector& key_list, - std::vector*>& value_list) { + std::vector& value_list) { for (int64 i = 0; i < key_list.size(); ++i) { - V* val = value_list[i]->GetValue(index_, offset_); + V* val = ShrinkPolicy::feat_desc_->GetEmbedding(value_list[i], index_); if (val != nullptr) { V l2_weight = (V)0.0; for (int64 j = 0; j < value_len; j++) { @@ -61,7 +56,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { l2_weight *= (V)0.5; if (l2_weight < (V)l2_weight_threshold_) { kv_->Remove(key_list[i]); - value_list[i] = (ValuePtr*)ValuePtrStatus::IS_DELETED; + value_list[i] = (void*)ValuePtrStatus::IS_DELETED; ShrinkPolicy::EmplacePointer(value_list[i]); } } @@ -70,7 +65,7 @@ class L2WeightShrinkPolicy : public ShrinkPolicy { private: int64 index_; - int64 offset_; + //int64 offset_; KVInterface* kv_; float l2_weight_threshold_; }; diff --git a/tensorflow/core/framework/embedding/layout_creator.h b/tensorflow/core/framework/embedding/layout_creator.h deleted file mode 100644 index 07d50451bf0..00000000000 --- a/tensorflow/core/framework/embedding/layout_creator.h +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2022 The DeepRec Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -======================================================================*/ -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ - -#include "tensorflow/core/framework/embedding/cache.h" -#include "tensorflow/core/framework/embedding/config.pb.h" -#include "tensorflow/core/framework/embedding/storage_config.h" -#include "tensorflow/core/lib/core/status.h" - -namespace tensorflow { -template -class ValuePtr; - -namespace embedding { -template -class LayoutCreator { - public: - virtual ValuePtr* Create(Allocator* alloc, size_t size) = 0; -}; - -template -class NormalLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalValuePtr(alloc, size); - } -}; - -template -class LightLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new LightValuePtr(alloc, size); - } -}; - -template -class NormalContiguousLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalContiguousValuePtr(alloc, size); - } -}; - -template -class NormalContiguousGPULayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new NormalGPUValuePtr(alloc, size); - } -}; - -template -class CompactLayoutCreator : public LayoutCreator { - public: - ValuePtr* Create(Allocator* alloc, size_t size) override { - return new CompactValuePtr(alloc, size); - } -}; - -class LayoutCreatorFactory { - public: - template - static LayoutCreator* Create(const StorageConfig& sc) { - switch (sc.layout_type) { - case LayoutType::NORMAL: - static NormalLayoutCreator normal_creator; - return &normal_creator; - case LayoutType::LIGHT: - static LightLayoutCreator light_creator; - return &light_creator; - case LayoutType::NORMAL_CONTIGUOUS: - static NormalContiguousLayoutCreator normal_contiguous_creator; - return &normal_contiguous_creator; - case LayoutType::NORMAL_CONTIGUOUS_GPU: - static NormalContiguousGPULayoutCreator - normal_contiguous_gpu_creator; - return &normal_contiguous_gpu_creator; - case LayoutType::COMPACT: - static CompactLayoutCreator compact_creator; - return &compact_creator; - default: - static NormalLayoutCreator default_creator; - return &default_creator; - } - } -}; -} // embedding -} // tensorflow - -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LAYOUT_CREATOR_H_ diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h index 8ea1fa63fc2..e488ab3776d 100644 --- a/tensorflow/core/framework/embedding/leveldb_kv.h +++ b/tensorflow/core/framework/embedding/leveldb_kv.h @@ -17,9 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_ #include "tensorflow/core/lib/io/path.h" - #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/lib/core/status.h" #include "leveldb/db.h" @@ -35,9 +33,6 @@ using leveldb::WriteBatch; using leveldb::WriteOptions; namespace tensorflow { -template -class ValuePtr; - namespace embedding { template @@ -76,28 +71,21 @@ class SizeCounter { template class LevelDBKV : public KVInterface { public: - LevelDBKV(std::string path) { + LevelDBKV(std::string path, FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { path_ = io::JoinPath(path, "level_db_" + std::to_string(Env::Default()->NowMicros()));; options_.create_if_missing = true; leveldb::Status s = leveldb::DB::Open(options_, path_, &db_); CHECK(s.ok()); counter_ = new SizeCounter(8); - new_value_ptr_fn_ = [] (size_t size) { - return new NormalContiguousValuePtr(ev_allocator(), size); - }; - total_dims_ = 0; - } - - void SetTotalDims(int total_dims) { - total_dims_ = total_dims; } ~LevelDBKV() override { delete db_; } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { std::string val_str; leveldb::Slice db_key((char*)(&key), sizeof(void*)); leveldb::ReadOptions options; @@ -106,8 +94,8 @@ class LevelDBKV : public KVInterface { return errors::NotFound( "Unable to find Key: ", key, " in LevelDB."); } else { - ValuePtr* val = new_value_ptr_fn_(total_dims_); - memcpy((int64 *)(val->GetPtr()), &val_str[0], val_str.length()); + void* val = feat_desc_->Allocate(); + memcpy((int64 *)val, &val_str[0], val_str.length()); *value_ptr = val; return Status::OK(); } @@ -126,22 +114,22 @@ class LevelDBKV : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { counter_->add(key, 1); return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return BatchCommit(keys, value_ptrs); } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { WriteBatch batch; for (int i = 0; i < keys.size(); i++) { - std::string value_res((char*)value_ptrs[i]->GetPtr(), - sizeof(FixedLengthHeader) + total_dims_ * sizeof(V)); + std::string value_res((char*)value_ptrs[i], + feat_desc_->data_bytes()); leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*)); batch.Put(db_key, value_res); delete value_ptrs[i]; @@ -150,9 +138,9 @@ class LevelDBKV : public KVInterface { return Status::OK(); } - Status Commit(K key, const ValuePtr* value_ptr) override { - std::string value_res((char*)value_ptr->GetPtr(), - sizeof(FixedLengthHeader) + total_dims_ * sizeof(V)); + Status Commit(K key, const void* value_ptr) override { + std::string value_res((char*)value_ptr, + feat_desc_->data_bytes()); leveldb::Slice db_key((char*)(&key), sizeof(void*)); leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res); if (!s.ok()){ @@ -176,22 +164,32 @@ class LevelDBKV : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { ReadOptions options; options.snapshot = db_->GetSnapshot(); leveldb::Iterator* it = db_->NewIterator(options); + void* dram_value_ptr = feat_desc_->Allocate(); for (it->SeekToFirst(); it->Valid(); it->Next()) { K key; memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); key_list->emplace_back(key); - ValuePtr* value_ptr = - new NormalGPUValuePtr(ev_allocator(), 1); - memcpy((char *)value_ptr->GetPtr(), + FeatureDescriptor hbm_feat_desc( + 1, 1, ev_allocator()/*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes()); + memcpy(dram_value_ptr, it->value().ToString().data(), - sizeof(FixedLengthHeader)); + feat_desc_->data_bytes()); + hbm_feat_desc.SetFreq( + value_ptr, feat_desc_->GetFreq(dram_value_ptr)); + hbm_feat_desc.UpdateVersion( + value_ptr, feat_desc_->GetVersion(dram_value_ptr)); value_ptr_list->emplace_back(value_ptr); } delete it; + feat_desc_->Deallocate(dram_value_ptr); return Status::OK(); } @@ -199,8 +197,8 @@ class LevelDBKV : public KVInterface { return counter_->size(); } - void FreeValuePtr(ValuePtr* value_ptr) override { - delete value_ptr; + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); } std::string DebugString() const override{ @@ -212,8 +210,7 @@ class LevelDBKV : public KVInterface { SizeCounter* counter_; Options options_; std::string path_; - std::function*(size_t)> new_value_ptr_fn_; - int total_dims_; + FeatureDescriptor* feat_desc_; }; template @@ -223,10 +220,12 @@ class DBValueIterator: public ValueIterator { const std::vector& key_list, int64 emb_index, int64 value_len, - LevelDBKV* leveldb_kv) + LevelDBKV* leveldb_kv, + FeatureDescriptor* feat_desc) : value_len_(value_len), emb_index_(emb_index), - leveldb_kv_(leveldb_kv) { + leveldb_kv_(leveldb_kv), + feat_desc_(feat_desc) { int64 emb_offset = value_len_ * emb_index; std::vector> keys_parts_vec(kSavedPartitionNum); for (int64 i = 0; i < key_list.size(); i++) { @@ -251,8 +250,7 @@ class DBValueIterator: public ValueIterator { V* Next() { if (value_ptr_ != nullptr) { - value_ptr_->Destroy(ev_allocator()); - delete value_ptr_; + feat_desc_->Deallocate(value_ptr_); } K key = *(keys_iter_++); @@ -260,16 +258,17 @@ class DBValueIterator: public ValueIterator { if (!s.ok()) { LOG(FATAL)<<"Not found value in LevelDB when Save."; } - return value_ptr_->GetValue(emb_index_, value_len_ * emb_index_); + return feat_desc_->GetEmbedding(value_ptr_, emb_index_); } private: int64 value_len_; int64 emb_index_; LevelDBKV* leveldb_kv_; + FeatureDescriptor* feat_desc_; std::list keys_; typename std::list::const_iterator keys_iter_; - ValuePtr* value_ptr_ = nullptr; + void* value_ptr_ = nullptr; int64 key_cursor_ = 0; }; diff --git a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h b/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h deleted file mode 100644 index 8dcea81d4a1..00000000000 --- a/tensorflow/core/framework/embedding/lockless_hash_map_cpu.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright 2022 The DeepRec Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -=======================================================================*/ - -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ -#if GOOGLE_CUDA -#define EIGEN_USE_GPU - -#include "sparsehash/dense_hash_map_lockless" -#include "tensorflow/core/framework/embedding/batch.h" -#include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/platform/stream_executor.h" - -namespace tensorflow { -using se::DeviceMemoryBase; -using se::Stream; - -namespace embedding { - -template -class LocklessHashMapCPU : public KVInterface { - public: - LocklessHashMapCPU(Allocator* gpu_alloc): gpu_alloc_(gpu_alloc) { - hash_map_.max_load_factor(0.8); - hash_map_.set_empty_key_and_value(EMPTY_KEY_, nullptr); - hash_map_.set_counternum(16); - hash_map_.set_deleted_key(DELETED_KEY_); - cudaEventCreate(&is_finish_); - } - - ~LocklessHashMapCPU() override { - cudaEventDestroy(is_finish_); - } - - Status Lookup(K key, ValuePtr** value_ptr) override { - auto iter = hash_map_.find_wait_free(key); - if (iter.first == EMPTY_KEY_) { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } else { - *value_ptr = iter.second; - return Status::OK(); - } - } - - Status Contains(K key) override { - auto iter = hash_map_.find_wait_free(key); - if (iter.first == EMPTY_KEY_) { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } else { - return Status::OK(); - } - } - - Status Insert(K key, const ValuePtr* value_ptr) override { - auto iter = hash_map_.insert_lockless( - std::move(std::pair*>(key, - const_cast*>(value_ptr)))); - // insert fail, exist key - if ((*(iter.first)).second != value_ptr){ - return errors::AlreadyExists( - "already exists Key: ", key, " in LocklessHashMap."); - } else { - return Status::OK(); - } - } - - // Other Method - int64 Size() const override { - return hash_map_.size_lockless(); - } - - // Remove KV - Status Remove(K key) override { - if (hash_map_.erase_lockless(key)) { - return Status::OK(); - } else { - return errors::NotFound( - "Unable to find Key: ", key, " in LocklessHashMap."); - } - } - - void SetTotalDims(int total_dims) override { - total_dims_ = total_dims; - } - - void AppendToValuePtrQueue(ValuePtr* old_value_ptr) { - //A parameter that can be adjusted in the future - if (value_ptr_out_of_date_.size() > CAP_INVALID_VALUEPTR) { - ValuePtr* value_ptr = value_ptr_out_of_date_.front(); - delete value_ptr; - value_ptr_out_of_date_.pop_front(); - } - value_ptr_out_of_date_.emplace_back(old_value_ptr); - } - - Status Commit(K key, const ValuePtr* value_ptr) override { - ValuePtr* cpu_value_ptr = - new NormalContiguousValuePtr(ev_allocator(), total_dims_); - cudaMemcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader), - *(char **)((char*)value_ptr->GetPtr() + sizeof(FixedLengthHeader)), - total_dims_ * sizeof(V), - cudaMemcpyDeviceToHost); - memcpy((char *)cpu_value_ptr->GetPtr(), - (char*)value_ptr->GetPtr(), sizeof(FixedLengthHeader)); - auto iter = hash_map_.insert_lockless(std::move( - std::pair*>(key, - const_cast*>(cpu_value_ptr)))); - if ((*(iter.first)).second != cpu_value_ptr) { - AppendToValuePtrQueue((*(iter.first)).second); - (*(iter.first)).second = cpu_value_ptr; - } - return Status::OK(); - } - - Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { - int batch_size = keys.size(); - Allocator* cpu_alloc = cpu_allocator(); - V** value_address = (V **)cpu_alloc->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V*) * batch_size); - V** dev_value_address; - V* batch_data_place; - V* dev_batch_data_place; - dev_value_address = (V**)gpu_alloc_->AllocateRaw( - Allocator::kAllocatorAlignment, batch_size * sizeof(V*)); - dev_batch_data_place = (V*)gpu_alloc_->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_); - batch_data_place = (V *)cpu_alloc->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dims_); - - // Copy GPU addresses V* - for(int i = 0;i < batch_size;++i) { - value_address[i] = - *(V **)((char*)value_ptrs[i]->GetPtr() + sizeof(FixedLengthHeader)); - } - - cudaMemcpyAsync(dev_value_address, value_address, - sizeof(V*) * batch_size, - cudaMemcpyHostToDevice); - - // Launch Kernel,Copy data to continuous place - int block_dim = 128; - void* args[] = { (void*)&dev_value_address, - (void*)&dev_batch_data_place, (void*)&total_dims_, - (void*)&batch_size}; - - cudaLaunchKernel((void *)BatchCopy, - (batch_size * total_dims_ + block_dim - 1) / block_dim, - block_dim, args, 0, NULL); - - cudaMemcpyAsync(batch_data_place, dev_batch_data_place, - sizeof(V) * batch_size * total_dims_, - cudaMemcpyDeviceToHost); - - cudaEventRecord(is_finish_); - cudaEventSynchronize(is_finish_); - - // Copy data to ValuePtrs in memory;Insert it into hashmap - for(int i = 0; i < batch_size; ++i) { - ValuePtr* cpu_value_ptr = - new NormalContiguousValuePtr(ev_allocator(), total_dims_); - memcpy((char *)cpu_value_ptr->GetPtr() + sizeof(FixedLengthHeader), - &batch_data_place[i * total_dims_], total_dims_ * sizeof(V)); - memcpy((char *)cpu_value_ptr->GetPtr(), - (char *)value_ptrs[i]->GetPtr(), sizeof(FixedLengthHeader)); - auto iter = hash_map_.insert_lockless(std::move( - std::pair*>(keys[i], - const_cast*>(cpu_value_ptr)))); - if ((*(iter.first)).second != cpu_value_ptr) { - AppendToValuePtrQueue((*(iter.first)).second); - (*(iter.first)).second = cpu_value_ptr; - } - } - - gpu_alloc_->DeallocateRaw(dev_value_address); - gpu_alloc_->DeallocateRaw(dev_batch_data_place); - - cpu_alloc->DeallocateRaw(batch_data_place); - cpu_alloc->DeallocateRaw(value_address); - - return Status::OK(); - } - - Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { - std::pair*> *hash_map_dump; - int64 bucket_count; - auto it = hash_map_.GetSnapshot(); - hash_map_dump = it.first; - bucket_count = it.second; - for (int64 j = 0; j < bucket_count; j++) { - if (hash_map_dump[j].first != EMPTY_KEY_ && - hash_map_dump[j].first != DELETED_KEY_) { - key_list->emplace_back(hash_map_dump[j].first); - value_ptr_list->emplace_back(hash_map_dump[j].second); - } - } - free(hash_map_dump); - return Status::OK(); - } - - std::string DebugString() const override { - LOG(INFO) << "map info size:" << Size() - << "map info bucket_count:" << hash_map_.bucket_count() - << "map info load_factor:" << hash_map_.load_factor() - << "map info max_load_factor:" << hash_map_.max_load_factor() - << "map info min_load_factor:" << hash_map_.min_load_factor(); - return ""; - } - - private: - typedef google::dense_hash_map_lockless* > - LockLessHashMap; - static const int EMPTY_KEY_ = -1; - static const int DELETED_KEY_ = -2; - static constexpr int CAP_INVALID_VALUEPTR = 200000; - LockLessHashMap hash_map_; - std::deque*> value_ptr_out_of_date_; - int total_dims_; - Allocator* gpu_alloc_; - cudaEvent_t is_finish_; -}; -} // namespace embedding -} // namespace tensorflow - -#endif //GOOGLE_CUDA -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LOCKLESS_HASH_MAP_CPU_H_ diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc index de275183d22..9745ab5fcc3 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc +++ b/tensorflow/core/framework/embedding/multi_tier_storage.cu.cc @@ -15,8 +15,7 @@ limitations under the License. #if GOOGLE_CUDA #define EIGEN_USE_GPU #include "tensorflow/core/framework/embedding/multi_tier_storage.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" -#include "tensorflow/core/framework/embedding/batch.h" +#include "tensorflow/core/framework/embedding/hbm_multi_tier_feature_descriptor.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/op_kernel.h" @@ -44,11 +43,13 @@ template void MultiTierStorage::CopyEmbeddingsFromDramToHbm( const EmbeddingVarContext& ctx, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursor, const std::vector& memory_index, - const std::vector*>& gpu_value_ptrs, - int value_len) { + const std::vector& gpu_value_ptrs, + int value_len, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc) { if (copyback_cursor.size() > 0) { int total = copyback_cursor.size(); //Alocate memcpy buffer on CPU and GPU. @@ -64,11 +65,13 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( auto do_work = [memory_index, memcpy_buffer_cpu, value_ptr_list, gpu_value_ptrs, + dram_feat_desc, value_len, this] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { int j = memory_index[i]; memcpy(memcpy_buffer_cpu + i * value_len, - value_ptr_list[j]->GetValue(0, 0), value_len * sizeof(V)); + dram_feat_desc->GetEmbedding(value_ptr_list[j], 0), + value_len * sizeof(V)); value_ptr_list[j] = gpu_value_ptrs[i]; } }; @@ -96,8 +99,7 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( for (; it != copyback_cursor.cend(); ++it, ++i) { // Get the cursor int64 cursor = *it; - gpu_value_ptrs[i]->SetInitialized(0); - value_address[i] = gpu_value_ptrs[i]->GetValue(0, 0); + value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0); } DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*)); compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address, total * sizeof(V*)); @@ -119,16 +121,71 @@ void MultiTierStorage::CopyEmbeddingsFromDramToHbm( } #define REGISTER_KERNELS(ktype, vtype) \ template void MultiTierStorage::CopyEmbeddingsFromDramToHbm( \ - const EmbeddingVarContext&, const ktype*, ValuePtr**,\ + const EmbeddingVarContext&, const ktype*, void**,\ std::list&, const std::vector&,\ - const std::vector*>&, int); + const std::vector&, int, FeatureDescriptor*,\ + FeatureDescriptor*); #define REGISTER_KERNELS_ALL(type) \ REGISTER_KERNELS(int32, type); \ REGISTER_KERNELS(int64, type) #define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) #undef REGISTER_KERNELS_CPU +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +template +template +void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( + const K* keys, const std::list& init_cursor, + void** value_ptrs, se::Stream* compute_stream, EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + if (init_cursor.size() > 0) { + int64 total = init_cursor.size(); + TValue** value_address = nullptr; + value_address = TypedAllocator::Allocate(cpu_allocator(), total * 2, + AllocationAttributes()); + TValue** default_value_address = value_address + total; + TValue** dev_value_address = nullptr; + dev_value_address = + TypedAllocator::Allocate(hbm_alloc_, total * 2, AllocationAttributes()); + TValue** dev_default_value_address = dev_value_address + total; + for (int emb_index = 0; emb_index < FeatureDescriptorImpl::slot_infos_.size(); emb_index++) { + int64 i = 0; + auto it = init_cursor.cbegin(); + for (; it != init_cursor.cend(); ++it, ++i) { + value_address[i] = GetEmbedding(value_ptrs[*it], emb_index); + default_value_address[i] = + FeatureDescriptorImpl::GetDefaultValuePtr(emb_index, keys[i]); + } + DeviceMemoryBase gpu_dst_ptr(dev_value_address, total * 2 * sizeof(TValue*)); + compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address, + total * 2 * sizeof(TValue*)); + int block_dim = 128; + int value_len = FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len; + TF_CHECK_OK(GpuLaunchKernel( + embedding::CopyEmbedding, + (total * value_len + block_dim - 1) / block_dim, + block_dim, 0, gpu_device.stream(), dev_default_value_address, + dev_value_address, value_len, total)); + SyncWithEventMgr(compute_stream, event_mgr); + } + + TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2); + TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2); + } +} +#define REGISTER_KERNELS(ktype, vtype) \ + template void HbmMultiTierFeatureDescriptorImpl::SetDefaultValues( \ + const ktype*, const std::list&, void**,\ + se::Stream*, EventMgr*, const Eigen::GpuDevice& gpu_device); +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(int32, type); \ + REGISTER_KERNELS(int64, type) +#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type) +TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU) +#undef REGISTER_KERNELS_CPU #undef REGISTER_KERNELS_ALL #undef REGISTER_KERNELS } // namespace embedding diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h index 8239d109e64..7955322aca6 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.h +++ b/tensorflow/core/framework/embedding/multi_tier_storage.h @@ -31,10 +31,11 @@ limitations under the License. #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/core/status.h" -namespace tensorflow { -template -class ValuePtr; +#if GOOGLE_CUDA +#include "tensorflow/core/framework/embedding/batch.h" +#endif +namespace tensorflow { template class EmbeddingVar; @@ -54,22 +55,10 @@ class MultiTierStorage : public Storage { TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage); - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); - - cache_capacity_ = Storage::storage_config_.size[0] - / (Storage::total_dims_ * sizeof(V)); - ready_eviction_ = true; - } - Storage::flag_.clear(std::memory_order_release); + virtual void Init() override { + cache_capacity_ = Storage::storage_config_.size[0] + / (total_dim() * sizeof(V)); + ready_eviction_ = true; } int64 CacheSize() const override { @@ -90,13 +79,13 @@ class MultiTierStorage : public Storage { } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { LOG(FATAL)<<"BatchCommit isn't supported by MultiTierStorage."; return Status::OK(); } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { LOG(FATAL)<<"Can't get snapshot of MultiTierStorage."; } @@ -104,7 +93,7 @@ class MultiTierStorage : public Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -128,17 +117,6 @@ class MultiTierStorage : public Storage { return; } - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - return; - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - return; - } - void Schedule(std::function fn) override { cache_thread_pool_->Schedule(std::move(fn)); } @@ -223,50 +201,50 @@ class MultiTierStorage : public Storage { } return s; } - - virtual void SetTotalDims(int64 total_dims) = 0; + virtual int total_dim() = 0; void DeleteFromEvictionManager() { eviction_manager_->DeleteStorage(this); } - void ReleaseValuePtrs(std::deque*>& value_ptrs, - Allocator* allocator) { + void ReleaseValuePtrs(std::deque& value_ptrs, + FeatureDescriptor* feat_desc) { constexpr int CAP_INVALID_VALUEPTR = 64 * 1024; if (value_ptrs.size() > CAP_INVALID_VALUEPTR) { int64 num_of_deleted_value_ptrs = value_ptrs.size() - CAP_INVALID_VALUEPTR; for (int i = 0; i < num_of_deleted_value_ptrs; i++) { - ValuePtr* value_ptr = value_ptrs.front(); - value_ptr->Destroy(allocator); - delete value_ptr; + void* value_ptr = value_ptrs.front(); + feat_desc->Deallocate(value_ptr); value_ptrs.pop_front(); } } } - void ReleaseInvalidValuePtr(Allocator* allocator) { - ReleaseValuePtrs(value_ptr_out_of_date_, allocator); + void ReleaseInvalidValuePtr(FeatureDescriptor* feat_desc) { + ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc); } - void KeepInvalidValuePtr(ValuePtr* value_ptr) { + void KeepInvalidValuePtr(void* value_ptr) { value_ptr_out_of_date_.emplace_back(value_ptr); } #if GOOGLE_CUDA void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext& context, const K* keys, - ValuePtr** value_ptr_list, + void** value_ptr_list, std::list& copyback_cursors, const std::vector& memory_index, - const std::vector*>& gpu_value_ptrs, - int value_len); + const std::vector& gpu_value_ptrs, + int value_len, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc); #endif //GOOGL_CUDA private: virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {} protected: - std::deque*> value_ptr_out_of_date_; + std::deque value_ptr_out_of_date_; BatchCache* cache_ = nullptr; EvictionManager* eviction_manager_; @@ -281,6 +259,70 @@ class MultiTierStorage : public Storage { std::string name_; std::vector mu_list_; }; + +#if GOOGLE_CUDA +template +void CopyEmbeddingFromHbmToDram( + const std::vector& hbm_value_ptrs, + const std::vector& dram_value_ptrs, + Allocator* gpu_alloc, + FeatureDescriptor* hbm_feat_desc, + FeatureDescriptor* dram_feat_desc) { + int batch_size = hbm_value_ptrs.size(); + V** dev_value_address; + + dev_value_address = (V**)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, batch_size * sizeof(V*)); + Allocator* cpu_alloc = ev_allocator(); + V** value_address = (V**)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V*) * batch_size); + + V* batch_data_place; + V* dev_batch_data_place; + int total_dim = dram_feat_desc->total_dim(); + dev_batch_data_place = (V*)gpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + batch_data_place = (V *)cpu_alloc->AllocateRaw( + Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim); + // Copy GPU addresses V* + for(int i = 0; i < batch_size; ++i) { + value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0); + } + cudaMemcpyAsync(dev_value_address, value_address, + sizeof(V*) * batch_size, + cudaMemcpyHostToDevice); + + // Launch Kernel,Copy data to continuous place + int block_dim = 128; + void* args[] = { (void*)&dev_value_address, + (void*)&dev_batch_data_place, (void*)&total_dim, + (void*)&batch_size}; + + cudaLaunchKernel((void *)BatchCopy, + (batch_size * total_dim + block_dim - 1) / block_dim, + block_dim, args, 0, NULL); + + cudaMemcpyAsync(batch_data_place, dev_batch_data_place, + sizeof(V) * batch_size * total_dim, + cudaMemcpyDeviceToHost); + + cudaEvent_t is_finish_; + cudaEventCreate(&is_finish_); + cudaEventRecord(is_finish_); + cudaEventSynchronize(is_finish_); + cudaEventDestroy(is_finish_); + + for(int i = 0; i < batch_size; ++i) { + memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0), + &batch_data_place[i * total_dim], total_dim * sizeof(V)); + } + + cpu_alloc->DeallocateRaw(value_address); + cpu_alloc->DeallocateRaw(batch_data_place); + gpu_alloc->DeallocateRaw(dev_value_address); + gpu_alloc->DeallocateRaw(dev_batch_data_place); +} +#endif //GOOGL_CUDA } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/normal_feature_descriptor.h b/tensorflow/core/framework/embedding/normal_feature_descriptor.h new file mode 100644 index 00000000000..817b33d058b --- /dev/null +++ b/tensorflow/core/framework/embedding/normal_feature_descriptor.h @@ -0,0 +1,134 @@ +/* Copyright 2022 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +======================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ +#include +#include "tensorflow/core/framework/embedding/feature_descriptor_impl.h" + +namespace tensorflow { +namespace embedding { +#if GOOGLE_CUDA +template +class HbmMultiTierFeatureDescriptorImpl; +#endif + +template +class NormalFeatureDescriptorImpl: public FeatureDescriptorImpl { + public: + NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num, + bool need_record_freq, + bool need_record_version) + : alloc_bytes_(0), + alloc_(alloc), + FeatureDescriptorImpl(slot_num, + need_record_freq, + need_record_version) {} + + NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl* feat_desc_impl) + : alloc_(feat_desc_impl->alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + NormalFeatureDescriptorImpl( + HbmMultiTierFeatureDescriptorImpl* feat_desc_impl) + : alloc_bytes_(0), + alloc_(feat_desc_impl->dram_alloc_), + FeatureDescriptorImpl(feat_desc_impl) {} + + ~NormalFeatureDescriptorImpl() {} + + bool InitSlotInfo(int emb_index, int64 embedding_dim, + const std::pair& default_value) override { + bool is_compute_alloc_bytes = FeatureDescriptorImpl::SetEmbeddingInfo( + emb_index, embedding_dim, default_value); + if (is_compute_alloc_bytes) { + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::CreateFreqAndVersionDescriptor(&alloc_bytes_); + } + return is_compute_alloc_bytes; + } + + bool InitSlotInfo(FeatureDescriptorImpl* feat_desc_impl) override { + FeatureDescriptorImpl::SetSlotInfo(feat_desc_impl); + FeatureDescriptorImpl::ComputeAllocBytes(&alloc_bytes_); + FeatureDescriptorImpl::SetFreqAndVersionOffset(&alloc_bytes_); + return true; + } + + V* GetEmbedding(void *val, int emb_index) override { + return reinterpret_cast(val) + + FeatureDescriptorImpl::slot_infos_[emb_index].embedding_offset; + } + + void* Allocate() override { + void* val = alloc_->AllocateRaw( + Allocator::kAllocatorAlignment, alloc_bytes_); + FeatureDescriptorImpl::InitFreqAndVersion(val); + return val; + } + + void Deallocate(void* val) override { + alloc_->DeallocateRaw(val); + } + + void Deallocate(const std::vector& value_ptrs) override { + for (auto val: value_ptrs) { + Deallocate(val); + } + } + + void SetValue(void* val, int64 emb_index, V* value) override { + V* val_ptr = GetEmbedding(val, emb_index); + memcpy(val_ptr, value, + sizeof(V) * FeatureDescriptorImpl::slot_infos_[emb_index].default_value_len); + } + + void SetDefaultValue(void* val, int64 index) override { + for (int i = 0; i < FeatureDescriptorImpl::slot_infos_.size(); i++) { + V* val_ptr = GetEmbedding(val, i); + FeatureDescriptorImpl::SetDefaultValue((void*)val_ptr, i, index); + } + } + +#if GOOGLE_CUDA + template + void SetDefaultValues( + const K* keys, + const std::list& init_cursor, + void** value_ptrs, + se::Stream* compute_stream, + EventMgr* event_mgr, + const Eigen::GpuDevice& gpu_device) { + LOG(FATAL)<<"Can't call SetDefaultValue(const K*, const std::list&," + <<"void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)" + <<" in HbmMultiTierFeatureDescriptor."; + } +#endif + + void SetAllocator(Allocator* alloc) override { + alloc_ = alloc; + } + + int data_bytes() override { + return alloc_bytes_; + } + + private: + int alloc_bytes_; + Allocator* alloc_; +}; +} //namespace embedding +} //namespace tensorflow + +#endif //TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_ diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h index 0c5ce80886a..7e3ace0063d 100644 --- a/tensorflow/core/framework/embedding/nullable_filter_policy.h +++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h @@ -30,19 +30,21 @@ template class NullableFilterPolicy : public FilterPolicy { using FilterPolicy::ev_; using FilterPolicy::config_; - using FilterPolicy::LookupOrCreateEmbInternal; public: NullableFilterPolicy(const EmbeddingConfig& config, - EV* ev, embedding::Storage* storage) : - FilterPolicy(config, ev), storage_(storage) {} + EV* ev, embedding::Storage* storage, + embedding::FeatureDescriptor* feat_desc) + : storage_(storage), feat_desc_(feat_desc), + FilterPolicy(config, ev) {} Status Lookup(K key, V* val, const V* default_value_ptr, const V* default_value_no_permission) override { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = ev_->LookupKey(key, &value_ptr); if (s.ok()) { - V* mem_val = ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + V* mem_val = feat_desc_->GetEmbedding( + value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } else { memcpy(val, default_value_ptr, @@ -57,17 +59,17 @@ class NullableFilterPolicy : public FilterPolicy { int64 num_of_keys, V* default_value_ptr, V* default_value_no_permission) override { - std::vector*> value_ptr_list(num_of_keys, nullptr); + std::vector value_ptr_list(num_of_keys, nullptr); ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys); std::vector embedding_ptr(num_of_keys, nullptr); auto do_work = [this, keys, value_ptr_list, &embedding_ptr, default_value_ptr, default_value_no_permission] (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - ValuePtr* value_ptr = value_ptr_list[i]; + void* value_ptr = value_ptr_list[i]; if (value_ptr != nullptr) { embedding_ptr[i] = - ev_->LookupOrCreateEmb(value_ptr, default_value_ptr); + feat_desc_->GetEmbedding(value_ptr, config_.emb_index); } else { embedding_ptr[i] = default_value_ptr; } @@ -85,65 +87,55 @@ class NullableFilterPolicy : public FilterPolicy { } void BatchLookupOrCreateKey(const EmbeddingVarContext& ctx, - const K* keys, ValuePtr** value_ptrs, + const K* keys, void** value_ptrs, int64 num_of_keys) { int num_worker_threads = ctx.worker_threads->num_threads; std::vector> not_found_cursor_list(num_worker_threads + 1); ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs, num_of_keys, not_found_cursor_list); - std::vector var_ptrs(num_of_keys); - auto do_work = [this, value_ptrs, &var_ptrs] - (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - bool is_need_set_default_value = false; - var_ptrs[i] = ev_->LookupOrCreateEmb( - value_ptrs[i], is_need_set_default_value); - } - }; - auto worker_threads = ctx.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, num_of_keys, - 1000, do_work); - - ev_->SetDefaultValueOfNewFeatures( - keys, num_of_keys, - not_found_cursor_list[0], - var_ptrs.data(), ctx.compute_stream, - ctx.event_mgr, ctx.gpu_device); } #endif //GOOGLE_CUDA void LookupOrCreate(K key, V* val, const V* default_value_ptr, - ValuePtr** value_ptr, int count, + void** value_ptr, int count, const V* default_value_no_permission) override { - TF_CHECK_OK(ev_->LookupOrCreateKey(key, value_ptr)); - V* mem_val = ev_->LookupOrCreateEmb(*value_ptr, default_value_ptr); + bool is_filter = true; + TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count)); + V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index); memcpy(val, mem_val, sizeof(V) * ev_->ValueLen()); } - Status LookupOrCreateKey(K key, ValuePtr** val, + Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, int64 count) override { *is_filter = true; - return ev_->LookupOrCreateKey(key, val); + Status s = ev_->LookupKey(key, value_ptr); + if (!s.ok()) { + *value_ptr = feat_desc_->Allocate(); + feat_desc_->SetDefaultValue(*value_ptr, key); + storage_->Insert(key, value_ptr); + s = Status::OK(); + } + feat_desc_->AddFreq(*value_ptr, count); + return s; } - int64 GetFreq(K key, ValuePtr* value_ptr) override { - if (storage_->GetLayoutType() != LayoutType::LIGHT) { - return value_ptr->GetFreq(); - }else { - return 0; - } + Status LookupKey(K key, void** val, + bool* is_filter, int64 count) override { + *is_filter = true; + return ev_->LookupKey(key, val); + } + + int64 GetFreq(K key, void* value_ptr) override { + return feat_desc_->GetFreq(value_ptr); } int64 GetFreq(K key) override { - if (storage_->GetLayoutType() != LayoutType::LIGHT) { - ValuePtr* value_ptr = nullptr; - TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); - return value_ptr->GetFreq(); - }else { + if (!config_.is_save_freq()) return 0; - } + void* value_ptr = nullptr; + TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr)); + return feat_desc_->GetFreq(value_ptr); } Status Restore(int64 key_num, int bucket_num, int64 partition_id, @@ -161,27 +153,30 @@ class NullableFilterPolicy : public FilterPolicy { LOG(INFO) << "skip EV key:" << *(key_buff + i); continue; } - ValuePtr* value_ptr = nullptr; - ev_->CreateKey(key_buff[i], &value_ptr, to_dram); + int64 import_freq = 0; + int64 import_version = -1; + if (config_.filter_freq !=0 || ev_->IsMultiLevel() || config_.record_freq) { - value_ptr->SetFreq(freq_buff[i]); + import_freq = freq_buff[i]; } if (config_.steps_to_live != 0 || config_.record_version) { - value_ptr->SetStep(version_buff[i]); + import_version = version_buff[i]; } - LookupOrCreateEmbInternal(is_filter, to_dram, i, value_len, - value_ptr, value_buff, key_buff); + ev_->storage()->Import(key_buff[i], + value_buff + i * ev_->ValueLen(), + import_freq, import_version, config_.emb_index); } return Status::OK(); } - bool is_admit(K key, ValuePtr* value_ptr) override { + bool is_admit(K key, void* value_ptr) override { return true; } private: embedding::Storage* storage_; + embedding::FeatureDescriptor* feat_desc_; }; } // tensorflow diff --git a/tensorflow/core/framework/embedding/shrink_policy.h b/tensorflow/core/framework/embedding/shrink_policy.h index ea063a113a3..a8d0d9ada75 100644 --- a/tensorflow/core/framework/embedding/shrink_policy.h +++ b/tensorflow/core/framework/embedding/shrink_policy.h @@ -15,14 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_ +#include "tensorflow/core/framework/embedding/feature_descriptor.h" #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { - -template -class ValuePtr; - class Allocator; namespace embedding { @@ -40,31 +37,29 @@ struct ShrinkArgs { template class ShrinkPolicy { public: - ShrinkPolicy(Allocator* alloc): alloc_(alloc) {} + ShrinkPolicy(FeatureDescriptor* feat_desc): feat_desc_(feat_desc) {} virtual ~ShrinkPolicy() {} TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy); virtual void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) = 0; protected: - void EmplacePointer(ValuePtr* value_ptr) { + void EmplacePointer(void* value_ptr) { to_delete_.emplace_back(value_ptr); } void ReleaseValuePtrs() { for (auto it : to_delete_) { - it->Destroy(alloc_); - delete it; + feat_desc_->Deallocate(it); } to_delete_.clear(); } protected: - std::vector*> to_delete_; - private: - Allocator* alloc_; + std::vector to_delete_; + FeatureDescriptor* feat_desc_; }; template @@ -74,7 +69,7 @@ class NonShrinkPolicy: public ShrinkPolicy { TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy); void Shrink(std::vector& key_list, - std::vector*>& value_list, + std::vector& value_list, const ShrinkArgs& shrink_args) override {} }; } // embedding diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h index f9de65df588..be08afd7f50 100644 --- a/tensorflow/core/framework/embedding/single_tier_storage.h +++ b/tensorflow/core/framework/embedding/single_tier_storage.h @@ -24,7 +24,6 @@ limitations under the License. #endif // GOOGLE_CUDA #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/framework/embedding/l2weight_shrink_policy.h" -#include "tensorflow/core/framework/embedding/layout_creator.h" #include "tensorflow/core/framework/embedding/leveldb_kv.h" #include "tensorflow/core/framework/embedding/ssd_hash_kv.h" #include "tensorflow/core/framework/embedding/storage_config.h" @@ -32,9 +31,6 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" namespace tensorflow { -template -class ValuePtr; - template class EmbeddingVar; @@ -62,24 +58,22 @@ class HbmDramSsdStorage; template class SingleTierStorage : public Storage { public: - SingleTierStorage(const StorageConfig& sc, Allocator* alloc, - KVInterface* kv, LayoutCreator* lc) - : kv_(kv), alloc_(alloc), layout_creator_(lc), + SingleTierStorage(const StorageConfig& sc, + KVInterface* kv, FeatureDescriptor* feat_desc) + : kv_(kv), feat_desc_(feat_desc), Storage(sc) { if (sc.embedding_config.steps_to_live != 0) { shrink_policy_ = new GlobalStepShrinkPolicy( sc.embedding_config.steps_to_live, - alloc_, + feat_desc_, kv_); } else if (sc.embedding_config.l2_weight_threshold != -1.0) { shrink_policy_ = new L2WeightShrinkPolicy( sc.embedding_config.l2_weight_threshold, sc.embedding_config.primary_emb_index, - Storage::GetOffset( - sc.embedding_config.primary_emb_index), - alloc_, + feat_desc_, kv_); } else { shrink_policy_ = new NonShrinkPolicy(); @@ -89,11 +83,10 @@ class SingleTierStorage : public Storage { ~SingleTierStorage() override { mutex_lock l(Storage::mu_); std::vector key_list; - std::vector*> value_ptr_list; + std::vector value_ptr_list; kv_->GetSnapshot(&key_list, &value_ptr_list); for (auto value_ptr : value_ptr_list) { - value_ptr->Destroy(alloc_); - delete value_ptr; + feat_desc_->Deallocate(value_ptr); } delete kv_; delete shrink_policy_; @@ -101,7 +94,7 @@ class SingleTierStorage : public Storage { TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage); - Status Get(K key, ValuePtr** value_ptr) override { + Status Get(K key, void** value_ptr) override { return kv_->Lookup(key, value_ptr); } @@ -109,47 +102,45 @@ class SingleTierStorage : public Storage { return kv_->Contains(key); } - virtual void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) override { do { - *value_ptr = layout_creator_->Create(alloc_, alloc_len); + *value_ptr = feat_desc_->Allocate(); Status s = kv_->Insert(key, *value_ptr); if (s.ok()) { break; } else { - (*value_ptr)->Destroy(alloc_); - delete *value_ptr; + feat_desc_->Deallocate(*value_ptr); } } while (!(kv_->Lookup(key, value_ptr)).ok()); } - virtual void Insert(K key, ValuePtr* value_ptr) override { - LOG(FATAL)<<"Unsupport Insert(K, ValuePtr*) in SingleTireStorage."; + virtual void Insert(K key, void** value_ptr) override { + do { + Status s = kv_->Insert(key, *value_ptr); + if (s.ok()) { + break; + } else { + feat_desc_->Deallocate(*value_ptr); + } + } while (!(kv_->Lookup(key, value_ptr)).ok()); } - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) override { + Status GetOrCreate(K key, void** value_ptr) override { Status s = kv_->Lookup(key, value_ptr); if (s.ok()) { return s; } - *value_ptr = layout_creator_->Create(alloc_, size); + *value_ptr = feat_desc_->Allocate(); s = kv_->Insert(key, *value_ptr); if (s.ok()) { return s; } // Insert Failed, key already exist - (*value_ptr)->Destroy(alloc_); - delete *value_ptr; + feat_desc_->Deallocate(*value_ptr); return kv_->Lookup(key, value_ptr); } - - Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) override { - need_copyback = NOT_COPYBACK; - return GetOrCreate(key, value_ptr, size); - } Status Remove(K key) override { return kv_->Remove(key); @@ -180,7 +171,7 @@ class SingleTierStorage : public Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -198,13 +189,13 @@ class SingleTierStorage : public Storage { } virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { LOG(FATAL) << "Unsupport BatchCommit in Storage: " << typeid(this).name(); return Status::OK(); } - virtual Status Commit(K keys, const ValuePtr* value_ptr) { + virtual Status Commit(K keys, const void* value_ptr) { LOG(FATAL) << "Unsupport Commit in Storage: " << typeid(this).name(); return Status::OK(); @@ -222,19 +213,12 @@ class SingleTierStorage : public Storage { return; } - void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) override { - return; - } - - void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, - int64 num_of_value_ptrs) override { - return; - } + virtual void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override {} Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { mutex_lock l(Storage::mu_); return kv_->GetSnapshot(key_list, value_ptr_list); } @@ -247,7 +231,7 @@ class SingleTierStorage : public Storage { ShrinkArgs& shrink_args, int64 value_len, V* default_value) override { - std::vector*> value_ptr_list; + std::vector value_ptr_list; std::vector key_list_tmp; TF_CHECK_OK(kv_->GetSnapshot( &key_list_tmp, &value_ptr_list)); @@ -255,30 +239,16 @@ class SingleTierStorage : public Storage { if (emb_config.is_primary()) { Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len); } - TF_CHECK_OK((Storage::SaveToCheckpoint( tensor_name, writer, emb_config, value_len, default_value, key_list_tmp, - value_ptr_list))); + value_ptr_list, + SingleTierStorage::feat_desc_))); return Status::OK(); } - void SetAllocLen(int64 value_len, int slot_num) override { - while (Storage::flag_.test_and_set(std::memory_order_acquire)); - // The start address of every slot should be aligned to 16 bytes, - // otherwise a coredump will happen in the ApplyOp. - Storage::alloc_len_ = Storage::ComputeAllocLen(value_len); - - int64 temp = Storage::alloc_len_ * slot_num; - if (temp > Storage::total_dims_) { - Storage::total_dims_ = temp; - SetTotalDims(Storage::total_dims_); - } - Storage::flag_.clear(std::memory_order_release); - } - bool IsMultiLevel() override { return false; } @@ -299,16 +269,22 @@ class SingleTierStorage : public Storage { LOG(FATAL) << "Unsupport Schedule in SingleTierStorage."; } + void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) override { + kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr); + } + protected: - virtual void SetTotalDims(int64 total_dims) = 0; + virtual void* CreateValuePtr() { + return feat_desc_->Allocate(); + } - virtual ValuePtr* CreateValuePtr(int64 size) { - return layout_creator_->Create(alloc_, size); + virtual void DestroyValuePtr(void* value_ptr) { + feat_desc_->Deallocate(value_ptr); } - virtual void DestroyValuePtr(ValuePtr* value_ptr) { - value_ptr->Destroy(alloc_); - delete value_ptr; + FeatureDescriptor* feature_descriptor() { + return feat_desc_; } protected: virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, @@ -324,7 +300,7 @@ class SingleTierStorage : public Storage { } virtual void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) { mutex_lock l(Storage::mu_); @@ -339,31 +315,40 @@ class SingleTierStorage : public Storage { KVInterface* kv_; ShrinkPolicy* shrink_policy_; Allocator* alloc_; - LayoutCreator* layout_creator_; + FeatureDescriptor* feat_desc_; }; template class DramStorage : public SingleTierStorage { public: - DramStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc, - KVInterface* kv) - : SingleTierStorage(sc, alloc, kv, lc) {} + DramStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) + : SingleTierStorage(sc, new LocklessHashMap(feat_desc), feat_desc) {} ~DramStorage() override {} Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) { + const std::vector& value_ptrs) { return SingleTierStorage::kv_->BatchCommit(keys, value_ptrs); } - Status TryInsert(K key, ValuePtr* value_ptr) { + Status TryInsert(K key, void* value_ptr) { return SingleTierStorage::kv_->Insert(key, value_ptr); } - Status Commit(K keys, const ValuePtr* value_ptr) override{ + Status Commit(K keys, const void* value_ptr) override{ return SingleTierStorage::kv_->Commit(keys, value_ptr); } + + void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) override { + void* value_ptr = SingleTierStorage::feat_desc_->Allocate(freq); + SingleTierStorage::Insert(key, &value_ptr); + SingleTierStorage::feat_desc_->SetValue(value_ptr, emb_index, value); + SingleTierStorage::feat_desc_->SetFreq(value_ptr, freq); + SingleTierStorage::feat_desc_->UpdateVersion(value_ptr, version); + } TF_DISALLOW_COPY_AND_ASSIGN(DramStorage); public: @@ -375,12 +360,8 @@ class DramStorage : public SingleTierStorage { friend class HbmDramSsdStorage; #endif protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); - } - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -395,9 +376,10 @@ class DramStorage : public SingleTierStorage { template class HbmStorage : public SingleTierStorage { public: - HbmStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new GPUHashMapKV(sc.embedding_config, alloc), lc) { + HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new GPUHashMapKV( + sc.embedding_config, gpu_allocator), feat_desc) { } ~HbmStorage() override {} @@ -488,48 +470,27 @@ class HbmStorage : public SingleTierStorage { gpu_kv->Import(key_import, value_import, device, emb_config); return Status::OK(); } - - void SetTotalDims(int64 total_dims) override {} }; template class HbmStorageWithCpuKv: public SingleTierStorage { public: - HbmStorageWithCpuKv(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + HbmStorageWithCpuKv(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~HbmStorageWithCpuKv() override {} - void Insert(K key, ValuePtr* value_ptr) override { - do { - Status s = SingleTierStorage::kv_->Insert(key, value_ptr); - if (s.ok()) { - break; - } else { - value_ptr->Destroy(SingleTierStorage::alloc_); - delete value_ptr; - } - } while (!(SingleTierStorage::kv_->Lookup(key, &value_ptr)).ok()); - } - - void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) override { - SingleTierStorage::Insert(key, value_ptr, alloc_len, to_dram); - } - - Status TryInsert(K key, ValuePtr* value_ptr) { + Status TryInsert(K key, void* value_ptr) { return SingleTierStorage::kv_->Insert(key, value_ptr); } public: friend class HbmDramStorage; friend class HbmDramSsdStorage; protected: - void SetTotalDims(int64 total_dims) override {} - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -544,28 +505,25 @@ class HbmStorageWithCpuKv: public SingleTierStorage { template class PmemMemkindStorage : public SingleTierStorage { public: - PmemMemkindStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + PmemMemkindStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~PmemMemkindStorage() override {} TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage); - - protected: - void SetTotalDims(int64 total_dims) override {} }; template class PmemLibpmemStorage : public SingleTierStorage { public: - PmemLibpmemStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LocklessHashMap(), lc) { + PmemLibpmemStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LocklessHashMap(feat_desc), feat_desc) { } ~PmemLibpmemStorage() override {} - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -573,10 +531,8 @@ class PmemLibpmemStorage : public SingleTierStorage { protected: friend class DramPmemStorage; - void SetTotalDims(int64 total_dims) override {} - void Shrink(std::vector& key_list, - std::vector*>& value_ptr_list, + std::vector& value_ptr_list, ShrinkArgs& shrink_args, int64 value_len) override { SingleTierStorage::Shrink( @@ -590,15 +546,15 @@ class PmemLibpmemStorage : public SingleTierStorage { template class LevelDBStore : public SingleTierStorage { public: - LevelDBStore(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new LevelDBKV(sc.path), lc) { + LevelDBStore(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new LevelDBKV(sc.path, feat_desc), feat_desc) { } ~LevelDBStore() override {} TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore); - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -608,29 +564,25 @@ class LevelDBStore : public SingleTierStorage { LevelDBKV* leveldb_kv = reinterpret_cast*>(SingleTierStorage::kv_); return new DBValueIterator( - key_list, emb_index, value_len, leveldb_kv); + key_list, emb_index, value_len, + leveldb_kv, SingleTierStorage::feat_desc_); } public: friend class DramLevelDBStore; - - protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); - } }; template class SsdHashStorage : public SingleTierStorage { public: - SsdHashStorage(const StorageConfig& sc, Allocator* alloc, - LayoutCreator* lc) : SingleTierStorage( - sc, alloc, new SSDHashKV(sc.path, alloc), lc) { + SsdHashStorage(const StorageConfig& sc, + FeatureDescriptor* feat_desc) : SingleTierStorage( + sc, new SSDHashKV(sc.path, feat_desc), feat_desc) { } ~SsdHashStorage() override {} TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage); - Status Commit(K keys, const ValuePtr* value_ptr) { + Status Commit(K keys, const void* value_ptr) { return SingleTierStorage::kv_->Commit(keys, value_ptr); } @@ -691,8 +643,9 @@ class SsdHashStorage : public SingleTierStorage { #endif protected: - void SetTotalDims(int64 total_dims) override { - SingleTierStorage::kv_->SetTotalDims(total_dims); + void Init() override { + dynamic_cast*>( + SingleTierStorage::kv_)->Init(); } }; } // embedding diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h index 8040421233e..f51c6904a50 100644 --- a/tensorflow/core/framework/embedding/ssd_hash_kv.h +++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h @@ -25,17 +25,12 @@ limitations under the License. #include "tensorflow/core/framework/embedding/ssd_record_descriptor.h" #include "tensorflow/core/framework/embedding/emb_file_creator.h" #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/util/env_var.h" namespace tensorflow { - -template -class ValuePtr; - namespace embedding { class EmbPosition { public: @@ -115,55 +110,6 @@ class SSDIterator { } } - virtual void Key(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - memcpy((char*)val, &((file_map_[f_id])[curr_vec_].first), dim); - } - - virtual void Value(char* val, int64 dim, int64 value_offset) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, dim, - posi->offset_ + value_offset + sizeof(FixedLengthHeader)); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_ + - value_offset + sizeof(FixedLengthHeader), dim); - } - } - - virtual void Freq(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, sizeof(FixedLengthHeader), - posi->offset_); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_, - sizeof(FixedLengthHeader)); - } - *((int64*)val) = - reinterpret_cast(val)->GetFreqCounter(); - } - - virtual void Version(char* val, int64 dim) { - int64 f_id = file_id_vec_[curr_file_]; - EmbPosition* posi = (file_map_[f_id])[curr_vec_].second; - - if (posi->flushed_) { - emb_files_[posi->version_]-> - ReadWithMemcpy(val, sizeof(FixedLengthHeader), - posi->offset_); - } else { - memcpy(val, write_buffer_ + posi->buffer_offset_, - sizeof(FixedLengthHeader)); - } - *((int64*)val) = - reinterpret_cast(val)->GetGlobalStep(); - } - virtual K Key() { int64 f_id = file_id_vec_[curr_file_]; return (file_map_[f_id])[curr_vec_].first; @@ -192,8 +138,9 @@ class SSDIterator { template class SSDHashKV : public KVInterface { public: - explicit SSDHashKV(const std::string& path, Allocator* alloc) - : alloc_(alloc) { + explicit SSDHashKV(const std::string& path, + FeatureDescriptor* feat_desc) + : feat_desc_(feat_desc) { path_ = io::JoinPath( path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_"); hash_map_.max_load_factor(0.8); @@ -205,9 +152,6 @@ class SSDHashKV : public KVInterface { evict_file_set_.set_counternum(16); evict_file_set_.set_deleted_key(DELETED_KEY); - new_value_ptr_fn_ = [this](size_t size) { - return new NormalContiguousValuePtr(alloc_, size); - }; is_async_compaction_ = true; TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true, &is_async_compaction_)); @@ -224,7 +168,7 @@ class SSDHashKV : public KVInterface { "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!"; compaction_fn_ = [this](){Compaction();}; check_buffer_fn_ = [this](){CheckBuffer();}; - save_kv_fn_ = [this](K key, const ValuePtr* value_ptr, + save_kv_fn_ = [this](K key, const void* value_ptr, bool is_compaction=false) { SaveKV(key, value_ptr, is_compaction); }; @@ -233,7 +177,7 @@ class SSDHashKV : public KVInterface { "Use Async Compactor in SSDHashKV of Multi-tier Embedding Storage!"; compaction_fn_ = [](){}; check_buffer_fn_ = [this](){CheckBufferAsync();}; - save_kv_fn_ = [this](K key, const ValuePtr* value_ptr, + save_kv_fn_ = [this](K key, const void* value_ptr, bool is_compaction=false) { SaveKVAsync(key, value_ptr, is_compaction); }; @@ -244,9 +188,8 @@ class SSDHashKV : public KVInterface { } } - void SetTotalDims(int total_dims) override { - total_dims_ = total_dims; - val_len_ = sizeof(FixedLengthHeader) + total_dims_ * sizeof(V); + void Init() { + val_len_ = feat_desc_->data_bytes(); max_app_count_ = BUFFER_SIZE / val_len_; write_buffer_ = new char[BUFFER_SIZE]; unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_); @@ -334,18 +277,18 @@ class SSDHashKV : public KVInterface { return Status::OK(); } - Status Lookup(K key, ValuePtr** value_ptr) override { + Status Lookup(K key, void** value_ptr) override { auto iter = hash_map_.find_wait_free(key); if (iter.first == EMPTY_KEY) { return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV."); } else { - ValuePtr* val = new_value_ptr_fn_(total_dims_); + void* val = feat_desc_->Allocate(); EmbPosition* posi = iter.second; if (posi->flushed_) { - emb_files_[posi->version_]->Read((char*)(val->GetPtr()), + emb_files_[posi->version_]->Read((char*)val, val_len_, posi->offset_); } else { - memcpy((char*)val->GetPtr(), + memcpy((char*)val, write_buffer_ + posi->buffer_offset_, val_len_); } *value_ptr = val; @@ -363,17 +306,17 @@ class SSDHashKV : public KVInterface { } } - Status Insert(K key, const ValuePtr* value_ptr) override { + Status Insert(K key, const void* value_ptr) override { return Status::OK(); } Status BatchInsert(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { return BatchCommit(keys, value_ptrs); } Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) override { + const std::vector& value_ptrs) override { compaction_fn_(); __sync_fetch_and_add(&total_app_count_, keys.size()); for (int i = 0; i < keys.size(); i++) { @@ -384,7 +327,7 @@ class SSDHashKV : public KVInterface { return Status::OK(); } - Status Commit(K key, const ValuePtr* value_ptr) override { + Status Commit(K key, const void* value_ptr) override { compaction_fn_(); __sync_fetch_and_add(&total_app_count_, 1); check_buffer_fn_(); @@ -402,7 +345,7 @@ class SSDHashKV : public KVInterface { } Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) override { + std::vector* value_ptr_list) override { return Status::OK(); } @@ -467,8 +410,8 @@ class SSDHashKV : public KVInterface { int64 Size() const override { return hash_map_.size_lockless(); } - void FreeValuePtr(ValuePtr* value_ptr) override { - delete value_ptr; + void FreeValuePtr(void* value_ptr) override { + feat_desc_->Deallocate(value_ptr); } private: @@ -555,10 +498,10 @@ class SSDHashKV : public KVInterface { } void AppendToWriteBuffer(size_t curr_buffer_offset, K key, - const ValuePtr* value_ptr) { + const void* value_ptr) { current_offset_ += val_len_; memcpy(write_buffer_ + curr_buffer_offset, - (char*)value_ptr->GetPtr(), val_len_); + (char*)value_ptr, val_len_); key_buffer_[buffer_cur_] = key; ++buffer_cur_; } @@ -582,7 +525,7 @@ class SSDHashKV : public KVInterface { return flag; } - void SaveKV(K key, const ValuePtr* value_ptr, + void SaveKV(K key, const void* value_ptr, bool is_compaction = false) { size_t curr_buffer_offset = buffer_cur_ * val_len_; EmbPosition* ep = new EmbPosition(current_offset_, current_version_, @@ -608,7 +551,7 @@ class SSDHashKV : public KVInterface { } } - void SaveKVAsync(K key, const ValuePtr* value_ptr, + void SaveKVAsync(K key, const void* value_ptr, bool is_compaction = false) { size_t curr_buffer_offset = buffer_cur_ * val_len_; EmbPosition* ep = new EmbPosition(current_offset_, evict_version_, @@ -681,21 +624,21 @@ class SSDHashKV : public KVInterface { } void MoveToNewFile() { - ValuePtr* val = new_value_ptr_fn_(total_dims_); + void* val = feat_desc_->Allocate(); for (auto it : evict_file_map_) { EmbFile* file = emb_files_[it.first]; total_app_count_ -= file->InvalidCount(); file->MapForRead(); for (auto it_vec : it.second) { EmbPosition* posi = it_vec.second; - file->ReadWithMemcpy((char*)(val->GetPtr()), val_len_, + file->ReadWithMemcpy((char*)val, val_len_, posi->offset_); CheckBuffer(); SaveKV(it_vec.first, val, true); } file->UnmapForRead(); } - delete val; + feat_desc_->Deallocate(val); } void MoveToNewFileAsync() { @@ -825,11 +768,10 @@ class SSDHashKV : public KVInterface { char* write_buffer_ = nullptr; K* key_buffer_ = nullptr; bool is_async_compaction_; - Allocator* alloc_ = nullptr; + FeatureDescriptor* feat_desc_; int total_dims_; std::string path_; - std::function*(size_t)> new_value_ptr_fn_; typedef google::dense_hash_map_lockless LockLessHashMap; LockLessHashMap hash_map_; @@ -857,7 +799,7 @@ class SSDHashKV : public KVInterface { std::function compaction_fn_; std::function check_buffer_fn_; - std::function*, bool)> save_kv_fn_; + std::function save_kv_fn_; EmbFileCreator* emb_file_creator_ = nullptr; }; template diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h index bb949183492..1ffb435054b 100644 --- a/tensorflow/core/framework/embedding/storage.h +++ b/tensorflow/core/framework/embedding/storage.h @@ -40,9 +40,6 @@ using GPUDevice = Eigen::GpuDevice; template class CheckpointLoader; -template -class ValuePtr; - template class EmbeddingVar; @@ -57,9 +54,6 @@ class BundleReader; template struct EmbeddingVarContext; -namespace { - const int kSavedPartitionNum = 1000; -} namespace embedding { template @@ -67,42 +61,40 @@ class Storage { friend class CheckpointLoader; public: explicit Storage(const StorageConfig& storage_config) - : storage_config_(storage_config) {} + : storage_config_(storage_config) { + initialize_value_.resize(storage_config.embedding_config.slot_num + 1); + } virtual ~Storage() {} TF_DISALLOW_COPY_AND_ASSIGN(Storage); - virtual Status Get(K key, ValuePtr** value_ptr) = 0; + virtual Status Get(K key, void** value_ptr) = 0; #if GOOGLE_CUDA virtual void BatchGet(const EmbeddingVarContext& ctx, const K* key, - ValuePtr** value_ptr_list, - int64 num_of_keys, - int64 value_len) {} + void** value_ptr_list, + int64 num_of_keys) {} virtual void BatchGetOrCreate( const EmbeddingVarContext& ctx, const K* key, - ValuePtr** value_ptr_list, + void** value_ptr_list, int64 num_of_keys, int64 value_len, std::vector>& not_found_cursor_list) {} #endif //GOOGLE_CUDA virtual Status Contains(K key) = 0; - virtual void Insert(K key, ValuePtr** value_ptr, - size_t alloc_len, bool to_dram = false) = 0; - virtual void Insert(K key, ValuePtr* value_ptr) = 0; - virtual void SetAllocLen(int64 value_len, int slot_num) = 0; + virtual void CreateAndInsert(K key, void** value_ptr, + bool to_dram=false) = 0; + virtual void Insert(K key, void** value_ptr) = 0; + virtual void Init() {} virtual void SetValueLen(int64 value_len) {} - virtual Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size) = 0; - virtual Status GetOrCreate(K key, ValuePtr** value_ptr, - size_t size, CopyBackFlag &need_copyback) = 0; + virtual Status GetOrCreate(K key, void** value_ptr) = 0; virtual int LookupTier(K key) const = 0; virtual Status Remove(K key) = 0; virtual int64 Size() const = 0; virtual int64 Size(int level) const = 0; virtual Status GetSnapshot(std::vector* key_list, - std::vector*>* value_ptr_list) = 0; + std::vector* value_ptr_list) = 0; virtual Status Save( const string& tensor_name, const string& prefix, @@ -113,7 +105,7 @@ class Storage { V* default_value) = 0; virtual Status BatchCommit(const std::vector& keys, - const std::vector*>& value_ptrs) = 0; + const std::vector& value_ptrs) = 0; virtual Status Eviction(K* evict_ids, int64 evict_size) = 0; @@ -121,7 +113,7 @@ class Storage { int total, const K* keys, const std::list& copyback_cursor, V** memcpy_address, size_t value_len, - ValuePtr **gpu_value_ptrs, + void **gpu_value_ptrs, V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr, @@ -149,25 +141,11 @@ class Storage { Allocator* alloc, int64 value_len, int64 block_size) = 0; - virtual void AllocateMemoryForNewFeatures( - const std::vector*>& value_ptr_list) = 0; - virtual void AllocateMemoryForNewFeatures( - ValuePtr** value_ptr_list, int64 num_of_value_ptrs) = 0; inline mutex* get_mutex() { return &mu_; } inline int64 GetAllocLen() { return alloc_len_; } inline int64 GetOffset(int64 index) { return alloc_len_ * index; } inline int64 GetTotalDims() { return total_dims_; } - inline int64 ComputeAllocLen(int64 value_len) { - if (LayoutType::COMPACT == storage_config_.layout_type) { - return value_len; - } else { - return (value_len * sizeof(V) % 16 == 0) - ? value_len - : value_len + (16 - (sizeof(V) * value_len) % 16) / sizeof(V); - } - } - inline LayoutType GetLayoutType() { return storage_config_.layout_type; } inline embedding::StorageType GetStorageType() { return storage_config_.type; } inline std::string GetStoragePath() { return storage_config_.path; } inline embedding::CacheStrategy @@ -183,7 +161,7 @@ class Storage { } inline void Insert(const std::vector& keys, - ValuePtr** value_ptrs) { + void** value_ptrs) { for (size_t i = 0; i < keys.size(); i++) { Insert(keys[i], value_ptrs[i]); } @@ -211,6 +189,13 @@ class Storage { reset_version, reader); restorer.RestoreCkpt(emb_config, device); }; + + virtual void UpdateValuePtr(K key, void* new_value_ptr, + void* old_value_ptr) = 0; + + virtual void Import(K key, V* value, + int64 freq, int64 version, + int emb_index) = 0; protected: virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, @@ -227,12 +212,7 @@ class Storage { const std::string& ssd_emb_file_name, EmbeddingVar* ev, RestoreSSDBuffer& restore_buff) { - int64 alloc_len = Storage::ComputeAllocLen(value_len); - auto* alloc = ev->GetAllocator(); for (int64 i = 0; i < restore_buff.num_of_keys; i++) { - ValuePtr* value_ptr = nullptr; - ev->LookupOrCreateKey(restore_buff.key_list_buf[i], &value_ptr); - value_ptr->SetInitialized(emb_index); int64 file_id = restore_buff.key_file_id_list_buf[i]; int64 key_offset = restore_buff.key_offset_list_buf[i]; // Read data from embedding files on SSD. Data are stored in @@ -240,32 +220,29 @@ class Storage { std::stringstream ss; ss << ssd_emb_file_name << "/" << file_id << ".emb"; int fd = open(ss.str().data(), O_RDONLY); + EmbeddingConfig& emb_config = storage_config_.embedding_config; + FeatureDescriptor normal_feat_desc( + emb_config.block_num, emb_config.slot_num + 1, + ev_allocator(), StorageType::DRAM, true, + true, {false, 0}); + void* value_ptr = normal_feat_desc.Allocate(); char* file_addr = (char*)mmap(nullptr, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1) + + normal_feat_desc.data_bytes() + key_offset, PROT_READ, MAP_PRIVATE, fd, 0); - - NormalContiguousValuePtr tmp_value_ptr(alloc, - alloc_len * (emb_slot_num + 1)); - void* ptr = tmp_value_ptr.GetPtr(); - memcpy(ptr, file_addr + key_offset, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1)); + memcpy(value_ptr, file_addr + key_offset, + normal_feat_desc.data_bytes()); munmap(file_addr, - sizeof(FixedLengthHeader) + - alloc_len * sizeof(V) * (emb_slot_num + 1) + + normal_feat_desc.data_bytes() + key_offset); close(fd); // Copy Data to ValuePtr, data of slots are set by primary here. - for (int j = 0; j < emb_slot_num + 1; j++) { - V* value = tmp_value_ptr.GetValue(j, alloc_len * j); - if (value != nullptr) { - value_ptr->GetOrAllocate(alloc, value_len, value, j, alloc_len * j); - } - } - value_ptr->SetFreq(tmp_value_ptr.GetFreq()); - value_ptr->SetStep(tmp_value_ptr.GetStep()); + int64 import_freq = normal_feat_desc.GetFreq(value_ptr); + int64 import_version = normal_feat_desc.GetVersion(value_ptr); + V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index); + Import(restore_buff.key_list_buf[i], value, + import_freq, import_version, emb_index); + normal_feat_desc.Deallocate(value_ptr); } return Status::OK(); } @@ -273,10 +250,11 @@ class Storage { private: void GeneratePartitionedCkptData( const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, EmbeddingVarCkptData* partitioned_ckpt_data, const EmbeddingConfig& emb_config, - V* default_value) { + V* default_value, + FeatureDescriptor* feat_desc) { std::vector> ev_ckpt_data_parts(kSavedPartitionNum); @@ -293,7 +271,43 @@ class Storage { ev_ckpt_data_parts[part_id].Emplace( key_list[i], value_ptr_list[i], emb_config, default_value, - GetOffset(emb_config.emb_index), + feat_desc, + is_save_freq, + is_save_version, + save_unfiltered_features); + break; + } + } + } + + partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts); + } + + void GeneratePartitionedCkptData( + const std::vector& key_list, + const std::vector& value_ptr_list, + EmbeddingVarCkptData* partitioned_ckpt_data, + const EmbeddingConfig& emb_config, + V* default_value, + const std::vector*>& feat_desc) { + std::vector> + ev_ckpt_data_parts(kSavedPartitionNum); + + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar( + "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features)); + + bool is_save_freq = emb_config.is_save_freq(); + bool is_save_version = emb_config.is_save_version(); + + for (int64 i = 0; i < key_list.size(); i++) { + for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) { + if (key_list[i] % kSavedPartitionNum == part_id) { + int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset; + ev_ckpt_data_parts[part_id].Emplace( + key_list[i], value_ptr_list[i], + emb_config, default_value, + feat_desc[feat_desc_type], is_save_freq, is_save_version, save_unfiltered_features); @@ -333,12 +347,33 @@ class Storage { int64 value_len, V* default_value, const std::vector& key_list, - const std::vector*>& value_ptr_list, + const std::vector& value_ptr_list, + FeatureDescriptor* feat_desc, + ValueIterator* value_iter = nullptr) { + EmbeddingVarCkptData partitioned_ckpt_data; + GeneratePartitionedCkptData(key_list, value_ptr_list, + &partitioned_ckpt_data, emb_config, + default_value, feat_desc); + Status s = + partitioned_ckpt_data.ExportToCkpt( + tensor_name, writer, value_len, value_iter); + return Status::OK(); + } + + Status SaveToCheckpoint( + const string& tensor_name, + BundleWriter* writer, + const EmbeddingConfig& emb_config, + int64 value_len, + V* default_value, + const std::vector& key_list, + const std::vector& value_ptr_list, + const std::vector*>& feat_desc, ValueIterator* value_iter = nullptr) { EmbeddingVarCkptData partitioned_ckpt_data; GeneratePartitionedCkptData(key_list, value_ptr_list, &partitioned_ckpt_data, emb_config, - default_value); + default_value, feat_desc); Status s = partitioned_ckpt_data.ExportToCkpt( tensor_name, writer, value_len, value_iter); @@ -366,6 +401,7 @@ class Storage { mutex mu_; std::atomic_flag flag_ = ATOMIC_FLAG_INIT; + std::vector initialize_value_; }; } // embedding } // tensorflow diff --git a/tensorflow/core/framework/embedding/storage_config.h b/tensorflow/core/framework/embedding/storage_config.h index 85e44879dcb..23babc9ef08 100644 --- a/tensorflow/core/framework/embedding/storage_config.h +++ b/tensorflow/core/framework/embedding/storage_config.h @@ -17,13 +17,11 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/embedding_config.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" namespace tensorflow { namespace embedding { struct StorageConfig { StorageConfig() : type(StorageType::DEFAULT), path(""), - layout_type(LayoutType::NORMAL), cache_strategy(CacheStrategy::LFU) { size = {1<<30,1<<30,1<<30,1<<30}; } @@ -31,32 +29,14 @@ struct StorageConfig { StorageConfig(StorageType t, const std::string& p, const std::vector& s, - const std::string& layout, const EmbeddingConfig& ec, const CacheStrategy cache_strategy_ = CacheStrategy::LFU) - : type(t), - path(p), - embedding_config(ec), - cache_strategy(cache_strategy_) { - if ("normal" == layout) { - layout_type = LayoutType::NORMAL; - } else if ("light" == layout) { - layout_type = LayoutType::LIGHT; - } else if ("normal_contiguous" == layout){ - layout_type = LayoutType::NORMAL_CONTIGUOUS; - } else if ("normal_contiguous_gpu" == layout){ - layout_type = LayoutType::NORMAL_CONTIGUOUS_GPU; - } else if ("compact" == layout){ - layout_type = LayoutType::COMPACT; - } else { - LOG(WARNING) << "Unknown layout: " - << layout << ", use LayoutType::NORMAL by default."; - layout_type = LayoutType::NORMAL; - } - size = s; - } + : type(t), + path(p), + size(s), + embedding_config(ec), + cache_strategy(cache_strategy_) {} StorageType type; - LayoutType layout_type; std::string path; std::vector size; CacheStrategy cache_strategy; diff --git a/tensorflow/core/framework/embedding/storage_factory.h b/tensorflow/core/framework/embedding/storage_factory.h index 10d2d52b83f..c585b058470 100644 --- a/tensorflow/core/framework/embedding/storage_factory.h +++ b/tensorflow/core/framework/embedding/storage_factory.h @@ -16,7 +16,6 @@ limitations under the License. #define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_ #include "tensorflow/core/framework/embedding/config.pb.h" -#include "tensorflow/core/framework/embedding/layout_creator.h" #include "tensorflow/core/framework/embedding/dram_leveldb_storage.h" #include "tensorflow/core/framework/embedding/dram_pmem_storage.h" #include "tensorflow/core/framework/embedding/dram_ssd_storage.h" @@ -34,50 +33,41 @@ class StorageFactory { public: template static Storage* Create(const StorageConfig& sc, - Allocator* gpu_allocator, const string& name) { - auto layout_creator = LayoutCreatorFactory::Create(sc); - + Allocator* gpu_allocator, FeatureDescriptor* feat_desc, + const string& name) { switch (sc.type) { case StorageType::DRAM: - return new DramStorage(sc, ev_allocator(), - layout_creator, new LocklessHashMap()); + return new DramStorage(sc, feat_desc); case StorageType::PMEM_MEMKIND: - return new PmemMemkindStorage(sc, pmem_allocator(), - layout_creator); + feat_desc->SetAllocator(pmem_allocator()); + return new PmemMemkindStorage(sc, feat_desc); case StorageType::PMEM_LIBPMEM: - return new PmemLibpmemStorage(sc, - experimental_pmem_allocator(sc.path, sc.size[0]), - layout_creator); + feat_desc->SetAllocator( + experimental_pmem_allocator(sc.path, sc.size[0])); + return new PmemLibpmemStorage(sc, feat_desc); case StorageType::DRAM_PMEM: - return new DramPmemStorage(sc, ev_allocator(), - experimental_pmem_allocator(sc.path, sc.size[0]), - layout_creator, name); + return new DramPmemStorage(sc, + feat_desc, name); case StorageType::LEVELDB: case StorageType::DRAM_LEVELDB: - return new DramLevelDBStore(sc, ev_allocator(), - layout_creator, name); + return new DramLevelDBStore(sc, feat_desc, name); case StorageType::SSDHASH: case StorageType::DRAM_SSDHASH: - return new DramSsdHashStorage(sc, ev_allocator(), - layout_creator, name); + return new DramSsdHashStorage(sc, feat_desc, name); case StorageType::HBM: #if GOOGLE_CUDA - return new HbmStorage(sc, gpu_allocator, - layout_creator); + return new HbmStorage(sc, gpu_allocator, feat_desc); #endif // GOOGLE_CUDA case StorageType::HBM_DRAM: #if GOOGLE_CUDA - return new HbmDramStorage(sc, gpu_allocator, - ev_allocator(), layout_creator, name); + return new HbmDramStorage(sc, gpu_allocator, feat_desc, name); #endif // GOOGLE_CUDA case StorageType::HBM_DRAM_SSDHASH: #if GOOGLE_CUDA - return new HbmDramSsdStorage(sc, gpu_allocator, - ev_allocator(), layout_creator, name); + return new HbmDramSsdStorage(sc, gpu_allocator, feat_desc, name); #endif // GOOGLE_CUDA default: - return new DramStorage(sc, ev_allocator(), - layout_creator, new LocklessHashMap()); + return new DramStorage(sc, feat_desc); } } }; diff --git a/tensorflow/core/framework/embedding/value_ptr.h b/tensorflow/core/framework/embedding/value_ptr.h deleted file mode 100644 index ca7d234ed61..00000000000 --- a/tensorflow/core/framework/embedding/value_ptr.h +++ /dev/null @@ -1,647 +0,0 @@ -#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ -#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ - -#include -#include -#include -#include - -#include "tensorflow/core/framework/typed_allocator.h" -#if GOOGLE_CUDA -#include -#endif // GOOGLE_CUDA - -namespace tensorflow { - -enum class LayoutType { - LIGHT, - NORMAL, - LEVELDB, - NORMAL_CONTIGUOUS, - NORMAL_CONTIGUOUS_GPU, - COMPACT, -}; - -namespace { -constexpr int COLUMN_BITSET_BYTES = 5; -constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8; - -struct MetaHeader { - unsigned char embed_num; - unsigned char value_type; - unsigned char header_size; - unsigned char column_bitset[COLUMN_BITSET_BYTES]; - - static const int kEmbeddingNumStartIndex = 0; - static const int kValueTypeStartIndex = - kEmbeddingNumStartIndex + sizeof(char); - static const int kHeaderSizeStartIndex = - kValueTypeStartIndex + sizeof(char); - static const int kColumnBitsetIndex = - kHeaderSizeStartIndex + sizeof(char); - - inline unsigned int GetEmbeddingNum() { - return (unsigned int) embed_num; - } - - inline void SetEmbeddingNum(size_t s) { - embed_num = (unsigned char)s; - } - - inline std::bitset GetColumnBitset() { - unsigned long meta = ((unsigned long*)this)[0]; - std::bitset bs(meta >> (8 * kColumnBitsetIndex)); - return bs; - } - - inline void SetColumnBitset(const std::bitset& bs, - unsigned int embnum) { - ((unsigned long*)(this))[0] = - (bs.to_ulong() << (8 * kColumnBitsetIndex)) | - (header_size << (8 * kHeaderSizeStartIndex)) | - (value_type << (8 * kValueTypeStartIndex)) | - (embnum << (8 * kEmbeddingNumStartIndex)); - } - - inline unsigned int GetHeaderSize() { - return (unsigned int) header_size; - } - - inline void SetHeaderSize(size_t size) { - header_size = (unsigned char)size; - } - - inline void SetLayoutType(LayoutType vt) { - value_type = (unsigned char)vt; - } - - inline LayoutType GetLayoutType() { - return (LayoutType)value_type; - } -}; - -struct LightHeader { -/*__________________________________________________________________________________________ - | | | | | embedding | slot | - | number of | valueptr | header | each bit a V* | V* | V* | - | embedding | type | size | 1 valid | actually pointer | actually pointer |... - | columns | | | 0 no-valid | by alloctor | by alloctor | - | (8 bits) | (8 bits) | (8 bits) | (40 bits) | (8 bytes) | (8 bytes) | - -------------------------------------------------------------------------------------------- -*/ - MetaHeader meta; - LightHeader() { - memset(this, 0, sizeof(LightHeader)); - meta.SetLayoutType(LayoutType::LIGHT); - meta.SetHeaderSize(sizeof(LightHeader) / sizeof(int64)); - } -}; - -struct NormalHeader { -/*_________________________________________________________________________________________________________________________ - | | | | | | | embedding | slot | - | number of | valueptr | header | each bit a V* | global step | freq counter | V* | V* | - | embedding | type | size | 1 valid | | | actually pointer | actually pointer |... - | columns | | | 0 no-valid | int64 | int64 | by alloctor | by alloctor | - | (8 bits) | (8 bits) | (8 bits) | (40 bits) | (8 bytes) | (8 bytes) | (8 bytes) | (8 bytes) | - -------------------------------------------------------------------------------------------------------------------------- - */ - MetaHeader meta; - int64 global_step; - int64 freq_counter; - - NormalHeader() { - memset(this, 0, sizeof(NormalHeader)); - meta.SetLayoutType(LayoutType::NORMAL); - meta.SetHeaderSize(sizeof(NormalHeader) / sizeof(int64)); - SetGlobalStep(-1); - } - - inline int64 GetGlobalStep() { - return global_step; - } - - inline void SetGlobalStep(int64 gs) { - global_step = gs; - } - - inline int64 GetFreqCounter() { - return freq_counter; - } - - inline void SetFreqCounter(int64 fc) { - freq_counter = fc; - } - - inline void AddFreq() { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + 1); - } - - inline void AddFreq(int64 count) { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + count); - } -}; - -struct FixedLengthHeader { -/*_________________________________________________________________________________ - | | | embeddings | - | slotflag + global step | freq counter | V | - | | | actually value | - | int64 | int64 | by alloctor | - | (8 bytes) | (8 bytes) | (4 * slot_num * emb_dim bytes) | - --------------------------------------------------------------------------------- -*/ - int64 global_step; - int64 freq_counter; - - FixedLengthHeader() { - memset(this, 0, sizeof(FixedLengthHeader)); - SetGlobalStep(-1); - } - - inline int64 GetGlobalStep() { - return global_step & 0x0000ffffffffffff; - } - - inline void SetGlobalStep(int64 gs) { - int64 temp = global_step; - temp &= 0xffff000000000000; - gs &= 0x0000ffffffffffff; - temp |= gs; - global_step = temp; - } - - inline void SetInitialized(int64 emb_index) { - int64 temp = 1; - temp = temp << (48 + emb_index); - global_step |= temp; - } - - inline int64 GetFreqCounter() { - return freq_counter; - } - - inline void SetFreqCounter(int64 fc) { - freq_counter = fc; - } - - inline void AddFreq() { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + 1); - } - - inline void AddFreq(int64 count) { - __sync_bool_compare_and_swap(&freq_counter, - freq_counter, freq_counter + count); - } -}; -} // namespace - -template -class ValuePtr { - public: - virtual ~ValuePtr() {} - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) = 0; - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) = 0; - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) = 0; - - virtual void Destroy(Allocator* allocator) = 0; - - virtual void* GetPtr() const = 0; - - // Global Step - virtual int64 GetStep() { - LOG(FATAL) << "Unsupport GlobalStep in subclass of ValuePtrBase"; - return 0; - } - - virtual void SetStep(int64 gs) {} - - // Frequency Counter - virtual int64 GetFreq() { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - return 0; - } - - virtual void SetFreq(int64 freq) {} - - virtual void AddFreq() { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - } - - virtual void AddFreq(int64 count) { - LOG(FATAL) << "Unsupport FreqCounter in subclass of ValuePtrBase"; - } - - virtual void SetValue(V val, size_t size) { - LOG(FATAL) << "Unsupport SetValue in subclass of ValuePtrBase"; - } - - virtual void SetInitialized(int64 emb_index) { - LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase"; - } - - virtual bool SetPtr(V* ptr) { - LOG(FATAL) << "Unsupport SetInitialized in subclass of ValuePtrBase"; - return false; - } - -}; - -template -class LooseValuePtr : public ValuePtr { - public: - virtual ~LooseValuePtr() {} - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) { - MetaHeader* meta = (MetaHeader*)ptr_; - unsigned int embnum = (unsigned int)meta->embed_num; - auto metadata = meta->GetColumnBitset(); - - if (!metadata.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - metadata = meta->GetColumnBitset(); - if (metadata.test(emb_index)) { - this->flag_.clear(std::memory_order_release); - return ((V**)((int64*)ptr_ + - (unsigned int)meta->header_size))[emb_index]; - } - embnum++ ; - int64 alloc_value_len = value_len; - V* tensor_val = (V*)allocator->AllocateRaw( - Allocator::kAllocatorAlignment, sizeof(V) * alloc_value_len); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index] = tensor_val; - - metadata.set(emb_index); - // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] = metadata.to_ulong(); - // the ptr_ will be occaionally modified from 0x7f18700912a0 to 0x700912a0 - // must use ((V**)ptr_ + 1 + 1)[emb_index] = tensor_val; to avoid - meta->SetColumnBitset(metadata, embnum); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]; - } - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) { - return nullptr; - } - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) { - MetaHeader* meta = (MetaHeader*)ptr_; - auto metadata = meta->GetColumnBitset(); - if (metadata.test(emb_index)) { - return ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[emb_index]; - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - MetaHeader* meta = (MetaHeader*)ptr_; - unsigned int embnum = (unsigned int)meta->embed_num; - auto metadata = meta->GetColumnBitset(); - for (int i = 0; i< embnum; i++) { - if (metadata.test(i)) { - V* val = ((V**)((int64*)ptr_ + meta->GetHeaderSize()))[i]; - if (val != nullptr) { - allocator->DeallocateRaw(val); - } - } - } - } - - virtual void* GetPtr() const { - return ptr_; - } - - protected: - void* ptr_; - std::atomic_flag flag_ = ATOMIC_FLAG_INIT; -}; - -template -class LightValuePtr : public LooseValuePtr { - public: - LightValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*)malloc( - sizeof(LightHeader) + sizeof(int64) * size); - memset(static_cast(this->ptr_) + sizeof(LightHeader), 0, sizeof(int64) * size); - new ((char*)this->ptr_) LightHeader(); - } - - ~LightValuePtr() { - free(this->ptr_); - } -}; - -template -class NormalValuePtr : public LooseValuePtr { - public: - NormalValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*) malloc(sizeof(NormalHeader) + sizeof(int64) * size); - memset(static_cast(this->ptr_) + sizeof(NormalHeader), 0, sizeof(int64) * size); - new ((char*)this->ptr_) NormalHeader(); - } - - ~NormalValuePtr() { - free(this->ptr_); - } - - int64 GetStep() { - return ((NormalHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((NormalHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((NormalHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((NormalHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - return ((NormalHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - return ((NormalHeader*)this->ptr_)->AddFreq(count); - } -}; - -template -class NormalContiguousValuePtr : public LooseValuePtr { - public: - NormalContiguousValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = allocator->AllocateRaw(Allocator::kAllocatorAlignment, - sizeof(FixedLengthHeader) + sizeof(V) * size); - memset(static_cast(this->ptr_) + sizeof(FixedLengthHeader), 0, sizeof(V) * size); - new ((char*)this->ptr_) FixedLengthHeader(); - } - - ~NormalContiguousValuePtr() { - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset); - } - V* tensor_val = - ((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + offset); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return (V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset; - } - } - - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(FixedLengthHeader) / - sizeof(V) + offset); - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - allocator->DeallocateRaw(this->ptr_); - } - - int64 GetStep() { - return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - ((FixedLengthHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - ((FixedLengthHeader*)this->ptr_)->AddFreq(count); - } - - void SetValue(V val, size_t size) { - for (int i = 0; i < size; ++i) { - *((V*)this->ptr_ + sizeof(FixedLengthHeader) / sizeof(V) + i) = val; - } - } -}; - -template -class NormalGPUValuePtr : public LooseValuePtr { - public: - NormalGPUValuePtr(Allocator* allocator, size_t size) { - this->ptr_ = (void*) malloc(sizeof(FixedLengthHeader) + sizeof(V *)); - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = nullptr; - new ((char*)this->ptr_) FixedLengthHeader(); - } - - ~NormalGPUValuePtr() { - free(this->ptr_); - } - -#if GOOGLE_CUDA - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - V* tensor_val = - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - cudaMemcpy(tensor_val, default_v, value_len * sizeof(V), - cudaMemcpyDeviceToDevice); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - } - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } -#endif // GOOGLE_CUDA - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, - bool &need_initialize) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - need_initialize = 1; - this->flag_.clear(std::memory_order_release); - return reinterpret_cast(this); - } - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } - - // simple getter for V* and version - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) + offset; - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - return; - } - - int64 GetStep() { - return ((FixedLengthHeader*)this->ptr_)->GetGlobalStep(); - } - - void SetStep(int64 gs) { - ((FixedLengthHeader*)this->ptr_)->SetGlobalStep(gs); - } - - int64 GetFreq() { - return ((FixedLengthHeader*)this->ptr_)->GetFreqCounter(); - } - - void SetFreq(int64 freq) { - ((FixedLengthHeader*)this->ptr_)->SetFreqCounter(freq); - } - - void AddFreq() { - ((FixedLengthHeader*)this->ptr_)->AddFreq(); - } - - void AddFreq(int64 count) override { - ((FixedLengthHeader*)this->ptr_)->AddFreq(count); - } - - bool SetPtr(V* ptr) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - V* value_ptr = *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)); - if (value_ptr == nullptr) { - *(V**)((char *)this->ptr_ + sizeof(FixedLengthHeader)) = ptr; - this->flag_.clear(std::memory_order_release); - return true; - } else { - this->flag_.clear(std::memory_order_release); - return false; - } - } - - void SetInitialized(int64 emb_index) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - ((FixedLengthHeader*)this->ptr_)->SetInitialized(emb_index); - this->flag_.clear(std::memory_order_release); - } - -}; - -template -class CompactValuePtr : public ValuePtr { - public: - CompactValuePtr(Allocator* allocator, size_t size) { - memset(static_cast(this->ptr_), 0, sizeof(V) * size + sizeof(int64)); - } - - ~CompactValuePtr() { - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset) override { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (!bs.test(emb_index)) { - while(this->flag_.test_and_set(std::memory_order_acquire)); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset); - } - V* tensor_val = - ((V*)this->ptr_ + sizeof(int64) / sizeof(V) + offset); - memcpy(tensor_val, default_v, sizeof(V) * value_len); - int8* m = (int8*)((char*)this->ptr_ + 6); - *m |= (1 << emb_index); - this->flag_.clear(std::memory_order_release); - return tensor_val; - } else { - return (V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset; - } - } - - virtual V* GetOrAllocate(Allocator* allocator, int64 value_len, - const V* default_v, int emb_index, int offset, bool &need_initialize) { - return nullptr; - } - - virtual V* GetValue(int emb_index, int offset) { - int8 meta = *((int8*)((char*)this->ptr_ + 6)); - std::bitset<8> bs(meta); - if (bs.test(emb_index)) { - return ((V*)this->ptr_ + sizeof(int64) / - sizeof(V) + offset); - } else { - return nullptr; - } - } - - virtual void Destroy(Allocator* allocator) { - allocator->DeallocateRaw(this->ptr_); - } - - virtual void* GetPtr() const { - return (void*)ptr_; - } - - private: - char ptr_[23]; - std::atomic_flag flag_ = ATOMIC_FLAG_INIT; -}; - -} // namespace tensorflow - -#endif // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_VALUE_PTR_H_ diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 115e3c4bae6..0c08c30c30a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -439,7 +439,8 @@ tf_cc_test( tf_cuda_cc_test( name = "embedding_variable_ops_test", - srcs = ["embedding_variable_ops_test.cc"], + srcs = ["embedding_variable_ops_test.cc", + "embedding_variable_test.h"], extra_copts = ["-fexceptions", "-g"], deps = [ ":io", @@ -6497,7 +6498,7 @@ tf_kernel_library( "training_ali_ops_gpu.h", "training_ali_ops.h" ], - copts = tf_copts(), + copts = tf_copts() + ["-g"], deps = [ ":bounds_check", ":training_op_helpers", diff --git a/tensorflow/core/kernels/embedding_variable_memory_test.cc b/tensorflow/core/kernels/embedding_variable_memory_test.cc index 7ec6b1cf109..393e9a9754b 100644 --- a/tensorflow/core/kernels/embedding_variable_memory_test.cc +++ b/tensorflow/core/kernels/embedding_variable_memory_test.cc @@ -19,17 +19,22 @@ namespace embedding { float PerfMemory(Tensor& default_value, const std::vector& id_list, int value_size, int64 default_value_dim, - int64 filter_freq = 0) { + int64 filter_freq = 0, int64 steps_to_live = 0, + int64 record_freq = false) { auto ev = CreateEmbeddingVar(value_size, default_value, - default_value_dim, filter_freq); - ValuePtr* value_ptr = nullptr; + default_value_dim, filter_freq, + steps_to_live, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + record_freq); + void* value_ptr = nullptr; bool is_filter = false; double start_mem, end_mem; start_mem = getResident() * getpagesize(); for (int i = 0; i < id_list.size(); i++) { ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); if (is_filter) - ev->flat(value_ptr, id_list[i]); + ev->flat(value_ptr); } end_mem = getResident() * getpagesize(); double used_mb = (end_mem - start_mem)/1000000; @@ -58,7 +63,7 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) { float used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim); float theoritical_mb = - 50 + num_of_ids * (32 + 32 + value_size * sizeof(float))/ 1000000; + 50 + num_of_ids * (value_size * sizeof(float)) / 1000000; EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && (used_mb < theoritical_mb * 1.01)); @@ -68,9 +73,10 @@ TEST(EmbeddingVariabelMemoryTest, TestMemory) { used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim, filter_freq); theoritical_mb = - 50 + num_of_ids * (32 + 32 + 16 + value_size * sizeof(float)/2)/ 1000000; + 50 + num_of_ids * (8 + value_size * sizeof(float) / 2 + + 4/*memory for ids_list*/) / 1000000; EXPECT_TRUE((used_mb > theoritical_mb * 0.99) && - (used_mb < theoritical_mb * 1.01)); + (used_mb < theoritical_mb * 1.02)); } } //namespace embedding } //namespace tensorflow diff --git a/tensorflow/core/kernels/embedding_variable_ops_test.cc b/tensorflow/core/kernels/embedding_variable_ops_test.cc index 4839c171708..e30381fef07 100644 --- a/tensorflow/core/kernels/embedding_variable_ops_test.cc +++ b/tensorflow/core/kernels/embedding_variable_ops_test.cc @@ -21,6 +21,7 @@ #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/embedding_variable_test.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/io/path.h" @@ -48,18 +49,6 @@ namespace { const int THREADNUM = 16; const int64 max = 2147483647; -template -class TestableEmbeddingVar : public EmbeddingVar { - public: - TestableEmbeddingVar(const string& name, - embedding::Storage* storage, - EmbeddingConfig emb_cfg = EmbeddingConfig(), - Allocator* alloc = nullptr) : EmbeddingVar( - name, storage, emb_cfg, alloc) {} - - using EmbeddingVar::GetFilter; -}; - struct ProcMemory { long size; // total program size long resident; // resident set size @@ -123,11 +112,7 @@ TEST(EmbeddingVariableTest, TestEmptyEV) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); { - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); LOG(INFO) << "size:" << variable->Size(); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); @@ -191,19 +176,14 @@ TEST(EmbeddingVariableTest, TestEVExportSmallLockless) { int64 value_size = 8; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddigVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(0, 0, 1, 1, "", 5), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); for (int64 i = 0; i < 5; i++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); vflat(i) = 5.0; } @@ -269,20 +249,15 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(0, 0, 1, 1, "", 5), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5); Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); int64 ev_size = 10048576; for (int64 i = 0; i < ev_size; i++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); } LOG(INFO) << "size:" << variable->Size(); @@ -344,9 +319,9 @@ TEST(EmbeddingVariableTest, TestEVExportLargeLockless) { void multi_insertion(EmbeddingVar* variable, int64 value_size){ for (long j = 0; j < 5; j++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(j, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, j); + typename TTypes::Flat vflat = variable->flat(value_ptr); } } @@ -355,12 +330,7 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); std::vector insert_threads(THREADNUM); for (size_t i = 0 ; i < THREADNUM; i++) { @@ -375,54 +345,45 @@ TEST(EmbeddingVariableTest, TestMultiInsertion) { void InsertAndLookup(EmbeddingVar* variable, int64 *keys, long ReadLoops, int value_size){ - float *default_value_fake = (float *)malloc((value_size)*sizeof(float)); - for (int j = 0; j < value_size; j++) { - default_value_fake[j] = -1.0; - } for (long j = 0; j < ReadLoops; j++) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - float *default_value = (float *)malloc((value_size)*sizeof(float)); - for (int k = 0; k < value_size; k++) { - default_value[k] = (float)keys[j]; - } - variable->LookupOrCreate(keys[j], val, default_value); - variable->LookupOrCreate(keys[j], val, default_value_fake); - ASSERT_EQ(default_value[0] , val[0]); - free(val); - free(default_value); + void* val = nullptr; + void* val_1 = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(keys[j], &val, &is_filter, false); + variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false); + ASSERT_EQ(val, val_1); } - free(default_value_fake); } void MultiBloomFilter(EmbeddingVar* var, int value_size, int64 i) { for (long j = 0; j < 1; j++) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - var->LookupOrCreate(i+1, val, nullptr); + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(i+1, &val, &is_filter, false); } } TEST(EmbeddingVariableTest, TestBloomFilter) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); - test::FillValues(&value, std::vector(value_size, 10.0)); - float* fill_v = (float*)malloc(value_size * sizeof(float)); - - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, "normal", 10, 0.01), - cpu_allocator()); - - var->Init(value, 1); - - float *val = (float *)malloc((value_size+1)*sizeof(float)); - float *default_value = (float *)malloc((value_size+1)*sizeof(float)); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(1, val, default_value); - var->LookupOrCreate(2, val, default_value); + std::vector default_value = + {0.0 ,1.0 ,2.0 ,3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; + test::FillValues(&value, default_value); + + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01); + + //float *val = (float *)malloc((value_size+1)*sizeof(float)); + void* val = nullptr; + bool is_filter = true; + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(1, &val, &is_filter, false); + var->LookupOrCreateKey(2, &val, &is_filter, false); std::vector keylist; std::vector valuelist; @@ -437,14 +398,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt64) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal", 10, 0.01, DT_UINT64), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT64); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -509,14 +467,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt32) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal", 10, 0.01, DT_UINT32), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT32); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -581,14 +536,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt16) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal_contiguous", 10, 0.01, DT_UINT16), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT16); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -654,14 +606,11 @@ TEST(EmbeddingVariableTest, TestBloomCounterInt8) { test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new TestableEmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 3, 99999, -1.0, - "normal_contiguous", 10, 0.01, DT_UINT8), cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, + 1, 3, 5, -1.0, + embedding::StorageType::DRAM, + {1024, 1024, 1024, 1024}, + false, 10, 0.01, DT_UINT8); float *val = (float *)malloc((value_size+1)*sizeof(float)); @@ -725,12 +674,7 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) { int64 value_size = 128; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); - - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); int64 InsertLoops = 1000; bool* flag = (bool *)malloc(sizeof(bool)*max); @@ -765,8 +709,9 @@ TEST(EmbeddingVariableTest, TestInsertAndLookup) { } void MultiFilter(EmbeddingVar* variable, int value_size) { - float *val = (float *)malloc((value_size+1)*sizeof(float)); - variable->LookupOrCreate(20, val, nullptr); + bool is_filter = true; + void* val; + variable->LookupOrCreateKey(20, &val, &is_filter, false); } TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { @@ -774,14 +719,8 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(0, 0, 1, 1, "", 5, 7), - cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5); + float *val = (float *)malloc((value_size+1)*sizeof(float)); int thread_num = 5; std::vector insert_threads(thread_num); @@ -792,20 +731,16 @@ TEST(EmbeddingVariableTest, TestFeatureFilterParallel) { t.join(); } - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; var->LookupOrCreateKey(20, &value_ptr); - ASSERT_EQ(value_ptr->GetFreq(), thread_num); + ASSERT_EQ(var->GetFreq(20), thread_num); } EmbeddingVar* InitEV_Lockless(int64 value_size) { Tensor value(DT_INT64, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, EmbeddingConfig(), cpu_allocator()); + auto variable = CreateEmbeddingVar(value_size, value, 1); - variable->Init(value, 1); return variable; } @@ -813,7 +748,7 @@ void MultiLookup(EmbeddingVar* variable, int64 InsertLoop, int thread_num, int i) { for (int64 j = i * InsertLoop/thread_num; j < (i+1)*InsertLoop/thread_num; j++) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(j, &value_ptr); } } @@ -829,9 +764,9 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) { float* fill_v = (float*)malloc(value_size * sizeof(float)); for (int64 i = 0; i < InsertLoop; i++){ - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); + typename TTypes::Flat vflat = variable->flat(value_ptr); } testing::StartTiming(); @@ -848,58 +783,6 @@ void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) { } -void hybrid_process(EmbeddingVar* variable, - int64* keys, int64 InsertLoop, int thread_num, - int64 i, int64 value_size) { - float *val = (float *)malloc(sizeof(float)*(value_size + 1)); - for (int64 j = i * InsertLoop/thread_num; - j < (i+1) * InsertLoop/thread_num; j++) { - variable->LookupOrCreate(keys[j], val, nullptr); - } -} - -void BM_HYBRID_LOCKLESS(int iters, int thread_num) { - testing::StopTiming(); - testing::UseRealTime(); - - int64 value_size = 128; - auto variable = InitEV_Lockless(value_size); - int64 InsertLoop = 1000000; - - srand((unsigned)time(NULL)); - int64 *keys = (int64 *)malloc(sizeof(int64)*InsertLoop); - - for (int64 i = 0; i < InsertLoop; i++) { - keys[i] = rand() % 1000; - } - - testing::StartTiming(); - while (iters--) { - std::vector insert_threads(thread_num); - for (size_t i = 0 ; i < thread_num; i++) { - insert_threads[i] = std::thread(hybrid_process, - variable, keys, InsertLoop, thread_num, i, value_size); - } - for (auto &t : insert_threads) { - t.join(); - } - } -} - -BENCHMARK(BM_MULTIREAD_LOCKLESS) - ->Arg(1) - ->Arg(2) - ->Arg(4) - ->Arg(8) - ->Arg(16); - -BENCHMARK(BM_HYBRID_LOCKLESS) - ->Arg(1) - ->Arg(2) - ->Arg(4) - ->Arg(8) - ->Arg(16); - TEST(EmbeddingVariableTest, TestAllocate) { int value_len = 8; @@ -923,23 +806,13 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) { Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 9.0)); float* fill_v = (float*)malloc(value_size * sizeof(float)); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, - EmbeddingConfig(/*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */1, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64), - cpu_allocator()); - variable->Init(value, 1); + auto variable = CreateEmbeddingVar(value_size, value, 1); int64 ev_size = 100; for (int64 i = 0; i < ev_size; i++) { - variable->LookupOrCreate(i, fill_v, nullptr); + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(i, &val, &is_filter, false); } LOG(INFO) << "size:" << variable->Size(); @@ -947,59 +820,20 @@ TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) { void t1(KVInterface* hashmap) { for (int i = 0; i< 100; ++i) { - hashmap->Insert(i, new NormalValuePtr(ev_allocator(), 100)); + hashmap->Insert(i, nullptr); } } TEST(EmbeddingVariableTest, TestRemoveLockless) { - KVInterface* hashmap = new LocklessHashMap(); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(t1, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} - -TEST(EmbeddingVariableTest, TestBatchCommitofDBKV) { - int64 value_size = 4; + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM, + false, false, {false, 0}); KVInterface* hashmap = - new LevelDBKV(testing::TmpDir()); - hashmap->SetTotalDims(value_size); - - for (int64 i = 0; i < 6; ++i) { - const ValuePtr* tmp = - new NormalContiguousValuePtr(ev_allocator(), value_size); - hashmap->Commit(i, tmp); - } - - for(int64 i = 0; i < 6; i++) { - ValuePtr* tmp = nullptr; - Status s = hashmap->Lookup(i, &tmp); - ASSERT_EQ(s.ok(), true); - } -} - -void InsertAndCommit(KVInterface* hashmap) { - for (int64 i = 0; i< 100; ++i) { - const ValuePtr* tmp = - new NormalContiguousValuePtr(ev_allocator(), 100); - hashmap->Insert(i, tmp); - hashmap->Commit(i, tmp); - } -} - -TEST(EmbeddingVariableTest, TestSizeDBKV) { - KVInterface* hashmap = - new LevelDBKV(testing::TmpDir()); - hashmap->SetTotalDims(100); + new LocklessHashMap(feat_desc); + feat_desc->InitSlotInfo(0, 100, {nullptr, 1}); ASSERT_EQ(hashmap->Size(), 0); LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(InsertAndCommit, hashmap); + auto t = std::thread(t1, hashmap); t.join(); LOG(INFO) << "hashmap size: " << hashmap->Size(); ASSERT_EQ(hashmap->Size(), 100); @@ -1190,213 +1024,6 @@ TEST(EmbeddingVariableTest, TestLFUCache) { } } -TEST(EmbeddingVariableTest, TestCacheRestore) { - setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1); - int64 value_size = 4; - Tensor value(DT_FLOAT, TensorShape({value_size})); - test::FillValues(&value, std::vector(value_size, 9.0)); - float* fill_v = (float*)malloc(value_size * sizeof(float)); - std::vector size; - size.emplace_back(64); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal_contiguous", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage= embedding::StorageFactory::Create( - embedding::StorageConfig(embedding::DRAM_SSDHASH, - testing::TmpDir(), - size, "normal_contiguous", - emb_config), - cpu_allocator(), - "EmbeddingVar"); - auto variable = new EmbeddingVar("EmbeddingVar", - storage, emb_config, cpu_allocator()); - variable->Init(value, 1); - variable->InitCache(CacheStrategy::LFU); - - Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1})); - - int64 ev_size = 7; - int64 cache_size = 3; - for (int64 i = 1; i < cache_size; i++) { - ValuePtr* value_ptr = nullptr; - variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); - value_ptr->AddFreq(2); - } - for (int64 i = cache_size; i < ev_size; i++) { - ValuePtr* value_ptr = nullptr; - variable->LookupOrCreateKey(i, &value_ptr); - typename TTypes::Flat vflat = variable->flat(value_ptr, i); - value_ptr->AddFreq(1); - } - - LOG(INFO) << "size:" << variable->Size(); - - BundleWriter writer(Env::Default(), Prefix("foo")); - embedding::ShrinkArgs shrink_args; - shrink_args.global_step = 1; - variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args); - TF_ASSERT_OK(writer.Finish()); - variable->Unref(); - - auto imported_storage= embedding::StorageFactory::Create( - embedding::StorageConfig(embedding::DRAM_SSDHASH, - testing::TmpDir(), - size, "normal_contiguous", - emb_config), - cpu_allocator(), - "EmbeddingVar1"); - auto imported_variable = new EmbeddingVar("EmbeddingVar1", - imported_storage, emb_config, cpu_allocator()); - imported_variable->Init(value, 1); - imported_variable->InitCache(CacheStrategy::LFU); - - BundleReader reader(Env::Default(), Prefix("foo")); - std::string name_string("var"); - imported_variable->Restore(name_string, Prefix("foo"), 0, 1, false, &reader, false); - - ASSERT_EQ(imported_storage->Size(0), ev_size - cache_size); - ASSERT_EQ(imported_storage->Size(1), 2); - delete imported_storage; -} - -void t1_gpu(KVInterface* hashmap) { - for (int i = 0; i< 100; ++i) { - hashmap->Insert(i, new NormalGPUValuePtr(ev_allocator(), 100)); - } -} - -#if GOOGLE_CUDA -TEST(EmbeddingVariableTest,TestRemoveLocklessCPU) { - SessionOptions sops; - std::unique_ptr device = - DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0"); - Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator( - GPUOptions(), TfGpuId(0), 1 << 26); - KVInterface* hashmap = - new LocklessHashMapCPU(gpu_allocator); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(t1, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} -#endif // GOOGLE_CUDA - -/*void CommitGPU(KVInterface* hashmap) { - for (int64 i = 0; i< 100; ++i) { - ValuePtr* tmp= new NormalGPUValuePtr(ev_allocator(), 100); - hashmap->Commit(i, tmp); - } -} - -TEST(EmbeddingVariableTest, TestCommitHashMapCPU) { - KVInterface* hashmap = new LocklessHashMapCPU(); - hashmap->SetTotalDims(100); - ASSERT_EQ(hashmap->Size(), 0); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - auto t = std::thread(CommitGPU, hashmap); - t.join(); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - ASSERT_EQ(hashmap->Size(), 100); - TF_CHECK_OK(hashmap->Remove(1)); - TF_CHECK_OK(hashmap->Remove(2)); - ASSERT_EQ(hashmap->Size(), 98); - LOG(INFO) << "2 size:" << hashmap->Size(); -} - -TEST(EmbeddingVariableTest, TestGPUValuePtr) { - int ev_list_size = 32; - ValuePtr* ptr_ = new NormalGPUValuePtr(ev_allocator(), ev_list_size); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float host_data[ev_list_size]; - float initial_data[ev_list_size]; - for(int i = 0;i < ev_list_size;++i){ - initial_data[i] = 10; - } - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << initial_data[i]; - } - cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(host_data, address, ev_list_size * sizeof(float), cudaMemcpyDeviceToHost); - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << host_data[i]; - } -}//Forbidden, due to no gpu allocator at that time - -TEST(EmbeddingVariableTest, TestCommitValue) { - int ev_list_size = 32; - ValuePtr* ptr_ = new NormalGPUValuePtr(ev_allocator(),ev_list_size); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float initial_data[ev_list_size]; - for(int i = 0;i < ev_list_size;++i){ - initial_data[i] = 10; - } - cudaMemcpy(address, initial_data, ev_list_size * sizeof(float), cudaMemcpyHostToDevice); - KVInterface* hashmap = new LocklessHashMapCPU(); - hashmap->SetTotalDims(ev_list_size); - hashmap->Commit(1, ptr_); - ValuePtr* check; - hashmap->Lookup(1,&check); - LOG(INFO) << "hashmap size: " << hashmap->Size(); - float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader)); - - for(int i = 0;i < ev_list_size;++i){ - LOG(INFO) << i << " " << tmp[i]; - //ASSERT_EQ(tmp[i], 10); - }// -} - -TEST(EmbeddingVariableTest, TestBatchCommitofLocklessHashMapCPU) { - KVInterface* hashmap = new LocklessHashMapCPU(); - const int EmbeddingSize = 16; - const int BatchSize = 16; - - hashmap->SetTotalDims(EmbeddingSize); - std::vector*> value_ptr_list; - std::vector key_list; - - for(int64 i = 0; i < BatchSize; i++) { - key_list.emplace_back(i); - ValuePtr* ptr_ = new NormalGPUValuePtr(EmbeddingSize); - float* address = *(float **)((char *)ptr_->GetPtr() + sizeof(FixedLengthHeader)); - float initial_data[EmbeddingSize]; - for(int j = 0;j < EmbeddingSize;++j){ - initial_data[j] = i; - //LOG(INFO) << "initial[" << i << "][" << j << "]=" << initial_data[j]; - } - cudaMemcpy(address, initial_data, EmbeddingSize * sizeof(float), cudaMemcpyHostToDevice); - value_ptr_list.emplace_back(ptr_); - }//initialize V on GPU - - timespec start,end; - clock_gettime(CLOCK_MONOTONIC, &start); - hashmap->BatchCommit(key_list, value_ptr_list); - clock_gettime(CLOCK_MONOTONIC, &end); - std::cout << "time: " << ((double)(end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec)/1000000 << "ms" << std::endl; - - for(int64 i = 0; i < BatchSize; i++) { - ValuePtr* check; - hashmap->Lookup(i,&check); - float* tmp = (float *)((char *)check->GetPtr() + sizeof(FixedLengthHeader)); - for(int j = 0;j < EmbeddingSize;++j){ - LOG(INFO) << "batch[" << i << "][" << j << "]=" << tmp[j]; - //ASSERT_EQ(tmp[j], i); - } - }//compare value after BatchCommit -} -*/ - const int total_size = 1024 * 8; const int th_num = 1; const int malloc_size = total_size / th_num; @@ -1466,17 +1093,11 @@ TEST(EmbeddingVariableTest, TestCPUGPUMalloc) { auto mem_pool = new EmbeddingMemoryPool(gpu_allocator, 256, 1024); float* ptr_1 = mem_pool->Allocate(); float* ptr_2 = mem_pool->Allocate(); - ValuePtr* value_ptr1 = new NormalGPUValuePtr(gpu_allocator, 256); - ValuePtr* value_ptr2 = new NormalGPUValuePtr(gpu_allocator, 256); - value_ptr1->SetPtr(ptr_1); - value_ptr2->SetPtr(ptr_2); - value_ptr1->SetInitialized(0); - value_ptr2->SetInitialized(0); - std::vector*> value_ptrs; - value_ptrs.emplace_back(value_ptr1); + std::vector value_ptrs; + value_ptrs.emplace_back(ptr_1); mem_pool->Deallocate(value_ptrs); value_ptrs.clear(); - value_ptrs.emplace_back(value_ptr2); + value_ptrs.emplace_back(ptr_2); mem_pool->Deallocate(value_ptrs); float* ptr_3 = mem_pool->Allocate(); ASSERT_EQ(ptr_1, ptr_3); @@ -1539,16 +1160,16 @@ TEST(EmbeddingVariableTest, TestEVMallocFree) { void SingleCommit(KVInterface* hashmap, std::vector keys, int bias) { - std::vector*> value_ptrs; + std::vector value_ptrs; for (int64 i = 0; i < keys.size(); ++i) { - ValuePtr* tmp = - new NormalContiguousValuePtr(cpu_allocator(), 124); - tmp->SetValue(float(keys[i] + bias), 124); + void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16); + for (int j = 0; j < 124; j++) { + ((float*)tmp)[j] = keys[i] + bias; + } value_ptrs.push_back(tmp); } ASSERT_EQ(keys.size(), value_ptrs.size()); uint64 start = Env::Default()->NowNanos(); - for (int64 i = 0; i < keys.size(); i++) { hashmap->Commit(keys[i], value_ptrs[i]); } @@ -1558,9 +1179,13 @@ void SingleCommit(KVInterface* hashmap, void TestCompaction() { std::string temp_dir = testing::TmpDir(); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, + true, true, {false, 0}); auto hashmap = new SSDHashKV( - temp_dir, cpu_allocator()); - hashmap->SetTotalDims(124); + temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); ASSERT_EQ(hashmap->Size(), 0); std::vector ids; for (int i = 0; i < 262144; i++) { @@ -1576,12 +1201,12 @@ void TestCompaction() { t1.join(); ids.clear(); sleep(1); - ValuePtr* val = nullptr; + void* val = nullptr; for (int i = 131073; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i+3); + ASSERT_EQ(v[j], i+3); } } for (int i = 131073; i < 262144; i++) { @@ -1596,16 +1221,16 @@ void TestCompaction() { sleep(1); for (int i = 0; i < 131073; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i + 1); + ASSERT_EQ(v[j], i + 1); } } for (int i = 131073; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i + 2); + ASSERT_EQ(v[j], i + 2); } } delete hashmap; @@ -1622,10 +1247,14 @@ TEST(KVInterfaceTest, TestSSDKVSyncCompaction) { } void TestReadEmbFile() { + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, + true, true, {false, 0}); std::string temp_dir = testing::TmpDir(); auto hashmap = new SSDHashKV( - temp_dir, cpu_allocator()); - hashmap->SetTotalDims(124); + temp_dir, feat_desc); + feat_desc->InitSlotInfo(0, 124, {nullptr, 1}); + hashmap->Init(); ASSERT_EQ(hashmap->Size(), 0); std::vector ids; for (int i = 0; i < 262145; i++) { @@ -1634,12 +1263,12 @@ void TestReadEmbFile() { SingleCommit(hashmap, ids, 3); sleep(1); ids.clear(); - ValuePtr* val = nullptr; + void* val = nullptr; for (int i = 0; i < 262144; i++) { hashmap->Lookup(i, &val); - float* v = (float*)val->GetPtr(); + float* v = (float*)val; for (int j = 0; j < 124; j++){ - ASSERT_EQ(v[4+j], i+3); + ASSERT_EQ(v[j], i+3); } } delete hashmap; @@ -1666,9 +1295,10 @@ TEST(KVInterfaceTest, TestDirectIoFile) { void InsertKey(EmbeddingVar* variable, int value_size) { float *val = (float *)malloc((value_size+1)*sizeof(float)); for (int64 i = 0; i < 100000000; i++) { - variable->LookupOrCreate(20, val, nullptr); + void* val = nullptr; + bool is_filter = true; + variable->LookupOrCreateKey(20, &val, &is_filter, false); } - LOG(INFO)<<"Finish Insert"; } void RemoveKey(EmbeddingVar* variable) { @@ -1676,29 +1306,13 @@ void RemoveKey(EmbeddingVar* variable) { sleep(1); variable->storage()->Remove(20); } - LOG(INFO)<<"Remove thread finish"; } TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */2, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - emb_config, - cpu_allocator()); - - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1); int thread_num = 5; std::vector insert_threads(thread_num); for (size_t i = 0 ; i < thread_num - 1; i++) { @@ -1714,21 +1328,7 @@ TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) { int value_size = 10; Tensor value(DT_FLOAT, TensorShape({value_size})); test::FillValues(&value, std::vector(value_size, 10.0)); - auto emb_config = EmbeddingConfig( - /*emb_index = */0, /*primary_emb_index = */0, - /*block_num = */1, /*slot_num = */0, - /*name = */"", /*steps_to_live = */0, - /*filter_freq = */0, /*max_freq = */999999, - /*l2_weight_threshold = */-1.0, /*layout = */"normal", - /*max_element_size = */0, /*false_positive_probability = */-1.0, - /*counter_type = */DT_UINT64); - auto storage = embedding::StorageFactory::Create( - embedding::StorageConfig(), cpu_allocator(), "EmbeddingVar"); - auto var = new EmbeddingVar("EmbeddingVar", - storage, - emb_config, - cpu_allocator()); - var->Init(value, 1); + auto var = CreateEmbeddingVar(value_size, value, 1); float* set_value = (float*)malloc(value_size * sizeof(float)); //Insertion for (int i = 0; i < 100; i++) { diff --git a/tensorflow/core/kernels/embedding_variable_performance_test.cc b/tensorflow/core/kernels/embedding_variable_performance_test.cc index 9b01e35840b..16f4a894858 100644 --- a/tensorflow/core/kernels/embedding_variable_performance_test.cc +++ b/tensorflow/core/kernels/embedding_variable_performance_test.cc @@ -90,14 +90,21 @@ void GenerateSkewInput(int num_of_ids, float skew_factor, void thread_lookup_or_create( EmbeddingVar* ev, const int64* input_batch, + float* default_value, + int default_value_dim, float** outputs, int value_size, int start, int end) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = start; i < end; i++) { ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false); - auto val = ev->flat(value_ptr, input_batch[i]); - memcpy(outputs[i], &val(0), sizeof(float) * value_size); + if (is_filter) { + auto val = ev->flat(value_ptr); + memcpy(outputs[i], &val(0), sizeof(float) * value_size); + } else { + int default_value_index = input_batch[i] % default_value_dim; + memcpy(outputs[i], default_value + default_value_index * value_size, sizeof(float) * value_size); + } } } @@ -138,6 +145,8 @@ double PerfLookupOrCreate( for (int i = 0; i < num_thread; i++) { worker_threads[i] = std::thread(thread_lookup_or_create, ev, input_batches[k].data(), + default_value_matrix.data(), + default_value_dim, outputs.data(), value_size, thread_task_range[i].first, thread_task_range[i].second); @@ -201,11 +210,11 @@ void thread_lookup( const int64* input_batch, float** outputs, int value_size, int start, int end) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = start; i < end; i++) { ev->LookupKey(input_batch[i], &value_ptr); - auto val = ev->flat(value_ptr, input_batch[i]); + auto val = ev->flat(value_ptr); memcpy(outputs[i], &val(0), sizeof(float) * value_size); } } @@ -293,7 +302,7 @@ TEST(EmbeddingVariablePerformanceTest, TestLookup) { } } auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; for (int i = 0; i < hot_ids_list.size(); i++) { ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false); @@ -339,13 +348,13 @@ void PerfSave(Tensor& default_value, value_size, default_value, default_value_dim, 0, steps_to_live, l2_weight_threshold); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; srand((unsigned)time(NULL)); for (int i = 0; i < id_list.size(); i++) { ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false); - ev->flat(value_ptr, id_list[i]); + ev->flat(value_ptr); int64 global_step = rand() % 100; ev->UpdateVersion(value_ptr, global_step); } diff --git a/tensorflow/core/kernels/embedding_variable_test.h b/tensorflow/core/kernels/embedding_variable_test.h index d06304fb78a..07c34764fb0 100644 --- a/tensorflow/core/kernels/embedding_variable_test.h +++ b/tensorflow/core/kernels/embedding_variable_test.h @@ -107,35 +107,42 @@ EmbeddingVar* CreateEmbeddingVar( int value_size, Tensor& default_value, int64 default_value_dim, int64 filter_freq = 0, int64 steps_to_live = 0, - float l2_weight_threshold=-1.0) { - std::string layout_type = "light"; - if (filter_freq != 0) { - layout_type = "normal"; - } - - if (steps_to_live != 0) { - if (layout_type == "light") { - layout_type = "normal_contiguous"; - } - } + float l2_weight_threshold=-1.0, + embedding::StorageType storage_type = embedding::StorageType::DRAM, + std::vector storage_size = {1024*1024*1024, + 1024*1024*1024, + 1024*1024*1024, + 1024*1024*1024}, + bool record_freq = false, + int64 max_element_size = 0, + float false_positive_probability = -1.0, + DataType counter_type = DT_UINT64) { auto embedding_config = EmbeddingConfig( - 0, 0, 1, 0, "emb_var", steps_to_live, - filter_freq, 999999, l2_weight_threshold, layout_type, - 0, -1.0, DT_UINT64, default_value_dim, - 0.0, false, false, false); + 0, 0, 1, 0, "emb_var", steps_to_live, + filter_freq, 999999, l2_weight_threshold, + max_element_size, false_positive_probability, + counter_type, default_value_dim, + 0.0, record_freq, false, false); + auto feat_desc = new embedding::FeatureDescriptor( + 1, 1, ev_allocator(), storage_type, + record_freq, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( - embedding::StorageType::DRAM, "", - {1024, 1024, 1024, 1024}, layout_type, + storage_type, "", + storage_size, embedding_config), cpu_allocator(), + feat_desc, "emb_var"); auto ev = new EmbeddingVar( "emb_var", storage, embedding_config, - cpu_allocator()); + cpu_allocator(), + feat_desc); ev->Init(default_value, default_value_dim); return ev; } diff --git a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc index 55dd40176a8..2f07e2ef537 100644 --- a/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc +++ b/tensorflow/core/kernels/group_embedding/group_embedding_lookup_ops_test.cc @@ -774,7 +774,7 @@ class GroupEmbeddingVariableForWardOpTest : public OpsTestBase { embedding_var->Init(value, 1); for (int64 j = 0; j < nnz; ++j) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); typename TTypes::Flat vflat = embedding_var->flat(value_ptr); @@ -958,7 +958,7 @@ class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase { embedding_var->Init(value, 1); for (int64 j = 0; j < nnz; ++j) { - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; Status s = embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr); typename TTypes::Flat vflat = embedding_var->flat(value_ptr); diff --git a/tensorflow/core/kernels/incr_save_restore_ops.h b/tensorflow/core/kernels/incr_save_restore_ops.h index 0582697ad16..d84838ae413 100644 --- a/tensorflow/core/kernels/incr_save_restore_ops.h +++ b/tensorflow/core/kernels/incr_save_restore_ops.h @@ -225,9 +225,9 @@ class IncrEVValueDumpIterator : public DumpIterator { keys_idx_++; col_idx_ = 0; } - ValuePtr* value_ptr = NULL; + void* value_ptr = NULL; TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr)); - return emb_var_->flat(value_ptr, *keys_iter_)(col_idx_++); + return emb_var_->flat(value_ptr)(col_idx_++); } private: diff --git a/tensorflow/core/kernels/kv_variable_lookup_ops.cc b/tensorflow/core/kernels/kv_variable_lookup_ops.cc index c69aec8ebb9..7e40dfff7ac 100644 --- a/tensorflow/core/kernels/kv_variable_lookup_ops.cc +++ b/tensorflow/core/kernels/kv_variable_lookup_ops.cc @@ -121,7 +121,7 @@ class KvResourceLookupIDOp : public OpKernel { const int64 indices_size = static_cast(indices_flat.dimension(0)); EmbeddingVarContext ev_ctx(c); ev->GetOrCreateKey(ev_ctx, indices, - reinterpret_cast**>(out_base), + reinterpret_cast(out_base), indices_size); } } @@ -203,7 +203,7 @@ class KvResourceCollectEmbeddingOp : public OpKernel { const size_t slice_bytes = slice_elems * sizeof(TValue); EmbeddingVarContext ev_ctx(c); ev->GatherEmbeddings(ev_ctx, indices, - (ValuePtr**)pointer.data(), + (void**)pointer.data(), out_base, N); } } diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc index 8a01a7bf2cd..5cd0ef140bd 100644 --- a/tensorflow/core/kernels/kv_variable_ops.cc +++ b/tensorflow/core/kernels/kv_variable_ops.cc @@ -214,16 +214,16 @@ class InitializeKvVariableOp : public OpKernel { int64 storage_type = 0; OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type)); storage_type_ = static_cast(storage_type); - auto device_type_str = c->device_type().type_string(); + device_type_str_ = c->device_type().type_string(); if (storage_type_ == embedding::DEFAULT) { - if (device_type_str == "CPU") { + if (device_type_str_ == "CPU") { storage_type_ = embedding::DRAM; } else { storage_type_ = embedding::HBM; } } - bool if_op_on_gpu = (device_type_str == "GPU"); + bool if_op_on_gpu = (device_type_str_ == "GPU"); bool if_embedding_on_hbm = (storage_type_ == embedding::HBM || storage_type_ == embedding::HBM_DRAM || storage_type_ == embedding::HBM_DRAM_SSDHASH); @@ -238,57 +238,14 @@ class InitializeKvVariableOp : public OpKernel { filter_freq_ = 0; } - OP_REQUIRES_OK(c, c->GetAttr("layout", &layout_)); - if (!layout_.empty()) { - // use layout by user configuration - } else if ((filter_freq_ != 0 && max_element_size_ == 0) - || steps_to_live_ != 0 || record_freq_ - || record_version_ || storage_type > 5) { - if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) { - layout_ = "normal"; - } else { - if (storage_type == embedding::HBM_DRAM || - storage_type == embedding::HBM_DRAM_SSDHASH) { - layout_ = "normal_contiguous_gpu"; - } else { - layout_ = "normal_contiguous"; - } - } - } else { - layout_ = "light"; - } - - CHECK(block_num_ == 1 || layout_ != "normal_contiguous"); - - if ("compact" == layout_) { - OP_REQUIRES(c, shape_.dim_size(0) == 1 && - storage_type_ == embedding::StorageType::DRAM, - errors::InvalidArgument("embedding_dim must be 1 and storage type" - " should be DRAM when layout is 'compact'.")); - } + record_freq_ |= (storage_type > 5); + record_version_ |= (storage_type > 5); OP_REQUIRES(c, steps_to_live_ >= 0, errors::InvalidArgument( "steps_to_live must >= 0, ", std::to_string(steps_to_live_))); OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_)); - if (embedding::StorageType::LEVELDB == storage_type_) { - ht_type_ = "leveldb_kv"; - if (layout_ != "normal_contiguous") - LOG(WARNING) - << "layout must be NORAML_CONTIGUOUS when storage type is LEVELDB"; - layout_ = "normal_contiguous"; - } - - if (embedding::StorageType::PMEM_LIBPMEM == storage_type_ || - embedding::StorageType::PMEM_MEMKIND == storage_type_){ - if (layout_ != "normal_contiguous"){ - LOG(WARNING) - << "layout must be NORAML_CONTIGUOUS" - << " when storage type is PMEM_LIBPMEM or PMEM_MEMKIND"; - } - layout_ = "normal_contiguous"; - } OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_)); } @@ -314,35 +271,43 @@ class InitializeKvVariableOp : public OpKernel { context, handle_self, &ev, [this, default_values, opname, context, handle_self](EmbeddingVar** ptr) { - Allocator* gpu_allocator = + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); auto embedding_config = EmbeddingConfig( emb_index_ + block_num_ * slot_index_, emb_index_, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, + l2_weight_threshold_, max_element_size_, false_positive_probability_, counter_type_, default_value_dim_, default_value_no_permission_, record_freq_, record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - gpu_allocator, + alloc_for_ev, + feat_desc, handle_self.name()); *ptr = new EmbeddingVar( handle_self.name(), storage, embedding_config, - gpu_allocator); - return Status::OK(); - })); - ev->Init(default_values, default_value_dim_); + alloc_for_ev, + feat_desc); + return (*ptr)->Init(default_values, default_value_dim_); + })); } else { EmbeddingVar* primary_variable = nullptr; OP_REQUIRES_OK( @@ -352,30 +317,38 @@ class InitializeKvVariableOp : public OpKernel { [this, default_values, opname, handle_primary, context](EmbeddingVar** ptr) { int64 primary_slot_index(0), primary_emb_index(0); - Allocator* gpu_allocator = context->device()->GetAllocator(AllocatorAttributes()); - //Allocator* gpu_allocator = context->get_allocator(AllocatorAttributes()); + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); auto embedding_config = EmbeddingConfig( primary_emb_index + block_num_ * primary_slot_index, primary_emb_index, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, + l2_weight_threshold_, max_element_size_, false_positive_probability_, counter_type_, 0, record_freq_, record_version_, is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - gpu_allocator, + alloc_for_ev, + feat_desc, handle_primary.name()); *ptr = new EmbeddingVar( handle_primary.name(), storage, embedding_config, - gpu_allocator); + alloc_for_ev, + feat_desc); // default_values is slot value, should not to initialize primary value return Status::OK(); })); @@ -386,20 +359,26 @@ class InitializeKvVariableOp : public OpKernel { context, handle_self, &ev, [this, default_values, opname, primary_variable, handle_self, context](EmbeddingVar** ptr) { + Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, + emb_index_, + block_num_, slot_num_, opname, + steps_to_live_, filter_freq_, + max_freq_, l2_weight_threshold_, + max_element_size_, + false_positive_probability_, + counter_type_, default_value_dim_, + default_value_no_permission_, + record_freq_, record_version_, + is_inference_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; *ptr = new EmbeddingVar(handle_self.name(), primary_variable->storage(), - EmbeddingConfig(emb_index_ + block_num_ * slot_index_, - emb_index_, - block_num_, slot_num_, opname, - steps_to_live_, filter_freq_, - max_freq_, l2_weight_threshold_, - layout_, max_element_size_, - false_positive_probability_, - counter_type_, default_value_dim_, - default_value_no_permission_, - record_freq_, record_version_, - is_inference_), - primary_variable->GetAllocator()); + embedding_config, + alloc_for_ev, + primary_variable->feature_descriptor()); return (*ptr)->Init(default_values, default_value_dim_); })); core::ScopedUnref unref_me(primary_variable); @@ -424,7 +403,6 @@ class InitializeKvVariableOp : public OpKernel { int64 filter_freq_; int64 max_freq_; float l2_weight_threshold_; - std::string layout_; int64 max_element_size_; float false_positive_probability_; embedding::StorageType storage_type_; @@ -436,6 +414,7 @@ class InitializeKvVariableOp : public OpKernel { bool record_version_; bool is_inference_; bool is_set_initialized_; + std::string device_type_str_; }; #define REGISTER_KERNELS(ktype, vtype) \ diff --git a/tensorflow/core/kernels/kv_variable_ops.h b/tensorflow/core/kernels/kv_variable_ops.h index 8e3572443ba..3202e6d12bf 100644 --- a/tensorflow/core/kernels/kv_variable_ops.h +++ b/tensorflow/core/kernels/kv_variable_ops.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/bounds_check.h" +#include "tensorflow/core/framework/embedding/cache_factory.h" #include "tensorflow/core/framework/embedding/embedding_var.h" #include "tensorflow/core/framework/embedding/kv_interface.h" #include "tensorflow/core/framework/op_kernel.h" diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc index 23a504eea5d..3b10c2521b9 100644 --- a/tensorflow/core/kernels/kv_variable_restore_ops.cc +++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc @@ -120,20 +120,6 @@ class KvResourceImportV2Op: public AsyncOpKernel { OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_)); OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_)); - if ((filter_freq_ != 0 && max_element_size_ == 0) - || steps_to_live_ != -1 || record_freq_ - || record_version_ || storage_type > 5) { - if (block_num_ > 1 || (filter_freq_ != 0 && storage_type <= 5)) { - layout_ = "normal"; - } else { - layout_ = "normal_contiguous"; - } - } else { - layout_ = "light"; - } - - CHECK(block_num_ == 1 || layout_ != "normal_contiguous"); - TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true, &ev_async_restore_)); } @@ -170,24 +156,33 @@ class KvResourceImportV2Op: public AsyncOpKernel { block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, l2_weight_threshold_, - layout_, max_element_size_, + max_element_size_, false_positive_probability_, counter_type_, default_value_dim_, default_value_no_permission_, record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - allocator, + alloc_for_ev, + feat_desc, handle_self.name()); *ptr = new EmbeddingVar( handle_self.name(), storage, embedding_config, - allocator); + alloc_for_ev, + feat_desc); return Status::OK(); })); ev->Init(default_values, default_value_dim_); @@ -207,19 +202,27 @@ class KvResourceImportV2Op: public AsyncOpKernel { primary_emb_index, block_num_, slot_num_, opname + "-primary", steps_to_live_, filter_freq_, max_freq_, l2_weight_threshold_, - layout_, max_element_size_, + max_element_size_, false_positive_probability_, counter_type_, 0, record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; + auto feat_desc = new embedding::FeatureDescriptor( + block_num_, slot_num_ + 1, alloc_for_ev, storage_type_, + record_freq_, + embedding_config.is_save_version(), + {embedding_config.is_counter_filter(), filter_freq_}); auto storage = embedding::StorageFactory::Create( embedding::StorageConfig( storage_type_, storage_path_, - storage_size_, layout_, + storage_size_, embedding_config), - allocator, + alloc_for_ev, + feat_desc, handle_primary.name()); *ptr = new EmbeddingVar(handle_primary.name(), - storage, embedding_config, allocator); + storage, embedding_config, alloc_for_ev, feat_desc); // default_values is slot value, should not to initialize primary value return Status::OK(); })); @@ -232,17 +235,22 @@ class KvResourceImportV2Op: public AsyncOpKernel { handle_self, context](EmbeddingVar** ptr) { Allocator* allocator = context->device()->GetAllocator(AllocatorAttributes()); + auto embedding_config = EmbeddingConfig( + emb_index_ + block_num_ * slot_index_, + emb_index_, block_num_, slot_num_, opname, + steps_to_live_, filter_freq_, max_freq_, + l2_weight_threshold_, max_element_size_, + false_positive_probability_, + counter_type_, default_value_dim_, + default_value_no_permission_, + record_freq_, record_version_); + Allocator* alloc_for_ev = + (device_type_str_ == "CPU") ? ev_allocator() : allocator; *ptr = new EmbeddingVar(handle_self.name(), primary_variable->storage(), - EmbeddingConfig(emb_index_ + block_num_ * slot_index_, - emb_index_, block_num_, slot_num_, opname, - steps_to_live_, filter_freq_, max_freq_, - l2_weight_threshold_, layout_, max_element_size_, - false_positive_probability_, - counter_type_, default_value_dim_, - default_value_no_permission_, - record_freq_, record_version_), - allocator); + embedding_config, + alloc_for_ev, + primary_variable->feature_descriptor()); return (*ptr)->Init(default_values, default_value_dim_); })); core::ScopedUnref unref_me(primary_variable); @@ -290,7 +298,6 @@ class KvResourceImportV2Op: public AsyncOpKernel { int64 slot_num_; int64 filter_freq_; float l2_weight_threshold_; - std::string layout_; int64 max_freq_; embedding::StorageType storage_type_; std::string storage_path_; @@ -301,6 +308,7 @@ class KvResourceImportV2Op: public AsyncOpKernel { bool record_version_; bool reset_version_; bool ev_async_restore_; + std::string device_type_str_; }; #define REGISTER_KERNELS(dev, ktype, vtype) \ diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h index 4f69ebe3fb5..da58e17e1bb 100644 --- a/tensorflow/core/kernels/save_restore_tensor.h +++ b/tensorflow/core/kernels/save_restore_tensor.h @@ -23,7 +23,6 @@ limitations under the License. #include "tensorflow/core/framework/hash_table/hash_table.h" #include "tensorflow/core/framework/hash_table/bloom_filter_strategy.h" #include "tensorflow/core/framework/embedding/kv_interface.h" -#include "tensorflow/core/framework/embedding/value_ptr.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/training_ali_op_helpers.h b/tensorflow/core/kernels/training_ali_op_helpers.h index e013a6a2bae..12948de24a4 100644 --- a/tensorflow/core/kernels/training_ali_op_helpers.h +++ b/tensorflow/core/kernels/training_ali_op_helpers.h @@ -121,55 +121,54 @@ EmbeddingVariableInputLockHolder MaybeLockEmbeddingVariableInputMutexesInO template void LookupKeyAndSetVersion( OpKernelContext* ctx, EmbeddingVar* var, - ValuePtr** value_ptrs, Tstep gs, const K* indices, + void** value_ptrs, Tstep gs, const K* indices, int64 task_size, bool indices_as_pointer, int counts_index) { + EmbeddingVarContext ev_ctx(ctx); int64* indices_counts = nullptr; std::function get_count_fn = 0; if (counts_index != -1) { const Tensor& counts_tensor = ctx->input(counts_index); indices_counts = (int64*)counts_tensor.data(); - get_count_fn = [](int64* counts, int64 index) { - return counts[index];}; - } else { - get_count_fn = [](int64* counts, int64 index) {return 1;}; } + var->LookupOrCreateKey(ev_ctx, indices, value_ptrs, + task_size, indices_counts, + indices_as_pointer); - auto lookup_key_and_set_version_fn = [var, value_ptrs, gs, - indices, indices_as_pointer, - indices_counts, get_count_fn] (int64 start, int64 limit) { - ValuePtr* value_ptr = nullptr; + auto update_version_fn = [var, value_ptrs, gs] + (int64 start, int64 limit) { for (int i = start; i < limit; i++) { - bool is_filter = false; - int64 count = get_count_fn(indices_counts, i); - var->LookupOrCreateKey(indices[i], &value_ptr, - &is_filter, indices_as_pointer, count); - value_ptrs[i] = value_ptr; - var->UpdateVersion(value_ptr, gs); + var->UpdateVersion(value_ptrs[i], gs); } }; const int64 unit_cost = 1000; //very unreliable estimate for cost per step. auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); Shard(worker_threads->num_threads, worker_threads->workers, task_size, unit_cost, - lookup_key_and_set_version_fn); + update_version_fn); } template -void LookupOrCreateEmbedding( +void LookupEmbedding( OpKernelContext* ctx, std::vector*, V**>>& vars, - ValuePtr** value_ptrs, + void** value_ptrs, const K* indices, - int64 num_of_keys, - IntraThreadCopyIdAllocator* thread_copy_id_alloc) { + int64 num_of_keys) { for (auto it: vars) { EmbeddingVar* var = it.first; V** var_ptr = it.second; - EmbeddingVarContext ev_ctx(ctx); - var->BatchLookupOrCreateEmb( - ev_ctx, var_ptr, value_ptrs, - indices, num_of_keys, thread_copy_id_alloc); + auto lookup_emb_fn = [var, var_ptr, value_ptrs] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + var_ptr[i] = var->GetValuePtr(value_ptrs[i]); + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + lookup_emb_fn); } } @@ -180,12 +179,12 @@ void GetEmbeddingPointers( const K* indices, Tstep gs, bool indices_as_pointer, int counts_index, int64 num_of_keys, IntraThreadCopyIdAllocator* thread_copy_id_alloc) { - std::vector*> value_ptrs(num_of_keys); + std::vector value_ptrs(num_of_keys); LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(), gs, indices, num_of_keys, indices_as_pointer, counts_index); - LookupOrCreateEmbedding(ctx, vars, value_ptrs.data(), - indices, num_of_keys, thread_copy_id_alloc); + LookupEmbedding(ctx, vars, value_ptrs.data(), + indices, num_of_keys); } } // end namespace tensorflow diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc index 839ce82feef..546b30e29dd 100644 --- a/tensorflow/core/kernels/training_ali_ops.cc +++ b/tensorflow/core/kernels/training_ali_ops.cc @@ -141,16 +141,16 @@ class KvSparseApplyAdagradOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const TKey index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto a = accum->flat(value_ptr, index); + auto a = accum->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); a += g.square(); v -= g.constant(lr_scalar) * g * a.rsqrt(); } @@ -542,15 +542,15 @@ class KvSparseApplyFtrlOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const TKey index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var_->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); if (is_filter) { - auto var = var_->flat(value_ptr, index); - auto accum = accum_->flat(value_ptr, index); - auto linear = linear_->flat(value_ptr, index); + auto var = var_->flat(value_ptr); + auto accum = accum_->flat(value_ptr); + auto linear = linear_->flat(value_ptr); auto grad = grad_flat.template chip<0>(i); // Use a macro to implement the computation here due to the templating of the @@ -1301,19 +1301,19 @@ class KvSparseApplyAdagradDecayOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto a = accum->flat(value_ptr, index); + auto a = accum->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); - auto accum_decay_power = accum_decay_power_var->flat(value_ptr, index); + auto v = var->flat(value_ptr); + auto accum_decay_power = accum_decay_power_var->flat(value_ptr); if (gs / decay_step_scalar > accum_decay_power(0)) { a *= a.constant(decay_rate_scalar); @@ -1505,19 +1505,18 @@ class KvSparseApplyAdamOp : public OpKernel { auto indices_vec = indices.vec(); int64 gs = global_step.scalar()(); - for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter =false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto var_i = var->flat(value_ptr, index); - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); m_a += (g - m_a) * (static_cast(1) - beta1_scalar); @@ -2412,15 +2411,15 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { Tstep gs = global_step.scalar()(); for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto v_ = v->flat(value_ptr, index); - auto m_ = m->flat(value_ptr, index); + auto v_ = v->flat(value_ptr); + auto m_ = m->flat(value_ptr); auto grad_ = grad_flat.template chip<0>(i); v_ = v_ * v_.constant(beta2_scalar) + @@ -2429,7 +2428,7 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { (v_ + v_.constant(epsilon_scalar)).rsqrt() * v_.constant(lr_scalar) * grad_; - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); v -= m_; } } @@ -2461,17 +2460,17 @@ class KvSparseApplyAdamAsyncOp : public OpKernel { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); - auto var_i = var->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); m_a = m_a * beta1_scalar + g * (static_cast(1) - beta1_scalar); v_a = v_a * beta2_scalar + g.square() * (static_cast(1) - beta2_scalar); @@ -2939,7 +2938,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel { (int64 start_i, int64 limit_i) { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter = false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, @@ -2947,7 +2946,7 @@ class KvResourceSparseApplyGradientDescentOp : public OpKernel { var->UpdateVersion(value_ptr, gs); if (is_filter) { auto g = grad_flat.template chip<0>(i); - auto v = var->flat(value_ptr, index); + auto v = var->flat(value_ptr); v -= g.constant(lr_scalar) * g; } } @@ -3136,16 +3135,16 @@ class KvSparseApplyAdamWOp : public OpKernel { for (int64 i = start_i; i < limit_i; i++) { const Tindex index = indices_vec(i); - ValuePtr* value_ptr = nullptr; + void* value_ptr = nullptr; bool is_filter =false; int64 count = get_count_fn(indices_counts, i); OP_REQUIRES_OK(ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter, indices_as_pointer, count)); var->UpdateVersion(value_ptr, gs); if (is_filter) { - auto var_i = var->flat(value_ptr, index); - auto m_a = m->flat(value_ptr, index); - auto v_a = v->flat(value_ptr, index); + auto var_i = var->flat(value_ptr); + auto m_a = m->flat(value_ptr); + auto v_a = v->flat(value_ptr); auto g = grad_flat.template chip<0>(i); // m_a = beta1 * m + (1 - beta1) * g m_a += (g - m_a) * (static_cast(1) - beta1_scalar); diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 2a56634206c..e89b095aff1 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -6132,6 +6132,8 @@ class GraphKeys(object): TRAINABLE_VARIABLES = "trainable_variables" # Indicate EmbeddingVariable in CollectionDef EMBEDDING_VARIABLES = "embedding_variables" + # Collection for dependencies of EmbeddingVariable's restore op + EMBEDDING_VARIABLE_RESTORE_DEPENDENCY = "embedding_variable_restore_dependency" # Key to collect summaries. SUMMARIES = "summaries" # Key to collect QueueRunners. diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py index 240938e8675..d47d94d0d99 100644 --- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py @@ -47,69 +47,6 @@ class EmbeddingVariableGpuTest(test_util.TensorFlowTestCase): - def testDynamicDimensionEmbeddingVariable(self): - print("testDynamicDimensionEmbeddingVariable") - with ops.device('/gpu:0'): - def runTestAdagrad(self, var, g): - if isinstance(var, kv_variable_ops.EmbeddingVariable): - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - else: - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2]) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - gs = training_util.get_or_create_global_step() - opt = adagrad.AdagradOptimizer(0.1) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - with self.test_session(graph=g) as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - return r - with ops.device('/gpu:0'), ops.Graph().as_default() as g: - emb_var = variable_scope.get_embedding_variable("var_1", - initializer=init_ops.ones_initializer(dtypes.float32), - embedding_dim = 8, - ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=4)) - emb1 = runTestAdagrad(self, emb_var, g) - with ops.device('/gpu:0'), ops.Graph().as_default() as g: - var = variable_scope.get_dynamic_dimension_embedding_variable("var_dist", - embedding_block_dimension=4, - embedding_block_num=2, - storage_type=config_pb2.StorageType.HBM, - initializer=init_ops.ones_initializer(dtypes.float32)) - emb2 = runTestAdagrad(self, var, g) - for i in range(0, 6): - for j in range(0, 8): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - def testDynamicEmbeddingVariableForInitFromProto(self): - print("testDynamicEmbeddingVariableForInitFromProto") - with ops.device('/gpu:0'): - embedding = variable_scope.get_dynamic_dimension_embedding_variable("var_dist", - embedding_block_dimension=4, - embedding_block_num=2, - storage_type=config_pb2.StorageType.HBM, - initializer=init_ops.ones_initializer(dtypes.float32)) - emb = embedding_ops.embedding_lookup(embedding, math_ops.cast([0,1,2,5,6,7], dtypes.int64), blocknums=[2,2,2,2,2,2]) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - graph = ops.get_default_graph() - meta_graph_def = saver_module.export_meta_graph() - ops.reset_default_graph() - with self.test_session() as sess: - res = saver_module.import_meta_graph(meta_graph_def) - def testEmbeddingVariableForInitFromProto(self): print("testEmbeddingVariableForInitFromProto") with ops.device('/gpu:0'): @@ -235,43 +172,6 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): print(sess.run([emb, train_op,loss])) print(sess.run([emb, train_op,loss])) - def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): - print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn") - columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) - with ops.device("/gpu:0"): - W = feature_column.embedding_column(sparse_id_column=columns, - dimension=3, - initializer=init_ops.ones_initializer(dtypes.float32)) - ids={} - ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) - emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) - - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - - opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - - with self.test_session() as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - emb1, top, l = sess.run([emb, train_op, loss]) - for val1 in emb1.tolist(): - for val in val1: - self.assertEqual(val, .0) - emb1, top, l = sess.run([emb, train_op, loss]) - for index, val1 in enumerate(emb1.tolist()): - if index < 7: - for val in val1: - self.assertNotEqual(val, 1.0) - else: - for val in val1: - self.assertEqual(val, .0) - def testEmbeddingVariableForSparseColumnEmbeddingCol(self): columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM))) @@ -870,6 +770,66 @@ def testSaveV3(self): result = sess.run([emb1]) print(result) + def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self): + print("testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm") + checkpoint_directory = self.get_temp_dir() + with ops.Graph().as_default() as g, ops.device('/gpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + ev_option = variables.EmbeddingVariableOption( + storage_option=variables.StorageOption( + storage_type=config_pb2.StorageType.HBM_DRAM))) + + emb = embedding_ops.embedding_lookup(var, + math_ops.cast([0,1,2,5,6,7], + dtypes.int64)) + fun = math_ops.multiply(emb, 1.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver(sharded=True) + init = variables.global_variables_initializer() + graph = ops.get_default_graph() + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + sess.run(train_op) + emb_ori = sess.run(emb) + save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345) + + with ops.Graph().as_default() as g, ops.device('/gpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + ev_option = variables.EmbeddingVariableOption( + storage_option=variables.StorageOption( + storage_type=config_pb2.StorageType.HBM_DRAM))) + + emb = embedding_ops.embedding_lookup(var, + math_ops.cast([0,1,2,5,6,7], + dtypes.int64)) + fun = math_ops.multiply(emb, 1.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver() + graph = ops.get_default_graph() + with self.test_session(graph = graph) as sess: + saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345")) + emb_val = sess.run(emb) + self.assertAllEqual(emb_ori, emb_val) + save_path = saver.save(sess, os.path.join(checkpoint_directory, "model.ckpt"), global_step=12345) + for name, shape in checkpoint_utils.list_variables(checkpoint_directory): + if "Adagrad-values" in name: + value = checkpoint_utils.load_variable(checkpoint_directory, name) + for i in range(0, shape[0]): + for j in range(0, shape[1]): + self.assertAlmostEqual(1.1, value[i][j]) + def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self): print("testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm") checkpoint_directory = self.get_temp_dir() @@ -894,8 +854,8 @@ def testEmbeddingVariableSaveAndRestoreForMultiTierWithHbm(self): emb2 = embedding_ops.embedding_lookup(var2, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - fun = math_ops.multiply(emb, 0.0, name='multiply') - fun1 = math_ops.multiply(emb2, 0.0, name='multiply_1') + fun = math_ops.multiply(emb, 1.0, name='multiply') + fun1 = math_ops.multiply(emb2, 1.0, name='multiply_1') loss = math_ops.reduce_sum(fun + fun1, name='reduce_sum') gs = training_util.get_or_create_global_step() opt = adagrad.AdagradOptimizer(0.1) diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index c6cdf951a1e..81b315e2e43 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -120,7 +120,7 @@ def _CounterFilterTestTemplate(self, optimizer): initializer=init_ops.ones_initializer(dtypes.float32), ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)), partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) - emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1], dtypes.int64)) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64)) fun = math_ops.multiply(emb, 2.0, name='multiply') loss = math_ops.reduce_sum(fun, name='reduce_sum') gs = training_util.get_or_create_global_step() @@ -133,11 +133,18 @@ def _CounterFilterTestTemplate(self, optimizer): sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) sess.run([init]) emb1, top, l = sess.run([emb, train_op, loss]) - for val in emb1.tolist()[0]: - self.assertEqual(val, .0) + + for val1 in emb1.tolist(): + for val in val1: + self.assertEqual(val, .0) emb1, top, l = sess.run([emb, train_op, loss]) - for val in emb1.tolist()[0]: - self.assertNotEqual(val, 1.0) + for index, val1 in enumerate(emb1.tolist()): + if index < 7: + for val in val1: + self.assertNotEqual(val, 1.0) + else: + for val in val1: + self.assertEqual(val, .0) def _RecordFreqTestTemplate(self, optimizer): checkpoint_directory = self.get_temp_dir() @@ -720,20 +727,11 @@ def testEmbeddingVariableForL2FeatureEviction(self): sess.run([init]) emb_ori = sess.run([emb, train_op]) save_path = saver.save(sess, os.path.join(checkpoint_directory, "model1.ckpt"), global_step=12345) - #for name, shape in checkpoint_utils.list_variables(checkpoint_directory): - # print('loading... ', name, shape) - with self.test_session() as sess: - saver.restore(sess, os.path.join(checkpoint_directory, "model1.ckpt-12345")) - emb_right = [[0.8282884, 0.8282884, 0.8282884], - [0.8282884, 0.8282884, 0.8282884], - [0.8282884, 0.8282884, 0.8282884], - [0.7927219, 0.7927219, 0.7927219], - [0.7927219, 0.7927219, 0.7927219], - [1.0, 1.0, 1.0]] - emb_ori = sess.run(emb) - for i in range(6): - for j in range(3): - self.assertAlmostEqual(emb_ori[i][j], emb_right[i][j]) + for name, shape in checkpoint_utils.list_variables(checkpoint_directory): + if name == "var_1-keys": + self.assertEqual(shape[0], 2) + keys = checkpoint_utils.load_variable(checkpoint_directory, name) + self.assertAllEqual(keys, [0, 1]) def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): columns_list=[] @@ -764,14 +762,15 @@ def testEmbeddingVariableForSparseColumnSharedEmbeddingCol(self): def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): print("testEmbeddingVariableForFeatureFilterFromContribFeatureColumn") - columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) - W = feature_column.embedding_column(sparse_id_column=columns, - dimension=3, - initializer=init_ops.ones_initializer(dtypes.float32)) - ids={} - ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) - emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) + with ops.device("/cpu:0"): + columns = feature_column.sparse_column_with_embedding(column_name="col_emb", dtype=dtypes.int64, + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) + W = feature_column.embedding_column(sparse_id_column=columns, + dimension=3, + initializer=init_ops.ones_initializer(dtypes.float32)) + ids={} + ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0]], values=math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64), dense_shape=[10, 1]) + emb = feature_column_ops.input_from_feature_columns(columns_to_tensors=ids, feature_columns=[W]) fun = math_ops.multiply(emb, 2.0, name='multiply') loss = math_ops.reduce_sum(fun, name='reduce_sum') @@ -786,6 +785,7 @@ def testEmbeddingVariableForFeatureFilterFromContribFeatureColumn(self): sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) sess.run([init]) emb1, top, l = sess.run([emb, train_op, loss]) + for val1 in emb1.tolist(): for val in val1: self.assertEqual(val, .0) @@ -1328,66 +1328,6 @@ def testEmbeddingVariableForHTPartitionNum(self): print(sess.run([emb, train_op,loss])) print(sess.run([emb, train_op,loss])) - def testEmbeddingVariableForLayout(self): - print("testEmbeddingVariableForLayout") - def runTestAdagrad(self, var, g): - emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) - fun = math_ops.multiply(emb, 2.0, name='multiply') - loss = math_ops.reduce_sum(fun, name='reduce_sum') - gs = training_util.get_or_create_global_step() - opt = adagrad.AdagradOptimizer(0.1) - g_v = opt.compute_gradients(loss) - train_op = opt.apply_gradients(g_v) - init = variables.global_variables_initializer() - with self.test_session(graph=g) as sess: - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) - sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) - sess.run([init]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - r, _, _ = sess.run([emb, train_op,loss]) - return r - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) - var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32)) - emb1 = runTestAdagrad(self, emb_var, g) - emb2 = runTestAdagrad(self, var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1), - steps_to_live=5) - var = variable_scope.get_variable("var_2", shape=[100, 3], initializer=init_ops.ones_initializer(dtypes.float32)) - emb1 = runTestAdagrad(self, emb_var, g) - emb2 = runTestAdagrad(self, var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], emb2.tolist()[i][j]) - - with ops.Graph().as_default() as g, ops.device('/cpu:0'): - emb_var = variable_scope.get_embedding_variable("var_1", - embedding_dim = 3, - initializer=init_ops.ones_initializer(dtypes.float32), - partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1), - ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=5))) - emb1 = runTestAdagrad(self, emb_var, g) - - for i in range(0, 6): - for j in range(0, 3): - self.assertEqual(emb1.tolist()[i][j], .0) - def testEVInitializerWithKeyFetch(self): print("testEVInitializerWithKeyFetch") with ops.Graph().as_default() as g, ops.device('/cpu:0'): @@ -2391,7 +2331,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self): "model1.ckpt") with self.test_session() as sess: sess.run([init]) - sess.run([emb, train_op]) + sess.run([train_op]) save_path = saver.save(sess, model_path) for name, shape in checkpoint_utils.list_variables(model_path): if name == "var_1-keys": @@ -2403,6 +2343,37 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self): name == "var_1-freqs_filtered": self.assertEqual(0, shape[0]) del os.environ["TF_EV_SAVE_FILTERED_FEATURES"] + + def testEmbeddingVariableForSaveUnfilterFeature(self): + checkpoint_directory = self.get_temp_dir() + with ops.device("/cpu:0"): + emb_var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + initializer=init_ops.ones_initializer(dtypes.float32), + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3))) + emb = embedding_ops.embedding_lookup(emb_var, math_ops.cast([1, 1, 1, 2, 2, 3], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v, gs) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + model_path = os.path.join(checkpoint_directory, + "model1.ckpt") + with self.test_session() as sess: + sess.run([init]) + sess.run([train_op]) + save_path = saver.save(sess, model_path) + for name, shape in checkpoint_utils.list_variables(model_path): + if name == "var_1-keys": + keys = checkpoint_utils.load_variable(model_path, name) + self.assertEqual(1, len(keys)) + self.assertEqual(1, keys[0]) + if name == "var_1-keys_filtered" or \ + name == "var_1-freqs_filtered": + self.assertEqual(2, shape[0]) def testEmbeddingVariableForMultiTierInference(self): print("testEmbeddingVariableForMultiTierInference") @@ -2716,7 +2687,55 @@ def testCPUFbjOpt(self): def testCPUFbjOptWithCounterFilter(self): print("testCPUFbjOpt") os.environ["TF_EMBEDDING_FBJ_OPT"] = "True" - self._CounterFilterTestTemplate("Adagrad") + with ops.device("/cpu:0"): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3, + initializer=init_ops.ones_initializer(dtypes.float32), + ev_option = variables.EmbeddingVariableOption(filter_option=variables.CounterFilter(filter_freq=3)), + partitioner=partitioned_variables.fixed_size_partitioner(num_shards=1)) + emb = embedding_ops.embedding_lookup(var, math_ops.cast([1,1,1,1,2,2,2,3,3,4], dtypes.int64)) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = self._CreateOptimizer("Adagrad") + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + emb1, top, l = sess.run([emb, train_op, loss]) + emb_list = emb1.tolist() + emb_right = [[.0, .0, .0], + [.0, .0, .0], + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + [.0, .0, .0], + [.0, .0, .0], + [1.0, 1.0, 1.0], + [.0, .0, .0], + [.0, .0, .0], + [.0, .0, .0]] + + for i in range(6): + for j in range(3): + self.assertAlmostEqual(emb_list[i][j], emb_right[i][j]) + + emb1= sess.run(emb) + emb_right = [[0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90031105, 0.90031105, 0.90031105], + [0.90122706, 0.90122706, 0.90122706], + [0.90122706, 0.90122706, 0.90122706], + [0.90122706, 0.90122706, 0.90122706], + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + [.0, .0, .0]] + for i in range(6): + for j in range(3): + self.assertAlmostEqual(emb1[i][j], emb_right[i][j]) del os.environ["TF_EMBEDDING_FBJ_OPT"] def testCPUFbjOptWithBloomFilter(self): diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index 96329ca345b..1ef9550ef6d 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -373,6 +373,8 @@ def _init_from_args(self, self._slot_num = 0 else: self._slot_num = evconfig.slot_num + if self._is_primary: + self._import_dependency_ops = [] with ops.name_scope("IsInitialized"): self._is_initialized_op = ( gen_kv_variable_ops.kv_var_is_initialized_op(self._handle, @@ -488,6 +490,7 @@ def create_init_op_for_restore(self, name, initial_value, invalid_key, rank): set_attr_ops.append(set_cache_op) with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]): self._init_op_for_restore = control_flow_ops.no_op() + self.collect_restore_denpendencies() def need_counts(self): return (self._record_freq or (self._filter_freq > 0) or self._is_multi_tier) @@ -612,8 +615,19 @@ def _init_from_proto(self, variable_def, import_scope=None): else: self._is_primary = False + self.collect_restore_denpendencies() # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py) + def collect_restore_denpendencies(self): + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY) + if len(restore_dependency) == 0: + ops.add_to_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY, {}) + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY) + dependency_dict = restore_dependency[0] + if not dependency_dict.__contains__(self._primary_handle): + dependency_dict[self._primary_handle] = [] + dependency_dict[self._primary_handle].append(self._init_op_for_restore) + def set_init_data_source_initializer(self, init_data_source): import pkgutil try: diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py index 0d8bfe87022..650b1a5e272 100644 --- a/tensorflow/python/training/saving/saveable_object_util.py +++ b/tensorflow/python/training/saving/saveable_object_util.py @@ -195,7 +195,8 @@ def restore(self, restored_tensors, unused_restored_shapes): if self.var._init_data_source is not None: return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num) else: - with ops.control_dependencies([self.var._init_op_for_restore]): + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0] + with ops.control_dependencies(restore_dependency[self.var._primary_handle]): rank = self.op.initial_value.get_shape().rank - 1 restore_op = gen_kv_variable_ops.kv_resource_import_v3( restored_tensors[0], From be62ec312595b51b74260f96a6c0872ce5f1540c Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Wed, 18 Oct 2023 10:11:16 +0800 Subject: [PATCH 09/45] [Graph] Fix hang bug for async embedding lookup. (#934) Skip edges to 'SaveV3' Op. Signed-off-by: chenbangduo.cbd --- tensorflow/python/training/async_embedding_stage.py | 7 ++++++- tensorflow/python/training/monitored_session.py | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/training/async_embedding_stage.py b/tensorflow/python/training/async_embedding_stage.py index 32433387c1c..858025bdab7 100644 --- a/tensorflow/python/training/async_embedding_stage.py +++ b/tensorflow/python/training/async_embedding_stage.py @@ -49,13 +49,14 @@ def __init__(self, options, checkpoint_dir = None): self._checkpoint_dir = checkpoint_dir if checkpoint_dir else "" self._use_stage_subgraph_thread_pool = options.use_stage_subgraph_thread_pool self._stage_subgraph_thread_pool_id = options.stage_subgraph_thread_pool_id + self._is_staged = False self._control_flow_ops = ['Switch', '_SwitchN', 'Merge', '_XlaMerge', 'Enter', 'Exit'] self._variable_ops = ['Variable', 'VariableV2', 'VarHandleOp', 'KvVarHandleOp', 'HashTableV2'] self._variable_is_init_ops = ['IsVariableInitialized', 'VarIsInitializedOp', 'KvVarIsInitializedOp'] - self._saver_ops = ['SaveV2'] + self._saver_ops = ['SaveV2', 'SaveV3'] self._no_data_input_ops = self._variable_ops + ['Placeholder', 'PlaceholderV2', 'Const'] self._boundary_ops = set() for tensor in ops.get_collection(ops.GraphKeys.ASYNC_EMBEDDING_OUTPUT_TENSORS): @@ -74,6 +75,10 @@ def __init__(self, options, checkpoint_dir = None): def stage(self, graph): """ add async embedding stage node to graph """ + if self._is_staged: + return + self._is_staged = True + logging.info('async embedding stage begin') logging.info('async embedding thread num: ' + str(self._threads_num)) logging.info('async embedding capacity: ' + str(self._capacity)) diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py index 09c05a02627..6eb204785dd 100644 --- a/tensorflow/python/training/monitored_session.py +++ b/tensorflow/python/training/monitored_session.py @@ -185,6 +185,7 @@ def __init__(self, self._saver = saver self._incremental_save_restore = incremental_save_restore self._incr_saver = None + self._async_embedding_stage = None self._enable_async_embedding = False self._async_embedding_checkpoint_dir = None self._async_embedding_options = None @@ -247,10 +248,11 @@ def default_ready_for_local_init_op(): self._incr_saver = incr_saver._get_incremental_saver(self._incremental_save_restore, self._saver) if self._enable_async_embedding: - async_embedding_stage = async_embedding.AsyncEmbeddingStage( - self._async_embedding_options, - self._async_embedding_checkpoint_dir) - async_embedding_stage.stage(ops.get_default_graph()) + if self._async_embedding_stage is None: + self._async_embedding_stage = async_embedding.AsyncEmbeddingStage( + self._async_embedding_options, + self._async_embedding_checkpoint_dir) + self._async_embedding_stage.stage(ops.get_default_graph()) ops.get_default_graph().finalize() logging.info('Graph was finalized.') From 0e8127a2cc9b2529ec2ab2f6f361d6c536280d60 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Wed, 25 Oct 2023 05:10:05 -0700 Subject: [PATCH 10/45] [Distribute] Add elastic-grpc server. (#936) Signed-off-by: JunqiHu --- configure.py | 3 + tensorflow/BUILD | 6 + tensorflow/contrib/elastic_grpc_server/BUILD | 70 ++++ .../elastic_grpc_server_lib.cc | 317 ++++++++++++++++++ .../elastic_grpc_server_lib.h | 66 ++++ .../elastic_grpc_server_lib_test.cc | 77 +++++ .../elastic_grpc_server/elastic_service.cc | 157 +++++++++ .../elastic_grpc_server/elastic_service.h | 31 ++ tensorflow/core/BUILD | 23 ++ .../distributed_runtime/rpc/grpc_server_lib.h | 14 +- .../core/platform/default/build_config.bzl | 6 + .../platform/default/build_config_root.bzl | 8 + .../core/protobuf/elastic_training.proto | 76 +++++ tensorflow/python/BUILD | 3 +- 14 files changed, 849 insertions(+), 8 deletions(-) create mode 100644 tensorflow/contrib/elastic_grpc_server/BUILD create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.cc create mode 100644 tensorflow/contrib/elastic_grpc_server/elastic_service.h create mode 100644 tensorflow/core/protobuf/elastic_training.proto diff --git a/configure.py b/configure.py index 362479981b2..6aeaf7d12af 100644 --- a/configure.py +++ b/configure.py @@ -1433,6 +1433,9 @@ def main(): set_build_var(environ_cp, 'TF_NEED_STAR', 'STAR', 'with_star_support', True, 'star') + set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support', + True, 'elastic') + set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support', False, 'pmem') diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 493247a2162..8b4190ea680 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -434,6 +434,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "with_elastic_support", + values = {"define": "with_elastic_support=true"}, + visibility = ["//visibility:public"], +) + config_setting( name = "with_pmem_support", values = {"define": "with_pmem_support=true"}, diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD new file mode 100644 index 00000000000..ea4b87e3b58 --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/BUILD @@ -0,0 +1,70 @@ +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +package(default_visibility = [ + "//tensorflow:internal", +]) + +load( + "//tensorflow:tensorflow.bzl", "tf_cc_test", +) + +cc_library( + name = "elastic_grpc_server_lib", + srcs = select({"//tensorflow:with_elastic_support": ["elastic_service.cc", + "elastic_grpc_server_lib.cc"], + "//conditions:default": []}), + hdrs = ["elastic_service.h", + "elastic_grpc_server_lib.h"], + linkstatic = 1, # Seems to be needed since alwayslink is broken in bazel + deps = [ + "//tensorflow/core:elastic_service_proto_cc", + "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", + "//tensorflow/core/distributed_runtime/rpc:async_service_interface", + "//tensorflow/core/distributed_runtime/rpc:grpc_channel", + "//tensorflow/core/distributed_runtime/rpc:grpc_master_service", + "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache", + "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service", + "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr", + "//tensorflow:grpc", + "//tensorflow:grpc++", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", + "//tensorflow/core/common_runtime/eager:context", + "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed", + "//tensorflow/core/distributed_runtime:device_resolver_distributed", + "//tensorflow/core/distributed_runtime:graph_mgr", + "//tensorflow/core/distributed_runtime:local_master", + "//tensorflow/core/distributed_runtime:master", + "//tensorflow/core/distributed_runtime:master_env", + "//tensorflow/core/distributed_runtime:master_session", + "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr", + "//tensorflow/core/distributed_runtime:server_lib", + "//tensorflow/core/distributed_runtime:session_mgr", + "//tensorflow/core/distributed_runtime:worker_cache_wrapper", + "//tensorflow/core/distributed_runtime:worker_env", + "//tensorflow/core/distributed_runtime:worker_resource", + "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl", + ], + alwayslink = 1, +) + +tf_cc_test( + name = "elastic_grpc_test", + size = "small", + srcs = ["elastic_grpc_server_lib_test.cc"], + deps = [ + ":elastic_grpc_server_lib", + "//tensorflow/core/distributed_runtime/rpc:grpc_util", + "//tensorflow:grpc", + "//tensorflow:grpc++", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:worker_proto_cc", + ], + linkstatic = 1, +) diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc new file mode 100644 index 00000000000..d45d70d6c8c --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc @@ -0,0 +1,317 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h" + +#include +#include +#include +#include + +#include "include/json/json.h" +#include "grpc/support/alloc.h" +#include "grpcpp/grpcpp.h" +#include "grpcpp/security/credentials.h" +#include "grpcpp/server_builder.h" +#include "tensorflow/core/util/env_var.h" + +#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/process_util.h" +#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h" +#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h" +#include "tensorflow/core/distributed_runtime/graph_mgr.h" +#include "tensorflow/core/distributed_runtime/local_master.h" +#include "tensorflow/core/distributed_runtime/master.h" +#include "tensorflow/core/distributed_runtime/master_env.h" +#include "tensorflow/core/distributed_runtime/master_session.h" +#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h" +#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h" +#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h" +#include "tensorflow/core/distributed_runtime/server_lib.h" +#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h" +#include "tensorflow/core/distributed_runtime/worker_env.h" +#include "tensorflow/core/distributed_runtime/worker_resource.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/protobuf/cluster.pb.h" + +namespace tensorflow { + +namespace { + +// static utility function +RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) { + return new RpcRendezvousMgr(env); +} + +} // namespace + +ElasticGrpcServer::ElasticGrpcServer(const ServerDef& server_def, Env* env) + : GrpcServer(server_def, env) {} + +ElasticGrpcServer::~ElasticGrpcServer() { + delete elastic_service_; +} + +Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num) { + std::string tf_config; + ReadStringFromEnvVar("TF_CONFIG", "", &tf_config); + if (!tf_config.empty()) { + Json::Reader reader; + Json::Value tf_config_json; + if(!reader.parse(tf_config, tf_config_json)) { + return errors::Internal("PARSE TF_CONFIG ERROR"); + } + if ((tf_config_json["cluster"].isNull()) || + (tf_config_json["cluster"]["ps"].isNull())) { + return errors::Internal("PARSE PS FROM TF_CONFIG ERROR"); + } + + Json::Value cluster_json; + if (!reader.parse(cluster_def_str, cluster_json)) { + LOG(ERROR) << "cluster_def is not correct with " << cluster_def_str; + return errors::Internal("PARSE TF_CONFIG/cluster ERROR"); + } + + std::unordered_set ps_addrs_vec; + after_part_num = cluster_json["cluster"]["ps"].size(); + for (auto& value: cluster_json["cluster"]["ps"]) { + ps_addrs_vec.emplace(value.asString()); + } + + int job_size = server_def_.cluster().job_size(); + for (int j = 0; j < job_size; ++j) { + auto* job = server_def_.mutable_cluster()->mutable_job(j); + if (job->name() == "ps") { + before_part_num = job->tasks_size(); + if (before_part_num == after_part_num) { + return Status::OK(); + } else if (after_part_num > before_part_num) { + int idx = before_part_num; + LOG(INFO) << "SCALING UP, partition_num is: " << after_part_num; + std::unordered_set target_string_set; + for (auto& value: tf_config_json["cluster"]["ps"]) { + target_string_set.emplace(value.asString()); + } + for (auto ps_addr: ps_addrs_vec) { + if (target_string_set.find(ps_addr) == target_string_set.end()) { + job->mutable_tasks()->insert({idx, ps_addr}); + tf_config_json["cluster"]["ps"].append(ps_addr); + } + } + break; + } else { + LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num; + for (int i = 0; i < before_part_num; ++i) { + string tmp_string = tf_config_json["cluster"]["ps"][i].asString(); + if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) { + Json::Value ps_addr; + tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr); + job->mutable_tasks()->erase(i); + } + } + } + } + } + Json::FastWriter writer; + std::string new_tf_config = writer.write(tf_config_json); + LOG(INFO) << "new TF_CONFIG " << new_tf_config; + setenv("TF_CONFIG", new_tf_config.c_str(), 1); + } + return Status::OK(); +} + +Status ElasticGrpcServer::Update(const string& cluster_def_str) { + int before_part_num, after_part_num; + Status s = UpdateServerDef(cluster_def_str, before_part_num, after_part_num); + if (!s.ok()) { + LOG(ERROR) << s.error_message(); + return Status::OK(); + } + + if (after_part_num == before_part_num) { + return Status::OK(); + } + + WorkerCacheInterface* worker_cache; + WorkerCacheFactoryOptions worker_cache_factory_options(server_def_); + TF_RETURN_IF_ERROR( + WorkerCacheFactory(worker_cache_factory_options, &worker_cache)); + CHECK_NE(nullptr, worker_cache); + ConfigProto config = server_def_.default_session_config(); + string unused; + string default_worker_name; + if (!DeviceNameUtils::SplitDeviceName(master_env()->local_devices[0]->name(), + &default_worker_name, &unused)) { + return errors::Internal("Could not parse worker name."); + } + std::unique_ptr dev_resolver( + new DeviceResolverDistributed(worker_env()->device_mgr, worker_cache, + default_worker_name)); + std::unique_ptr param_resolver( + new CollectiveParamResolverDistributed(config, worker_env()->device_mgr, + dev_resolver.get(), worker_cache, + default_worker_name)); + worker_env()->collective_executor_mgr = new RpcCollectiveExecutorMgr( + config, worker_env()->device_mgr, std::move(dev_resolver), + std::move(param_resolver), worker_cache, default_worker_name); + + if (worker_env()->session_mgr != nullptr) { + delete worker_env()->session_mgr; // Deletes graph_mgr's. + } + + // Set up worker environment. + worker_env()->session_mgr = new SessionMgr( + worker_env(), SessionMgr::WorkerNameFromServerDef(server_def_), + std::unique_ptr(worker_cache), + [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) { + WorkerCacheFactoryOptions options(server_def); + return WorkerCacheFactory(options, worker_cache); + }); + master_env()->worker_cache = worker_cache; + // Finish setting up master environment. + + StatsPublisherFactory stats_factory = opts_.stats_factory; + master_env()->master_session_factory = + [config, stats_factory]( + SessionOptions options, const MasterEnv* env, + std::unique_ptr>> remote_devs, + std::unique_ptr worker_cache, + std::unique_ptr device_set, + std::vector filtered_worker_list) { + options.config.MergeFrom(config); + return new MasterSession(options, env, std::move(remote_devs), + std::move(worker_cache), std::move(device_set), + std::move(filtered_worker_list), + stats_factory); + }; + master_env()->worker_cache_factory = + [this](const WorkerCacheFactoryOptions& options, + WorkerCacheInterface** worker_cache) { + return WorkerCacheFactory(options, worker_cache); + }; + return Status::OK(); +} + +void ElasticGrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) { + elastic_service_ = NewElasticGrpcService(this, builder); +} + +Status ElasticGrpcServer::Start() { + { + mutex_lock l(mu_); + switch (state_) { + case NEW: { + update_server_thread_.reset( + env_->StartThread(ThreadOptions(), "TF_elastic_service", + [this] { elastic_service_->HandleRPCsLoop(); })); + LOG(INFO) << "Started server with target: " << target(); + break; + } + case STARTED: + LOG(INFO) << "Server already started (target: " << target() << ")"; + return Status::OK(); + case STOPPED: + return errors::FailedPrecondition("Server has stopped."); + default: + LOG(FATAL); + } + } + return GrpcServer::Start(); +} + +Status ElasticGrpcServer::Join() { + GrpcServer::Join(); + mutex_lock l(mu_); + switch (state_) { + case NEW: + LOG(FATAL) << "Server shoud already closed"; + case STARTED: + case STOPPED: + update_server_thread_.reset(); + return Status::OK(); + default: + LOG(FATAL); + } +} + +/* static */ +Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env, + std::unique_ptr* out_server) { + std::unique_ptr ret( + new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env)); + ServiceInitFunction service_func = nullptr; + GrpcServerOptions options; + options.rendezvous_mgr_func = NewRpcRendezvousMgr; + Status s = ret->Init(options); + if (!s.ok()) { + LOG(ERROR) << s; + return s; + } + *out_server = std::move(ret); + return Status::OK(); +} + +/* static */ +Status ElasticGrpcServer::Create(const ServerDef& server_def, Env* env, + std::unique_ptr* out_server) { + std::unique_ptr ret( + new ElasticGrpcServer(server_def, env == nullptr ? Env::Default() : env)); + GrpcServerOptions options; + options.rendezvous_mgr_func = NewRpcRendezvousMgr; + Status s = ret->Init(options); + if (!s.ok()) { + LOG(ERROR) << s; + return s; + } + *out_server = std::move(ret); + return Status::OK(); +} + +namespace { + +class ElasticGrpcServerFactory : public ServerFactory { + public: + bool AcceptsOptions(const ServerDef& server_def) override { + return server_def.protocol() == "elastic-grpc"; + } + + Status NewServer(const ServerDef& server_def, + std::unique_ptr* out_server) override { + return ElasticGrpcServer::Create(server_def, Env::Default(), out_server); + } +}; + +// Registers a `ServerFactory` for `ElasticGrpcServer` instances. +class ElasticGrpcServerRegistrar { + public: + ElasticGrpcServerRegistrar() { + gpr_allocation_functions alloc_fns; + memset(&alloc_fns, 0, sizeof(alloc_fns)); + alloc_fns.malloc_fn = port::Malloc; + alloc_fns.realloc_fn = port::Realloc; + alloc_fns.free_fn = port::Free; + gpr_set_allocation_functions(alloc_fns); + ServerFactory::Register("ELASTIC_GRPC_SERVER", new ElasticGrpcServerFactory()); + } +}; +static ElasticGrpcServerRegistrar registrar; + +} // namespace +} // namespace tensorflow \ No newline at end of file diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h new file mode 100644 index 00000000000..8853ceb2819 --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h @@ -0,0 +1,66 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_ +#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_ + +#include + +#include "grpcpp/grpcpp.h" +#include "grpcpp/security/credentials.h" +#include "tensorflow/core/common_runtime/process_util.h" +#include "tensorflow/core/common_runtime/stats_publisher_interface.h" +#include "tensorflow/core/distributed_runtime/master_env.h" +#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h" +#include "tensorflow/core/distributed_runtime/server_lib.h" +#include "tensorflow/core/distributed_runtime/session_mgr.h" +#include "tensorflow/core/framework/collective.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/platform/env.h" + +namespace tensorflow { + +class ElasticGrpcServer : public GrpcServer { + public: + ElasticGrpcServer(const ServerDef& server_def, Env* env); + + virtual ~ElasticGrpcServer() override; + + static Status Create(const ServerDef& server_def, Env* env, + std::unique_ptr* out_server); + static Status Create(const ServerDef& server_def, Env* env, + std::unique_ptr* out_server); + + Status Update(const string& cluster_def_str); + + void MaybeMutateBuilder(::grpc::ServerBuilder* builder) override; + + Status Start() override; + + Status Join() override; + + private: + Status UpdateServerDef(const string& cluster_def_str, int& before_part_num, int& after_part_num); + + private: + // TensorFlow Eager implementation, and RPC polling thread. + AsyncServiceInterface* elastic_service_ = nullptr; + std::unique_ptr update_server_thread_ GUARDED_BY(mu_); + + std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_GRPC_SERVER_LIB_H_ \ No newline at end of file diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc new file mode 100644 index 00000000000..e2db870a74a --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib_test.cc @@ -0,0 +1,77 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +#include "gtest/gtest.h" + +namespace tensorflow { + +class ElasticGrpcServerTest : public ::testing::Test { + protected: + Status FillServerDef(const string& job_spec, ServerDef* options) { + options->set_protocol("elastic-grpc"); + options->set_job_name("chief"); + options->set_task_index(0); + + uint32 my_tasks_per_replica = 0; + for (const string& job_str : str_util::Split(job_spec, ',')) { + JobDef* job_def = options->mutable_cluster()->add_job(); + // Split each entry in the flag into 2 pieces, separated by "|". + const std::vector job_pieces = str_util::Split(job_str, '|'); + CHECK_EQ(2, job_pieces.size()) << job_str; + job_def->set_name(job_pieces[0]); + // Does a bit more validation of the tasks_per_replica. + const StringPiece spec = job_pieces[1]; + // job_str is of form |. + const std::vector host_ports = str_util::Split(spec, ';'); + uint32 tasks_per_replica = host_ports.size(); + for (size_t i = 0; i < host_ports.size(); ++i) { + (*job_def->mutable_tasks())[i] = host_ports[i]; + } + if (job_def->name() == options->job_name()) { + my_tasks_per_replica = tasks_per_replica; + } + LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {" + << absl::StrJoin(host_ports, ", ") << "}"; + } + if (my_tasks_per_replica == 0) { + return errors::InvalidArgument("Invalid job specification"); + } + return Status::OK(); + } +}; + +//Test Update Logic +TEST_F(ElasticGrpcServerTest, UpdateServer) { + Status s; + std::unique_ptr grpc_server; + ServerDef server_def; + std::string job_spec = "worker|localhost:2222,ps|localhost:10086;localhost:10087;localhost:10088,chief|localhost:2220"; + TF_ASSERT_OK(FillServerDef(job_spec, &server_def)); + s = ElasticGrpcServer::Create(server_def, Env::Default(), &grpc_server); + if (!s.ok()) { + LOG(ERROR) << "Could not create server: " << s.error_message(); + } + TF_ASSERT_OK(grpc_server->Start()); + // TF_QCHECK_OK(grpc_server->Join()); + LOG(INFO) << "SCALING DOWN"; + std::string tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\"],\"chief\": [\"localhost:2220\"]]}}"; + grpc_server->Update(tf_config_str); + LOG(INFO) << "SCALING UP"; + tf_config_str = "{\"cluster\": {\"worker\": [\"localhost:2222\"],\"ps\": [\"localhost:10086\", \"localhost:10087\", \"localhost:10088\"],\"chief\": [\"localhost:2220\"]]}}"; + grpc_server->Update(tf_config_str); + grpc_server.release(); +} + +} \ No newline at end of file diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc new file mode 100644 index 00000000000..61aa6e662ec --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc @@ -0,0 +1,157 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#include "tensorflow/contrib/elastic_grpc_server/elastic_service.h" + +#include "tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.h" +#include "tensorflow/core/protobuf/elastic_training.grpc.pb.h" +#include "tensorflow/core/protobuf/elastic_training.pb.h" +#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h" + + +#include +#include +#include +#include +#include "grpcpp/server_builder.h" + +using namespace des; + +using grpc::Server; +using grpc::ServerAsyncResponseWriter; +using grpc::ServerBuilder; +using grpc::ServerCompletionQueue; +using grpc::ServerContext; + +namespace tensorflow { + +class GrpcElasticService : public AsyncServiceInterface { + public: + GrpcElasticService(ElasticGrpcServer* elastic_grpc_server, + ::grpc::ServerBuilder* builder): + elastic_grpc_server_(elastic_grpc_server), builder_(builder) { + builder_->RegisterService(&elastic_service_); + cq_ = builder_->AddCompletionQueue(); + } + + ~GrpcElasticService() override { } + + void Shutdown() override { + cq_->Shutdown(); + } + + void HandleRPCsLoop() override { + new CallData(&elastic_service_, elastic_grpc_server_, cq_.get()); + void* tag; + bool ok; + while (true) { + // Block waiting to read the next event from the completion queue. The + // event is uniquely identified by its tag, which in this case is the + // memory address of a CallData instance. + // The return value of Next should always be checked. This return value + // tells us whether there is any kind of event or cq_ is shutting down. + GPR_ASSERT(cq_->Next(&tag, &ok)); + GPR_ASSERT(ok); + static_cast(tag)->Proceed(); + } + } + + private: + // Class encompasing the state and logic needed to serve a request. + class CallData { + public: + // Take in the "service" instance (in this case representing an asynchronous + // server) and the completion queue "cq" used for asynchronous communication + // with the gRPC runtime. + CallData(ElasticTrainingService::AsyncService* service, ElasticGrpcServer* elastic_grpc_server, + ServerCompletionQueue* cq) + : service_(service), elastic_grpc_server_(elastic_grpc_server), + cq_(cq), responder_(&ctx_), status_(CREATE) { + // Invoke the serving logic right away. + Proceed(); + } + + void Proceed() { + if (status_ == CREATE) { + // Make this instance progress to the PROCESS state. + status_ = PROCESS; + + // As part of the initial CREATE state, we *request* that the system + // start processing SayHello requests. In this request, "this" acts are + // the tag uniquely identifying the request (so that different CallData + // instances can serve different requests concurrently), in this case + // the memory address of this CallData instance. + service_->RequestUpdateServerDef(&ctx_, &request_, &responder_, + cq_, cq_, this); + } else if (status_ == PROCESS) { + // Spawn a new CallData instance to serve new clients while we process + // the one for this CallData. The instance will deallocate itself as + // part of its FINISH state. + new CallData(service_, elastic_grpc_server_, cq_); + + // The actual processing. + Status s = elastic_grpc_server_->Update(request_.cluster_def()); + if (s.ok()) { + reply_.set_code(Code::OK); + } else { + reply_.set_code(Code::INTERNAL); + reply_.set_msg(s.ToString()); + LOG(ERROR) << "error" << s.ToString(); + } + + // And we are done! Let the gRPC runtime know we've finished, using the + // memory address of this instance as the uniquely identifying tag for + // the event. + status_ = FINISH; + responder_.Finish(reply_, ::grpc::Status::OK, this); + } else { + GPR_ASSERT(status_ == FINISH); + // Once in the FINISH state, deallocate ourselves (CallData). + delete this; + } + } + private: + ElasticGrpcServer* elastic_grpc_server_; + // The means of communication with the gRPC runtime for an asynchronous + // server. + ElasticTrainingService::AsyncService* service_; + // The producer-consumer queue where for asynchronous server notifications. + ServerCompletionQueue* cq_; + // Context for the rpc, allowing to tweak aspects of it such as the use + // of compression, authentication, as well as to send metadata back to the + // client. + ServerContext ctx_; + + // What we get from the client. + UpdateServerDefRequest request_; + // What we send back to the client. + UpdateServerDefResponse reply_; + + // The means to get back to the client. + ServerAsyncResponseWriter responder_; + + // Let's implement a tiny state machine with the following states. + enum CallStatus { CREATE, PROCESS, FINISH }; + CallStatus status_; // The current serving state. + }; + + ElasticGrpcServer* elastic_grpc_server_; + ::grpc::ServerBuilder* builder_; + ElasticTrainingService::AsyncService elastic_service_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_; +}; + +AsyncServiceInterface* NewElasticGrpcService( + ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder) { + return reinterpret_cast(new GrpcElasticService(elastic_grpc_server, builder)); +} +} \ No newline at end of file diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.h b/tensorflow/contrib/elastic_grpc_server/elastic_service.h new file mode 100644 index 00000000000..9465a10c918 --- /dev/null +++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.h @@ -0,0 +1,31 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=======================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_ +#define TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_ + + +#include +#include "grpcpp/server_builder.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/protobuf/config.pb.h" +class ElasticGrpcServer; + +namespace tensorflow { + +class AsyncServiceInterface; +AsyncServiceInterface* NewElasticGrpcService( + ElasticGrpcServer* elastic_grpc_server, ::grpc::ServerBuilder* builder); + +} // namespace tensorflow + +#endif // TENSORFLOW_CONTRIB_ELASTIC_GRPC_SERVER_ELASTIC_SERVICE_H_ \ No newline at end of file diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 95bbbab5624..0531200e7ab 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -139,6 +139,7 @@ load( "tf_lib_proto_parsing_deps", "tf_proto_library", "tf_proto_library_cc", + "tf_proto_library_py", "tf_protos_all", "tf_protos_all_impl", "tf_protos_grappler", @@ -2475,6 +2476,28 @@ tf_proto_library_cc( ], ) +tf_proto_library_cc( + name = "elastic_service_proto", + srcs = ["protobuf/elastic_training.proto"], + has_services = 1, + cc_api_version = 2, + cc_grpc_version = 1, + cc_stubby_versions = ["2"], + protodeps = tf_additional_all_protos(), + visibility = [ + "//tensorflow:internal", + ], +) + +tf_proto_library_py( + name = "elastic_service_pb", + srcs = ["protobuf/elastic_training.proto"], + use_grpc_plugin = True, + visibility = [ + "//tensorflow:internal", + ], +) + LIB_INTERNAL_PRIVATE_HEADERS = [ "framework/resource_handle.h", "//tensorflow/core/platform:legacy_lib_internal_headers", diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h index 521c8f206f8..79d6b0cd65e 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h @@ -127,14 +127,11 @@ class GrpcServer : public ServerInterface { const ServerDef& server_def() const { return server_def_; } GrpcWorker* worker_impl() const { return worker_impl_.get(); } - - private: - // The overall server configuration. - const ServerDef server_def_; + protected: + // The overall server configuration. It may be changed during scaling. + ServerDef server_def_; Env* env_; - - // The port to which this server is bound. - int bound_port_ = 0; + GrpcServerOptions opts_; // Guards state transitions. mutex mu_; @@ -151,6 +148,9 @@ class GrpcServer : public ServerInterface { enum State { NEW, STARTED, STOPPED }; State state_ GUARDED_BY(mu_); + private: + // The port to which this server is bound. + int bound_port_ = 0; // Implementation of a TensorFlow master, and RPC polling thread. MasterEnv master_env_; std::unique_ptr master_impl_; diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 406285e7f0f..75d3c671562 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -769,6 +769,12 @@ def tf_additional_star_lib_defines(): "//conditions:default": [], }) +def tf_additional_elastic_server_lib_defines(): + return select({ + "//tensorflow:with_elastic_support": ["TENSORFLOW_USE_ELASTIC_SERVER"], + "//conditions:default": [], + }) + def tf_additional_api_compatible_defines(): return select({ "//tensorflow:with_api_compatible": ["TF_API_COMPATIBLE_1150"], diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl index 71651faf0b1..38191dea3c4 100644 --- a/tensorflow/core/platform/default/build_config_root.bzl +++ b/tensorflow/core/platform/default/build_config_root.bzl @@ -77,6 +77,14 @@ def tf_additional_star_deps(): "//conditions:default": [], }) +def tf_additional_elastic_deps(): + return select({ + str(Label("//tensorflow:with_elastic_support")): [ + str(Label("//tensorflow/contrib/elastic_grpc_server:elastic_grpc_server_lib")), + ], + "//conditions:default": [], + }) + # Include specific extra dependencies when building statically, or # another set of dependencies otherwise. If "macos" is provided, that # dependency list is used when using the framework_shared_object config diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto new file mode 100644 index 00000000000..ee0d0bd10e0 --- /dev/null +++ b/tensorflow/core/protobuf/elastic_training.proto @@ -0,0 +1,76 @@ +syntax = "proto3"; + +package des; + +enum Code { + OK = 0; + CANCELLED = 1; + UNKNOWN = 2; + INVALID_ARGUMENT = 3; + DEADLINE_EXCEEDED = 4; + NOT_FOUND = 5; + ALREADY_EXISTS = 6; + PERMISSION_DENIED = 7; + RESOURCE_EXHAUSTED = 8; + FAILED_PRECONDITION = 9; + ABORTED = 10; + OUT_OF_RANGE = 11; + UNIMPLEMENTED = 12; + INTERNAL = 13; + UNAVAILABLE = 14; + DATA_LOSS = 15; + UNAUTHENTICATED = 16; + REQUEST_STOP = 17; +} + +enum ElasticTrainingState { + READY = 0; + SCALING = 1; + All_SESSION_CLOSED = 2; +} + +enum ScalingAction { + NONE = 0; + SCALING_UP = 1; + SCALING_DOWN = 2; +} + +message IsReadyScalingRequest { + int32 task_index = 1; +} + +message IsReadyScalingResponse { + Code code = 1; + string msg = 2; + ScalingAction scaling_action = 3; + int32 ps_num = 4; // updated ps_num; +} + +message ReadyToUpdateRequest {}; +message ReadyToUpdateResponse {}; + +message UpdateServerDefRequest { + string cluster_def = 1;//serialized cluster_def +} + +message UpdateServerDefResponse { + Code code = 1; + string msg = 2; +} + +message FetchParamsRequest { + repeated string names = 1; // vec of partitioned variables or ev +} + +message FetchParamsResponse { + Code code = 1; + string msg = 2; + map param_partition_map = 3; // per partition num of variable +} + +service ElasticTrainingService { + rpc IsReadyScaling(IsReadyScalingRequest) returns (IsReadyScalingResponse); + rpc ReadyToUpdate(ReadyToUpdateRequest) returns (ReadyToUpdateResponse); + rpc UpdateServerDef(UpdateServerDefRequest) returns (UpdateServerDefResponse); + rpc FetchParamsMeta(FetchParamsRequest) returns (FetchParamsResponse); +} \ No newline at end of file diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 68649078f5c..a740e0916d9 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -24,7 +24,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc") load("//tensorflow:tensorflow.bzl", "cuda_py_test") load("//tensorflow:tensorflow.bzl", "cuda_py_tests") load("//tensorflow/core/platform:default/build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_cupti_test_flags", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler") # @unused -load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps") +load("//tensorflow/core/platform:default/build_config_root.bzl", "if_static", "tf_additional_gdr_deps", "tf_additional_mpi_deps", "tf_additional_plugin_deps", "tf_additional_verbs_deps", "tf_additional_star_deps", "tf_additional_elastic_deps") load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py") load( "//third_party/ngraph:build_defs.bzl", @@ -5307,6 +5307,7 @@ tf_py_wrap_cc( tf_additional_verbs_deps() + tf_additional_mpi_deps() + tf_additional_gdr_deps() + + tf_additional_elastic_deps() + tf_additional_star_deps()) + if_ngraph([ "@ngraph_tf//:ngraph_tf", ]), From 2d31c8e37ea28d7c169879ebd9c3a89bd8d26cb5 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Thu, 26 Oct 2023 04:02:41 -0700 Subject: [PATCH 11/45] [Embedding] Add interface of EmbeddingVar for Elastic Training. (#933) Signed-off-by: JunqiHu --- configure.py | 2 +- tensorflow/contrib/elastic_grpc_server/BUILD | 3 +- tensorflow/core/BUILD | 5 +- .../framework/embedding/bloom_filter_policy.h | 2 +- .../embedding/counter_filter_policy.h | 2 +- .../framework/embedding/cpu_hash_map_kv.h | 22 +++++ .../framework/embedding/dense_hash_map_kv.h | 19 ++++ .../core/framework/embedding/embedding_var.h | 86 ++++++++++++++++++- .../embedding/embedding_var_ckpt_data.h | 1 - .../core/framework/embedding/filter_policy.h | 20 ++++- .../framework/embedding/gpu_hash_map_kv.h | 7 ++ .../core/framework/embedding/kv_interface.h | 5 ++ .../core/framework/embedding/leveldb_kv.h | 32 +++++++ .../framework/embedding/multi_tier_storage.h | 9 +- .../embedding/nullable_filter_policy.h | 2 +- .../framework/embedding/single_tier_storage.h | 13 ++- .../core/framework/embedding/ssd_hash_kv.h | 6 ++ tensorflow/core/framework/embedding/storage.h | 7 +- tensorflow/core/kernels/data/BUILD | 6 ++ tensorflow/core/kernels/data/iterator_ops.cc | 12 ++- tensorflow/python/ops/embedding_ops.py | 3 +- 21 files changed, 244 insertions(+), 20 deletions(-) diff --git a/configure.py b/configure.py index 6aeaf7d12af..4fb1c78c40b 100644 --- a/configure.py +++ b/configure.py @@ -1434,7 +1434,7 @@ def main(): True, 'star') set_build_var(environ_cp, 'TF_NEED_ELASTIC', 'ELASTIC TRAINING', 'with_elastic_support', - True, 'elastic') + False, 'elastic') set_build_var(environ_cp, 'TF_ENABLE_PMEM', 'PMEM', 'with_pmem_support', False, 'pmem') diff --git a/tensorflow/contrib/elastic_grpc_server/BUILD b/tensorflow/contrib/elastic_grpc_server/BUILD index ea4b87e3b58..16ec91f4435 100644 --- a/tensorflow/contrib/elastic_grpc_server/BUILD +++ b/tensorflow/contrib/elastic_grpc_server/BUILD @@ -56,7 +56,8 @@ cc_library( tf_cc_test( name = "elastic_grpc_test", size = "small", - srcs = ["elastic_grpc_server_lib_test.cc"], + srcs = select({"//tensorflow:with_elastic_support": ["elastic_grpc_server_lib_test.cc"], + "//conditions:default": []}), deps = [ ":elastic_grpc_server_lib", "//tensorflow/core/distributed_runtime/rpc:grpc_util", diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 0531200e7ab..ef1ebcb6dcf 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -128,6 +128,7 @@ load( "tf_additional_numa_deps", "tf_additional_numa_lib_defines", "tf_additional_star_lib_defines", + "tf_additional_elastic_server_lib_defines", "tf_additional_api_compatible_defines", "tf_additional_pmem_lib_defines", "tf_additional_test_deps", @@ -1441,6 +1442,7 @@ tf_cc_test( cc_library( name = "ops", visibility = ["//visibility:public"], + defines = tf_additional_elastic_server_lib_defines(), deps = [ ":array_ops_op_lib", ":parquet_ops_op_lib", @@ -2562,7 +2564,8 @@ LIB_INTERNAL_DEFINES = ( tf_additional_gdr_lib_defines() + tf_additional_numa_lib_defines() + tf_additional_star_lib_defines() + - tf_additional_pmem_lib_defines() + tf_additional_pmem_lib_defines() + + tf_additional_elastic_server_lib_defines() ) cc_library( diff --git a/tensorflow/core/framework/embedding/bloom_filter_policy.h b/tensorflow/core/framework/embedding/bloom_filter_policy.h index 781511578af..8019e70a312 100644 --- a/tensorflow/core/framework/embedding/bloom_filter_policy.h +++ b/tensorflow/core/framework/embedding/bloom_filter_policy.h @@ -333,7 +333,7 @@ class BloomFilterPolicy : public FilterPolicy { // this can describe by graph(Mod + DynamicPartition), // but memory waste and slow if (*(key_buff + i) % bucket_num % partition_num != partition_id) { - LOG(INFO) << "skip EV key:" << *(key_buff + i); + VLOG(1) << "skip EV key:" << *(key_buff + i); continue; } void* value_ptr = nullptr; diff --git a/tensorflow/core/framework/embedding/counter_filter_policy.h b/tensorflow/core/framework/embedding/counter_filter_policy.h index 19cd90ad01c..e53d574182c 100644 --- a/tensorflow/core/framework/embedding/counter_filter_policy.h +++ b/tensorflow/core/framework/embedding/counter_filter_policy.h @@ -159,7 +159,7 @@ class CounterFilterPolicy : public FilterPolicy { // this can describe by graph(Mod + DynamicPartition), // but memory waste and slow if (*(key_buff + i) % bucket_num % partition_num != partition_id) { - LOG(INFO) << "skip EV key:" << *(key_buff + i); + VLOG(1) << "skip EV key:" << *(key_buff + i); continue; } int64 import_freq = 0; diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h index 8476c399c40..750ba282285 100644 --- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h @@ -137,6 +137,28 @@ class LocklessHashMap : public KVInterface { return Status::OK(); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + std::pair *hash_map_dump; + int64 bucket_count; + auto it = hash_map_.GetSnapshot(); + hash_map_dump = it.first; + bucket_count = it.second; + for (int64 j = 0; j < bucket_count; j++) { + if (hash_map_dump[j].first != LocklessHashMap::EMPTY_KEY_ + && hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_ + && hash_map_dump[j].first % kSavedPartitionNum + % partition_nums != partition_id) { + key_list->emplace_back(hash_map_dump[j].first); + value_ptr_list->emplace_back(hash_map_dump[j].second); + } + } + + free(hash_map_dump); + return Status::OK(); + } + std::string DebugString() const override { LOG(INFO) << "map info size:" << Size() << "map info bucket_count:" << hash_map_.bucket_count() diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h index ffaf2e335dc..8a27404b66f 100644 --- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h @@ -121,6 +121,25 @@ class DenseHashMap : public KVInterface { return Status::OK(); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + dense_hash_map hash_map_dump[partition_num_]; + for (int i = 0; i< partition_num_; i++) { + spin_rd_lock l(hash_map_[i].mu); + hash_map_dump[i].hash_map = hash_map_[i].hash_map; + } + for (int i = 0; i< partition_num_; i++) { + for (const auto it : hash_map_dump[i].hash_map) { + if (it.first % kSavedPartitionNum % partition_nums != partition_id) { + key_list->push_back(it.first); + value_ptr_list->push_back(it.second); + } + } + } + return Status::OK(); + } + std::string DebugString() const override { return ""; } diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index 487f595bf31..a66ec19fb97 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -435,6 +435,10 @@ class EmbeddingVar : public ResourceBase { return storage_->CacheSize(); } + int64 MemoryUsage() const { + return storage_->Size() * (sizeof(K) + feat_desc_->data_bytes()); + } + int64 MinFreq() { return emb_config_.filter_freq; } @@ -516,6 +520,85 @@ class EmbeddingVar : public ResourceBase { } } + Status GetShardedSnapshot(std::vector* key_list, + std::vector* value_ptr_list, + int partition_id, int partition_num) { + return storage_->GetShardedSnapshot(key_list, value_ptr_list, + partition_id, partition_num); + } + + void ExportAndRemove(K* key_list, V* value_list, + int64* version_list, int64* freq_list, + std::vector& tot_keys_list, + std::vector& tot_value_ptr_list) { + bool save_unfiltered_features = true; + TF_CHECK_OK(ReadBoolFromEnvVar( + "TF_EV_SAVE_FILTERED_FEATURES", true, &save_unfiltered_features)); + + bool is_save_freq = emb_config_.is_save_freq(); + bool is_save_version = emb_config_.is_save_version(); + + for (int64 i = 0; i < tot_keys_list.size(); ++i) { + auto& value_ptr = tot_value_ptr_list[i]; + if((int64)value_ptr == embedding::ValuePtrStatus::IS_DELETED) + continue; + + bool is_admit = feat_desc_->IsAdmit(value_ptr); + bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); + + if (!is_admit) { + key_list[i] = tot_keys_list[i]; + + if (!is_in_dram) { + auto tmp_value = value_list + i * value_len_; + tmp_value = (V*)embedding::ValuePtrStatus::NOT_IN_DRAM; + value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1)); + } else if (feat_desc_->GetEmbedding(value_ptr, 0) == nullptr) { + memcpy(value_list + i * value_len_, default_value_, sizeof(V) * value_len_); + } else { + V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index); + memcpy(value_list + i * value_len_, val, sizeof(V) * value_len_); + } + + if(is_save_version) { + int64 dump_version = feat_desc_->GetVersion(value_ptr); + version_list[i] = dump_version; + } + + if(is_save_freq) { + int64 dump_freq = feat_desc_->GetFreq(value_ptr); + freq_list[i] = dump_freq; + } + } else { + if (!save_unfiltered_features) + return; + //TODO(JUNQI) : currently not export filtered keys + } + + if (emb_config_.is_primary()) { + Status s; + s = storage_->Remove(tot_keys_list[i]); + if (!s.ok()) { + LOG(ERROR) << "Remove keys error: " << s.error_message(); + } + feat_desc_->Deallocate(value_ptr); + } + } + } + + Status RestoreFromKeysAndValues(int64 key_num, int partition_id, + int partition_num, const K* key_list, + const V* value_list, const int64* version_list, + const int64* freq_list, + const Eigen::GpuDevice* device = nullptr) { + RestoreBuffer restore_buff((char*)key_list, (char*)value_list, + (char*)version_list, (char*)freq_list); + return storage_->RestoreFeatures(key_num, kSavedPartitionNum, + partition_id, partition_num, + value_len_, false/* is_filter*/, false/* is_incr*/, + emb_config_, device, filter_, restore_buff); + } + mutex* mu() { return &mu_; } @@ -537,6 +620,8 @@ class EmbeddingVar : public ResourceBase { } } + string Name() {return name_; } + V* GetDefaultValuePtr() { return default_value_; } @@ -645,7 +730,6 @@ class EmbeddingVar : public ResourceBase { GPUHashTable* HashTable() { return storage_->HashTable(); } - FilterPolicy>* GetFilter() const { return filter_; } diff --git a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h index 10bf0d0e43b..13072f9cdd1 100644 --- a/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h +++ b/tensorflow/core/framework/embedding/embedding_var_ckpt_data.h @@ -20,7 +20,6 @@ limitations under the License. namespace tensorflow { class BundleWriter; namespace { - const int kSavedPartitionNum = 1000; const int kDramFlagOffset = 49; } diff --git a/tensorflow/core/framework/embedding/filter_policy.h b/tensorflow/core/framework/embedding/filter_policy.h index 256d3b044d4..c994829bafc 100644 --- a/tensorflow/core/framework/embedding/filter_policy.h +++ b/tensorflow/core/framework/embedding/filter_policy.h @@ -27,19 +27,31 @@ struct RestoreBuffer { char* value_buffer = nullptr; char* version_buffer = nullptr; char* freq_buffer = nullptr; + bool should_release = false; explicit RestoreBuffer(size_t buffer_size) { key_buffer = new char[buffer_size]; value_buffer = new char[buffer_size]; version_buffer = new char[buffer_size]; freq_buffer = new char[buffer_size]; + should_release = true; + } + + explicit RestoreBuffer(char* i_key_buffer, char* i_value_buffer, + char* i_version_buffer, char* i_freq_buffer) { + key_buffer = i_key_buffer; + value_buffer = i_value_buffer; + version_buffer = i_version_buffer; + freq_buffer = i_freq_buffer; } ~RestoreBuffer() { - delete []key_buffer; - delete []value_buffer; - delete []version_buffer; - delete []freq_buffer; + if (should_release) { + delete []key_buffer; + delete []value_buffer; + delete []version_buffer; + delete []freq_buffer; + } } }; diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h index fc4a2506313..e73839e3f76 100644 --- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h @@ -252,6 +252,13 @@ class GPUHashMapKV : public KVInterface { return Status::OK(); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot"; + return Status::OK(); + } + std::string DebugString() const override { return std::string(); } GPUHashTable* HashTable() override { return hash_table_; } diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h index 3659187c825..dc603680138 100644 --- a/tensorflow/core/framework/embedding/kv_interface.h +++ b/tensorflow/core/framework/embedding/kv_interface.h @@ -23,6 +23,7 @@ limitations under the License. namespace tensorflow { namespace { const char* kInferenceMode = "INFERENCE_MODE"; +const int kSavedPartitionNum = 1000; } template @@ -89,6 +90,10 @@ class KVInterface { virtual Status GetSnapshot(std::vector* key_list, std::vector* value_ptr_list) = 0; + virtual Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) = 0; + virtual std::string DebugString() const = 0; virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v, diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h index e488ab3776d..47c8a39dfbd 100644 --- a/tensorflow/core/framework/embedding/leveldb_kv.h +++ b/tensorflow/core/framework/embedding/leveldb_kv.h @@ -193,6 +193,38 @@ class LevelDBKV : public KVInterface { return Status::OK(); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + ReadOptions options; + options.snapshot = db_->GetSnapshot(); + leveldb::Iterator* it = db_->NewIterator(options); + void* dram_value_ptr = feat_desc_->Allocate(); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + K key; + memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); + if (key % kSavedPartitionNum % partition_nums == partition_id) continue; + key_list->emplace_back(key); + FeatureDescriptor hbm_feat_desc( + 1, 1, ev_allocator()/*useless*/, + StorageType::HBM_DRAM, true, true, + {false, 0}); + void* value_ptr = cpu_allocator()->AllocateRaw( + Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes()); + memcpy(dram_value_ptr, + it->value().ToString().data(), + feat_desc_->data_bytes()); + hbm_feat_desc.SetFreq( + value_ptr, feat_desc_->GetFreq(dram_value_ptr)); + hbm_feat_desc.UpdateVersion( + value_ptr, feat_desc_->GetVersion(dram_value_ptr)); + value_ptr_list->emplace_back(value_ptr); + } + delete it; + feat_desc_->Deallocate(dram_value_ptr); + return Status::OK(); + } + int64 Size() const override { return counter_->size(); } diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h index 7955322aca6..f77fec8c85a 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.h +++ b/tensorflow/core/framework/embedding/multi_tier_storage.h @@ -87,6 +87,14 @@ class MultiTierStorage : public Storage { Status GetSnapshot(std::vector* key_list, std::vector* value_ptr_list) override { LOG(FATAL)<<"Can't get snapshot of MultiTierStorage."; + return Status::OK(); + } + + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage."; + return Status::OK(); } void CopyEmbeddingsFromCPUToGPU( @@ -170,7 +178,6 @@ class MultiTierStorage : public Storage { }); } - protected: Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, bool is_incr, const EmbeddingConfig& emb_config, diff --git a/tensorflow/core/framework/embedding/nullable_filter_policy.h b/tensorflow/core/framework/embedding/nullable_filter_policy.h index 7e3ace0063d..55f718d7ca4 100644 --- a/tensorflow/core/framework/embedding/nullable_filter_policy.h +++ b/tensorflow/core/framework/embedding/nullable_filter_policy.h @@ -150,7 +150,7 @@ class NullableFilterPolicy : public FilterPolicy { // this can describe by graph(Mod + DynamicPartition), // but memory waste and slow if (*(key_buff + i) % bucket_num % partition_num != partition_id) { - LOG(INFO) << "skip EV key:" << *(key_buff + i); + VLOG(1) << "skip EV key:" << *(key_buff + i); continue; } int64 import_freq = 0; diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h index be08afd7f50..db96c807c5e 100644 --- a/tensorflow/core/framework/embedding/single_tier_storage.h +++ b/tensorflow/core/framework/embedding/single_tier_storage.h @@ -223,6 +223,14 @@ class SingleTierStorage : public Storage { return kv_->GetSnapshot(key_list, value_ptr_list); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + mutex_lock l(Storage::mu_); + return kv_->GetShardedSnapshot(key_list, value_ptr_list, + partition_id, partition_nums); + } + Status Save( const std::string& tensor_name, const std::string& prefix, @@ -286,7 +294,7 @@ class SingleTierStorage : public Storage { FeatureDescriptor* feature_descriptor() { return feat_desc_; } - protected: + virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, bool is_incr, const EmbeddingConfig& emb_config, @@ -298,7 +306,8 @@ class SingleTierStorage : public Storage { false/*to_dram*/, is_incr, restore_buff); return s; } - + + protected: virtual void Shrink(std::vector& key_list, std::vector& value_ptr_list, ShrinkArgs& shrink_args, diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h index f51c6904a50..a56c9f73385 100644 --- a/tensorflow/core/framework/embedding/ssd_hash_kv.h +++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h @@ -349,6 +349,12 @@ class SSDHashKV : public KVInterface { return Status::OK(); } + Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) override { + return Status::OK(); + } + Status GetSnapshot( std::vector* key_list, std::vector* file_list) { diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h index 1ffb435054b..a652de5fa5f 100644 --- a/tensorflow/core/framework/embedding/storage.h +++ b/tensorflow/core/framework/embedding/storage.h @@ -95,6 +95,9 @@ class Storage { virtual int64 Size(int level) const = 0; virtual Status GetSnapshot(std::vector* key_list, std::vector* value_ptr_list) = 0; + virtual Status GetShardedSnapshot( + std::vector* key_list, std::vector* value_ptr_list, + int partition_id, int partition_nums) = 0; virtual Status Save( const string& tensor_name, const string& prefix, @@ -197,7 +200,6 @@ class Storage { int64 freq, int64 version, int emb_index) = 0; - protected: virtual Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id, int64 partition_num, int64 value_len, bool is_filter, bool is_incr, const EmbeddingConfig& emb_config, @@ -206,7 +208,8 @@ class Storage { RestoreBuffer& restore_buff) { return Status::OK(); } - + + protected: virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len, const std::string& ssd_emb_file_name, diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 08445403b58..6878c5f8350 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -9,6 +9,11 @@ load( "transitive_hdrs", ) +load( + "//tensorflow/core/platform:default/build_config.bzl", + "tf_additional_elastic_server_lib_defines", +) + package( default_visibility = ["//visibility:public"], licenses = ["notice"], # Apache 2.0 @@ -1119,6 +1124,7 @@ tf_kernel_library( name = "iterator_ops", srcs = ["iterator_ops.cc"], hdrs = ["iterator_ops.h"], + defines = tf_additional_elastic_server_lib_defines(), deps = [ ":captured_function", ":dataset_utils", diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 08d9d936537..ed6b40a38a0 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -308,7 +308,11 @@ void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) { } ResourceMgr* mgr = context->resource_manager(); - OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); +#ifdef TENSORFLOW_USE_ELASTIC_SERVER + OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), true)); +#else + OP_REQUIRES_OK(context, cinfo_.Init(mgr, def(), false)); +#endif IteratorResource* resource; OP_REQUIRES_OK( @@ -783,7 +787,11 @@ class OneShotIteratorOp : public AsyncOpKernel { Status TryInit(OpKernelContext* ctx, IteratorResource** iterator, ContainerInfo* cinfo) { - TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def())); +#ifdef TENSORFLOW_USE_ELASTIC_SERVER + TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), true)); +#else + TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def(), false)); +#endif FunctionLibraryRuntime* flr; std::unique_ptr flib_def(nullptr); diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py index cb2b7bb8154..e239c9ba8d5 100644 --- a/tensorflow/python/ops/embedding_ops.py +++ b/tensorflow/python/ops/embedding_ops.py @@ -44,6 +44,7 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util.tf_export import tf_export +SAVED_PARTITIONED_NUM = 1000 def _clip(params, ids, max_norm): """Helper function for _embedding_lookup_and_transform. @@ -216,7 +217,7 @@ def _embedding_lookup_and_transform(params, if isinstance(params[0], kv_variable_ops.EmbeddingVariable): new_ids = flat_ids - p_assignments = flat_ids % 1000 % np + p_assignments = flat_ids % SAVED_PARTITIONED_NUM % np elif partition_strategy == "mod": p_assignments = flat_ids % np new_ids = flat_ids // np From 89c7d63f50ed335ea14eb17f295b315a59e9f843 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 1 Nov 2023 19:48:48 +0800 Subject: [PATCH 12/45] [Runtime] Update log level in direct_session. (#935) Signed-off-by: candy.dc --- tensorflow/core/common_runtime/direct_session.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index 9670e838f88..a3dd3eba2ed 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -2185,8 +2185,8 @@ Status DirectSession::GetOrCreateExecutors( auto insert_key_status = executors_.emplace(key, insert_result.first->second); *executors_and_keys = insert_result.first->second.get(); if (insert_key_status.second) { - LOG(INFO) << "Add new unsort key to executors_ map: " << executors_idx++ - << ", key: " << key << ", this: " << this; + VLOG(2) << "Add new unsort key to executors_ map: " << executors_idx++ + << ", key: " << key << ", this: " << this; } return Status::OK(); From c2e664aecaec18106350ec77dee946e45dbcf1fb Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Tue, 7 Nov 2023 19:10:14 -0800 Subject: [PATCH 13/45] [Embedding] Remove private header. (#943) Signed-off-by: JunqiHu --- tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h | 1 - tensorflow/core/framework/embedding/hbm_dram_storage.h | 1 - 2 files changed, 2 deletions(-) diff --git a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h index 1056f4bbd78..4bc3b7d3aa2 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_ssd_storage.h @@ -7,7 +7,6 @@ #include "tensorflow/core/framework/embedding/single_tier_storage.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" #include "tensorflow/core/platform/stream_executor.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" namespace tensorflow { using se::DeviceMemoryBase; diff --git a/tensorflow/core/framework/embedding/hbm_dram_storage.h b/tensorflow/core/framework/embedding/hbm_dram_storage.h index d058d95f05b..15f6271fb4f 100644 --- a/tensorflow/core/framework/embedding/hbm_dram_storage.h +++ b/tensorflow/core/framework/embedding/hbm_dram_storage.h @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/hbm_storage_iterator.h" #include "tensorflow/core/framework/embedding/intra_thread_copy_id_allocator.h" #include "tensorflow/core/platform/stream_executor.h" -#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h" namespace tensorflow { using se::DeviceMemoryBase; From fc4f9f5c48b3f84d1f945c6aa738253cac7acf95 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Tue, 7 Nov 2023 23:32:37 -0800 Subject: [PATCH 14/45] [Distributed] Fix ps address list sort by index. (#945) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 --- .../elastic_grpc_server_lib.cc | 17 +++++++++++------ .../elastic_grpc_server/elastic_service.cc | 2 +- tensorflow/core/protobuf/elastic_training.proto | 2 +- tensorflow/python/BUILD | 1 + 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc index d45d70d6c8c..66e237956e5 100644 --- a/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc +++ b/tensorflow/contrib/elastic_grpc_server/elastic_grpc_server_lib.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include "include/json/json.h" #include "grpc/support/alloc.h" #include "grpcpp/grpcpp.h" @@ -89,7 +90,7 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be return errors::Internal("PARSE TF_CONFIG/cluster ERROR"); } - std::unordered_set ps_addrs_vec; + std::set ps_addrs_vec; //ordered after_part_num = cluster_json["cluster"]["ps"].size(); for (auto& value: cluster_json["cluster"]["ps"]) { ps_addrs_vec.emplace(value.asString()); @@ -111,21 +112,25 @@ Status ElasticGrpcServer::UpdateServerDef(const string& cluster_def_str, int& be } for (auto ps_addr: ps_addrs_vec) { if (target_string_set.find(ps_addr) == target_string_set.end()) { - job->mutable_tasks()->insert({idx, ps_addr}); + job->mutable_tasks()->insert({idx++, ps_addr}); tf_config_json["cluster"]["ps"].append(ps_addr); } } break; } else { LOG(INFO) << "SCALING DOWN, partition_num is: " << after_part_num; + google::protobuf::Map< google::protobuf::int32, std::string > tasks; + Json::Value arr_value(Json::arrayValue); + int idx = 0; for (int i = 0; i < before_part_num; ++i) { string tmp_string = tf_config_json["cluster"]["ps"][i].asString(); - if (ps_addrs_vec.find(tmp_string) == ps_addrs_vec.end()) { - Json::Value ps_addr; - tf_config_json["cluster"]["ps"].removeIndex(i, &ps_addr); - job->mutable_tasks()->erase(i); + if (ps_addrs_vec.find(tmp_string) != ps_addrs_vec.end()) { + arr_value.append(tf_config_json["cluster"]["ps"][i]); + tasks[idx++] = tmp_string; } } + tf_config_json["cluster"]["ps"].swap(arr_value); + job->mutable_tasks()->swap(tasks); } } } diff --git a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc index 61aa6e662ec..59f7fa473bd 100644 --- a/tensorflow/contrib/elastic_grpc_server/elastic_service.cc +++ b/tensorflow/contrib/elastic_grpc_server/elastic_service.cc @@ -24,7 +24,7 @@ limitations under the License. #include #include "grpcpp/server_builder.h" -using namespace des; +using namespace deeprec; using grpc::Server; using grpc::ServerAsyncResponseWriter; diff --git a/tensorflow/core/protobuf/elastic_training.proto b/tensorflow/core/protobuf/elastic_training.proto index ee0d0bd10e0..b6af4b139cf 100644 --- a/tensorflow/core/protobuf/elastic_training.proto +++ b/tensorflow/core/protobuf/elastic_training.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -package des; +package deeprec; enum Code { OK = 0; diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index a740e0916d9..f9cc74743be 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -4747,6 +4747,7 @@ py_library( ":platform", ":protos_all_py", ":session_run_hook", + "//tensorflow/core:elastic_service_pb_py", ":training_util", ":util", ], From 29d9b464b55b571484ceae11947a6dfa25caba19 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Wed, 8 Nov 2023 19:17:25 -0800 Subject: [PATCH 15/45] [Op] Canonicalize SaveV2 Op device spec in distributed training. (#925) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 --- tensorflow/python/training/saver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py index 981d01dd7be..acc9723c183 100644 --- a/tensorflow/python/training/saver.py +++ b/tensorflow/python/training/saver.py @@ -550,8 +550,12 @@ def _GroupByDevices(self, saveables): """ per_device = collections.defaultdict(lambda: []) for saveable in saveables: - canonical_device = set( - pydev.canonical_name(spec.tensor.device) for spec in saveable.specs) + canonical_device = set() + for spec in saveable.specs: + device_name = pydev.canonical_name(spec.tensor.device) + device_spec = pydev.DeviceSpec.from_string(device_name) + device_spec.device_type = "CPU" + canonical_device.add(device_spec.to_string()) if len(canonical_device) != 1: raise ValueError("All tensors of a saveable object must be " "on the same device: %s" % saveable.name) From feab52dd225b9838d41790f25abb0f2f0607b199 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 15 Nov 2023 10:24:34 +0800 Subject: [PATCH 16/45] [Embedding] Fix SharedEmbeddingColumn with PartitionedEmbedingVariable shape validation error. (#948) Signed-off-by: candy.dc --- .../python/feature_column/feature_column.py | 3 ++ .../feature_column/feature_column_v2_test.py | 35 +++++++++++++++++++ tensorflow/python/ops/variables.py | 3 ++ 3 files changed, 41 insertions(+) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 3d5e7a71330..86a190cf86b 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -2675,6 +2675,9 @@ def create_embedding(self, embedding_weights = shared_embedding_collection[0] if isinstance(embedding_weights, kv_variable_ops.EmbeddingVariable): embedding_shape = (self.dimension) + elif isinstance(embedding_weights, variables.PartitionedVariable): + if isinstance(embedding_weights._get_variable_list()[0], kv_variable_ops.EmbeddingVariable): + embedding_shape = (self.dimension) if embedding_weights.get_shape() != embedding_shape: raise ValueError( 'Shared embedding collection {} contains variable {} of ' diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index ff5935b708f..7946aee1e1a 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -7705,6 +7705,41 @@ def testEmbeddingVariableForSharedEmbeddingColumnsMultiCol(self): for j in range(3): self.assertAlmostEqual(emb_r[i][j], emb_right[i][j]) + def testEmbeddingVariableForSharedPartitionedEmbeddingColumnsMultiCol(self): + columns_list=[] + columns_list.append(fc.categorical_column_with_embedding("col_emb", dtype=dtypes.string)) + columns_list.append(fc.categorical_column_with_embedding("col_emb2", dtype=dtypes.string)) + W = fc.shared_embedding_columns(columns_list, + dimension=3, + initializer=init_ops.ones_initializer(dtypes.float32), + shared_embedding_collection_name="xxxxx_shared") + + ids={} + ids["col_emb"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5]) + ids["col_emb2"] = sparse_tensor.SparseTensor(indices=[[0,0],[1,0],[2,0],[3,0],[4,0]], values=["aaaa","bbbbb","ccc","4nn","5b"], dense_shape=[5, 5]) + with variable_scope.variable_scope("scope",partitioner=partitioned_variables.fixed_size_partitioner(4)): + emb = fc_old.input_layer(ids, W) + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables_lib.global_variables_initializer() + + with self.test_session() as sess: + sess.run(init) + sess.run([emb, train_op,loss]) + sess.run([emb, train_op,loss]) + emb_r, _, _ = sess.run([emb, train_op,loss]) + emb_right = [[0.7221214, 0.7221214, 0.7221214], + [0.7221214, 0.7221214, 0.7221214], + [0.7221214, 0.7221214, 0.7221214], + [0.7221214, 0.7221214, 0.7221214], + [0.7221214, 0.7221214, 0.7221214]] + for i in range(5): + for j in range(3): + self.assertAlmostEqual(emb_r[i][j], emb_right[i][j]) + @test_util.run_deprecated_v1 def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self): columns_list=[] diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py index 6a3a1e0702b..8f92d091e68 100644 --- a/tensorflow/python/ops/variables.py +++ b/tensorflow/python/ops/variables.py @@ -3100,6 +3100,9 @@ def __init__(self, name, shape, dtype, variable_list, partitions): self._name = name self._shape = shape + from tensorflow.python.ops import kv_variable_ops + if isinstance(self._variable_list[0], kv_variable_ops.EmbeddingVariable): + self._shape = shape[1:] self._dtype = dtype self._partitions = partitions self._as_tensor = None From 37221b53ca3a90ea1a3f85cc787463fc3c9884fe Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 15 Nov 2023 11:42:07 +0800 Subject: [PATCH 17/45] [Release] Update DeepRec release version to 1.15.5+deeprec2310. (#949) Signed-off-by: candy.dc --- tensorflow/tools/pip_package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index d5fa79bf2b1..e8635e1a298 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -47,7 +47,7 @@ # result for pip. # Also update tensorflow/tensorflow.bzl and # tensorflow/core/public/version.h -_VERSION = '1.15.5+deeprec2306' +_VERSION = '1.15.5+deeprec2310' REQUIRED_PACKAGES = [ 'absl-py >= 0.9.0', From 3bc98886262c496ffcacac54f02391c9818e75ae Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Thu, 16 Nov 2023 16:53:48 +0800 Subject: [PATCH 18/45] [Docs] Update deeprec2310 release images and notes in README.md & RELEASE.md. (#950) Signed-off-by: candy.dc --- README.md | 4 +- RELEASE.md | 41 +++++++++++++++++++ docs/docs_en/DeepRec-Compile-And-Install.md | 4 +- docs/docs_en/Estimator-Compile-And-Install.md | 2 +- docs/docs_en/TFServing-Compile-And-Install.md | 2 +- docs/docs_zh/DeepRec-Compile-And-Install.md | 4 +- docs/docs_zh/Estimator-Compile-And-Install.md | 2 +- docs/docs_zh/TFServing-Compile-And-Install.md | 2 +- 8 files changed, 51 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 53cca5c5c83..8f491e14665 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux #### Image for CPU ``` -alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 ``` #### Image for GPU CUDA11.6 ``` -alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 ``` *** diff --git a/RELEASE.md b/RELEASE.md index 43e03bc2b49..6b7e4a7fd79 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,44 @@ +# Release r1.15.5-deeprec2310 +## **Major Features and Improvements** + +### **Embedding** + +- Refactor the data structure of EmbeddingVariable. +- Add interface of EmbeddingVar for Elastic Training. +- Add GetSnapshot and Create API for EmbeddingVariable. +- Remove the dependency on private header file in EmbeddingVariable. + +### **Runtime Optimization** + +- Canonicalize SaveV2 Op device spec in distributed training. +- Update log level in direct_session. + +### **Distributed** + +- Add elastic-grpc server. + +### **BugFix** + +- Fix missing return value of RestoreSSD of DramSSDHashStorage. +- Fix incorrect frequency in shared-embedding. +- Fix set initialized flag too early in restore subgraph. +- Fix wgrad bug in Sparse Operation Kit. +- Fix hang bug for async embedding lookup. +- Fix ps address list sort by index. +- Fix SharedEmbeddingColumn with PartitionedEmbedingVariable shape validation error. + +More details of features: [https://deeprec.readthedocs.io/zh/latest/](url) + +## **Release Images** + +### **CPU Image** + +`alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04` + +### **GPU Image** + +`alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04` + # Release r1.15.5-deeprec2306 ## **Major Features and Improvements** diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md index 83ba4854b9f..fdf3e295fdd 100644 --- a/docs/docs_en/DeepRec-Compile-And-Install.md +++ b/docs/docs_en/DeepRec-Compile-And-Install.md @@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 ``` arm64: @@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU Image with CUDA 11.6** ``` -alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 ``` diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md index 73b6a36f318..55f759a3c2a 100644 --- a/docs/docs_en/Estimator-Compile-And-Install.md +++ b/docs/docs_en/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -Develop Branch:master, Latest Release Branch: deeprec2306 +Develop Branch:master, Latest Release Branch: deeprec2310 ## Estimator Build diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md index 346a848ca74..79a0944aa3e 100644 --- a/docs/docs_en/TFServing-Compile-And-Install.md +++ b/docs/docs_en/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -Develop Branch: master, Latest Release Branch: deeprec2306 +Develop Branch: master, Latest Release Branch: deeprec2310 ## TFServing Build diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md index 08d249f8eeb..ad8fd36dbf7 100644 --- a/docs/docs_zh/DeepRec-Compile-And-Install.md +++ b/docs/docs_zh/DeepRec-Compile-And-Install.md @@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2306-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 ``` arm64: @@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU CUDA11.6镜像** ``` -alideeprec/deeprec-release:deeprec2306-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 ``` ## DeepRec Processor编译打包 diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md index e5455aae91a..e54c8ddbd2f 100644 --- a/docs/docs_zh/Estimator-Compile-And-Install.md +++ b/docs/docs_zh/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ 代码库:[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -开发分支:master,最新Release分支:deeprec2306 +开发分支:master,最新Release分支:deeprec2310 ## Estimator编译 diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md index 0c76400e6c6..a43d2d517a6 100644 --- a/docs/docs_zh/TFServing-Compile-And-Install.md +++ b/docs/docs_zh/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ 代码库:[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -开发分支:master,最新Release分支:deeprec2306 +开发分支:master,最新Release分支:deeprec2310 ## TFServing编译&打包 From d8149699bd8366ef7bb32ea049c4202b0c8d0c68 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:41:02 -0800 Subject: [PATCH 19/45] [ModelZoo] Set Saver's parameter sharded=True in distributed training. (#954) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 --- modelzoo/bst/train.py | 3 ++- modelzoo/dbmtl/train.py | 3 ++- modelzoo/dcn/train.py | 3 ++- modelzoo/dcnv2/train.py | 3 ++- modelzoo/deepfm/train.py | 3 ++- modelzoo/dien/train.py | 6 +++--- modelzoo/din/train.py | 6 +++--- modelzoo/dlrm/train.py | 3 ++- modelzoo/dssm/train.py | 3 ++- modelzoo/esmm/train.py | 5 +++-- modelzoo/masknet/train.py | 3 ++- modelzoo/mlperf/train.py | 3 ++- modelzoo/mmoe/train.py | 3 ++- modelzoo/ple/train.py | 3 ++- modelzoo/simple_multitask/train.py | 5 +++-- modelzoo/wide_and_deep/train.py | 3 ++- 16 files changed, 36 insertions(+), 22 deletions(-) diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py index 2fb5e4e90f5..eeeb136678b 100644 --- a/modelzoo/bst/train.py +++ b/modelzoo/bst/train.py @@ -612,9 +612,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py index 24595073b95..c848cbc76b2 100644 --- a/modelzoo/dbmtl/train.py +++ b/modelzoo/dbmtl/train.py @@ -527,9 +527,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py index b8e1dba5d63..44701e22d9f 100644 --- a/modelzoo/dcn/train.py +++ b/modelzoo/dcn/train.py @@ -594,9 +594,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py index 7ac4c1a0358..5b572af0425 100644 --- a/modelzoo/dcnv2/train.py +++ b/modelzoo/dcnv2/train.py @@ -610,9 +610,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py index 896295b0ae6..166bedec0d0 100644 --- a/modelzoo/deepfm/train.py +++ b/modelzoo/deepfm/train.py @@ -472,9 +472,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py index 6c583c3ac19..190695f6ce0 100644 --- a/modelzoo/dien/train.py +++ b/modelzoo/dien/train.py @@ -776,10 +776,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.tables_initializer(), - tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py index 6273e0d15a4..058583ce6fd 100644 --- a/modelzoo/din/train.py +++ b/modelzoo/din/train.py @@ -594,10 +594,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.tables_initializer(), - tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py index 0789e9418b8..cc4c045c349 100644 --- a/modelzoo/dlrm/train.py +++ b/modelzoo/dlrm/train.py @@ -507,9 +507,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py index a757851711c..db949aac5e8 100644 --- a/modelzoo/dssm/train.py +++ b/modelzoo/dssm/train.py @@ -478,9 +478,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py index 58219e19e3e..073b08814d4 100755 --- a/modelzoo/esmm/train.py +++ b/modelzoo/esmm/train.py @@ -534,9 +534,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.local_variables_initializer(), train_init_op), - saver=tf.train.Saver(max_to_keep=keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py index 0790f200b21..bb96a467701 100644 --- a/modelzoo/masknet/train.py +++ b/modelzoo/masknet/train.py @@ -529,9 +529,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py index db7e077250b..ce34fe5e55c 100644 --- a/modelzoo/mlperf/train.py +++ b/modelzoo/mlperf/train.py @@ -522,9 +522,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py index 251e02c7a72..694eb45da80 100644 --- a/modelzoo/mmoe/train.py +++ b/modelzoo/mmoe/train.py @@ -523,9 +523,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py index 2ba98363bbf..b2d2f2057ec 100644 --- a/modelzoo/ple/train.py +++ b/modelzoo/ple/train.py @@ -592,9 +592,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py index ff90946c96d..4ef1874a521 100644 --- a/modelzoo/simple_multitask/train.py +++ b/modelzoo/simple_multitask/train.py @@ -427,9 +427,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( - local_init_op=tf.group(tf.local_variables_initializer(), train_init_op), - saver=tf.train.Saver(max_to_keep=keep_checkpoint_max)) + local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py index b4f4dbc7a65..3024f58024e 100644 --- a/modelzoo/wide_and_deep/train.py +++ b/modelzoo/wide_and_deep/train.py @@ -543,9 +543,10 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) + sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( From 7ce84779b69d746111db5934bc90b94fc3ada6fa Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Tue, 5 Dec 2023 00:51:05 -0800 Subject: [PATCH 20/45] [Embedding] Refine KVInterface::GetShardedSnapshot API. (#953) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 --- .../core/framework/embedding/cpu_hash_map_kv.h | 14 ++++++++------ .../core/framework/embedding/dense_hash_map_kv.h | 10 ++++++---- .../core/framework/embedding/embedding_var.h | 9 +++++---- .../core/framework/embedding/gpu_hash_map_kv.h | 3 ++- tensorflow/core/framework/embedding/kv_interface.h | 3 ++- tensorflow/core/framework/embedding/leveldb_kv.h | 10 ++++++---- .../core/framework/embedding/multi_tier_storage.h | 3 ++- .../core/framework/embedding/single_tier_storage.h | 3 ++- tensorflow/core/framework/embedding/ssd_hash_kv.h | 3 ++- tensorflow/core/framework/embedding/storage.h | 3 ++- 10 files changed, 37 insertions(+), 24 deletions(-) diff --git a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h index 750ba282285..f9a6e1fff25 100644 --- a/tensorflow/core/framework/embedding/cpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/cpu_hash_map_kv.h @@ -138,7 +138,8 @@ class LocklessHashMap : public KVInterface { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { std::pair *hash_map_dump; int64 bucket_count; @@ -147,11 +148,12 @@ class LocklessHashMap : public KVInterface { bucket_count = it.second; for (int64 j = 0; j < bucket_count; j++) { if (hash_map_dump[j].first != LocklessHashMap::EMPTY_KEY_ - && hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_ - && hash_map_dump[j].first % kSavedPartitionNum - % partition_nums != partition_id) { - key_list->emplace_back(hash_map_dump[j].first); - value_ptr_list->emplace_back(hash_map_dump[j].second); + && hash_map_dump[j].first != LocklessHashMap::DELETED_KEY_) { + int part_id = hash_map_dump[j].first % kSavedPartitionNum % partition_nums; + if (part_id != partition_id) { + key_list[part_id].emplace_back(hash_map_dump[j].first); + value_ptr_list[part_id].emplace_back(hash_map_dump[j].second); + } } } diff --git a/tensorflow/core/framework/embedding/dense_hash_map_kv.h b/tensorflow/core/framework/embedding/dense_hash_map_kv.h index 8a27404b66f..12749a92e6e 100644 --- a/tensorflow/core/framework/embedding/dense_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/dense_hash_map_kv.h @@ -122,7 +122,8 @@ class DenseHashMap : public KVInterface { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { dense_hash_map hash_map_dump[partition_num_]; for (int i = 0; i< partition_num_; i++) { @@ -131,9 +132,10 @@ class DenseHashMap : public KVInterface { } for (int i = 0; i< partition_num_; i++) { for (const auto it : hash_map_dump[i].hash_map) { - if (it.first % kSavedPartitionNum % partition_nums != partition_id) { - key_list->push_back(it.first); - value_ptr_list->push_back(it.second); + int part_id = it.first % kSavedPartitionNum % partition_nums; + if (part_id != partition_id) { + key_list[part_id].emplace_back(it.first); + value_ptr_list[part_id].emplace_back(it.second); } } } diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index a66ec19fb97..df6ae6f1277 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -520,8 +520,8 @@ class EmbeddingVar : public ResourceBase { } } - Status GetShardedSnapshot(std::vector* key_list, - std::vector* value_ptr_list, + Status GetShardedSnapshot(std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_num) { return storage_->GetShardedSnapshot(key_list, value_ptr_list, partition_id, partition_num); @@ -546,7 +546,7 @@ class EmbeddingVar : public ResourceBase { bool is_admit = feat_desc_->IsAdmit(value_ptr); bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0); - if (!is_admit) { + if (is_admit) { key_list[i] = tot_keys_list[i]; if (!is_in_dram) { @@ -571,7 +571,7 @@ class EmbeddingVar : public ResourceBase { } } else { if (!save_unfiltered_features) - return; + continue; //TODO(JUNQI) : currently not export filtered keys } @@ -584,6 +584,7 @@ class EmbeddingVar : public ResourceBase { feat_desc_->Deallocate(value_ptr); } } + return; } Status RestoreFromKeysAndValues(int64 key_num, int partition_id, diff --git a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h index e73839e3f76..68fecf690ba 100644 --- a/tensorflow/core/framework/embedding/gpu_hash_map_kv.h +++ b/tensorflow/core/framework/embedding/gpu_hash_map_kv.h @@ -253,7 +253,8 @@ class GPUHashMapKV : public KVInterface { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot"; return Status::OK(); diff --git a/tensorflow/core/framework/embedding/kv_interface.h b/tensorflow/core/framework/embedding/kv_interface.h index dc603680138..8480132a7d9 100644 --- a/tensorflow/core/framework/embedding/kv_interface.h +++ b/tensorflow/core/framework/embedding/kv_interface.h @@ -91,7 +91,8 @@ class KVInterface { std::vector* value_ptr_list) = 0; virtual Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) = 0; virtual std::string DebugString() const = 0; diff --git a/tensorflow/core/framework/embedding/leveldb_kv.h b/tensorflow/core/framework/embedding/leveldb_kv.h index 47c8a39dfbd..030a0969e5d 100644 --- a/tensorflow/core/framework/embedding/leveldb_kv.h +++ b/tensorflow/core/framework/embedding/leveldb_kv.h @@ -194,7 +194,8 @@ class LevelDBKV : public KVInterface { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { ReadOptions options; options.snapshot = db_->GetSnapshot(); @@ -203,8 +204,9 @@ class LevelDBKV : public KVInterface { for (it->SeekToFirst(); it->Valid(); it->Next()) { K key; memcpy((char*)&key, it->key().ToString().data(), sizeof(K)); - if (key % kSavedPartitionNum % partition_nums == partition_id) continue; - key_list->emplace_back(key); + int part_id = key % kSavedPartitionNum % partition_nums; + if (part_id == partition_id) continue; + key_list[part_id].emplace_back(key); FeatureDescriptor hbm_feat_desc( 1, 1, ev_allocator()/*useless*/, StorageType::HBM_DRAM, true, true, @@ -218,7 +220,7 @@ class LevelDBKV : public KVInterface { value_ptr, feat_desc_->GetFreq(dram_value_ptr)); hbm_feat_desc.UpdateVersion( value_ptr, feat_desc_->GetVersion(dram_value_ptr)); - value_ptr_list->emplace_back(value_ptr); + value_ptr_list[part_id].emplace_back(value_ptr); } delete it; feat_desc_->Deallocate(dram_value_ptr); diff --git a/tensorflow/core/framework/embedding/multi_tier_storage.h b/tensorflow/core/framework/embedding/multi_tier_storage.h index f77fec8c85a..e27521f1a65 100644 --- a/tensorflow/core/framework/embedding/multi_tier_storage.h +++ b/tensorflow/core/framework/embedding/multi_tier_storage.h @@ -91,7 +91,8 @@ class MultiTierStorage : public Storage { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { LOG(FATAL)<<"Can't get sharded snapshot of MultiTierStorage."; return Status::OK(); diff --git a/tensorflow/core/framework/embedding/single_tier_storage.h b/tensorflow/core/framework/embedding/single_tier_storage.h index db96c807c5e..1c6bdd90790 100644 --- a/tensorflow/core/framework/embedding/single_tier_storage.h +++ b/tensorflow/core/framework/embedding/single_tier_storage.h @@ -224,7 +224,8 @@ class SingleTierStorage : public Storage { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { mutex_lock l(Storage::mu_); return kv_->GetShardedSnapshot(key_list, value_ptr_list, diff --git a/tensorflow/core/framework/embedding/ssd_hash_kv.h b/tensorflow/core/framework/embedding/ssd_hash_kv.h index a56c9f73385..bdc38cc5d5e 100644 --- a/tensorflow/core/framework/embedding/ssd_hash_kv.h +++ b/tensorflow/core/framework/embedding/ssd_hash_kv.h @@ -350,7 +350,8 @@ class SSDHashKV : public KVInterface { } Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) override { return Status::OK(); } diff --git a/tensorflow/core/framework/embedding/storage.h b/tensorflow/core/framework/embedding/storage.h index a652de5fa5f..559588af7e1 100644 --- a/tensorflow/core/framework/embedding/storage.h +++ b/tensorflow/core/framework/embedding/storage.h @@ -96,7 +96,8 @@ class Storage { virtual Status GetSnapshot(std::vector* key_list, std::vector* value_ptr_list) = 0; virtual Status GetShardedSnapshot( - std::vector* key_list, std::vector* value_ptr_list, + std::vector>& key_list, + std::vector>& value_ptr_list, int partition_id, int partition_nums) = 0; virtual Status Save( const string& tensor_name, From a5c014f144f00b5d5606ffa1e47bda0c8e0a2478 Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Sun, 10 Dec 2023 22:07:29 +0800 Subject: [PATCH 21/45] [IO] Fix tensor shape meta-data bug for DataFrame Value. (#958) * Revert "[IO] Add tensor shape meta-data support for ParquetDataset. (#849)" * [IO] Fix tensor shape meta-data bug for DataFrame Value. Signed-off-by: chenbangduo.cbd --- .../python/data/experimental/ops/dataframe.py | 26 ++++++++--------- .../experimental/ops/parquet_dataset_ops.py | 28 +++++++++++-------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/dataframe.py b/tensorflow/python/data/experimental/ops/dataframe.py index f3dc249653a..003f75259f1 100644 --- a/tensorflow/python/data/experimental/ops/dataframe.py +++ b/tensorflow/python/data/experimental/ops/dataframe.py @@ -59,17 +59,14 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None): self._ragged_rank = ragged_rank if shape: shape = tensor_shape.TensorShape(shape) - shape_rank = 0 - for _ in shape: - shape_rank += 1 - if ragged_rank is not None and ragged_rank != shape_rank: + for d in shape: + if d.value is None: + raise ValueError( + f'Field {name} has incomplete shape: {shape}') + if ragged_rank is not None and ragged_rank > 1: raise ValueError( f'Field {name} is a nested list ({ragged_rank}) ' f'with shape {shape}') - self._ragged_rank = shape_rank - elif ragged_rank is not None: - shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)]) - self._shape = shape @property @@ -134,16 +131,17 @@ def output_classes(self): def output_types(self): return self.map(lambda i: self._dtype if i == 0 else dtypes.int32) - def output_shapes(self, batch_size=None): + @property + def output_shapes(self): if self._shape is None: - return self.map(lambda i: tensor_shape.vector(batch_size) if i == 0 - else tensor_shape.vector(None)) + return self.map(lambda _: tensor_shape.vector(None)) return self.map( - lambda i: tensor_shape.vector(batch_size).concatenate(self._shape) if i == 0 + lambda i: tensor_shape.vector(None).concatenate(self._shape) if i == 0 else tensor_shape.vector(None)) - def output_specs(self, batch_size=None): - shape = tensor_shape.vector(batch_size) + @property + def output_specs(self): + shape = tensor_shape.vector(None) if self._shape is not None: shape = shape.concatenate(self._shape) specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)] diff --git a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py index 719940d1beb..5bb790c331d 100644 --- a/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py +++ b/tensorflow/python/data/experimental/ops/parquet_dataset_ops.py @@ -22,6 +22,7 @@ from tensorflow.python.data.ops import readers from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec from tensorflow.python.framework import type_spec from tensorflow.python.util import nest @@ -38,25 +39,23 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec): def value_type(self): return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor - def __init__(self, field, batch_size=None): + def __init__(self, field): """Constructs a type specification for a `tf.RaggedTensor`. Args: field: The field definition. - batch_size: The batch_size of DataFrame. """ if field.incomplete: raise ValueError( f'Field {field} is incomplete, please specify dtype and ragged_rank') self._field = field - self._batch_size = batch_size def _serialize(self): return (self._field.dtype, self._field.ragged_rank) @property def _component_specs(self): - return self._field.output_specs(self._batch_size) + return self._field.output_specs def _to_components(self, value): if isinstance(value, DataFrame.Value): @@ -80,7 +79,7 @@ def _to_legacy_output_types(self): return self._field.output_types def _to_legacy_output_shapes(self): - return self._field.output_shapes(self._batch_size) + return self._field.output_shapes def _to_legacy_output_classes(self): return self._field.output_classes @@ -110,13 +109,18 @@ def __init__( self._batch_size = ops.convert_to_tensor( batch_size, dtype=dtypes.int64, name='batch_size') self._fields = fields - self._output_specs = { - f.name: ( - DataFrameValueSpec(f, batch_size if drop_remainder else None) - if f.ragged_rank > 0 - else tensor_spec.TensorSpec( - shape=[batch_size if drop_remainder else None], dtype=f.dtype)) - for f in self._fields} + self._output_specs = {} + for f in self._fields: + item = None + if f.ragged_rank > 0: + item = DataFrameValueSpec(f) + else: + shape = tensor_shape.vector(batch_size if drop_remainder else None) + if f.shape: + shape = shape.concatenate(f.shape) + item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype) + self._output_specs[f.name] = item + self._field_names = nest.flatten({f.name: f.name for f in self._fields}) self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields}) self._field_ragged_ranks = nest.flatten( From 717f7c5e0840566c39739c321de024a88ddcc84f Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Wed, 13 Dec 2023 16:16:52 +0800 Subject: [PATCH 22/45] [Op] Implement of SliceSend/SliceRecv Op. (#947) Signed-off-by: chenbangduo.cbd --- tensorflow/core/BUILD | 2 + tensorflow/core/framework/rendezvous.h | 2 + tensorflow/core/graph/graph.cc | 2 + tensorflow/core/graph/graph.h | 12 +- tensorflow/core/grappler/op_types.cc | 8 +- tensorflow/core/grappler/op_types.h | 2 + tensorflow/core/kernels/BUILD | 27 +- tensorflow/core/kernels/slice_sendrecv_ops.cc | 562 ++++++++++++++++++ tensorflow/core/kernels/slice_sendrecv_ops.h | 89 +++ .../core/kernels/slice_sendrecv_ops_test.cc | 339 +++++++++++ tensorflow/core/ops/slice_sendrecv_ops.cc | 78 +++ 11 files changed, 1118 insertions(+), 5 deletions(-) create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.cc create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops.h create mode 100644 tensorflow/core/kernels/slice_sendrecv_ops_test.cc create mode 100644 tensorflow/core/ops/slice_sendrecv_ops.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ef1ebcb6dcf..ce6850eb9da 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1237,6 +1237,7 @@ tf_gen_op_libs( "set_ops", "script_ops", "sendrecv_ops", + "slice_sendrecv_ops", "sparse_ops", "spectral_ops", "state_ops", @@ -1497,6 +1498,7 @@ cc_library( ":sdca_ops_op_lib", ":sendrecv_ops_op_lib", ":set_ops_op_lib", + ":slice_sendrecv_ops_op_lib", ":sparse_ops_op_lib", ":star_run_graph_op_op_lib", ":summary_ops_op_lib", diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h index 255c0326e02..3c2b20379c8 100644 --- a/tensorflow/core/framework/rendezvous.h +++ b/tensorflow/core/framework/rendezvous.h @@ -80,6 +80,8 @@ class Rendezvous : public core::RefCounted { friend class SendOp; friend class RecvOp; friend class FuseRecvOp; + friend class SliceSendOp; + friend class SliceRecvOp; friend class RefSendOp; friend class RefRecvOp; string buf_; diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc index 8ba5d345837..d9709d39f3f 100644 --- a/tensorflow/core/graph/graph.cc +++ b/tensorflow/core/graph/graph.cc @@ -69,11 +69,13 @@ const std::unordered_map& Node::kNodeClassTable = {"_Send", NC_SEND}, {"_HostSend", NC_HOST_SEND}, {"_RefSend", NC_REF_SEND}, + {"_SliceSend", NC_SLICE_SEND}, {"_Recv", NC_RECV}, {"_HostRecv", NC_HOST_RECV}, {"_RefRecv", NC_REF_RECV}, {"_FuseRecv", NC_FUSE_RECV}, {"_HostFuseRecv", NC_HOST_FUSE_RECV}, + {"_SliceRecv", NC_SLICE_RECV}, {"Const", NC_CONSTANT}, {"HostConst", NC_CONSTANT}, {"Variable", NC_VARIABLE}, diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h index 0e7e032c9a5..0baf8f257a9 100644 --- a/tensorflow/core/graph/graph.h +++ b/tensorflow/core/graph/graph.h @@ -219,12 +219,16 @@ class Node { bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; } bool IsSend() const { return class_ == NC_SEND || class_ == NC_HOST_SEND || - class_ == NC_REF_SEND; } + class_ == NC_REF_SEND || + class_ == NC_SLICE_SEND; } + bool IsSliceSend() const { return class_ == NC_SLICE_SEND; } bool IsRecv() const { return class_ == NC_RECV || class_ == NC_HOST_RECV || - class_ == NC_REF_RECV; } + class_ == NC_REF_RECV || + class_ == NC_SLICE_RECV; } bool IsFuseRecv() const { return class_ == NC_FUSE_RECV || class_ == NC_HOST_FUSE_RECV; } + bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; } bool IsConstant() const { return class_ == NC_CONSTANT; } bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; } bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; } @@ -334,11 +338,13 @@ class Node { NC_SEND, NC_HOST_SEND, NC_REF_SEND, + NC_SLICE_SEND, NC_RECV, NC_HOST_RECV, NC_REF_RECV, NC_FUSE_RECV, NC_HOST_FUSE_RECV, + NC_SLICE_RECV, NC_CONSTANT, NC_VARIABLE, NC_KV_VAR_HANDLE, @@ -844,7 +850,9 @@ inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); } inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); } inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); } inline bool IsSend(const Node* node) { return node->IsSend(); } +inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); } inline bool IsRecv(const Node* node) { return node->IsRecv(); } +inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); } inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); } inline bool IsHostSend(const Node* node) { return node->IsHostSend(); } inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); } diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc index a3a521fa123..1201623ffcd 100644 --- a/tensorflow/core/grappler/op_types.cc +++ b/tensorflow/core/grappler/op_types.cc @@ -454,7 +454,7 @@ bool IsReciprocalGrad(const NodeDef& node) { } bool IsRecv(const NodeDef& node) { - return node.op() == "_Recv" || node.op() == "_HostRecv"; + return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node); } bool IsFuseRecv(const NodeDef& node) { @@ -502,7 +502,7 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; } bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; } bool IsSend(const NodeDef& node) { - return node.op() == "_Send" || node.op() == "_HostSend"; + return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node); } bool IsShape(const NodeDef& node) { return node.op() == "Shape"; } @@ -517,6 +517,10 @@ bool IsSize(const NodeDef& node) { return node.op() == "Size"; } bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; } +bool IsSliceRecv(const NodeDef& node) { return node.op() == "_SliceRecv"; } + +bool IsSliceSend(const NodeDef& node) { return node.op() == "_SliceSend"; } + bool IsSnapshot(const NodeDef& node) { return node.op() == "Snapshot"; } bool IsSoftmax(const NodeDef& node) { return node.op() == "Softmax"; } diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h index 19699ccb933..737581fd412 100644 --- a/tensorflow/core/grappler/op_types.h +++ b/tensorflow/core/grappler/op_types.h @@ -167,6 +167,8 @@ bool IsShuffle(const NodeDef& node); bool IsSigmoidGrad(const NodeDef& node); bool IsSize(const NodeDef& node); bool IsSlice(const NodeDef& node); +bool IsSliceRecv(const NodeDef& node); +bool IsSliceSend(const NodeDef& node); bool IsSnapshot(const NodeDef& node); bool IsSoftmax(const NodeDef& node); bool IsSoftplusGrad(const NodeDef& node); diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 0c08c30c30a..36721527cc2 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5423,8 +5423,9 @@ cc_library( name = "required", deps = [ ":no_op", - ":sendrecv_ops", ":fuserecv_ops", + ":sendrecv_ops", + ":slice_sendrecv_ops", ], ) @@ -5445,6 +5446,12 @@ tf_kernel_library( deps = REQUIRED_DEPS, ) +tf_kernel_library( + name = "slice_sendrecv_ops", + prefix = "slice_sendrecv_ops", + deps = REQUIRED_DEPS, +) + tf_kernel_library( name = "group_embedding_ops", hdrs = ["group_embedding/group_embedding_lookup_sparse_forward_base_ops.h"], @@ -5509,6 +5516,24 @@ tf_cc_test( ], ) +tf_cc_test( + name = "slice_sendrecv_ops_test", + srcs = ["slice_sendrecv_ops_test.cc"], + linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking + deps = [ + ":control_flow_ops", + ":cwise_op", + ":logging_ops", + ":ops_testutil", + ":ops_util", + ":slice_sendrecv_ops", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + tf_kernel_library( name = "fuserecv_ops", prefix = "fuserecv_ops", diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc new file mode 100644 index 00000000000..f09f314ae10 --- /dev/null +++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc @@ -0,0 +1,562 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/slice_sendrecv_ops.h" + +namespace tensorflow { + +//------------------------------------------------------------------------------ +// Utils. +static string GetSliceRendezvousKeyPrefix(const string& send_device, + const string& recv_device, + const uint64 send_device_incarnation, + const string& tensor_name) { + return strings::StrCat(send_device, ";", + strings::FpToString(send_device_incarnation), ";", + recv_device, ";", tensor_name); +} + +static void GetSliceRendezvousKey(const string& key_prefix, + const string& tensor_name_suffix, + const FrameAndIter& frame_iter, string* key) { + key->clear(); + strings::StrAppend(key, key_prefix, tensor_name_suffix, ";", + frame_iter.frame_id, ":", frame_iter.iter_id); +} + +static FrameAndIter GetFrameAndIter(OpKernelContext* ctx, + bool hostmem_sendrecv) { + if (hostmem_sendrecv && ctx->call_frame() != nullptr) { + // Host memory send/recv pairs are added by + // common_runtime/memory_types.cc. When the pair of nodes are + // added inside a function, we need to use the function call frame + // to formulate the unique rendezvous key. + return FrameAndIter(reinterpret_cast(ctx->call_frame()), 0); + } else { + return ctx->frame_iter(); + } +} + +//------------------------------------------------------------------------------ +// Functions of SliceSendOp. + +SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = \ + GetSliceRendezvousKeyPrefix(send_device, recv_device, + send_device_incarnation, tensor_name); + if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { + hostmem_sendrecv_ = false; + } + OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); +} + +void SliceSendOp::Compute(OpKernelContext* ctx) { + OP_REQUIRES( + ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + + const Tensor& input_t = ctx->input(0); + FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_); + + // send total_bytes. + OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t)); + // if input is dead, only send total_bytes dead tensor. + if (ctx->is_input_dead()) { + return; + } + + // if total bytes is smaller than slice size, send directly. + if (input_t.TotalBytes() <= slice_size_) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->input_alloc_attr(0); + + Rendezvous::ParsedKey parsed_key; + GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t, + ctx->is_input_dead())); + return; + } + + // send shape. + OP_REQUIRES_OK(ctx, SendShape(ctx, frame_iter, input_t)); + + // send data. + if (dtype_ == DT_STRING) { + OP_REQUIRES_OK(ctx, SendString(ctx, frame_iter, input_t)); + } else { + OP_REQUIRES_OK(ctx, SendBasicType(ctx, frame_iter, input_t)); + } +} + +Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const Tensor& input_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + + Rendezvous::ParsedKey parsed_key; + Tensor total_bytes_t; + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}), + &total_bytes_t)); + total_bytes_t.scalar()() = input_t.TotalBytes(); + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t, + ctx->is_input_dead()); +} + +Status SliceSendOp::SendShape(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const Tensor& input_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + Rendezvous::ParsedKey parsed_key; + + Tensor shape_t; + TensorShape shape = input_t.shape(); + const int rank = shape.dims(); + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({rank}), + &shape_t)); + auto shape_vec = shape_t.vec(); + for (int i = 0; i < rank; i++) { + shape_vec(i) = shape.dim_size(i); + } + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + return ctx->rendezvous()->Send(parsed_key, args, shape_t, + ctx->is_input_dead()); +} + +Status SliceSendOp::SendString(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const Tensor& input_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + Rendezvous::ParsedKey parsed_key; + + // send elements size. + Tensor elements_size_t; + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(), + &elements_size_t)); + int64 num_elements = input_t.NumElements(); + auto input_flat = input_t.flat(); + auto elements_size_flat = elements_size_t.flat(); + for (int64 i = 0; i < num_elements; i++) { + elements_size_flat(i) = input_flat(i).size(); + } + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size", + frame_iter, &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t, + ctx->is_input_dead())); + + // send data. + args.alloc_attrs = ctx->input_alloc_attr(0); + Tensor data_t; + for (int64 i = 0; i < num_elements; i++) { + const std::string& elem = input_flat(i); + if (elem.size() <= slice_size_) { + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), + &data_t)); + data_t.scalar()() = elem; + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, + ctx->is_input_dead())); + } else { + TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i)); + } + } + + return Status::OK(); +} + +Status SliceSendOp::SendStringSlice(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const std::string& elem, int64 index) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->input_alloc_attr(0); + Rendezvous::ParsedKey parsed_key; + + int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_; + Tensor data_t; + for (int64 i = 0; i < slice_num; i++) { + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t)); + size_t start = i * slice_size_; + size_t copy_size = slice_size_; + if (start > elem.size() - slice_size_) { + copy_size = elem.size() - start; + } + data_t.scalar()() = elem.substr(start, copy_size); + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(index), "_", + std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, + ctx->is_input_dead())); + } + + return Status::OK(); +} + +Status SliceSendOp::SendBasicType(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const Tensor& input_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->input_alloc_attr(0); + Rendezvous::ParsedKey parsed_key; + + // send data. + Tensor data_t; + int64 bytes_num = input_t.TotalBytes(); + int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_; + unsigned char* input_base = reinterpret_cast(input_t.data()); + for (int64 i = 0; i < slice_num; i++) { + int64 start = i * slice_size_; + int64 copy_size = slice_size_; + if (start > bytes_num - slice_size_) { + copy_size = bytes_num - start; + } + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT8, TensorShape({copy_size}), + &data_t)); + auto data_base = data_t.data(); + std::memcpy(data_base, input_base+start, copy_size); + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, + ctx->is_input_dead())); + } + + return Status::OK(); +} + +REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_CPU), SliceSendOp); +REGISTER_KERNEL_BUILDER(Name("_SliceSend").Device(DEVICE_DEFAULT), SliceSendOp); + +//------------------------------------------------------------------------------ +// Functions of SliceRecvOp. + +SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = \ + GetSliceRendezvousKeyPrefix(send_device, recv_device, + send_device_incarnation, tensor_name); + if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { + hostmem_sendrecv_ = false; + } + OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_type", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_)); +} + +void SliceRecvOp::Compute(OpKernelContext* ctx) { + OP_REQUIRES( + ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + + FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_); + bool is_dead; + + // recv total_bytes. + int64 total_bytes; + OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes)); + if (is_dead) { + return; + } + + // if total bytes is smaller than slice size, recv directly. + if (total_bytes <= slice_size_) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + Tensor data_t; + OP_REQUIRES_OK(ctx, ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + ctx->set_output(0, data_t); + return; + } + + // recv shape. + TensorShape shape; + OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape)); + + // recv data + Tensor* output_t = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output_t)); + if (dtype_ == DT_STRING) { + OP_REQUIRES_OK(ctx, RecvString(ctx, frame_iter, shape, output_t)); + } else { + OP_REQUIRES_OK(ctx, RecvBasicType(ctx, frame_iter, total_bytes, output_t)); + } +} + +Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + bool& is_dead, int64& total_bytes) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + Tensor total_bytes_t; + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t, + &is_dead, timeout_ms_)); + if (!is_dead) { + total_bytes = total_bytes_t.scalar()(); + } + + return Status::OK(); +} + +Status SliceRecvOp::RecvShape(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + TensorShape& shape) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + + Tensor shape_t; + bool is_dead; + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + auto shape_vec = shape_t.vec(); + const int64 num_elements = shape_t.NumElements(); + for (int64 i = 0; i < num_elements; i++) { + shape.AddDim(shape_vec(i)); + } + + return Status::OK(); +} + +Status SliceRecvOp::RecvString(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const TensorShape& shape, Tensor*& output_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + Rendezvous::ParsedKey parsed_key; + bool is_dead; + + // recv elements size. + GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size", + frame_iter, &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + Tensor elements_size_t; + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + auto elements_size_flat = elements_size_t.flat(); + int64 num_elements = shape.num_elements(); + args.alloc_attrs = ctx->output_alloc_attr(0); + Tensor data_t; + auto output_flat = output_t->flat(); + for (int64 i = 0; i < num_elements; i++) { + if (elements_size_flat(i) <= slice_size_) { + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + output_flat(i) = data_t.scalar()(); + } else { + TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i, + elements_size_flat(i), output_flat)); + } + } + + return Status::OK(); +} + +Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const int64 index, const int64 element_size, + TTypes::Flat& output_flat) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + Rendezvous::ParsedKey parsed_key; + + int64 slice_num = (element_size + slice_size_ - 1) / slice_size_; + Tensor data_t; + bool is_dead = false; + for (int64 i = 0; i < slice_num; i++) { + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(index), "_", + std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + output_flat(index) += data_t.scalar()(); + } + + return Status::OK(); +} + +Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const int64 total_bytes, + Tensor*& output_t) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + Rendezvous::ParsedKey parsed_key; + + Tensor data_t; + bool is_dead = false; + int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_; + unsigned char* output_base = \ + reinterpret_cast(output_t->data()); + for (int64 i = 0; i < slice_num; i++) { + int64 start = i * slice_size_; + int64 copy_size = slice_size_; + if (start > total_bytes - slice_size_) { + copy_size = total_bytes - start; + } + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(i)); + GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, + &parsed_key.buf_); + VLOG(2) << "SliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + auto data_base = data_t.data(); + std::memcpy(output_base+start, data_base, copy_size); + } + + return Status::OK(); + +} + +REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_CPU), SliceRecvOp); +REGISTER_KERNEL_BUILDER(Name("_SliceRecv").Device(DEVICE_DEFAULT), SliceRecvOp); + +} // End of namespace tensorflow diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h new file mode 100644 index 00000000000..df55c080aa1 --- /dev/null +++ b/tensorflow/core/kernels/slice_sendrecv_ops.h @@ -0,0 +1,89 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_ +#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { + +class SliceSendOp : public OpKernel { + public: + explicit SliceSendOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + + private: + // Variables. + string key_prefix_; + bool hostmem_sendrecv_; + int32 slice_size_; + DataType dtype_; + + // Functions. + Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const Tensor& input_t); + + Status SendShape(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const Tensor& input_t); + Status SendString(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const Tensor& input_t); + + Status SendStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const std::string& elem, int64 index); + + Status SendBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const Tensor& input_t); + + TF_DISALLOW_COPY_AND_ASSIGN(SliceSendOp); +}; + +class SliceRecvOp : public OpKernel { + public: + explicit SliceRecvOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + + private: + // Variable. + string key_prefix_; + bool hostmem_sendrecv_; + int32 slice_size_; + int64 timeout_ms_; + DataType dtype_; + + // Fucntions. + Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + bool& is_dead, int64& total_bytes); + + Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter, + TensorShape& shape); + + Status RecvString(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const TensorShape& shape, Tensor*& output_t); + + Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const int64 index, const int64 element_size, + TTypes::Flat& output_flat); + + Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const int64 total_bytes, Tensor*& output_t); + + TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp); +}; + +} // End of namespace tensorflow + +#endif // End of TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_OPS_H_ diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc new file mode 100644 index 00000000000..5693ed57918 --- /dev/null +++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc @@ -0,0 +1,339 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +namespace { +// Implement a trivial version of the Rendezvous interface, to avoid +// clouding the benchmark results with the time spent in the various +// implementations, and to avoid the duplicate-send or duplicate-recv +// errors that would arise from running either benchmark in a loop. +class DummyRendezvous : public Rendezvous { + // Functions. + Status Send(const ParsedKey& key, const Args& args, const Tensor& val, + const bool is_dead) override { + std::string key_str = { key.FullKey().data(), key.FullKey().size() }; + mutex_lock l(mu_); + // consumer does not reach. + if (kv_.count(key_str) == 0) { + struct Var var; + var.type = send; + var.args = args; + var.data = val; + var.is_dead = is_dead; + + kv_[key_str] = var; + return Status::OK(); + } + + auto var = kv_[key_str]; + CHECK_EQ(var.type, recv); + var.done(Status::OK(), args, var.args, val, is_dead); + kv_.erase(key_str); + return Status::OK(); + } + void RecvAsync(const ParsedKey& key, const Args& args, + DoneCallback done) override { + std::string key_str = { key.FullKey().data(), key.FullKey().size() }; + + mutex_lock l(mu_); + // producer does not reach. + if (kv_.count(key_str) == 0) { + struct Var var; + var.type = recv; + var.args = args; + var.done = done; + + kv_[key_str] = var; + return; + } + + // auto var = kv_[key_str]; + auto var = kv_[key_str]; + CHECK_EQ(var.type, send); + done(Status::OK(), var.args, args, var.data, var.is_dead); + kv_.erase(key_str); + } + void StartAbort(const Status& status) override {} + + private: + enum RendezvousType { + send, + recv + }; + // Type define. + struct Var { + RendezvousType type; + Args args; + Tensor data; + bool is_dead; + DoneCallback done; + }; + + // Variables. + mutex mu_; + std::unordered_map kv_ GUARDED_BY(mu_); +}; + +Node* SliceSend(Graph* g, Node* input, const string& tensor, + const string& sender, const uint64 sender_incarnation, + const string& receiver, const int32 slice_size) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend") + .Input(input, 0) + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("slice_size", slice_size) + .Finalize(g, &ret)); + return ret; +} + +Node* SliceRecv(Graph* g, const string& tensor, const string& type, + const string& sender, const uint64 sender_incarnation, + const string& receiver, const int32 slice_size, + const int64 timeout_ms) { + Node* ret; + DataType dtype; + CHECK(DataTypeFromString(type, &dtype)); + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv") + .Attr("tensor_type", dtype) + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("slice_size", slice_size) + .Attr("timeout_ms", timeout_ms) + .Finalize(g, &ret)); + return ret; +} + +Node* Equal(Graph* g, Node* x, Node* y) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Equal") + .Input(x) + .Input(y) + .Finalize(g, &ret)); + return ret; +} + +Node* ReduceAll(Graph* g, Node* input, Node* axes) { + return test::graph::Reduce(g, "All", input, axes); +} + +Node* Assert(Graph* g, Node* condition, + std::vector& data) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert") + .Input(condition) + .Input(data) + .Finalize(g, &ret)); + return ret; +} + +static Graph* TransferStringTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars. + + Tensor input_t(DT_STRING, TensorShape({2, 4})); + input_t.flat().setConstant(str); // total bytes: 44*8=352 bytes. + Node* input_n = test::graph::Constant(g, input_t); + SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size); + Node* recv_n = \ + SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + Node* equal_n = Equal(g, input_n, recv_n); + + Tensor axes_t(DT_INT32, TensorShape({input_t.dims()})); + auto axes_flat = axes_t.flat(); + for (int i = 0; i < input_t.dims(); i++) { + axes_flat(i) = i; + } + Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t)); + + std::vector data_out; + data_out.emplace_back(input_n, 0); + data_out.emplace_back(recv_n, 0); + Assert(g, reduce_all_n, data_out); + + return g; +} + +static Graph* TransferBasicTypeTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + + Tensor input_t(DT_FLOAT, TensorShape({2, 8})); + input_t.flat().setConstant(2); // total bytes = 4*2*8=64 bytes. + Node* input_n = test::graph::Constant(g, input_t); + SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size); + Node* recv_n = \ + SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + Node* equal_n = Equal(g, input_n, recv_n); + + Tensor axes_t(DT_INT32, TensorShape({input_t.dims()})); + auto axes_flat = axes_t.flat(); + for (int i = 0; i < input_t.dims(); i++) { + axes_flat(i) = i; + } + Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t)); + + std::vector data_out; + data_out.emplace_back(input_n, 0); + data_out.emplace_back(recv_n, 0); + Assert(g, reduce_all_n, data_out); + + return g; +} + +static Graph* TransferBigStringTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 16; + const int64 timeout_ms = 5000; + std::string str = "The quick brown fox jumps over the lazy dog."; // 44 chars. + + Tensor input_t(DT_STRING, TensorShape({2, 4})); + input_t.flat().setConstant(str); + input_t.flat()(0) = "short str"; + Node* input_n = \ + test::graph::Constant(g, input_t); // total bytes: 44*7+9=317 bytes. + SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size); + Node* recv_n = \ + SliceRecv(g, "T", "string", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + Node* equal_n = Equal(g, input_n, recv_n); + + Tensor axes_t(DT_INT32, TensorShape({input_t.dims()})); + auto axes_flat = axes_t.flat(); + for (int i = 0; i < input_t.dims(); i++) { + axes_flat(i) = i; + } + Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t)); + + std::vector data_out; + data_out.emplace_back(input_n, 0); + data_out.emplace_back(recv_n, 0); + Assert(g, reduce_all_n, data_out); + + return g; +} + +static Graph* TransferBigBasicTypeTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 16; + const int64 timeout_ms = 5000; + + Tensor input_t(DT_FLOAT, TensorShape({2, 8})); + input_t.flat().setConstant(2); // total bytes: 4*2*8=64 + Node* input_n = test::graph::Constant(g, input_t); + SliceSend(g, input_n, "T", "/cpu:0", 1, "/cpu:0", slice_size); + Node* recv_n = \ + SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + Node* equal_n = Equal(g, input_n, recv_n); + + Tensor axes_t(DT_INT32, TensorShape({input_t.dims()})); + auto axes_flat = axes_t.flat(); + for (int i = 0; i < input_t.dims(); i++) { + axes_flat(i) = i; + } + Node* reduce_all_n = ReduceAll(g, equal_n, test::graph::Constant(g, axes_t)); + + std::vector data_out; + data_out.emplace_back(input_n, 0); + data_out.emplace_back(recv_n, 0); + Assert(g, reduce_all_n, data_out); + + return g; +} + +static Graph* TransferDeadTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + + // val + Tensor val_t(DT_FLOAT, TensorShape({})); + val_t.scalar()() = 2; + Node* val_n = test::graph::Constant(g, val_t); + + Tensor pred_t(DT_BOOL, TensorShape({})); + pred_t.scalar()() = true; + Node* pred_n = test::graph::Constant(g, pred_t); + + Node* switch_n = test::graph::Switch(g, val_n, pred_n); + SliceSend(g, switch_n, "T", "/cpu:0", 1, "/cpu:0", slice_size); + SliceRecv(g, "T", "float32", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + return g; +} + +static void BM_TransferStringTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferStringTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_TransferBasicTypeTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferBasicTypeTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_TransferBigStringTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferBigStringTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_TransferBigBasicTypeTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferBigBasicTypeTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_TransferDeadTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +BENCHMARK(BM_TransferStringTensor); +BENCHMARK(BM_TransferBasicTypeTensor); +BENCHMARK(BM_TransferBigStringTensor); +BENCHMARK(BM_TransferBigBasicTypeTensor); +BENCHMARK(BM_TransferDeadTensor); + +} // End of anonymous namespace + +} // End of namespace tensorflow diff --git a/tensorflow/core/ops/slice_sendrecv_ops.cc b/tensorflow/core/ops/slice_sendrecv_ops.cc new file mode 100644 index 00000000000..11905712410 --- /dev/null +++ b/tensorflow/core/ops/slice_sendrecv_ops.cc @@ -0,0 +1,78 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +namespace tensorflow { + +REGISTER_OP("_SliceSend") + .Input("tensor: T") + .Attr("T: type") + .Attr("tensor_name: string") + .Attr("send_device: string") + .Attr("send_device_incarnation: int") + .Attr("recv_device: string") + .Attr("client_terminated: bool = false") + .Attr("slice_size: int >= 1") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Sends the named tensor from send_device to recv_device. +Supports sending the tensor of any size. + +tensor: The tensor to send. +tensor_name: The name of the tensor to send. +send_device: The name of the device sending the tensor. +send_device_incarnation: The current incarnation of send_device. +recv_device: The name of the device receiving the tensor. +client_terminated: If set to true, this indicates that the node was added + to the graph as a result of a client-side feed or fetch of Tensor data, + in which case the corresponding send or recv is expected to be managed + locally by the caller. +slice_size: The maximum number of bytes transferred at one time. +)doc"); + +REGISTER_OP("_SliceRecv") + .Output("tensor: tensor_type") + .Attr("tensor_type: type") + .Attr("tensor_name: string") + .Attr("send_device: string") + .Attr("send_device_incarnation: int") + .Attr("recv_device: string") + .Attr("client_terminated: bool = false") + .Attr("slice_size: int >= 1") + .Attr("timeout_ms: int >= 0 = 300000") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Receives the named tensor from send_device on recv_device. +Supports recving the tensor of any size. + +tensor: The tensor to receive. +tensor_name: The name of the tensor to receive. +send_device: The name of the device sending the tensor. +send_device_incarnation: The current incarnation of send_device. +recv_device: The name of the device receiving the tensor. +client_terminated: If set to true, this indicates that the node was added + to the graph as a result of a client-side feed or fetch of Tensor data, + in which case the corresponding send or recv is expected to be managed + locally by the caller. +slice_size: The maximum number of bytes transferred at one time. +timeout_ms: The maximum wait time for receiving a tensor. +)doc"); + +} // End of namespace tensorflow From 6bf562197efaedccc8026d1d05ac23e27d3b2521 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 20 Dec 2023 15:47:52 +0800 Subject: [PATCH 23/45] [Embedding] undefine EV GPU interface in CPU compile. (#956) Signed-off-by: candy.dc --- .../core/framework/embedding/embedding_var.h | 91 +++++++++---------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index df6ae6f1277..c0d26a2f4d8 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -140,13 +140,6 @@ class EmbeddingVar : public ResourceBase { return storage_->Get(key, value_ptr); } - void BatchLookupKey(const EmbeddingVarContext& ctx, - const K* keys, - void** value_ptr_list, - int64 num_of_keys) { - storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys); - } - Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter, bool indices_as_pointer, int64 count = 1) { @@ -167,45 +160,6 @@ class EmbeddingVar : public ResourceBase { return Status::OK(); } - Status LookupOrCreateKey(const EmbeddingVarContext& context, - const K* keys, - void** value_ptrs, - int64 num_of_keys, - int64* indices_counts, - bool indices_as_pointer = false) { - if (indices_as_pointer) { - auto lookup_key_and_set_version_fn = [keys, value_ptrs] - (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - value_ptrs[i] = (void*)keys[i]; - } - }; - const int64 unit_cost = 1000; //very unreliable estimate for cost per step. - auto worker_threads = context.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, num_of_keys, unit_cost, - lookup_key_and_set_version_fn); - } else { - filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); - } - - if (indices_counts != nullptr) { - auto add_freq_fn = [this, value_ptrs, indices_counts] - (int64 start, int64 limit) { - for (int i = start; i < limit; i++) { - feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]); - } - }; - const int64 unit_cost = 1000; //very unreliable estimate for cost per step. - auto worker_threads = context.worker_threads; - Shard(worker_threads->num_threads, - worker_threads->workers, num_of_keys, unit_cost, - add_freq_fn); - } - return Status::OK(); - } - - Status LookupOrCreateKey(K key, void** value_ptr) { Status s = storage_->GetOrCreate(key, value_ptr); TF_CHECK_OK(s); @@ -402,6 +356,51 @@ class EmbeddingVar : public ResourceBase { storage_->AddToCache(keys_tensor); } + + void BatchLookupKey(const EmbeddingVarContext& ctx, + const K* keys, + void** value_ptr_list, + int64 num_of_keys) { + storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys); + } + + Status LookupOrCreateKey(const EmbeddingVarContext& context, + const K* keys, + void** value_ptrs, + int64 num_of_keys, + int64* indices_counts, + bool indices_as_pointer = false) { + if (indices_as_pointer) { + auto lookup_key_and_set_version_fn = [keys, value_ptrs] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + value_ptrs[i] = (void*)keys[i]; + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + lookup_key_and_set_version_fn); + } else { + filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys); + } + + if (indices_counts != nullptr) { + auto add_freq_fn = [this, value_ptrs, indices_counts] + (int64 start, int64 limit) { + for (int i = start; i < limit; i++) { + feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]); + } + }; + const int64 unit_cost = 1000; //very unreliable estimate for cost per step. + auto worker_threads = context.worker_threads; + Shard(worker_threads->num_threads, + worker_threads->workers, num_of_keys, unit_cost, + add_freq_fn); + } + return Status::OK(); + } #endif #if GOOGLE_CUDA From 0f536a2849528e2c25dd7f496a00d810acd5e72c Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Tue, 26 Dec 2023 16:14:06 +0800 Subject: [PATCH 24/45] [Op] Implement FileSliceSend/FileSliceRecvOp. (#960) FileSliceSend/FileSliceRecv Op transfer scalar string Tensor to/from SliceRecv/SliceSend Op. Signed-off-by: chenbangduo.cbd --- tensorflow/core/BUILD | 2 + tensorflow/core/framework/rendezvous.h | 2 + tensorflow/core/graph/graph.cc | 2 + tensorflow/core/graph/graph.h | 12 +- tensorflow/core/grappler/op_types.cc | 10 +- tensorflow/core/grappler/op_types.h | 2 + tensorflow/core/kernels/BUILD | 46 +- .../core/kernels/file_slice_sendrecv_ops.cc | 482 +++++++++++++++++ .../core/kernels/file_slice_sendrecv_ops.h | 98 ++++ .../kernels/file_slice_sendrecv_ops_test.cc | 483 ++++++++++++++++++ tensorflow/core/kernels/slice_sendrecv_ops.cc | 175 +++---- tensorflow/core/kernels/slice_sendrecv_ops.h | 6 +- .../core/kernels/slice_sendrecv_utils.cc | 53 ++ .../core/kernels/slice_sendrecv_utils.h | 41 ++ .../core/ops/file_slice_sendrecv_ops.cc | 77 +++ 15 files changed, 1388 insertions(+), 103 deletions(-) create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.cc create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops.h create mode 100644 tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.cc create mode 100644 tensorflow/core/kernels/slice_sendrecv_utils.h create mode 100644 tensorflow/core/ops/file_slice_sendrecv_ops.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ce6850eb9da..07115cfea3c 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1203,6 +1203,7 @@ tf_gen_op_libs( "encode_proto_ops", "experimental_dataset_ops", "feature_column_ops", + "file_slice_sendrecv_ops", "function_ops", "functional_ops", "fused_embedding_ops", @@ -1465,6 +1466,7 @@ cc_library( ":encode_proto_ops_op_lib", ":experimental_dataset_ops_op_lib", ":feature_column_ops_op_lib", + ":file_slice_sendrecv_ops_op_lib", ":function_ops_op_lib", ":functional_ops_op_lib", ":fused_embedding_ops_op_lib", diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h index 3c2b20379c8..3aa65534272 100644 --- a/tensorflow/core/framework/rendezvous.h +++ b/tensorflow/core/framework/rendezvous.h @@ -82,6 +82,8 @@ class Rendezvous : public core::RefCounted { friend class FuseRecvOp; friend class SliceSendOp; friend class SliceRecvOp; + friend class FileSliceSendOp; + friend class FileSliceRecvOp; friend class RefSendOp; friend class RefRecvOp; string buf_; diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc index d9709d39f3f..59b25ee7c36 100644 --- a/tensorflow/core/graph/graph.cc +++ b/tensorflow/core/graph/graph.cc @@ -70,12 +70,14 @@ const std::unordered_map& Node::kNodeClassTable = {"_HostSend", NC_HOST_SEND}, {"_RefSend", NC_REF_SEND}, {"_SliceSend", NC_SLICE_SEND}, + {"_FileSliceSend", NC_FILE_SLICE_SEND}, {"_Recv", NC_RECV}, {"_HostRecv", NC_HOST_RECV}, {"_RefRecv", NC_REF_RECV}, {"_FuseRecv", NC_FUSE_RECV}, {"_HostFuseRecv", NC_HOST_FUSE_RECV}, {"_SliceRecv", NC_SLICE_RECV}, + {"_FileSliceRecv", NC_FILE_SLICE_RECV}, {"Const", NC_CONSTANT}, {"HostConst", NC_CONSTANT}, {"Variable", NC_VARIABLE}, diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h index 0baf8f257a9..bd6d18cfc7c 100644 --- a/tensorflow/core/graph/graph.h +++ b/tensorflow/core/graph/graph.h @@ -220,15 +220,19 @@ class Node { bool IsSend() const { return class_ == NC_SEND || class_ == NC_HOST_SEND || class_ == NC_REF_SEND || - class_ == NC_SLICE_SEND; } + class_ == NC_SLICE_SEND || + class_ == NC_FILE_SLICE_SEND; } bool IsSliceSend() const { return class_ == NC_SLICE_SEND; } + bool IsFileSliceSend() const { return class_ == NC_FILE_SLICE_SEND; } bool IsRecv() const { return class_ == NC_RECV || class_ == NC_HOST_RECV || class_ == NC_REF_RECV || - class_ == NC_SLICE_RECV; } + class_ == NC_SLICE_RECV || + class_ == NC_FILE_SLICE_RECV; } bool IsFuseRecv() const { return class_ == NC_FUSE_RECV || class_ == NC_HOST_FUSE_RECV; } bool IsSliceRecv() const {return class_ == NC_SLICE_RECV; } + bool IsFileSliceRecv() const { return class_ == NC_FILE_SLICE_RECV; } bool IsConstant() const { return class_ == NC_CONSTANT; } bool IsStage() const { return class_ == NC_TENSOR_BUFFER_PUT; } bool IsUnstage() const { return class_ == NC_TENSOR_BUFFER_TAKE; } @@ -339,12 +343,14 @@ class Node { NC_HOST_SEND, NC_REF_SEND, NC_SLICE_SEND, + NC_FILE_SLICE_SEND, NC_RECV, NC_HOST_RECV, NC_REF_RECV, NC_FUSE_RECV, NC_HOST_FUSE_RECV, NC_SLICE_RECV, + NC_FILE_SLICE_RECV, NC_CONSTANT, NC_VARIABLE, NC_KV_VAR_HANDLE, @@ -851,8 +857,10 @@ inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); } inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); } inline bool IsSend(const Node* node) { return node->IsSend(); } inline bool IsSliceSend(const Node* node) { return node->IsSliceSend(); } +inline bool IsFileSliceSend(const Node* node) { return node->IsFileSliceSend(); } inline bool IsRecv(const Node* node) { return node->IsRecv(); } inline bool IsSliceRecv(const Node* node) { return node->IsSliceRecv(); } +inline bool IsFileSliceRecv(const Node* node) { return node->IsFileSliceRecv(); } inline bool IsFuseRecv(const Node* node) { return node->IsFuseRecv(); } inline bool IsHostSend(const Node* node) { return node->IsHostSend(); } inline bool IsHostRecv(const Node* node) { return node->IsHostRecv(); } diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc index 1201623ffcd..fd72927bd79 100644 --- a/tensorflow/core/grappler/op_types.cc +++ b/tensorflow/core/grappler/op_types.cc @@ -265,6 +265,10 @@ bool IsExp(const NodeDef& node) { return node.op() == "Exp"; } bool IsFakeParam(const NodeDef& node) { return node.op() == "FakeParam"; } +bool IsFileSliceRecv(const NodeDef& node) { return node.op() == "_FileSliceRecv"; } + +bool IsFileSliceSend(const NodeDef& node) { return node.op() == "_FileSliceSend"; } + bool IsFill(const NodeDef& node) { return node.op() == "Fill"; } bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; } @@ -454,7 +458,8 @@ bool IsReciprocalGrad(const NodeDef& node) { } bool IsRecv(const NodeDef& node) { - return node.op() == "_Recv" || node.op() == "_HostRecv" || IsSliceRecv(node); + return node.op() == "_Recv" || node.op() == "_HostRecv" || + IsSliceRecv(node) || IsFileSliceRecv(node); } bool IsFuseRecv(const NodeDef& node) { @@ -502,7 +507,8 @@ bool IsSelect(const NodeDef& node) { return node.op() == "Select"; } bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; } bool IsSend(const NodeDef& node) { - return node.op() == "_Send" || node.op() == "_HostSend" || IsSliceSend(node); + return node.op() == "_Send" || node.op() == "_HostSend" || + IsSliceSend(node) || IsFileSliceSend(node); } bool IsShape(const NodeDef& node) { return node.op() == "Shape"; } diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h index 737581fd412..10968ad2547 100644 --- a/tensorflow/core/grappler/op_types.h +++ b/tensorflow/core/grappler/op_types.h @@ -80,6 +80,8 @@ bool IsExit(const NodeDef& node); bool IsExp(const NodeDef& node); bool IsFakeParam(const NodeDef& node); bool IsFill(const NodeDef& node); +bool IsFileSliceRecv(const NodeDef& node); +bool IsFileSliceSend(const NodeDef& node); bool IsFloorDiv(const NodeDef& node); bool IsFloorMod(const NodeDef& node); bool IsFusedBatchNorm(const NodeDef& node); diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 36721527cc2..4e6868a9897 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5423,6 +5423,7 @@ cc_library( name = "required", deps = [ ":no_op", + ":file_slice_sendrecv_ops", ":fuserecv_ops", ":sendrecv_ops", ":slice_sendrecv_ops", @@ -5446,10 +5447,33 @@ tf_kernel_library( deps = REQUIRED_DEPS, ) +cc_library( + name = "slice_sendrecv_utils", + hdrs = [ + "slice_sendrecv_utils.h" + ], + srcs = [ + "slice_sendrecv_utils.cc", + ], + deps = [ + "//tensorflow/core:framework", + ] +) + tf_kernel_library( name = "slice_sendrecv_ops", prefix = "slice_sendrecv_ops", - deps = REQUIRED_DEPS, + deps = REQUIRED_DEPS + [ + ":slice_sendrecv_utils", + ], +) + +tf_kernel_library( + name = "file_slice_sendrecv_ops", + prefix = "file_slice_sendrecv_ops", + deps = REQUIRED_DEPS + [ + ":slice_sendrecv_utils", + ], ) tf_kernel_library( @@ -5534,6 +5558,26 @@ tf_cc_test( ], ) +tf_cc_test( + name = "file_slice_sendrecv_ops_test", + srcs = ["file_slice_sendrecv_ops_test.cc"], + linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking + deps = [ + ":control_flow_ops", + ":cwise_op", + ":file_slice_sendrecv_ops", + ":logging_ops", + ":ops_testutil", + ":ops_util", + ":slice_sendrecv_ops", + ":whole_file_read_ops", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + tf_kernel_library( name = "fuserecv_ops", prefix = "fuserecv_ops", diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc new file mode 100644 index 00000000000..6bfe54363f9 --- /dev/null +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc @@ -0,0 +1,482 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/file_slice_sendrecv_ops.h" +#include "tensorflow/core/kernels/slice_sendrecv_utils.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/env.h" + +namespace tensorflow { + +//------------------------------------------------------------------------------ +// Functions of FileSliceSendOp. + +FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = \ + slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, + recv_device, send_device_incarnation, tensor_name); + + if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { + hostmem_sendrecv_ = false; + } + OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_)); +} + +void FileSliceSendOp::Compute(OpKernelContext* ctx) { + OP_REQUIRES(ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + + const Tensor& file_path_t = ctx->input(0); + if (!ctx->is_input_dead()) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(file_path_t.shape()), + errors::InvalidArgument("file_path is not a scalar: ", + file_path_t.shape().DebugString())); + } + + FrameAndIter frame_iter = \ + slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_); + + // get element_bytes. + uint64 element_bytes = 0; + OP_REQUIRES_OK(ctx, GetElementBytes(ctx, file_path_t, element_bytes)); + + // send total_bytes. + // total_bytes is the TotalBytes of the Tensor that contains the contents of + // the file. please refer Tensor::TotalBytes() + uint64 total_bytes = element_bytes + sizeof(tstring); + OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, total_bytes)); + // if input is dead, only send total_bytes dead tensor. + if (ctx->is_input_dead()) { + return; + } + + // if total bytes is smaller than slice size, send directly. + if (total_bytes <= slice_size_) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->input_alloc_attr(0); + + Rendezvous::ParsedKey parsed_key; + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data", + frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceSend " << parsed_key.buf_; + OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + Tensor data_t; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t)); + if (element_bytes > 0) { + OP_REQUIRES_OK(ctx, ReadFileToString(Env::Default(), + file_path_t.scalar()(), data_t.scalar().data())); + } + OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key,args, data_t, + ctx->is_input_dead())); + return; + } + + // send shape, in order to match the behavior of 'SliceSend'. + OP_REQUIRES_OK(ctx, SendScalarShape(ctx, frame_iter)); + + // send element bytes, in order to match the behavior of 'SliceSend'. + OP_REQUIRES_OK(ctx, SendElementBytes(ctx, frame_iter, element_bytes)); + + // send data. + OP_REQUIRES_OK(ctx, SendFileSlice(ctx, frame_iter, file_path_t, element_bytes)); +} + +Status FileSliceSendOp::GetElementBytes(OpKernelContext* ctx, + const Tensor& file_path_t, + uint64& element_bytes) { + + if (ctx->is_input_dead()) { + element_bytes = 0; + return Status::OK(); + } + + const string& file_path = file_path_t.scalar()(); + Env* env = Env::Default(); + + if (env->FileExists(file_path) != Status::OK()) { + element_bytes = 0; + return Status::OK(); + } + + return env->GetFileSize(file_path, &element_bytes); +} + +Status FileSliceSendOp::SendUInt64MetaMsg(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const string& name, + const uint64 val) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + + Rendezvous::ParsedKey parsed_key; + Tensor val_t; + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}), &val_t)); + val_t.scalar()() = val; + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter, + &parsed_key.buf_); + VLOG(2) << "FileSliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + return ctx->rendezvous()->Send(parsed_key, args, val_t, ctx->is_input_dead()); +} + +Status FileSliceSendOp::SendTotalBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const uint64 total_bytes) { + return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes", + total_bytes); +} + +Status FileSliceSendOp::SendScalarShape(OpKernelContext* ctx, + const FrameAndIter& frame_iter) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + Rendezvous::ParsedKey parsed_key; + + Tensor shape_t; + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({0}), &shape_t)); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_shape", frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + + return ctx->rendezvous()->Send(parsed_key, args, shape_t, + ctx->is_input_dead()); +} + +Status FileSliceSendOp::SendElementBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const uint64 element_bytes) { + return SendUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes", + element_bytes); +} + +Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const Tensor& file_path_t, + const uint64 element_bytes) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + Rendezvous::ParsedKey parsed_key; + + std::unique_ptr file; + Env* env = Env::Default(); + const string& file_path = file_path_t.scalar()(); + TF_RETURN_IF_ERROR(env->NewRandomAccessFile(file_path, &file)); + + // Slice Send. + int64 slice_num = element_bytes / slice_size_; + if (element_bytes % slice_size_ != 0) { + slice_num += 1; + } + Tensor data_t; + for (int64 i = 0; i < slice_num; i++) { + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t)); + uint64 start = i * slice_size_; + uint64 copy_size = slice_size_; + if (start > element_bytes - slice_size_) { + copy_size = element_bytes - start; + } + TF_RETURN_IF_ERROR(ReadFileSlice(file, start, copy_size, data_t)); + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(0), "_", + std::to_string(i)); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceSend " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, + ctx->is_input_dead())); + } + + + return Status::OK(); +} + +Status FileSliceSendOp::ReadFileSlice( + const std::unique_ptr& file, + const uint64 pos, const uint64 offset, + Tensor& data_t) { + string* data_s = data_t.scalar().data(); + gtl::STLStringResizeUninitialized(data_s, offset); + char* data_p = gtl::string_as_array(data_s); + StringPiece result; + TF_RETURN_IF_ERROR(file->Read(pos, offset, &result, data_p)); + if (result.data() != data_p) { + memmove(data_p, result.data(), result.size()); + } + + return Status::OK(); +} + +REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_CPU), + FileSliceSendOp); +REGISTER_KERNEL_BUILDER(Name("_FileSliceSend").Device(DEVICE_DEFAULT), + FileSliceSendOp); + +//------------------------------------------------------------------------------ +// Functions of FileSliceRecvOp. + +FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = \ + slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, + recv_device, send_device_incarnation, tensor_name); + if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { + hostmem_sendrecv_ = false; + } + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_dir", &recv_dir_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("slice_size", &slice_size_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("timeout_ms", &timeout_ms_)); +} + +void FileSliceRecvOp::Compute(OpKernelContext* ctx) { + OP_REQUIRES(ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + + FrameAndIter frame_iter = \ + slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_); + + bool is_dead = false; + uint64 total_bytes = 0; + OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes)); + if (is_dead) { + return; + } + + // Create file path output. + Env* env = Env::Default(); + if (!env->FileExists(recv_dir_).ok()) { + OP_REQUIRES_OK(ctx, env->RecursivelyCreateDir(recv_dir_)); + } + const string &filename = GenerateRecvFileName(ctx->op_kernel().name()); + const string &file_path = io::JoinPath(recv_dir_, "tempfilerecv-"+filename); + Tensor* file_path_t = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &file_path_t)); + file_path_t->scalar()() = file_path; + + // if total bytes is smaller than slice size, recv directly. + if (total_bytes <= slice_size_) { + OP_REQUIRES_OK(ctx, RecvFile(ctx, frame_iter, file_path)); + return; + } + + // recv shape, in order to match the behavior of 'SliceRecv'. + TensorShape shape; + OP_REQUIRES_OK(ctx, RecvShape(ctx, frame_iter, shape)); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(shape), + errors::InvalidArgument( + "FileSliceRecv only supports receiving a tensor with a scalar shape.")); + + // recv element_bytes, in order to match the behavior of 'SliceRecv'. + uint64 element_bytes = 0; + OP_REQUIRES_OK(ctx, RecvElementBytes(ctx, frame_iter, element_bytes)); + + // recv data. + OP_REQUIRES_OK(ctx, RecvFileSlice(ctx, frame_iter, element_bytes, file_path)); +} + +Status FileSliceRecvOp::RecvUInt64MetaMsg(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const string& name, bool &is_dead, + uint64& val) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + Tensor val_t; + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, name, frame_iter, + &parsed_key.buf_); + VLOG(2) << "FileSliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->Recv(parsed_key, args, &val_t, &is_dead, timeout_ms_)); + if (!is_dead) { + val = val_t.scalar()(); + } + + return Status::OK(); +} + +Status FileSliceRecvOp::RecvTotalBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + bool& is_dead, uint64& total_bytes) { + return RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_totalbytes", + is_dead, total_bytes); +} + +string FileSliceRecvOp::GenerateRecvFileName(const string& op_name) { + const std::vector& file_name_vec = absl::StrSplit(op_name, "/"); + return absl::StrJoin(file_name_vec, "_"); +} + +Status FileSliceRecvOp::RecvShape(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + TensorShape& shape) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = AllocatorAttributes(); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", + frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + + Tensor shape_t; + bool is_dead; + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &shape_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + auto shape_vec = shape_t.vec(); + const int64 num_elements = shape_t.NumElements(); + for (int64 i = 0; i < num_elements; i++) { + shape.AddDim(shape_vec(i)); + } + + return Status::OK(); +} + +Status FileSliceRecvOp::RecvElementBytes(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + uint64& element_bytes) { + bool is_dead = false; + Status s = \ + RecvUInt64MetaMsg(ctx, frame_iter, "_slice_transfer_elements_bytes", is_dead, + element_bytes); + CHECK_EQ(is_dead, false); + + return s; +} + +Status FileSliceRecvOp::RecvFile(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const string& file_path) { + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + + Rendezvous::ParsedKey parsed_key; + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data", + frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + Tensor data_t; + bool is_dead = false; + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + + // Write data_t to file. + Env* env = Env::Default(); + return WriteStringToFile(env, file_path, data_t.scalar()()); +} + +Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx, + const FrameAndIter& frame_iter, + const uint64 element_bytes, + const string& file_path) { + // create file + Env* env = Env::Default(); + std::unique_ptr file_ptr; + TF_RETURN_IF_ERROR(env->NewWritableFile(file_path, &file_ptr)); + + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + if (ctx->is_eager()) { + // NOTE(fishx): Only set cancellation_manager in eager mode. Because in + // Tensorflow 1.x, session (or graph_mgr) will abort the underlying + // rendezvous if it encounters any error. + args.cancellation_manager = ctx->cancellation_manager(); + } + Rendezvous::ParsedKey parsed_key; + + int64 slice_num = element_bytes / slice_size_; + if (element_bytes % slice_size_ != 0) { + slice_num += 1; + } + Tensor data_t; + bool is_dead = false; + for (int64 i = 0; i < slice_num; i++) { + std::string tensor_name_suffix = \ + strings::StrCat("_slice_transfer_data_", std::to_string(0), "_", + std::to_string(i)); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); + VLOG(2) << "FileSliceRecv " << parsed_key.buf_; + TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, + &is_dead, timeout_ms_)); + // This shouldn't be a dead tensor. + CHECK_EQ(is_dead, false); + file_ptr->Append(data_t.scalar()()); + } + + return Status::OK(); +} + +REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_CPU), + FileSliceRecvOp); +REGISTER_KERNEL_BUILDER(Name("_FileSliceRecv").Device(DEVICE_DEFAULT), + FileSliceRecvOp); + +}; // End of namespace tensorflow diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h new file mode 100644 index 00000000000..6701196d481 --- /dev/null +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h @@ -0,0 +1,98 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_ +#define TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { + +class FileSliceSendOp : public OpKernel { + public: + explicit FileSliceSendOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + + private: + // Variables. + string key_prefix_; + bool hostmem_sendrecv_; + int32 slice_size_; + + // Functions. + Status GetElementBytes(OpKernelContext* ctx, const Tensor& file_path_t, + uint64& element_bytes); + + Status SendUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const string& name, const uint64 val); + + Status SendTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const uint64 total_bytes); + + Status SendScalarShape(OpKernelContext* ctx, const FrameAndIter& frame_iter); + + Status SendElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const uint64 element_bytes); + + Status SendFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const Tensor& file_path_t, const uint64 element_bytes); + + Status ReadFileSlice(const std::unique_ptr& file, + const uint64 pos, const uint64 offset, Tensor& data_t); + + TF_DISALLOW_COPY_AND_ASSIGN(FileSliceSendOp); +}; + +class FileSliceRecvOp: public OpKernel { + public: + explicit FileSliceRecvOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + + private: + // Variables. + string key_prefix_; + bool hostmem_sendrecv_; + string recv_dir_; + int32 slice_size_; + int64 timeout_ms_; + + // Functions. + Status RecvUInt64MetaMsg(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const string& name, bool &is_dead, uint64& val); + + Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + bool& is_dead, uint64& total_bytes); + + string GenerateRecvFileName(const string& op_name); + + Status RecvFile(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const string& file_path); + + Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter, + TensorShape& shape); + + Status RecvElementBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, + uint64& element_bytes); + + Status RecvFileSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, + const uint64 element_bytes, const string& file_path); + + TF_DISALLOW_COPY_AND_ASSIGN(FileSliceRecvOp); +}; + +}; // End of namespace tensorflow + +#endif // End of macro TENSORFLOW_CORE_KERNELS_FILE_SLICE_SENDRECV_OPS_H_ diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc new file mode 100644 index 00000000000..931cd152253 --- /dev/null +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc @@ -0,0 +1,483 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +namespace { +// Implement a trivial version of the Rendezvous interface, to avoid +// clouding the benchmark results with the time spent in the various +// implementations, and to avoid the duplicate-send or duplicate-recv +// errors that would arise from running either benchmark in a loop. +class DummyRendezvous : public Rendezvous { + // Functions. + Status Send(const ParsedKey& key, const Args& args, const Tensor& val, + const bool is_dead) override { + std::string key_str = { key.FullKey().data(), key.FullKey().size() }; + mutex_lock l(mu_); + // consumer does not reach. + if (kv_.count(key_str) == 0) { + struct Var var; + var.type = send; + var.args = args; + var.data = val; + var.is_dead = is_dead; + + kv_[key_str] = var; + return Status::OK(); + } + + auto var = kv_[key_str]; + CHECK_EQ(var.type, recv); + var.done(Status::OK(), args, var.args, val, is_dead); + kv_.erase(key_str); + return Status::OK(); + } + void RecvAsync(const ParsedKey& key, const Args& args, + DoneCallback done) override { + std::string key_str = { key.FullKey().data(), key.FullKey().size() }; + + mutex_lock l(mu_); + // producer does not reach. + if (kv_.count(key_str) == 0) { + struct Var var; + var.type = recv; + var.args = args; + var.done = done; + + kv_[key_str] = var; + return; + } + + // auto var = kv_[key_str]; + auto var = kv_[key_str]; + CHECK_EQ(var.type, send); + done(Status::OK(), var.args, args, var.data, var.is_dead); + kv_.erase(key_str); + } + void StartAbort(const Status& status) override {} + + private: + enum RendezvousType { + send, + recv + }; + // Type define. + struct Var { + RendezvousType type; + Args args; + Tensor data; + bool is_dead; + DoneCallback done; + }; + + // Variables. + mutex mu_; + std::unordered_map kv_ GUARDED_BY(mu_); +}; + +//------------------------------------------------------------------------------ +// Utils. +Node* FileSliceSend(Graph* g, Node* filename, const string& tensor, + const string& sender, const uint64 sender_incarnation, + const string& receiver, const int32 slice_size) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceSend"), "_FileSliceSend") + .Input(filename, 0) + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("slice_size", slice_size) + .Finalize(g, &ret)); + + return ret; +} + +Node* FileSliceRecv(Graph* g, const string& tensor, const string& sender, + const uint64 sender_incarnation, const string& receiver, + const string& recv_dir, const int32 slice_size, + const int64 timeout_ms) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("FileSliceRecv"), "_FileSliceRecv") + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("recv_dir", recv_dir) + .Attr("slice_size", slice_size) + .Attr("timeout_ms", timeout_ms) + .Finalize(g, &ret)); + + return ret; +} + +Node* SliceSend(Graph* g, Node* input, const string& tensor, + const string& sender, const uint64 sender_incarnation, + const string& receiver, const int32 slice_size) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceSend") + .Input(input, 0) + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("slice_size", slice_size) + .Finalize(g, &ret)); + return ret; +} + +Node* SliceRecv(Graph* g, const string& tensor, const string& sender, + const uint64 sender_incarnation, const string& receiver, + const int32 slice_size, const int64 timeout_ms) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_SliceRecv") + .Attr("tensor_type", DT_STRING) + .Attr("tensor_name", tensor) + .Attr("send_device", sender) + .Attr("send_device_incarnation", + static_cast(sender_incarnation)) + .Attr("recv_device", receiver) + .Attr("slice_size", slice_size) + .Attr("timeout_ms", timeout_ms) + .Finalize(g, &ret)); + return ret; +} + +Node* ReadFile(Graph* g, Node* filename) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("ReadFile"), "ReadFile") + .Input(filename, 0) + .Finalize(g, &ret)); + + return ret; +} + +Node* WriteFile(Graph* g, Node* filename, Node* contents) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("WriteFile"), "WriteFile") + .Input(filename, 0) + .Input(contents, 0) + .Finalize(g, &ret)); + + return ret; +} + +Node* Equal(Graph* g, Node* x, Node* y) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("Equal"), "Equal") + .Input(x) + .Input(y) + .Finalize(g, &ret)); + return ret; +} + +Node* Assert(Graph* g, Node* condition, + std::vector& data) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Assert") + .Input(condition) + .Input(data) + .Finalize(g, &ret)); + return ret; +} + +//------------------------------------------------------------------------------ +// Graph Constructor. + +static Graph* TransferFile(const std::string& test_type, + const int32 slice_size) { + Graph* g = new Graph(OpRegistry::Global()); + const int64 timeout_ms = 5000; + std::string recv_dir = "/tmp/FileSliceTransferTestRecv"; + std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type; + std::string contents = \ + "The quick brown fox jumps over the lazy dog."; // 44 chars. + + // send filename node. + Tensor filename_t(DT_STRING, TensorShape({})); + filename_t.scalar().setConstant(filename); + Node* filename_n = test::graph::Constant(g, filename_t); + + // contents node. + Tensor contents_t(DT_STRING, TensorShape({})); + contents_t.scalar().setConstant(contents); + Node* contents_n = test::graph::Constant(g, contents_t); + + Node* write_file_n = WriteFile(g, filename_n, contents_n); + Node* send_n = \ + FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size); + g->AddControlEdge(write_file_n, send_n); + + Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir, + slice_size, timeout_ms); + Node* read_file_n = ReadFile(g, recv_n); + Node* equal_n = Equal(g, contents_n, read_file_n); + + std::vector data_out; + data_out.emplace_back(contents_n, 0); + data_out.emplace_back(read_file_n, 0); + Assert(g, equal_n, data_out); + + return g; +} + +static Graph* FileSliceSendTransferFileToSliceRecv(const std::string& test_type, + const int32 slice_size) { + Graph* g = new Graph(OpRegistry::Global()); + const int64 timeout_ms = 5000; + std::string filename = "/tmp/FileSliceTransferTestSend/send_" + test_type; + std::string contents = \ + "The quick brown fox jumps over the lazy dog."; // 44 chars. + + // send filename node. + Tensor filename_t(DT_STRING, TensorShape({})); + filename_t.scalar().setConstant(filename); + Node* filename_n = test::graph::Constant(g, filename_t); + + // contents node. + Tensor contents_t(DT_STRING, TensorShape({})); + contents_t.scalar().setConstant(contents); + Node* contents_n = test::graph::Constant(g, contents_t); + + Node* write_file_n = WriteFile(g, filename_n, contents_n); + Node* send_n = \ + FileSliceSend(g, filename_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size); + g->AddControlEdge(write_file_n, send_n); + + Node* recv_n = \ + SliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + Node* equal_n = Equal(g, contents_n, recv_n); + + std::vector data_out; + data_out.emplace_back(contents_n, 0); + data_out.emplace_back(recv_n, 0); + Assert(g, equal_n, data_out); + + return g; +} + +static Graph* SliceSendTransferFileToFileSliceRecv(const std::string& test_type, + const int32 slice_size) { + Graph* g = new Graph(OpRegistry::Global()); + const int64 timeout_ms = 5000; + std::string recv_dir = "/tmp/FileSliceTransferTestRecv"; + std::string contents = \ + "The quick brown fox jumps over the lazy dog."; // 44 chars. + + // contents node. + Tensor contents_t(DT_STRING, TensorShape({})); + contents_t.scalar().setConstant(contents); + Node* contents_n = test::graph::Constant(g, contents_t); + + Node* send_n = \ + SliceSend(g, contents_n, test_type, "/cpu:0", 1, "/cpu:0", slice_size); + + Node* recv_n = FileSliceRecv(g, test_type, "/cpu:0", 1, "/cpu:0", recv_dir, + slice_size, timeout_ms); + Node* read_file_n = ReadFile(g, recv_n); + Node* equal_n = Equal(g, contents_n, read_file_n); + + std::vector data_out; + data_out.emplace_back(contents_n, 0); + data_out.emplace_back(read_file_n, 0); + Assert(g, equal_n, data_out); + + return g; +} + +static Graph* TransferDeadTensor() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + std::string recv_dir = "/tmp/FileSliceTransferTestRecv"; + std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor"; + + // val + Tensor val_t(DT_STRING, TensorShape({})); + val_t.scalar()() = filename; + Node* val_n = test::graph::Constant(g, val_t); + + Tensor pred_t(DT_BOOL, TensorShape({})); + pred_t.scalar()() = true; + Node* pred_n = test::graph::Constant(g, pred_t); + + Node* switch_n = test::graph::Switch(g, val_n, pred_n); + FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size); + FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size, + timeout_ms); + + return g; +} + +static Graph* FileSliceSendTransferDeadTensorToSliceRecv() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + std::string recv_dir = "/tmp/FileSliceTransferTestRecv"; + std::string filename = "/tmp/FileSliceTransferTestSend/send_dead_tensor"; + + // val + Tensor val_t(DT_STRING, TensorShape({})); + val_t.scalar()() = filename; + Node* val_n = test::graph::Constant(g, val_t); + + Tensor pred_t(DT_BOOL, TensorShape({})); + pred_t.scalar()() = true; + Node* pred_n = test::graph::Constant(g, pred_t); + + Node* switch_n = test::graph::Switch(g, val_n, pred_n); + FileSliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size); + SliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size, timeout_ms); + + return g; +} + +static Graph* SliceSendTransferDeadTensorToFileSliceRecv() { + Graph* g = new Graph(OpRegistry::Global()); + const int32 slice_size = 1024; + const int64 timeout_ms = 5000; + std::string recv_dir = "/tmp/FileSliceTransferTestRecv"; + std::string contents = \ + "The quick brown fox jumps over the lazy dog."; // 44 chars. + + // val + Tensor val_t(DT_STRING, TensorShape({})); + val_t.scalar()() = contents; + Node* val_n = test::graph::Constant(g, val_t); + + Tensor pred_t(DT_BOOL, TensorShape({})); + pred_t.scalar()() = true; + Node* pred_n = test::graph::Constant(g, pred_t); + + Node* switch_n = test::graph::Switch(g, val_n, pred_n); + SliceSend(g, switch_n, "dead_tensor", "/cpu:0", 1, "/cpu:0", slice_size); + FileSliceRecv(g, "dead_tensor", "/cpu:0", 1, "/cpu:0", recv_dir, slice_size, + timeout_ms); + + return g; +} + +static Graph* TransferSmallFile() { + return TransferFile("small_file", 1024); +} + +static Graph* TransferBigFile() { + return TransferFile("big_file", 16); +} + +static Graph* FileSliceSendTransferSmallFileToSliceRecv() { + return FileSliceSendTransferFileToSliceRecv("small_file", 1024); +} + +static Graph* FileSliceSendTransferBigFileToSliceRecv() { + return FileSliceSendTransferFileToSliceRecv("big_file", 16); +} + +static Graph* SliceSendTransferSmallFileToFileSliceRecv() { + return SliceSendTransferFileToFileSliceRecv("small_file", 1024); +} + +static Graph* SliceSendTransferBigFileToFileSliceRecv() { + return SliceSendTransferFileToFileSliceRecv("big_file", 16); +} + +//------------------------------------------------------------------------------ +// Test Function. + +static void BM_TransferSmallFile(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferSmallFile(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_TransferBigFile(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferBigFile(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_FileSliceSendTransferSmallFileToSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", FileSliceSendTransferSmallFileToSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +static void BM_FileSliceSendTransferBigFileToSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", FileSliceSendTransferBigFileToSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +static void BM_SliceSendTransferSmallFileToFileSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", SliceSendTransferSmallFileToFileSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +static void BM_SliceSendTransferBigFileToFileSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", SliceSendTransferBigFileToFileSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +static void BM_TransferDeadTensor(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", TransferDeadTensor(), nullptr, nullptr, + new DummyRendezvous).Run(iters); +} + +static void BM_FileSliceSendTransferDeadTensorToSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", FileSliceSendTransferDeadTensorToSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +static void BM_SliceSendTransferDeadTensorToFileSliceRecv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast(iters)); + test::Benchmark("cpu", SliceSendTransferDeadTensorToFileSliceRecv(), nullptr, + nullptr, new DummyRendezvous).Run(iters); +} + +BENCHMARK(BM_TransferSmallFile); +BENCHMARK(BM_TransferBigFile); +BENCHMARK(BM_FileSliceSendTransferSmallFileToSliceRecv); +BENCHMARK(BM_FileSliceSendTransferBigFileToSliceRecv); +BENCHMARK(BM_SliceSendTransferSmallFileToFileSliceRecv); +BENCHMARK(BM_SliceSendTransferBigFileToFileSliceRecv); +BENCHMARK(BM_TransferDeadTensor); +BENCHMARK(BM_FileSliceSendTransferDeadTensorToSliceRecv); +BENCHMARK(BM_SliceSendTransferDeadTensorToFileSliceRecv); + +} // End of anonymous namespace + +} // End of namespace tensorflow diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc index f09f314ae10..25f1a4e8738 100644 --- a/tensorflow/core/kernels/slice_sendrecv_ops.cc +++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc @@ -14,41 +14,10 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/kernels/slice_sendrecv_ops.h" +#include "tensorflow/core/kernels/slice_sendrecv_utils.h" namespace tensorflow { -//------------------------------------------------------------------------------ -// Utils. -static string GetSliceRendezvousKeyPrefix(const string& send_device, - const string& recv_device, - const uint64 send_device_incarnation, - const string& tensor_name) { - return strings::StrCat(send_device, ";", - strings::FpToString(send_device_incarnation), ";", - recv_device, ";", tensor_name); -} - -static void GetSliceRendezvousKey(const string& key_prefix, - const string& tensor_name_suffix, - const FrameAndIter& frame_iter, string* key) { - key->clear(); - strings::StrAppend(key, key_prefix, tensor_name_suffix, ";", - frame_iter.frame_id, ":", frame_iter.iter_id); -} - -static FrameAndIter GetFrameAndIter(OpKernelContext* ctx, - bool hostmem_sendrecv) { - if (hostmem_sendrecv && ctx->call_frame() != nullptr) { - // Host memory send/recv pairs are added by - // common_runtime/memory_types.cc. When the pair of nodes are - // added inside a function, we need to use the function call frame - // to formulate the unique rendezvous key. - return FrameAndIter(reinterpret_cast(ctx->call_frame()), 0); - } else { - return ctx->frame_iter(); - } -} - //------------------------------------------------------------------------------ // Functions of SliceSendOp. @@ -64,8 +33,9 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { string tensor_name; OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); key_prefix_ = \ - GetSliceRendezvousKeyPrefix(send_device, recv_device, - send_device_incarnation, tensor_name); + slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, + recv_device, send_device_incarnation, tensor_name); + if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; } @@ -79,7 +49,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) { errors::Internal("Op kernel context needs to provide a rendezvous.")); const Tensor& input_t = ctx->input(0); - FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_); + FrameAndIter frame_iter = \ + slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_); // send total_bytes. OP_REQUIRES_OK(ctx, SendTotalBytes(ctx, frame_iter, input_t)); @@ -95,8 +66,8 @@ void SliceSendOp::Compute(OpKernelContext* ctx) { args.alloc_attrs = ctx->input_alloc_attr(0); Rendezvous::ParsedKey parsed_key; - GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data", + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed_key, args, input_t, @@ -124,11 +95,11 @@ Status SliceSendOp::SendTotalBytes(OpKernelContext* ctx, Rendezvous::ParsedKey parsed_key; Tensor total_bytes_t; - TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, TensorShape({}), + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, TensorShape({}), &total_bytes_t)); - total_bytes_t.scalar()() = input_t.TotalBytes(); - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter, - &parsed_key.buf_); + total_bytes_t.scalar()() = input_t.TotalBytes(); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); return ctx->rendezvous()->Send(parsed_key, args, total_bytes_t, @@ -152,8 +123,8 @@ Status SliceSendOp::SendShape(OpKernelContext* ctx, for (int i = 0; i < rank; i++) { shape_vec(i) = shape.dim_size(i); } - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_shape", frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); return ctx->rendezvous()->Send(parsed_key, args, shape_t, @@ -168,21 +139,21 @@ Status SliceSendOp::SendString(OpKernelContext* ctx, args.alloc_attrs = AllocatorAttributes(); Rendezvous::ParsedKey parsed_key; - // send elements size. - Tensor elements_size_t; - TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_INT64, input_t.shape(), - &elements_size_t)); + // send elements bytes. + Tensor elements_bytes_t; + TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_UINT64, input_t.shape(), + &elements_bytes_t)); int64 num_elements = input_t.NumElements(); auto input_flat = input_t.flat(); - auto elements_size_flat = elements_size_t.flat(); + auto elements_bytes_flat = elements_bytes_t.flat(); for (int64 i = 0; i < num_elements; i++) { - elements_size_flat(i) = input_flat(i).size(); + elements_bytes_flat(i) = input_flat(i).size(); } - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size", - frame_iter, &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_size_t, + TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, elements_bytes_t, ctx->is_input_dead())); // send data. @@ -196,8 +167,8 @@ Status SliceSendOp::SendString(OpKernelContext* ctx, data_t.scalar()() = elem; std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, @@ -218,7 +189,10 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx, args.alloc_attrs = ctx->input_alloc_attr(0); Rendezvous::ParsedKey parsed_key; - int64 slice_num = (elem.size() + slice_size_ - 1) / slice_size_; + int64 slice_num = elem.size() / slice_size_; + if (elem.size() % slice_size_ != 0) { + slice_num += 1; + } Tensor data_t; for (int64 i = 0; i < slice_num; i++) { TF_RETURN_IF_ERROR(ctx->allocate_temp(DT_STRING, TensorShape({}), &data_t)); @@ -231,8 +205,8 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx, std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(index), "_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, @@ -252,12 +226,15 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx, // send data. Tensor data_t; - int64 bytes_num = input_t.TotalBytes(); - int64 slice_num = (bytes_num + slice_size_ - 1) / slice_size_; + size_t bytes_num = input_t.TotalBytes(); + int64 slice_num = bytes_num / slice_size_; + if (bytes_num % slice_size_ != 0) { + slice_num += 1; + } unsigned char* input_base = reinterpret_cast(input_t.data()); for (int64 i = 0; i < slice_num; i++) { - int64 start = i * slice_size_; - int64 copy_size = slice_size_; + size_t start = i * slice_size_; + size_t copy_size = slice_size_; if (start > bytes_num - slice_size_) { copy_size = bytes_num - start; } @@ -267,8 +244,8 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx, std::memcpy(data_base, input_base+start, copy_size); std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, @@ -296,8 +273,8 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) { string tensor_name; OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); key_prefix_ = \ - GetSliceRendezvousKeyPrefix(send_device, recv_device, - send_device_incarnation, tensor_name); + slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, + recv_device, send_device_incarnation, tensor_name); if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; } @@ -311,11 +288,12 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) { ctx, ctx->rendezvous() != nullptr, errors::Internal("Op kernel context needs to provide a rendezvous.")); - FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_); + FrameAndIter frame_iter = \ + slice_sendrecv::GetFrameAndIter(ctx, hostmem_sendrecv_); bool is_dead; // recv total_bytes. - int64 total_bytes; + uint64 total_bytes; OP_REQUIRES_OK(ctx, RecvTotalBytes(ctx, frame_iter, is_dead, total_bytes)); if (is_dead) { return; @@ -334,8 +312,8 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) { } Rendezvous::ParsedKey parsed_key; - GetSliceRendezvousKey(key_prefix_, "_transfer_data", frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_transfer_data", + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); Tensor data_t; @@ -364,7 +342,7 @@ void SliceRecvOp::Compute(OpKernelContext* ctx) { Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, - bool& is_dead, int64& total_bytes) { + bool& is_dead, uint64& total_bytes) { Rendezvous::Args args; args.device_context = ctx->op_device_context(); args.alloc_attrs = AllocatorAttributes(); @@ -377,14 +355,14 @@ Status SliceRecvOp::RecvTotalBytes(OpKernelContext* ctx, Rendezvous::ParsedKey parsed_key; Tensor total_bytes_t; - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_totalbytes", frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_totalbytes", frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &total_bytes_t, &is_dead, timeout_ms_)); if (!is_dead) { - total_bytes = total_bytes_t.scalar()(); + total_bytes = total_bytes_t.scalar()(); } return Status::OK(); @@ -404,8 +382,8 @@ Status SliceRecvOp::RecvShape(OpKernelContext* ctx, } Rendezvous::ParsedKey parsed_key; - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, "_slice_transfer_shape", + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); @@ -439,27 +417,27 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx, Rendezvous::ParsedKey parsed_key; bool is_dead; - // recv elements size. - GetSliceRendezvousKey(key_prefix_, "_slice_transfer_elements_size", - frame_iter, &parsed_key.buf_); + // recv elements bytes. + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, + "_slice_transfer_elements_bytes", frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - Tensor elements_size_t; - TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_size_t, + Tensor elements_bytes_t; + TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &elements_bytes_t, &is_dead, timeout_ms_)); // This shouldn't be a dead tensor. CHECK_EQ(is_dead, false); - auto elements_size_flat = elements_size_t.flat(); + auto elements_bytes_flat = elements_bytes_t.flat(); int64 num_elements = shape.num_elements(); args.alloc_attrs = ctx->output_alloc_attr(0); Tensor data_t; auto output_flat = output_t->flat(); for (int64 i = 0; i < num_elements; i++) { - if (elements_size_flat(i) <= slice_size_) { + if (elements_bytes_flat(i) <= slice_size_) { std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, @@ -469,7 +447,7 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx, output_flat(i) = data_t.scalar()(); } else { TF_RETURN_IF_ERROR(RecvStringSlice(ctx, frame_iter, i, - elements_size_flat(i), output_flat)); + elements_bytes_flat(i), output_flat)); } } @@ -478,7 +456,8 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx, Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, - const int64 index, const int64 element_size, + const int64 index, + const uint64 element_bytes, TTypes::Flat& output_flat) { Rendezvous::Args args; args.device_context = ctx->op_device_context(); @@ -491,15 +470,18 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx, } Rendezvous::ParsedKey parsed_key; - int64 slice_num = (element_size + slice_size_ - 1) / slice_size_; + int64 slice_num = element_bytes / slice_size_; + if (element_bytes % slice_size_ != 0) { + slice_num += 1; + } Tensor data_t; bool is_dead = false; for (int64 i = 0; i < slice_num; i++) { std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(index), "_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, @@ -514,7 +496,7 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx, Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter, - const int64 total_bytes, + const uint64 total_bytes, Tensor*& output_t) { Rendezvous::Args args; args.device_context = ctx->op_device_context(); @@ -529,19 +511,22 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx, Tensor data_t; bool is_dead = false; - int64 slice_num = (total_bytes + slice_size_ - 1) / slice_size_; + int64 slice_num = total_bytes / slice_size_; + if (total_bytes % slice_size_ != 0) { + slice_num += 1; + } unsigned char* output_base = \ reinterpret_cast(output_t->data()); for (int64 i = 0; i < slice_num; i++) { - int64 start = i * slice_size_; - int64 copy_size = slice_size_; + uint64 start = i * slice_size_; + uint64 copy_size = slice_size_; if (start > total_bytes - slice_size_) { copy_size = total_bytes - start; } std::string tensor_name_suffix = \ strings::StrCat("_slice_transfer_data_", std::to_string(i)); - GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, frame_iter, - &parsed_key.buf_); + slice_sendrecv::GetSliceRendezvousKey(key_prefix_, tensor_name_suffix, + frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h index df55c080aa1..43429bff32f 100644 --- a/tensorflow/core/kernels/slice_sendrecv_ops.h +++ b/tensorflow/core/kernels/slice_sendrecv_ops.h @@ -66,7 +66,7 @@ class SliceRecvOp : public OpKernel { // Fucntions. Status RecvTotalBytes(OpKernelContext* ctx, const FrameAndIter& frame_iter, - bool& is_dead, int64& total_bytes); + bool& is_dead, uint64& total_bytes); Status RecvShape(OpKernelContext* ctx, const FrameAndIter& frame_iter, TensorShape& shape); @@ -75,11 +75,11 @@ class SliceRecvOp : public OpKernel { const TensorShape& shape, Tensor*& output_t); Status RecvStringSlice(OpKernelContext* ctx, const FrameAndIter& frame_iter, - const int64 index, const int64 element_size, + const int64 index, const uint64 element_bytes, TTypes::Flat& output_flat); Status RecvBasicType(OpKernelContext* ctx, const FrameAndIter& frame_iter, - const int64 total_bytes, Tensor*& output_t); + const uint64 total_bytes, Tensor*& output_t); TF_DISALLOW_COPY_AND_ASSIGN(SliceRecvOp); }; diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.cc b/tensorflow/core/kernels/slice_sendrecv_utils.cc new file mode 100644 index 00000000000..56c2166c650 --- /dev/null +++ b/tensorflow/core/kernels/slice_sendrecv_utils.cc @@ -0,0 +1,53 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/slice_sendrecv_utils.h" + +namespace tensorflow { + +namespace slice_sendrecv { + +string GetSliceRendezvousKeyPrefix(const string& send_device, + const string& recv_device, + const uint64 send_device_incarnation, + const string& tensor_name) { + return strings::StrCat(send_device, ";", + strings::FpToString(send_device_incarnation), ";", + recv_device, ";", tensor_name); +} + +void GetSliceRendezvousKey(const string& key_prefix, + const string& tensor_name_suffix, + const FrameAndIter& frame_iter, string* key) { + key->clear(); + strings::StrAppend(key, key_prefix, tensor_name_suffix, ";", + frame_iter.frame_id, ":", frame_iter.iter_id); +} + +FrameAndIter GetFrameAndIter(OpKernelContext* ctx, bool hostmem_sendrecv) { + if (hostmem_sendrecv && ctx->call_frame() != nullptr) { + // Host memory send/recv pairs are added by + // common_runtime/memory_types.cc. When the pair of nodes are + // added inside a function, we need to use the function call frame + // to formulate the unique rendezvous key. + return FrameAndIter(reinterpret_cast(ctx->call_frame()), 0); + } else { + return ctx->frame_iter(); + } +} + +}; // End of namespace slice_sendrecv + +}; // End of namespace tensorflow diff --git a/tensorflow/core/kernels/slice_sendrecv_utils.h b/tensorflow/core/kernels/slice_sendrecv_utils.h new file mode 100644 index 00000000000..3605eece2ca --- /dev/null +++ b/tensorflow/core/kernels/slice_sendrecv_utils.h @@ -0,0 +1,41 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_ +#define TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +namespace slice_sendrecv { + +extern string GetSliceRendezvousKeyPrefix(const string& send_device, + const string& recv_device, + const uint64 send_device_incarnation, + const string& tensor_name); + +extern void GetSliceRendezvousKey(const string& key_prefix, + const string& tensor_name_suffix, + const FrameAndIter& frame_iter, string* key); + +extern FrameAndIter GetFrameAndIter(OpKernelContext* ctx, + bool hostmem_sendrecv); + +}; // End of namespace slice_sendrecv + +}; // End of namespace tensorflow + +#endif // End of macro TENSORFLOW_CORE_KERNELS_SLICE_SENDRECV_UTILS_H_ diff --git a/tensorflow/core/ops/file_slice_sendrecv_ops.cc b/tensorflow/core/ops/file_slice_sendrecv_ops.cc new file mode 100644 index 00000000000..c7eb20d1358 --- /dev/null +++ b/tensorflow/core/ops/file_slice_sendrecv_ops.cc @@ -0,0 +1,77 @@ +/* Copyright 2023 The DeepRec Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +namespace tensorflow { +REGISTER_OP("_FileSliceSend") + .Input("file_path: string") + .Attr("tensor_name: string") + .Attr("send_device: string") + .Attr("send_device_incarnation: int") + .Attr("recv_device: string") + .Attr("client_terminated: bool = false") + .Attr("slice_size: int >= 1") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Sends the file from send_device to recv_device. +Supports sending the file of any size. + +file_path: The file to send. +tensor_name: The name of the tensor to send. +send_device: The name of the device sending the tensor. +send_device_incarnation: The current incarnation of send_device. +recv_device: The name of the device receiving the tensor. +client_terminated: If set to true, this indicates that the node was added + to the graph as a result of a client-side feed or fetch of Tensor data, + in which case the corresponding send or recv is expected to be managed + locally by the caller. +slice_size: The maximum number of bytes transferred at one time. +)doc"); + +REGISTER_OP("_FileSliceRecv") + .Output("file_path: string") + .Attr("tensor_name: string") + .Attr("send_device: string") + .Attr("send_device_incarnation: int") + .Attr("recv_device: string") + .Attr("client_terminated: bool = false") + .Attr("recv_dir: string") + .Attr("slice_size: int >= 1") + .Attr("timeout_ms: int >= 0 = 300000") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Receives the file from send_device on recv_device. +Supports recving the file of any size. + +file_path: The file to receive. +tensor_name: The name of the tensor to receive. +send_device: The name of the device sending the tensor. +send_device_incarnation: The current incarnation of send_device. +recv_device: The name of the device receiving the tensor. +client_terminated: If set to true, this indicates that the node was added + to the graph as a result of a client-side feed or fetch of Tensor data, + in which case the corresponding send or recv is expected to be managed + locally by the caller. +recv_dir: the directory to store received file. +slice_size: The maximum number of bytes transferred at one time. +timeout_ms: The maximum wait time for receiving a tensor. +)doc"); + +}; // End of namespace tensorflow From 2f938dc2a18e57c9a302f5a8b988f6cd39f89e2f Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Tue, 9 Jan 2024 17:46:11 -0800 Subject: [PATCH 25/45] [TensorRT] Fix Graph contains EmbeddingVariable compiling issue. (#964) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 Co-authored-by: 泊霆 --- tensorflow/python/compiler/tensorrt/trt_convert.py | 12 +++++------- tensorflow/python/framework/graph_util_impl.py | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index 2c8d603ba01..064e32c6984 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -539,13 +539,10 @@ def _gather_names(tensor_info): # EmbeddingVariable can not be convert to constant, so we need to # load ev varibles at runtime always. if self._use_ev: - global_step_collection_ops = sess.graph.get_collection("global_step") - global_step_name = global_step_collection_ops[0].name.split(":")[0] output_node_names.add(filename_tensor_name) output_node_names.add(save_tensor_name) output_node_names.add(restore_op_name) - tf_logging.info("TensorRT - global_step_name: %s" % str(global_step_name)) tf_logging.info("TensorRT - filename_tensor_name: %s" % str(filename_tensor_name)) tf_logging.info("TensorRT - save_tensor_name: %s" % str(save_tensor_name)) tf_logging.info("TensorRT - restore_op_name: %s" % str(restore_op_name)) @@ -559,18 +556,19 @@ def _gather_names(tensor_info): # Freeze the variables in the SavedModel graph and copy the frozen # graph over. - variable_names_blacklist = [] if self._use_ev: - variable_names_blacklist.append(global_step_name) + global_step_collection_ops = sess.graph.get_collection("global_step") + if len(global_step_collection_ops) > 0: + sess.run([sess.graph.get_operation_by_name("global_step/Assign")]) frozen_graph_def = graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(add_shapes=True), - list(output_node_names), variable_names_blacklist=variable_names_blacklist) + list(output_node_names)) if self._use_ev: # Keep KV Variable in saver_def, these kv-vars will be initialized at runtime. frozen_graph_def = graph_util.create_kv_variable_init_graph( - frozen_graph_def, global_step_name, restore_op_name) + frozen_graph_def, restore_op_name) self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef() self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def) diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py index 76d69e886e7..c3fa37529c3 100644 --- a/tensorflow/python/framework/graph_util_impl.py +++ b/tensorflow/python/framework/graph_util_impl.py @@ -169,7 +169,7 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name): return nodes_to_keep @tf_export(v1=["graph_util.create_kv_variable_init_graph"]) -def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name): +def create_kv_variable_init_graph(graph, restore_all_op_name): name_to_input_name, name_to_node, name_to_seq_num = \ _extract_graph_summary(graph) @@ -184,8 +184,10 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name): " {} in current graph.".format(restore_all_op_name)) for restore_shard_input_full_name in restore_all_op.input: - restore_shard_input_name = re.sub(r"^\^", "", restore_shard_input_full_name) - restore_shard_input_op = name_to_node[restore_shard_input_name] + restore_shard_input_no_op_name = re.sub(r"^\^", "", restore_shard_input_full_name) + restore_shard_input_no_op = name_to_node[restore_shard_input_no_op_name] + restore_shard_input_op_name = re.sub(r"^\^", "",restore_shard_input_no_op.input[0]) + restore_shard_input_op = name_to_node[restore_shard_input_op_name] # go through all restore_shard ops new_node = node_def_pb2.NodeDef() new_node.CopyFrom(restore_shard_input_op) @@ -198,10 +200,6 @@ def create_kv_variable_init_graph(graph, global_step_name, restore_all_op_name): n_node.op == "KvResourceImportV2" or \ n_node.op == "KvResourceImport": new_node.input.append(n_full_name) - else: - # Keep global_step assign op in new save/restore_all - if n_node.input[0] == global_step_name: - new_node.input.append(n_full_name) graph.node.remove(restore_shard_input_op) graph.node.extend([new_node]) From 5eabe5fba8b08707020868c899b7cd63784a70f6 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Fri, 12 Jan 2024 00:24:52 -0800 Subject: [PATCH 26/45] [Embedding] Make Embedding backward compatible with previous saved_model. (#963) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 Co-authored-by: 泊霆 --- tensorflow/python/ops/kv_variable_ops.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index 1ef9550ef6d..840aadf2541 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -530,11 +530,16 @@ def _init_from_proto(self, variable_def, import_scope=None): cache_op = op elif self._initializer_op.type == "InitializeKvVariableOp": init_op = self._initializer_op - - self._init_op_for_restore = g.as_graph_element( + if variable_def.initialize_op_for_restore: + self._init_op_for_restore = g.as_graph_element( ops.prepend_name_scope( variable_def.initialize_op_for_restore, import_scope=import_scope)) + else: #Backward compatibility with 2306 + self._init_op_for_restore = g.as_graph_element( + ops.prepend_name_scope( + variable_def.initializer_name, + import_scope=import_scope)) self._trainable = getattr(variable_def, "trainable", True) if variable_def.snapshot_name: self._cached_value = g.as_graph_element( From d84837fc3c589ea32aad9a3e6b6a272cbd92a079 Mon Sep 17 00:00:00 2001 From: dashingwu Date: Thu, 1 Feb 2024 12:22:01 +0800 Subject: [PATCH 27/45] [Runtime] fix a scheduling issue (#970) The original code assumes the last 4 bits of the CPU cycle count is uniformly distributed, but that is not true, at lease Intel IceLake Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz, the CPU cycle is always ODD number. This fact will result expensive ops are frequently scheduled to signle thread, which will greatly increase the RT time (in custom scenario, from ~30ms to ~45ms). Signed-off-by: Xiaoguang Wu Co-authored-by: Xiaoguang Wu --- tensorflow/core/common_runtime/executor.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index fd38329a1fa..3df0d2a15be 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -730,15 +730,16 @@ Status ExecutorState::ProcessSync( } else if (kernel_stats_->HasExpensiveMarker(item)) { KernelTimer timer; + static uint64 update_counter = 0; device->Compute(op_kernel, &ctx); - // For expensive kernels, always update the cost estimate. For inexpensive - // kernels, update the cost estimate with ~1/16 probability. This assumes - // that the last 4 bits of the CPU cycle count is uniformly distributed. + constexpr int kKernelExecutionTrackingInvocationSkipCount = 16; if (is_expensive || - timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0) { + update_counter % kKernelExecutionTrackingInvocationSkipCount == 0) { kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles()); } + + update_counter++; } else { device->Compute(op_kernel, &ctx); } From 2b15e8a13a7d17736366bb9600267f94465b72e8 Mon Sep 17 00:00:00 2001 From: Junqi Hu <42396655+Mesilenceki@users.noreply.github.com> Date: Sun, 4 Feb 2024 01:58:50 -0800 Subject: [PATCH 28/45] [Embedding] Fix shared embedding frequency counting problem. (#962) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 泊霆 Co-authored-by: 泊霆 --- .../api_def_UniqueWithExtraCounts.pbtxt | 4 + .../api_def_UniqueWithExtraCounts.pbtxt | 3 + .../api_def_UniqueWithExtraCounts.pbtxt | 4 + tensorflow/core/kernels/unique_ali_op.cc | 121 ++++++++++++----- tensorflow/core/kernels/unique_ali_op_util.h | 122 +++++++++++++++--- tensorflow/core/ops/array_ops.cc | 20 +++ .../framework/python_op_gen_internal.cc | 1 + .../python/kernel_tests/unique_op_test.py | 68 ++++++++++ tensorflow/python/ops/array_ops.py | 1 - .../python/ops/embedding_variable_ops_test.py | 69 ++++++++++ .../python/training/gradient_descent.py | 23 +++- tensorflow/python/training/optimizer.py | 22 ++-- 12 files changed, 386 insertions(+), 72 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt create mode 100644 tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt new file mode 100644 index 00000000000..b8fabfe75a9 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithExtraCounts.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "UniqueWithExtraCounts" + visibility: HIDDEN +} diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt new file mode 100644 index 00000000000..117b73ef185 --- /dev/null +++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithExtraCounts.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "UniqueWithExtraCounts" +} diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt new file mode 100644 index 00000000000..b8fabfe75a9 --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_UniqueWithExtraCounts.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "UniqueWithExtraCounts" + visibility: HIDDEN +} diff --git a/tensorflow/core/kernels/unique_ali_op.cc b/tensorflow/core/kernels/unique_ali_op.cc index 28b5dad1990..efae935db12 100644 --- a/tensorflow/core/kernels/unique_ali_op.cc +++ b/tensorflow/core/kernels/unique_ali_op.cc @@ -25,8 +25,8 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/kernels/task_runner.h" #include "tensorflow/core/kernels/unique_ali_op_util.h" -#include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/util/env_var.h" namespace tensorflow { @@ -41,40 +41,43 @@ const char* kStlHashMapString = "STL"; const char* kAbslHashMapString = "ABSL"; const char* kGoogleHashMapString = "GOOGLE"; const int64 kDefaultUniqueRatioHint = 4; -} +} // namespace template class UniqueAliOp : public OpKernel { public: explicit UniqueAliOp(OpKernelConstruction* context) : OpKernel(context) { - OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, - kPartitionSize, &partition_size_)); - OP_REQUIRES(context, partition_size_ > 0, - errors::InvalidArgument("Invaild PARTITION_SIZE=", - partition_size_)); + OP_REQUIRES_OK( + context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, kPartitionSize, + &partition_size_)); + OP_REQUIRES( + context, partition_size_ > 0, + errors::InvalidArgument("Invaild PARTITION_SIZE=", partition_size_)); - OP_REQUIRES_OK(context, ReadBoolFromEnvVar(kUniqueOpSerialEnv, - false, &serial_)); + OP_REQUIRES_OK(context, + ReadBoolFromEnvVar(kUniqueOpSerialEnv, false, &serial_)); // NOTE(zycao>: Hash map insertion and lookup performance is dominating in // Unique Op. Based on benchmark results, 'google::dense_hash_map' will be // used as default for most key types except string. // - // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a particular - // hash map could be seleteed to use. Possible choices are listed below: + // By setting "DEEPREC_UNIQUE_OP_HASH_MAP" environment variable, a + // particular hash map could be seleteed to use. Possible choices are listed + // below: // "MULTIMAP" for multimap parrallel process, // "STL" for std::unordred_map, // "ABSL" for absl::flat_hash_map, // "GOOGLE" for google::dense_hash_map. std::string hash_map_str; - OP_REQUIRES_OK(context, ReadStringFromEnvVar(kUniqueOpHashMapEnv, - kGoogleHashMapString, - &hash_map_str)); + OP_REQUIRES_OK( + context, ReadStringFromEnvVar(kUniqueOpHashMapEnv, kGoogleHashMapString, + &hash_map_str)); std::transform(hash_map_str.begin(), hash_map_str.end(), hash_map_str.begin(), ::toupper); OP_REQUIRES_OK(context, ReadInt64FromEnvVar(kUniqueOpUniqRatioHint, - kDefaultUniqueRatioHint, &unique_ratio_hint_)); + kDefaultUniqueRatioHint, + &unique_ratio_hint_)); OP_REQUIRES(context, unique_ratio_hint_ > 0, errors::InvalidArgument("Invaild ", kUniqueOpUniqRatioHint, "=", unique_ratio_hint_)); @@ -83,7 +86,8 @@ class UniqueAliOp : public OpKernel { map_flag_ = MULTIMAP; static char print_once = [] { LOG(INFO) << "MultiMapCompute preserved " - "dense hash map key: " << kPreseverdEmptyKey; + "dense hash map key: " + << kPreseverdEmptyKey; return '\0'; }(); } else if (!hash_map_str.compare(kStlHashMapString)) { @@ -95,7 +99,6 @@ class UniqueAliOp : public OpKernel { } else { map_flag_ = GOOGLE; } - } void Compute(OpKernelContext* context) override { @@ -110,16 +113,14 @@ class UniqueAliOp : public OpKernel { Tensor output; Tensor output_counter; if (context->num_inputs() == 1) { - UniqueWithoutAxis(context, input, - &idx, &output, &output_counter, num_outputs(), - partition_size_, serial_, unique_ratio_hint_, - map_flag_); + UniqueWithoutAxis( + context, input, &idx, &output, &output_counter, num_outputs(), + partition_size_, serial_, unique_ratio_hint_, map_flag_); } else { const Tensor& axis_tensor = context->input(1); - UniqueWithAxis(context, input, - axis_tensor, &idx, &output, &output_counter, - num_outputs(), partition_size_, serial_, - unique_ratio_hint_, map_flag_); + UniqueWithAxis(context, input, axis_tensor, &idx, &output, + &output_counter, num_outputs(), partition_size_, + serial_, unique_ratio_hint_, map_flag_); } context->set_output(0, output); context->set_output(1, idx); @@ -128,33 +129,65 @@ class UniqueAliOp : public OpKernel { } } + protected: bool serial_ = false; int64 partition_size_ = 0; int64 unique_ratio_hint_; UniqueMaps map_flag_ = GOOGLE; // "GOOGLE" dense hash map is default }; +template +class UniqueWithCountAliOp : public UniqueAliOp { + using UniqueAliOp::serial_; + using UniqueAliOp::partition_size_; + using UniqueAliOp::unique_ratio_hint_; + using UniqueAliOp::map_flag_; + using OpKernel::num_outputs; + + public: + explicit UniqueWithCountAliOp(OpKernelConstruction* context) + : UniqueAliOp(context) { + OP_REQUIRES_OK(context, context->GetAttr("N", &num_sparse_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + Tensor idx; + Tensor output; + Tensor output_counter; + UniqueWithExtraCounts( + context, input, &idx, &output, &output_counter, num_outputs(), + partition_size_, serial_, unique_ratio_hint_, num_sparse_, map_flag_); + context->set_output(0, output); + context->set_output(1, idx); + context->set_output(2, output_counter); + } + + private: + int num_sparse_; +}; + #define REGISTER_UNIQUE(type) \ REGISTER_KERNEL_BUILDER(Name("Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -164,7 +197,7 @@ class UniqueAliOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); \ + UniqueAliOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -174,7 +207,17 @@ class UniqueAliOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp) + UniqueAliOp) \ + REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) \ + REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); REGISTER_UNIQUE(string) #undef REGISTER_UNIQUE @@ -198,12 +241,22 @@ REGISTER_UNIQUE(string) .HostMemory("count") \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueAliOp); + UniqueAliOp) \ + REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp) \ + REGISTER_KERNEL_BUILDER(Name("UniqueWithExtraCounts") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("out_idx"), \ + UniqueWithCountAliOp); TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); REGISTER_UNIQUE(string) #undef REGISTER_UNIQUE -#endif //GOOGLE_CUDA - +#endif // GOOGLE_CUDA + #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_SYCL) diff --git a/tensorflow/core/kernels/unique_ali_op_util.h b/tensorflow/core/kernels/unique_ali_op_util.h index 6b59ba26e81..0a52d8864e9 100644 --- a/tensorflow/core/kernels/unique_ali_op_util.h +++ b/tensorflow/core/kernels/unique_ali_op_util.h @@ -191,7 +191,8 @@ void NewSizes(OpKernelContext* context, const Tensor& input, template void SerialComputeV1(OpKernelContext* context, const Tensor& input, - Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) { + Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, + google::dense_hash_map* counter_map, Tensor* output) { auto Tin = input.flat(); const int64 N = input.NumElements(); auto idx_vec = idx->template vec(); @@ -205,7 +206,23 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, ++j; } } - + + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * N); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + auto idx_it = uniq.find(ids); + if (idx_it != uniq.end()) { + counter_map->emplace(idx_it->second, counter_vec(k)); + } + } + } + *uniq_size = static_cast(uniq.size()); TensorShape output_shape(input.shape()); output_shape.set_dim(axis, *uniq_size); @@ -223,7 +240,8 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, template void ParallelComputeV1(OpKernelContext* context, const Tensor& input, - Tensor* idx, int64 axis, int64* uniq_size, Tensor* output) { + Tensor* idx, int64 axis, int64* uniq_size, int num_sparse, + google::dense_hash_map* counter_map, Tensor* output) { // Struct INode was used to store an inverse mapping for each node in the // hash map container. struct INode { @@ -415,6 +433,25 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input, TaskRunner t3_runner(GlobalIndexTask, thread_pool, num_tasks_t1); t3_runner.Run(); + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * N); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + for (int j = 0; j < num_tasks_t1; ++j) { + const INode* inode = uniq_maps[j].GetINodeByKey(ids); + if (inode != nullptr) { + counter_map->emplace(inode->index_, counter_vec(k)); + continue; + } + } + } + } + // Parallel Step 4: Write output indicies Tensor. int32 max_tasks_t4 = (N + kPartitionSize - 1) / kPartitionSize; int32 num_tasks_t4 = std::max(std::min(max_threads, max_tasks_t4), 1); @@ -447,8 +484,8 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input, template void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx, int64 axis, int64* uniq_size_out, - int32 num_buckets, int64 unique_ratio_hint, - Tensor* output) { + int32 num_buckets, int64 unique_ratio_hint, int num_sparse, + google::dense_hash_map* counter_map, Tensor* output) { auto Tin = input.vec(); const int64 N = input.NumElements(); @@ -529,6 +566,24 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, } int64 uniq_size = global_offsets[num_buckets - 1] + uniq_maps[num_buckets - 1].size(); + + counter_map->set_empty_key(std::numeric_limits::max()); + counter_map->resize(2 * uniq_size); + + google::dense_hash_map extra_unique_id_map; + extra_unique_id_map.set_empty_key(std::numeric_limits::max()); + extra_unique_id_map.resize(2 * uniq_size); + for (int i = 0; i < num_sparse; ++i) { + const Tensor& indices_tensor = context->input(1 + i); + auto extra_ids_vec = indices_tensor.template vec(); + const Tensor& counter_tensor = context->input(1 + num_sparse + i); + auto counter_vec = counter_tensor.template vec(); + for (int64 k = 0; k < extra_ids_vec.size(); ++k) { + auto ids = extra_ids_vec(k); + auto counts = counter_vec(k); + extra_unique_id_map.emplace(ids, counts); + } + } *uniq_size_out = uniq_size; AllocatorAttributes attr; @@ -539,7 +594,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, auto key_output_vec = output->template vec(); auto OutputTask = [&key_output_vec, &uniq_maps, &global_offsets, - &Tin, &idx_vec, &map_parter] + &Tin, &idx_vec, &map_parter, &counter_map, extra_unique_id_map] (int32 task_id, int32 num_tasks) { TIndex offset = global_offsets[task_id]; for (auto iter = uniq_maps[task_id].begin(); iter != uniq_maps[task_id].end(); ++iter) { @@ -553,7 +608,10 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, next_idx = idx_vec(cur_idx); idx_vec(cur_idx) = offset; } - + auto it = extra_unique_id_map.find(iter->first); + if (it != extra_unique_id_map.end()) { + counter_map->emplace(offset, it->second); + } ++offset; } }; @@ -618,8 +676,9 @@ void MultipleElements(OpKernelContext* context, const Tensor& input, } template -void CheckCountOutput(OpKernelContext* context, Tensor* output_counter, - Tensor* idx, int num_outputs, int64 uniq_size) { +void CheckCountOutput(OpKernelContext* context, Tensor* output, Tensor* output_counter, + Tensor* idx, int num_outputs, int64 uniq_size, + int num_sparse, google::dense_hash_map counter_map) { if (num_outputs > 2) { auto idx_vec = idx->template vec(); AllocatorAttributes attr; @@ -633,12 +692,19 @@ void CheckCountOutput(OpKernelContext* context, Tensor* output_counter, for (int64 i = 0; i < N; ++i) { count_output_vec(idx_vec(i))++; } + if (num_sparse > 0) { + for (auto& it: counter_map) { + count_output_vec(it.first) += (it.second - 1); + } + } } + } template void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input, - Tensor* idx, int64 axis, int64* uniq_size, int64 N, bool serial, Tensor* output) { + Tensor* idx, int64 axis, int64* uniq_size, int64 N, int num_sparse, bool serial, + google::dense_hash_map* counter_map, Tensor* output) { OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), errors::InvalidArgument("unique expects a 1D vector.")); // TODO(dga): Make unique polymorphic for returning int32 and int64 @@ -651,10 +717,10 @@ void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input, if (N >= kPartitionLimit && !serial) { ParallelComputeV1 - (context, input, idx, axis, uniq_size, output); + (context, input, idx, axis, uniq_size, num_sparse, counter_map, output); } else { SerialComputeV1 - (context, input, idx, axis, uniq_size, output); + (context, input, idx, axis, uniq_size, num_sparse, counter_map, output); } } @@ -662,7 +728,7 @@ template void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs, int64 partition_size, bool serial, int64 axis, int64 unique_ratio_hint, - std::vector& new_sizes, UniqueMaps map_flag) { + std::vector& new_sizes, UniqueMaps map_flag, int num_sparse = 0) { typedef google::dense_hash_map DefaultHashMap; AllocatorAttributes attr; @@ -672,6 +738,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, TensorShape({new_sizes[1]}), idx, attr)); int64 uniq_size_out; + google::dense_hash_map counter_map; if (new_sizes[0] == 1 && new_sizes[2] == 1) { // Specialized and faster implementation when unique is run over single @@ -687,33 +754,34 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, case MULTIMAP: if (num_buckets > 1 && !serial) { MultiMapCompute> - (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, output); + (context, input, idx, axis, &uniq_size_out, num_buckets, unique_ratio_hint, num_sparse, &counter_map, output); } else { SerialComputeV1 - (context, input, idx, axis, &uniq_size_out, output); + (context, input, idx, axis, &uniq_size_out, num_sparse, &counter_map, output); } break; case STL: ComputeInternalWithHashMap> - (context, input, idx, axis, &uniq_size_out, N, serial, output); + (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output); break; case ABSL: ComputeInternalWithHashMap> - (context, input, idx, axis, &uniq_size_out, N, serial, output); + (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output); break; case GOOGLE: ComputeInternalWithHashMap - (context, input, idx, axis, &uniq_size_out, N, serial, output); + (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output); break; default: ComputeInternalWithHashMap - (context, input, idx, axis, &uniq_size_out, N, serial, output); + (context, input, idx, axis, &uniq_size_out, N, num_sparse, serial, &counter_map, output); } } else { MultipleElements(context, input, idx, output, &uniq_size_out, axis, new_sizes); } - CheckCountOutput(context, output_counter, idx, num_outputs, uniq_size_out); + CheckCountOutput(context, output, output_counter, idx, num_outputs, + uniq_size_out, num_sparse, counter_map); } template @@ -743,6 +811,20 @@ void UniqueWithAxis(OpKernelContext* context, const Tensor& input, axis, unique_ratio_hint, new_sizes, map_flag); } +template +void UniqueWithExtraCounts(OpKernelContext* context, const Tensor& input, + Tensor* idx, Tensor* output, Tensor* output_counter, int num_outputs, + int64 partition_size, bool serial, int64 unique_ratio_hint, + int num_sparse, UniqueMaps map_flag) { + int64 axis = 0; + std::vector new_sizes{1, input.NumElements(), 1}; + OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), + errors::InvalidArgument("unique expects a 1D vector.")); + UniqueInternal(context, input, idx, output, + output_counter, num_outputs, partition_size, serial, + axis, unique_ratio_hint, new_sizes, map_flag, num_sparse); +} + } // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_UNIQUE_ALI_OP_UTIL_H_ diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 27f6811fcff..306026977ef 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -1741,6 +1741,26 @@ REGISTER_OP("UniqueWithCountsV2") return Status::OK(); }); +// --------------------------------------------------- + +REGISTER_OP("UniqueWithExtraCounts") + .Input("x: T") + .Input("extra_indices: N * T") + .Input("extra_counts: N * out_idx") + .Output("y: T") + .Output("idx: out_idx") + .Output("count: out_idx") + .Attr("T: type") + .Attr("N: int >= 0") + .Attr("out_idx: {int32, int64} = DT_INT32") + .SetShapeFn([](InferenceContext* c) { + auto uniq = c->Vector(InferenceContext::kUnknownDim); + c->set_output(0, uniq); + c->set_output(1, c->input(0)); + c->set_output(2, uniq); + return Status::OK(); + }); + namespace { Status ShapeShapeFn(InferenceContext* c) { diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc index 42ae4eacc77..d0370a09106 100644 --- a/tensorflow/python/framework/python_op_gen_internal.cc +++ b/tensorflow/python/framework/python_op_gen_internal.cc @@ -105,6 +105,7 @@ bool IsOpWithUnderscorePrefix(const string& s) { // TODO(annarev): reduce usage of '*' imports and remove these from the // list. "fused_batch_norm", "histogram_fixed_width", "stack", + "unique_with_extra_counts", "batch_norm_with_global_normalization", "clip_by_value"}); return kUnderscoreOps->count(s) > 0; } diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py index 9ec0ff74e3e..08ebcf0e8dd 100644 --- a/tensorflow/python/kernel_tests/unique_op_test.py +++ b/tensorflow/python/kernel_tests/unique_op_test.py @@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import constant_op from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_array_ops from tensorflow.python.platform import test @@ -278,6 +279,73 @@ def testUniqueWithCountsAbslMap(self): def testUniqueWithCountsDenseHashMap(self): self.RunUniqueWithCountsWithDifferentMaps('GOOGLE') +class UniqueWithExtraCountsTest(test.TestCase): + + def testInt32(self): + x = np.random.randint(2, high=1000, size=700000) + extra_x = x[:5].tolist() + extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)] + extra_count = [500 for _ in range(5)] + extra_count_tensor = [constant_op.constant(extra_count, dtypes.int32)] + with self.cached_session() as sess: + y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + if value in extra_x: + self.assertEqual(count, np.sum(x == value) + 499) + else: + self.assertEqual(count, np.sum(x == value)) + + def testInt32OutIdxInt64(self): + x = np.random.randint(2, high=1000, size=700000) + extra_x = x[:5].tolist() + extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)] + extra_count = [500 for _ in range(5)] + extra_count_tensor = [constant_op.constant(extra_count, dtypes.int64)] + with self.cached_session() as sess: + y, idx, count = gen_array_ops._unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor) + tf_y, tf_idx, tf_count = sess.run([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + if value in extra_x: + self.assertEqual(count, np.sum(x == value) + 499) + else: + self.assertEqual(count, np.sum(x == value)) + + def RunUniqueWithCountsWithDifferentMaps(self, map_type): + recover_env = False + if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ: + recover_env = True + old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type + self.testInt32() + self.testInt32OutIdxInt64() + + del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] + if recover_env: + os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env + + def testUniqueWithCountsMultiMap(self): + self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP') + + def testUniqueWithCountsStlMap(self): + self.RunUniqueWithCountsWithDifferentMaps('STL') + + def testUniqueWithCountsAbslMap(self): + self.RunUniqueWithCountsWithDifferentMaps('ABSL') + + def testUniqueWithCountsDenseHashMap(self): + self.RunUniqueWithCountsWithDifferentMaps('GOOGLE') if __name__ == '__main__': test.main() diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index adadf3cc427..960dae9ac8c 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1627,7 +1627,6 @@ def unique_with_counts(x, out_idx=dtypes.int32, name=None): unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__ - @tf_export("split") def split(value, num_or_size_splits, axis=0, num=None, name="split"): """Splits a tensor into sub tensors. diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index 81b315e2e43..dbf254d5f14 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -19,6 +19,7 @@ from tensorflow.core.framework import attr_value_pb2 from tensorflow.python.framework import ops from tensorflow.python.framework import test_util +from tensorflow.python.framework import constant_op from tensorflow.python.ops import string_ops from tensorflow.python.ops.check_ops import assert_equal from tensorflow.python.platform import googletest @@ -2871,6 +2872,39 @@ def testCountsTensor(self): value = checkpoint_utils.load_variable(ckpt_path, name) self.assertAllEqual(value, [3, 3, 1, 3, 2]) + def testCountsWithSparseAndDenseTensor(self): + os.environ["TF_RECORD_FREQ"] = "1" + checkpoint_directory = self.get_temp_dir() + ckpt_path = os.path.join(checkpoint_directory, "model.ckpt") + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + sp1 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([0,0,0,1,1,2], dtypes.int64), + dense_shape=[6, 1]) + ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64) + emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None) + emb2 = embedding_ops.embedding_lookup(var, ids) + emb = emb1 + emb2 + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + sess.run([init]) + sess.run(train_op) + saver.save(sess, ckpt_path) + + for name, shape in checkpoint_utils.list_variables(ckpt_path): + if name == "var_1-freqs": + value = checkpoint_utils.load_variable(ckpt_path, name) + self.assertAllEqual(value, [3, 3, 1, 3, 2]) + def testCountsTensorWithGradientDescent(self): os.environ["TF_RECORD_FREQ"] = "1" checkpoint_directory = self.get_temp_dir() @@ -2908,6 +2942,41 @@ def testCountsTensorWithGradientDescent(self): self.assertAllEqual(value, [3, 3, 1, 3, 2]) del os.environ["TF_RECORD_FREQ"] + + def testCountsDenseAndSparseTensorWithGradientDescent(self): + os.environ["TF_RECORD_FREQ"] = "1" + checkpoint_directory = self.get_temp_dir() + ckpt_path = os.path.join(checkpoint_directory, "model.ckpt") + with ops.Graph().as_default() as g, ops.device('/cpu:0'): + var = variable_scope.get_embedding_variable("var_1", + embedding_dim = 3) + sp1 = sparse_tensor.SparseTensor( + indices=[[0,0],[1,0],[2,0],[3,0],[4,0],[5,0]], + values=math_ops.cast([0,0,0,1,1,2], dtypes.int64), + dense_shape=[6, 1]) + ids = constant_op.constant([3,3,3,4,4,1], dtype=dtypes.int64) + emb1 = embedding_ops.embedding_lookup_sparse(var, sp1, None) + emb2 = embedding_ops.embedding_lookup(var, ids) + emb = emb1 + emb2 + fun = math_ops.multiply(emb, 2.0, name='multiply') + loss = math_ops.reduce_sum(fun, name='reduce_sum') + gs = training_util.get_or_create_global_step() + opt = gradient_descent.GradientDescentOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + saver = saver_module.Saver() + init = variables.global_variables_initializer() + with self.test_session(graph=g) as sess: + sess.run([init]) + sess.run(train_op) + saver.save(sess, ckpt_path) + + for name, shape in checkpoint_utils.list_variables(ckpt_path): + if name == "var_1-freqs": + value = checkpoint_utils.load_variable(ckpt_path, name) + self.assertAllEqual(value, [3, 3, 1, 3, 2]) + + del os.environ["TF_RECORD_FREQ"] if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py index 799e3c5f5bd..bd16892c1c8 100644 --- a/tensorflow/python/training/gradient_descent.py +++ b/tensorflow/python/training/gradient_descent.py @@ -19,9 +19,12 @@ from __future__ import print_function from tensorflow.python.framework import ops +from tensorflow.python.framework import dtypes from tensorflow.python.ops import gen_hash_training_ops from tensorflow.python.ops import kv_variable_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops @@ -72,22 +75,28 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): if isinstance(handle, kv_variable_ops.EmbeddingVariable): global_step = training_util.get_or_create_global_step() if handle.need_counts() and len(handle._counts_tensor.keys()) != 0: + extra_counts, extra_indices = [], [] if indices.op.type == "ConcatV2": - total_counts = [] for tensor in indices.op.inputs: if tensor.op.type == "Reshape": indices_tensor = tensor.op.inputs[0] - total_counts.append(handle._counts_tensor[indices_tensor]) - from tensorflow.python.ops import array_ops - counts_tensor = array_ops.concat(total_counts, 0) + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) elif indices.op.type == "Reshape": indices_tensor = indices.op.inputs[0] - counts_tensor = handle._counts_tensor[indices_tensor] + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) + unique_indices, new_index_positions, indices_counts = \ + gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts) + summed_grads = math_ops.unsorted_segment_sum( + grad, new_index_positions, array_ops.shape(unique_indices)[0]) return training_ops.kv_resource_sparse_apply_gradient_descent_with_counts( handle.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), - grad, indices, global_step, - counts_tensor, use_locking=self._use_locking) + summed_grads, unique_indices, global_step, + indices_counts, use_locking=self._use_locking) else: return training_ops.kv_resource_sparse_apply_gradient_descent( handle.handle, math_ops.cast(self._learning_rate_tensor, diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 7523604ccf9..95383a9d962 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -34,6 +34,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import smart_cond from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gradients from tensorflow.python.ops import gen_io_ops @@ -93,16 +94,14 @@ def _deduplicate_indexed_slices_with_counts(values, indices): array_ops.shape(unique_indices)[0]) return (summed_values, unique_indices, indices_counts) -def _deduplicate_indexed_slices_with_counts_reduction(values, indices, counts): +def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices): """Sums `values` associated with any non-unique `indices` and return counts of each count in `values`.""" - unique_indices, new_index_positions = array_ops.unique(indices) + unique_indices, new_index_positions, summed_counts = \ + gen_array_ops._unique_with_extra_counts(indices, extra_indices, extra_counts) summed_values = math_ops.unsorted_segment_sum( values, new_index_positions, array_ops.shape(unique_indices)[0]) - summed_counts = math_ops.unsorted_segment_sum( - counts, new_index_positions, - array_ops.shape(unique_indices)[0]) return (summed_values, unique_indices, summed_counts) def _var_key(var): @@ -1105,19 +1104,22 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): _deduplicate_indexed_slices_with_counts( values=grad, indices=indices) else: + extra_counts, extra_indices = [], [] if indices.op.type == "ConcatV2": - total_counts = [] for tensor in indices.op.inputs: if tensor.op.type == "Reshape": indices_tensor = tensor.op.inputs[0] - total_counts.append(handle._counts_tensor[indices_tensor]) - counts_tensor = array_ops.concat(total_counts, 0) + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) elif indices.op.type == "Reshape": indices_tensor = indices.op.inputs[0] - counts_tensor = handle._counts_tensor[indices_tensor] + if indices_tensor in handle._counts_tensor: + extra_counts.append(handle._counts_tensor[indices_tensor]) + extra_indices.append(indices_tensor) summed_grad, unique_indices, indices_counts = \ _deduplicate_indexed_slices_with_counts_reduction( - grad, indices, counts_tensor) + grad, indices, extra_counts, extra_indices) return self._resource_apply_sparse( summed_grad, handle, unique_indices, indices_counts) else: From 70b32df83f0e7928d8894773fe2d5cf247ccf3d4 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Tue, 20 Feb 2024 19:20:27 +0800 Subject: [PATCH 29/45] [BUILD] Add build SDK package. (#972) Signed-off-by: candy.dc --- tensorflow/tools/sdk_package/BUILD | 43 ++++++ tensorflow/tools/sdk_package/README.md | 41 ++++++ .../tools/sdk_package/build_sdk_package.sh | 136 ++++++++++++++++++ 3 files changed, 220 insertions(+) create mode 100644 tensorflow/tools/sdk_package/BUILD create mode 100644 tensorflow/tools/sdk_package/README.md create mode 100755 tensorflow/tools/sdk_package/build_sdk_package.sh diff --git a/tensorflow/tools/sdk_package/BUILD b/tensorflow/tools/sdk_package/BUILD new file mode 100644 index 00000000000..b3dca82b9e3 --- /dev/null +++ b/tensorflow/tools/sdk_package/BUILD @@ -0,0 +1,43 @@ +# Description: +# TensorFlow is a computational framework, primarily for use in machine +# learning applications. +# +# Public targets: +# +# ":sdk_package" - Package the tensorflow dynamic library and necessry +# headers for developing. The script should be executed manually +# after 'bazel build'. + +package(default_visibility = ["//visibility:public"]) + +load("//tensorflow:tensorflow.bzl", "transitive_hdrs", "tf_binary_additional_srcs") +load("//tensorflow/core/platform:default/build_config_root.bzl", + "tf_additional_plugin_deps") + +transitive_hdrs( + name = "sdk_headers", + deps = [ + # Need to check definition of //tensorflow:libtensorflow_cc.so + # for updates. + "//tensorflow/c:c_api", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:client_session", + "//tensorflow/cc:scope", + "//tensorflow/cc/saved_model:loader", + "//tensorflow/cc/saved_model:signature_constants", + "//tensorflow/cc/saved_model:tag_constants", + "//tensorflow/contrib/session_bundle:bundle_shim", + ] + tf_additional_plugin_deps(), + tags = ["manual"], +) + +sh_binary( + name = "build_sdk_package", + srcs = ["build_sdk_package.sh"], + data = [ + ":sdk_headers", + "@com_google_protobuf//:protoc", + "//tensorflow:libtensorflow_cc.so", + ] + tf_binary_additional_srcs(), + tags = ["manual"], +) diff --git a/tensorflow/tools/sdk_package/README.md b/tensorflow/tools/sdk_package/README.md new file mode 100644 index 00000000000..8dbac7bed92 --- /dev/null +++ b/tensorflow/tools/sdk_package/README.md @@ -0,0 +1,41 @@ +Bazel rules and bash scripts to package the DeepRec C/C++ APIs and +runtime library into '\/tensorflow_sdk.tar.gz' archive. + +## SDK Build + +First of all, edit and run the configurating script **'./configure'** under +DeeRec root directory (supposed '\'). + +Then simply run the following commands under '\' to build +the DeepRec SDK package: + +```sh +./build sdk +``` +_This command will put the SDK package named 'tensorflow\_sdk.tar.gz' into +the directory below:_ +> /built/sdk/[gpu|cpu] + +## SDK usage: + +To make use of DeepRec runtime SDK for C++ codes writting with original APIs +defined in TensorFlow, just decompress the SDK package into another work +directory (supposed '\') with the command at first: + +```sh +tar xzvf -C tensorflow_sdk.tar.gz +``` + +Then a directory named 'sdk' will be placed into the \, which +contains necessary header files in the 'include' sub-directory, keeping the +original hierarchy in TensorFlow, and the 'libtensorflow_cc.so' dynamic +runtime library in the 'lib' sub-directoy to support TensorFlow running. + +Just append **'-I\/sdk/include'** to compiling arguments and +**'-L\/sdk/lib'** -ltensorflow_cc to linking arguments, in the +cases of building a project, that contains codes using original TensorFlow +C++ APIs, together with DeepRec SDK. + +Finally, to successfully run the binary building with DeepRec SDK, do not +forget to append '\/sdk/lib' to **'LD_LIBRARY_PATH'** environment +variable. diff --git a/tensorflow/tools/sdk_package/build_sdk_package.sh b/tensorflow/tools/sdk_package/build_sdk_package.sh new file mode 100755 index 00000000000..89b7d8e9195 --- /dev/null +++ b/tensorflow/tools/sdk_package/build_sdk_package.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Copyright 2024 The DeepRec Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# This script is used for packaging TensorFlow SDK files into a tarball. +# The processing flow took 'tensorflow/tools/pip_package/build_pip_package.sh' +# as the reference. + +set -e + +PLATFORM="$(uname -s | tr 'A-Z' 'a-z')" +function is_windows() { + # On windows, the shell script is actually running in msys + if [[ "${PLATFORM}" =~ msys_nt* ]]; then + true + else + false + fi +} + +function main() { + if [ $# -lt 1 ] ; then + echo "No destination dir provided" + exit 1 + fi + + DEST=$1 + TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX) + mkdir -p "${TMPDIR}/sdk/bin" + mkdir -p "${TMPDIR}/sdk/include" + mkdir -p "${TMPDIR}/sdk/lib" + + echo $(date) : "=== Using tmpdir: ${TMPDIR}" + + if [ ! -d bazel-bin/tensorflow ]; then + echo "Could not find bazel-bin. Did you run from the root of the build tree?" + exit 1 + fi + + if is_windows; then + echo "Windows version TensorFlow SDK not supported..." + elif [ ! -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow ]; then + # Really old (0.2.1-) runfiles, without workspace name. + echo "TensorFlow SDK does not support such old verions..." + else + RUNFILES=bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/org_tensorflow + if [ -d ${RUNFILES}/external ]; then + # Old-style runfiles structure (--legacy_external_runfiles). + cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include" + # Check LLVM headers for XLA support. + if [ -d ${RUNFILES}/external/llvm_archive ]; then + # Old-style runfiles structure (--legacy_external_runfiles). + mkdir -p ${TMPDIR}/sdk/include/external/llvm/include + cp -RL ${RUNFILES}/external/llvm_archive/include/llvm \ + "${TMPDIR}/sdk/include/external/llvm/include" + pushd ${TMPDIR}/sdk/include + ln -s external/llvm/include/llvm llvm + popd + fi + # Copy MKL libs over so they can be loaded at runtime + so_lib_dir=$(ls $RUNFILES | grep solib) || true + if [ -n "${so_lib_dir}" ]; then + mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true + if [ -n "${mkl_so_dir}" ]; then + cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib" + fi + fi + else + # New-style runfiles structure (--nolegacy_external_runfiles). + cp -RL ${RUNFILES}/tensorflow "${TMPDIR}/sdk/include" + # Check LLVM headers for XLA support. + if [ -d bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive ]; then + cp -RL \ + bazel-bin/tensorflow/tools/sdk_package/build_sdk_package.runfiles/llvm_archive/include/llvm \ + "${TMPDIR}/sdk/include" + fi + # Copy MKL libs over so they can be loaded at runtime + so_lib_dir=$(ls $RUNFILES | grep solib) || true + if [ -n "${so_lib_dir}" ]; then + mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true + if [ -n "${mkl_so_dir}" ]; then + cp -L ${RUNFILES}/${so_lib_dir}/${mkl_so_dir}/*.so "${TMPDIR}/sdk/lib" + fi + fi + fi + fi + + # move and strip the dynamic library file for packaging. + # at default the .so file was not writable for the owner, + # so using a 'chmod +w' to perform the strip command. + chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so + chmod +w ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1 + strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_cc.so + strip ${TMPDIR}/sdk/include/tensorflow/libtensorflow_framework.so.1 + mv ${TMPDIR}/sdk/include/tensorflow/libtensorflow_*.so* ${TMPDIR}/sdk/lib + + # third party packages doesn't ship with header files. Copy the headers + # over so user defined ops can be compiled. + mkdir -p ${TMPDIR}/sdk/include/google + mkdir -p ${TMPDIR}/sdk/include/third_party + pushd ${RUNFILES%org_tensorflow}/com_google_protobuf/src/google + for header in $(find protobuf -name \*.h); do + mkdir -p "${TMPDIR}/sdk/include/google/$(dirname ${header})" + cp -L "$header" "${TMPDIR}/sdk/include/google/$(dirname ${header})/" + done + popd + cp -RL $RUNFILES/third_party/eigen3 ${TMPDIR}/sdk/include/third_party + cp -RL ${RUNFILES%org_tensorflow}/eigen_archive/* ${TMPDIR}/sdk/include/ + cp -RL ${RUNFILES%org_tensorflow}/nsync/public/* ${TMPDIR}/sdk/include + cp -L ${RUNFILES%org_tensorflow}/com_google_protobuf/protoc ${TMPDIR}/sdk/bin + + # package all files into the target file. + pushd ${TMPDIR} + rm -f MANIFEST + echo $(date) : "=== Building sdk package" + tar czvf tensorflow_sdk.tar.gz sdk/ 1> /dev/null + popd + mkdir -p ${DEST} + mv ${TMPDIR}/tensorflow_sdk.tar.gz ${DEST} + rm -rf ${TMPDIR} + echo $(date) : "=== Output sdk package file is: ${DEST}/tensorflow_sdk.tar.gz" +} + +main "$@" From eb5f30db53ee41179a61a83c6ec9b54111c0257a Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Thu, 22 Feb 2024 15:08:16 +0800 Subject: [PATCH 30/45] [Embedding] Log error when EV has been initialized in EV Import OP. (#971) Signed-off-by: chenbangduo.cbd --- tensorflow/core/kernels/kv_variable_restore_ops.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc index 3b10c2521b9..2eccf485ef8 100644 --- a/tensorflow/core/kernels/kv_variable_restore_ops.cc +++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc @@ -373,6 +373,12 @@ class KvResourceImportV3Op: public AsyncOpKernel { core::ScopedUnref unref_me(ev); + // EV should not be initialized at this time. + if (ev->IsInitialized()) { + LOG(ERROR) << "Import parameter for EV (" << name_string + << ") failed, this EV has already been initialized."; + } + auto do_compute = [this, context, file_name_string, ev, name_string, done] () { BundleReader reader(Env::Default(), file_name_string); From 9a54aae7d5062330f4055c73401183b57650c7d2 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 28 Feb 2024 10:54:34 +0800 Subject: [PATCH 31/45] [Release] Update DeepRec release version to 1.15.5+deeprec2402. (#974) Signed-off-by: candy.dc --- tensorflow/tools/pip_package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index e8635e1a298..10132cab678 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -47,7 +47,7 @@ # result for pip. # Also update tensorflow/tensorflow.bzl and # tensorflow/core/public/version.h -_VERSION = '1.15.5+deeprec2310' +_VERSION = '1.15.5+deeprec2402' REQUIRED_PACKAGES = [ 'absl-py >= 0.9.0', From 8d4024406210dbcb0a99cc036606efcfa3671c3a Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 28 Feb 2024 16:57:59 +0800 Subject: [PATCH 32/45] [Docs] Update deeprec2402 release images and notes in README.md & RELEASE.md. (#975) Signed-off-by: candy.dc --- README.md | 4 +- RELEASE.md | 44 +++++++++++++++++++ docs/docs_en/DeepRec-Compile-And-Install.md | 4 +- docs/docs_en/Estimator-Compile-And-Install.md | 2 +- docs/docs_en/TFServing-Compile-And-Install.md | 2 +- docs/docs_zh/DeepRec-Compile-And-Install.md | 4 +- docs/docs_zh/Estimator-Compile-And-Install.md | 2 +- docs/docs_zh/TFServing-Compile-And-Install.md | 2 +- 8 files changed, 54 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8f491e14665..b7d7b578c24 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,13 @@ $ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux #### Image for CPU ``` -alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04 ``` #### Image for GPU CUDA11.6 ``` -alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04 ``` *** diff --git a/RELEASE.md b/RELEASE.md index 6b7e4a7fd79..b095351d2a0 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,4 +1,48 @@ +# Release r1.15.5-deeprec2402 + +## **Major Features and Improvements** + +### **Embedding** + +- Refine KVInterface::GetShardedSnapshot API. +- Undefine EV GPU interface in CPU compile. +- Make Embedding backward compatible with previous saved_model. +- Log error when EV has been initialized in EV Import OP. + +### **Op Implement** + +- Implement of SliceSend/SliceRecv Op. +- Implement FileSliceSend/FileSliceRecvOp. + +### **SDK** + +- Add build SDK package. + +### **BugFix** + +- Fix shared embedding frequency counting problem. +- Fix Graph contains EmbeddingVariable compiling issue. +- Fix a scheduling issue. +- Fix tensor shape meta-data bug for DataFrame Value. + +### **ModelZoo** + +- Set Saver parameter sharded=True in distributed training. + +More details of features: [https://deeprec.readthedocs.io/zh/latest/](url) + +## **Release Images** + +### **CPU Image** + +`alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04` + +### **GPU Image** + +`alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04` + # Release r1.15.5-deeprec2310 + ## **Major Features and Improvements** ### **Embedding** diff --git a/docs/docs_en/DeepRec-Compile-And-Install.md b/docs/docs_en/DeepRec-Compile-And-Install.md index fdf3e295fdd..379526e5b24 100644 --- a/docs/docs_en/DeepRec-Compile-And-Install.md +++ b/docs/docs_en/DeepRec-Compile-And-Install.md @@ -111,7 +111,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04 ``` arm64: @@ -122,5 +122,5 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU Image with CUDA 11.6** ``` -alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04 ``` diff --git a/docs/docs_en/Estimator-Compile-And-Install.md b/docs/docs_en/Estimator-Compile-And-Install.md index 55f759a3c2a..6305d739571 100644 --- a/docs/docs_en/Estimator-Compile-And-Install.md +++ b/docs/docs_en/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ DeepRec provide new distributed protocols such as grpc++ and star_server, which Source Code: [https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -Develop Branch:master, Latest Release Branch: deeprec2310 +Develop Branch:master, Latest Release Branch: deeprec2402 ## Estimator Build diff --git a/docs/docs_en/TFServing-Compile-And-Install.md b/docs/docs_en/TFServing-Compile-And-Install.md index 79a0944aa3e..ea70f397c98 100644 --- a/docs/docs_en/TFServing-Compile-And-Install.md +++ b/docs/docs_en/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ We provide optimized TFServing which could highly improve performance in inferen Source Code: [https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -Develop Branch: master, Latest Release Branch: deeprec2310 +Develop Branch: master, Latest Release Branch: deeprec2402 ## TFServing Build diff --git a/docs/docs_zh/DeepRec-Compile-And-Install.md b/docs/docs_zh/DeepRec-Compile-And-Install.md index ad8fd36dbf7..0c11dca394f 100644 --- a/docs/docs_zh/DeepRec-Compile-And-Install.md +++ b/docs/docs_zh/DeepRec-Compile-And-Install.md @@ -108,7 +108,7 @@ pip3 install /tmp/tensorflow_pkg/tensorflow-1.15.5+${version}-cp38-cp38m-linux_x x86_64: ``` -alideeprec/deeprec-release:deeprec2310-cpu-py38-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-cpu-py38-ubuntu20.04 ``` arm64: @@ -119,7 +119,7 @@ alideeprec/deeprec-release:deeprec2302-cpu-py38-ubuntu22.04-arm64 **GPU CUDA11.6镜像** ``` -alideeprec/deeprec-release:deeprec2310-gpu-py38-cu116-ubuntu20.04 +alideeprec/deeprec-release:deeprec2402-gpu-py38-cu116-ubuntu20.04 ``` ## DeepRec Processor编译打包 diff --git a/docs/docs_zh/Estimator-Compile-And-Install.md b/docs/docs_zh/Estimator-Compile-And-Install.md index e54c8ddbd2f..eeb4f66dc99 100644 --- a/docs/docs_zh/Estimator-Compile-And-Install.md +++ b/docs/docs_zh/Estimator-Compile-And-Install.md @@ -40,7 +40,7 @@ 代码库:[https://github.com/DeepRec-AI/estimator](https://github.com/DeepRec-AI/estimator) -开发分支:master,最新Release分支:deeprec2310 +开发分支:master,最新Release分支:deeprec2402 ## Estimator编译 diff --git a/docs/docs_zh/TFServing-Compile-And-Install.md b/docs/docs_zh/TFServing-Compile-And-Install.md index a43d2d517a6..b0460934165 100644 --- a/docs/docs_zh/TFServing-Compile-And-Install.md +++ b/docs/docs_zh/TFServing-Compile-And-Install.md @@ -39,7 +39,7 @@ 代码库:[https://github.com/DeepRec-AI/serving](https://github.com/DeepRec-AI/serving) -开发分支:master,最新Release分支:deeprec2310 +开发分支:master,最新Release分支:deeprec2402 ## TFServing编译&打包 From 8b58f9b93e144fa2d6517d5d370dc0df4fd3644b Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Wed, 28 Feb 2024 17:18:29 +0800 Subject: [PATCH 33/45] [Dockerfile] Add DeepRec release image dockerfile. (#976) Signed-off-by: candy.dc --- cibuild/dockerfiles/Dockerfile.release | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 cibuild/dockerfiles/Dockerfile.release diff --git a/cibuild/dockerfiles/Dockerfile.release b/cibuild/dockerfiles/Dockerfile.release new file mode 100644 index 00000000000..77b013f840d --- /dev/null +++ b/cibuild/dockerfiles/Dockerfile.release @@ -0,0 +1,32 @@ +# build DeepRec & estimator wheel +FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04 AS deeprec_build + +ARG TF_COMMIT=deeprec2402 + +RUN mkdir -p /src +RUN wget -nv -O /src/install_bazel.sh \ + http://pythonrun.oss-cn-zhangjiakou.aliyuncs.com/bazel-0.26.1-installer-linux-x86_64.sh && \ + bash /src/install_bazel.sh + +RUN git clone https://github.com/DeepRec-AI/DeepRec.git /src/DeepRec && \ + cd /src/DeepRec && \ + git checkout ${TF_COMMIT} +RUN cd /src/DeepRec && \ + yes "" | bash ./configure || true && \ + bazel build -c opt --config=opt //tensorflow/tools/pip_package:build_pip_package && \ + bazel-bin/tensorflow/tools/pip_package/build_pip_package /src/ + +RUN pip install /src/tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl + +RUN git clone https://github.com/DeepRec-AI/estimator.git /src/estimator && \ + cd /src/estimator && \ + git checkout ${TF_COMMIT} +RUN cd /src/estimator && \ + bazel build //tensorflow_estimator/tools/pip_package:build_pip_package && \ + bazel-bin/tensorflow_estimator/tools/pip_package/build_pip_package /src/ + +# build DeeepRec release image +FROM alideeprec/deeprec-base:deeprec-base-cpu-py38-ubuntu20.04 +COPY --from=deeprec_build /src/*.whl / +RUN pip install /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl +RUN rm -f /tensorflow-1.15.5+${TF_COMMIT}-cp38-cp38-linux_x86_64.whl /tensorflow_estimator-1.15.2+${TF_COMMIT}-py2.py3-none-any.whl From 186afd0479bb43c629cafa808be70b7f5ac33d83 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Thu, 29 Feb 2024 10:10:38 +0800 Subject: [PATCH 34/45] [Serving] Fix syntax error in generate timeline tool. (#977) Signed-off-by: candy.dc --- serving/tools/timeline/gen_timeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/serving/tools/timeline/gen_timeline.py b/serving/tools/timeline/gen_timeline.py index f055e473fa0..d56c1b39897 100644 --- a/serving/tools/timeline/gen_timeline.py +++ b/serving/tools/timeline/gen_timeline.py @@ -1,6 +1,6 @@ import sys -import config_pb2 -import timeline +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import timeline def gen_timeline(src_name, dest_name): run_metadata = config_pb2.RunMetadata() From 6dae552cb40e954cce59e125977f141c6a926ada Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Thu, 7 Mar 2024 14:35:36 +0800 Subject: [PATCH 35/45] [Embedding] Refine header file of embedding variable. (#978) Signed-off-by: chenbangduo.cbd --- tensorflow/core/framework/embedding/embedding_var.h | 1 - tensorflow/core/kernels/kv_variable_ops.cc | 1 + tensorflow/core/kernels/kv_variable_restore_ops.cc | 1 + tensorflow/core/kernels/training_ali_ops.cc | 8 ++++---- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h index c0d26a2f4d8..81941bc9ff9 100644 --- a/tensorflow/core/framework/embedding/embedding_var.h +++ b/tensorflow/core/framework/embedding/embedding_var.h @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/framework/embedding/gpu_hash_map_kv.h" #include "tensorflow/core/framework/embedding/embedding_config.h" #include "tensorflow/core/framework/embedding/storage.h" -#include "tensorflow/core/framework/embedding/storage_factory.h" #include "tensorflow/core/framework/typed_allocator.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc index 5cd0ef140bd..b7567ffe924 100644 --- a/tensorflow/core/kernels/kv_variable_ops.cc +++ b/tensorflow/core/kernels/kv_variable_ops.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/config.pb.h" #include "tensorflow/core/framework/embedding/embedding_var.h" +#include "tensorflow/core/framework/embedding/storage_factory.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/resource_mgr.h" diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc index 2eccf485ef8..e16db9b4cd6 100644 --- a/tensorflow/core/kernels/kv_variable_restore_ops.cc +++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/framework/embedding/cache.h" #include "tensorflow/core/framework/embedding/config.pb.h" #include "tensorflow/core/framework/embedding/embedding_var.h" +#include "tensorflow/core/framework/embedding/storage_factory.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/resource_mgr.h" diff --git a/tensorflow/core/kernels/training_ali_ops.cc b/tensorflow/core/kernels/training_ali_ops.cc index 546b30e29dd..fc21ab610cf 100644 --- a/tensorflow/core/kernels/training_ali_ops.cc +++ b/tensorflow/core/kernels/training_ali_ops.cc @@ -236,7 +236,7 @@ class KvSparseApplyAdagradGPUOp : public OpKernel { T** dev_a = dev_v + task_size; CHECK(dev_a); CHECK(dev_v); - DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2); + se::DeviceMemoryBase dev_v_ptr(dev_v, sizeof(T*) * task_size * 2); stream->ThenMemcpy(&dev_v_ptr, v, sizeof(T*) * task_size * 2); int block_size = 128; @@ -1606,7 +1606,7 @@ class KvSparseApplyAdamGPUOp : public OpKernel { CHECK(dev_m_ptr); CHECK(dev_v_ptr); - DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3); int block_size = 128; @@ -2579,7 +2579,7 @@ class KvSparseApplyAdamAsyncGPUOp : public OpKernel { CHECK(dev_m_ptr); CHECK(dev_v_ptr); - DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3); int block_size = 128; @@ -3236,7 +3236,7 @@ class KvSparseApplyAdamWGPUOp : public OpKernel { CHECK(dev_m_ptr); CHECK(dev_v_ptr); - DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); + se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3); stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3); int block_size = 128; From cf16856d01551c9d1cb005722d7f62a448df7095 Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Tue, 26 Mar 2024 17:15:18 +0800 Subject: [PATCH 36/45] [Incremental Checkpoint] Fix import incremental embedding variable. (#983) Signed-off-by: chenbangduo.cbd --- .../embedding/embedding_var_restore.cc | 50 +++++++++-------- tensorflow/python/training/incr_ckpt_test.py | 54 +++++++++++++++++++ 2 files changed, 82 insertions(+), 22 deletions(-) diff --git a/tensorflow/core/framework/embedding/embedding_var_restore.cc b/tensorflow/core/framework/embedding/embedding_var_restore.cc index 11c13008995..6ff07bf7e43 100644 --- a/tensorflow/core/framework/embedding/embedding_var_restore.cc +++ b/tensorflow/core/framework/embedding/embedding_var_restore.cc @@ -102,45 +102,48 @@ void CheckpointLoader::RestoreInternal( Tensor part_filter_offset_tensor; if (!restore_args_.m_is_oldform) { /****** InitPartOffsetTensor ******/ - TensorShape part_offset_shape, part_filter_offset_shape; - DataType part_offset_type, part_filter_offset_type; + TensorShape part_offset_shape; + DataType part_offset_type; string offset_tensor_name; if (!restore_args_.m_is_incr) { offset_tensor_name = name_string + kPartOffsetTensorSuffsix; } else { offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix; } - - string offset_filter_tensor_name = - name_string + kPartFilterOffsetTensorSuffsix; + Status s = reader_->LookupDtypeAndShape( offset_tensor_name, &part_offset_type, &part_offset_shape); if (!s.ok()) { LOG(ERROR) << "EV restoring fail:" << s.error_message(); } - s = reader_->LookupDtypeAndShape(offset_filter_tensor_name, - &part_filter_offset_type, - &part_filter_offset_shape); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail: " << s.error_message(); - } part_offset_tensor = Tensor(cpu_allocator(), part_offset_type, part_offset_shape); - part_filter_offset_tensor = Tensor( - cpu_allocator(), part_filter_offset_type, part_filter_offset_shape); s = reader_->Lookup(offset_tensor_name, &part_offset_tensor); if (!s.ok()) { LOG(ERROR) << "EV restoring fail:" << s.error_message(); } - s = reader_->Lookup(offset_filter_tensor_name, - &part_filter_offset_tensor); - if (!s.ok()) { - LOG(ERROR) << "EV restoring fail: " << s.error_message(); + if (restore_args_.m_has_filter) { + TensorShape part_filter_offset_shape; + DataType part_filter_offset_type; + string offset_filter_tensor_name = + name_string + kPartFilterOffsetTensorSuffsix; + s = reader_->LookupDtypeAndShape(offset_filter_tensor_name, + &part_filter_offset_type, + &part_filter_offset_shape); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.error_message(); + } + part_filter_offset_tensor = \ + Tensor(cpu_allocator(), part_filter_offset_type, + part_filter_offset_shape); + s = reader_->Lookup(offset_filter_tensor_name, + &part_filter_offset_tensor); + if (!s.ok()) { + LOG(ERROR) << "EV restoring fail: " << s.error_message(); + } } } - auto part_offset_flat = part_offset_tensor.flat(); - auto part_filter_offset_flat = part_filter_offset_tensor.flat(); if (restore_args_.m_is_oldform) { VLOG(1) << "old form, EV name:" << name_string @@ -164,6 +167,7 @@ void CheckpointLoader::RestoreInternal( VLOG(1) << "new form checkpoint... :" << name_string << " , partition_id:" << restore_args_.m_partition_id << " , partition_num:" << restore_args_.m_partition_num; + auto part_offset_flat = part_offset_tensor.flat(); for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) { int subpart_id = restore_args_.m_loaded_parts[i]; size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim; @@ -183,6 +187,7 @@ void CheckpointLoader::RestoreInternal( new_dim, emb_config, device); if (restore_args_.m_has_filter) { + auto part_filter_offset_flat = part_filter_offset_tensor.flat(); Status s = EVRestoreFilteredFeatures( subpart_id, new_dim, restore_buff, part_filter_offset_flat, emb_config, device); @@ -444,7 +449,7 @@ Status CheckpointLoader::EVInitTensorNameAndShape( } st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered", sizeof(K) * version_filter_shape.dim_size(0)); - if (!st.ok()) { + if (!st.ok() && st.code() != error::NOT_FOUND) { return st; } st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered", @@ -463,7 +468,8 @@ Status CheckpointLoader::EVInitTensorNameAndShape( return st; } } - return st; + + return Status::OK(); } #define REGISTER_KERNELS(ktype, vtype) \ template Status CheckpointLoader::EVInitTensorNameAndShape(\ @@ -644,4 +650,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX) #undef REGISTER_KERNELS_ALL_INDEX #undef REGISTER_KERNELS -}// namespace tensorflow \ No newline at end of file +}// namespace tensorflow diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py index b4f7ded3cea..55cf748a9d6 100644 --- a/tensorflow/python/training/incr_ckpt_test.py +++ b/tensorflow/python/training/incr_ckpt_test.py @@ -451,5 +451,59 @@ def testIncrementalSaverForResourceVariable(self): saver.build() incr_saver = incr_saver_module._get_incremental_saver(True, saver) + def testIncrementalSaverSaveAndRestore(self): + tmp_path = self.get_temp_dir() + full_ckpt_dir = os.path.join(tmp_path, "model.ckpt") + incr_ckpt_dir = os.path.join(tmp_path, "incr.ckpt") + full_ckpt_path = None + incr_ckpt_path = None + + # construct graph + emb_var = variable_scope.get_embedding_variable("emb", embedding_dim=3, + initializer = init_ops.ones_initializer(dtypes.float32)) + emb = embedding_ops.embedding_lookup(emb_var, + math_ops.cast([0, 1, 2, 3, 4], dtypes.int64)) + loss = math_ops.reduce_sum(emb, name = 'reduce_sum') + opt = adagrad.AdagradOptimizer(0.1) + g_v = opt.compute_gradients(loss) + train_op = opt.apply_gradients(g_v) + init = variables.global_variables_initializer() + saver = saver_module.Saver(sharded=True, incremental_save_restore=True) + incr_saver = \ + incr_saver_module.IncrementalSaver(sharded=True, + saver_def=saver.saver_def, defer_build=True) + incr_saver.build(saver._builder.filename_tensor) + + # generate full ckpt and incr ckpt. + full_ckpt_value=None + incr_ckpt_value=None + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + sess.run([train_op]) + full_ckpt_path = saver.save(sess, full_ckpt_dir, global_step = 10) + full_ckpt_value = sess.run([emb]) + print("full_ckpt: {}".format(full_ckpt_value)) + sess.run([train_op]) + incr_ckpt_path = \ + incr_saver.incremental_save(sess, incr_ckpt_dir, global_step=20) + incr_ckpt_value = sess.run([emb]) + print("incr_ckpt: {}".format(incr_ckpt_value)) + + # check the value after restoring parameter. + with self.test_session() as sess: + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) + sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS)) + sess.run([init]) + saver.restore(sess, full_ckpt_path) + restore_full_ckpt_value = sess.run([emb]) + print("restore_full_ckpt: {}".format(restore_full_ckpt_value)) + incr_saver.incremental_restore(sess, full_ckpt_path, incr_ckpt_path) + restore_incr_ckpt_value = sess.run([emb]) + print("restore_incr_ckpt: {}".format(restore_incr_ckpt_value)) + self.assertAllClose(full_ckpt_value, restore_full_ckpt_value) + self.assertAllClose(incr_ckpt_value, restore_incr_ckpt_value) + if __name__ == "__main__": googletest.main() From d5f7f6ad77a59b70679835009dbe31add175dba3 Mon Sep 17 00:00:00 2001 From: "Secret.Sun" Date: Wed, 10 Apr 2024 14:41:50 +0800 Subject: [PATCH 37/45] [Runtime] Remove read limit of ReadBinaryProto. (#981) Signed-off-by: Secret.Sun --- tensorflow/core/platform/env.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc index ac91b79a07f..b835677627a 100644 --- a/tensorflow/core/platform/env.cc +++ b/tensorflow/core/platform/env.cc @@ -508,14 +508,7 @@ Status ReadBinaryProto(Env* env, const string& fname, TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file)); std::unique_ptr stream(new FileStream(file.get())); - // TODO(jiayq): the following coded stream is for debugging purposes to allow - // one to parse arbitrarily large messages for MessageLite. One most likely - // doesn't want to put protobufs larger than 64MB on Android, so we should - // eventually remove this and quit loud when a large protobuf is passed in. ::tensorflow::protobuf::io::CodedInputStream coded_stream(stream.get()); - // Total bytes hard limit / warning limit are set to 1GB and 512MB - // respectively. - coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20); if (!proto->ParseFromCodedStream(&coded_stream) || !coded_stream.ConsumedEntireMessage()) { From a4489e31a4b9bc8371198537a0a15af6011ef8ae Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Fri, 12 Apr 2024 14:22:32 +0800 Subject: [PATCH 38/45] [EVAllocator] Fix the bug in configuring ARENA_ARRAY_SIZE. (#986) Signed-off-by: chenbangduo.cbd --- tensorflow/core/framework/ev_allocator.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/framework/ev_allocator.h b/tensorflow/core/framework/ev_allocator.h index d3251b14782..5082ee04b72 100644 --- a/tensorflow/core/framework/ev_allocator.h +++ b/tensorflow/core/framework/ev_allocator.h @@ -546,15 +546,15 @@ class EVAllocatorImpl { page_map_ = new PageMap(); page_map_->Init(); - int64 arena_array_size = ARENA_ARRAY_SIZE; + arena_array_size_ = ARENA_ARRAY_SIZE; Status s = ReadInt64FromEnvVar("ARENA_ARRAY_SIZE", - ARENA_ARRAY_SIZE, &arena_array_size); + ARENA_ARRAY_SIZE, &arena_array_size_); if (!s.ok()) { LOG(ERROR) << "Read ARENA_ARRAY_SIZE env error: " << s.error_message(); } - LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size; + LOG(INFO) << "EVAllocator set arena array size: " << arena_array_size_; - arenas_ = new std::vector>(arena_array_size, page_map_); + arenas_ = new std::vector>(arena_array_size_, page_map_); arena_cur_index = 0; } @@ -602,7 +602,7 @@ class EVAllocatorImpl { { mutex_lock l(mu_arena_index_); ret = &((*arenas_)[arena_cur_index]); - arena_cur_index = (arena_cur_index + 1) % ARENA_ARRAY_SIZE; + arena_cur_index = (arena_cur_index + 1) % arena_array_size_; } return ret; @@ -619,6 +619,7 @@ class EVAllocatorImpl { PageMap* page_map_ = nullptr; std::vector> *arenas_ = nullptr; int arena_cur_index GUARDED_BY(mu_arena_index_); + int64 arena_array_size_; }; template From 04413cf0ee6ca57f35446095c4e27bc1cfdf2b0d Mon Sep 17 00:00:00 2001 From: Chaofeng Guo Date: Thu, 18 Apr 2024 19:56:17 +0800 Subject: [PATCH 39/45] [Embedding] Fix the issue of default_value type mismatch in the EV Gather op. (#989) Signed-off-by: Lyaction --- tensorflow/python/ops/kv_variable_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/kv_variable_ops.py b/tensorflow/python/ops/kv_variable_ops.py index 840aadf2541..55e01537c0d 100644 --- a/tensorflow/python/ops/kv_variable_ops.py +++ b/tensorflow/python/ops/kv_variable_ops.py @@ -858,10 +858,10 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None): if self._trainable: tape.variable_accessed(self) if ev_init_value is not None: - default_value = ev_init_value + default_value = math_ops.cast(ev_init_value, self.dtype) is_use_default_value_tensor = True else: - default_value = ops.convert_to_tensor(1.0) + default_value = ops.convert_to_tensor(1.0, dtype=self.dtype) is_use_default_value_tensor = False if counts != None: value = gen_kv_variable_ops.kv_resource_gather_v1(self._handle, From fc08e1b605490e818cdf80bc2389b68028c19049 Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Fri, 26 Apr 2024 11:33:59 +0800 Subject: [PATCH 40/45] [Hook] Add 'before_create_session' interface to SessionRunHook. (#991) Signed-off-by: chenbangduo.cbd --- tensorflow/python/training/monitored_session.py | 3 +++ tensorflow/python/training/session_run_hook.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py index 6eb204785dd..9492028a200 100644 --- a/tensorflow/python/training/monitored_session.py +++ b/tensorflow/python/training/monitored_session.py @@ -957,6 +957,8 @@ def __init__(self, session_creator, hooks, stop_grace_period_secs): def create_session(self): """Creates a coordinated session.""" # Keep the tf_sess for unit testing. + for hook in self._hooks: + hook.before_create_session() self.tf_sess = self._session_creator.create_session() # We don't want coordinator to suppress any exception. self.coord = coordinator.Coordinator(clean_stop_exception_types=[]) @@ -1027,6 +1029,7 @@ class MonitoredSession(_MonitoredSession): in given order: * calls `hook.begin()` for each given hook + * calls `hook.before_create_session()` * finalizes the graph via `scaffold.finalize()` * create session * initializes the model via initialization ops provided by `Scaffold` diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py index e598bc2d98c..9d05d04c139 100644 --- a/tensorflow/python/training/session_run_hook.py +++ b/tensorflow/python/training/session_run_hook.py @@ -109,6 +109,20 @@ def begin(self): """ pass + def before_create_session(self): + """Called before new TensorFlow session is created. + + This has two essential differences with the situation in which `begin` is + called: + + * Do not modify the graph in this method, ops should not be added to graph. + The modification of the graph should take place within the begin + interface. + * This method will also be called prior to the recovery of a wrapped + session, not just at the beginning of the overall session. + """ + pass + def after_create_session(self, session, coord): # pylint: disable=unused-argument """Called when new TensorFlow session is created. From e10d4411dfb93ca47f6e1908ac878d1417c7db58 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Mon, 29 Apr 2024 17:18:35 +0800 Subject: [PATCH 41/45] [Docs] Fix readthedoc build fail. (#993) - Add configure file: docs/docs_zh/.readthedocs.yaml docs/docs_en/.readthedocs.yaml Signed-off-by: Chen Ding --- docs/docs_en/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++ docs/docs_zh/.readthedocs.yaml | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 docs/docs_en/.readthedocs.yaml create mode 100644 docs/docs_zh/.readthedocs.yaml diff --git a/docs/docs_en/.readthedocs.yaml b/docs/docs_en/.readthedocs.yaml new file mode 100644 index 00000000000..c69bbd13812 --- /dev/null +++ b/docs/docs_en/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/docs_en/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/docs_en/requirements.txt diff --git a/docs/docs_zh/.readthedocs.yaml b/docs/docs_zh/.readthedocs.yaml new file mode 100644 index 00000000000..859db8adfa5 --- /dev/null +++ b/docs/docs_zh/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/docs_zh/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/docs_zh/requirements.txt From b2aed9686182124fca72f8093e74136cc13dcd39 Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Tue, 14 May 2024 10:43:13 +0800 Subject: [PATCH 42/45] [Embedding] Change the log level for EV restore. (#995) Signed-off-by: chenbangduo.cbd --- tensorflow/core/kernels/kv_variable_restore_ops.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/kv_variable_restore_ops.cc b/tensorflow/core/kernels/kv_variable_restore_ops.cc index e16db9b4cd6..0a0165595f0 100644 --- a/tensorflow/core/kernels/kv_variable_restore_ops.cc +++ b/tensorflow/core/kernels/kv_variable_restore_ops.cc @@ -376,8 +376,8 @@ class KvResourceImportV3Op: public AsyncOpKernel { // EV should not be initialized at this time. if (ev->IsInitialized()) { - LOG(ERROR) << "Import parameter for EV (" << name_string - << ") failed, this EV has already been initialized."; + LOG(WARNING) << "EV (" << name_string + << ") has already been initialized."; } auto do_compute = [this, context, file_name_string, ev, From 93c69ad9576d6ee0f7b9479bef9b091451e5b91a Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Tue, 21 May 2024 19:26:07 +0800 Subject: [PATCH 43/45] [Rendezvous] RemoteRendezvous supports FlowControl. (#994) Signed-off-by: chenbangduo.cbd --- .../base_rendezvous_mgr.cc | 213 ++++++++++++++- .../distributed_runtime/base_rendezvous_mgr.h | 45 ++++ .../rendezvous_mgr_interface.h | 11 +- .../rpc/grpc_remote_worker.cc | 10 + .../rpc/grpc_worker_interface.h | 6 + .../rpc/grpc_worker_service.cc | 162 ++++++++++++ .../rpc/grpc_worker_service.h | 4 + .../rpc/grpc_worker_service_impl.cc | 2 + .../rpc/grpc_worker_service_impl.h | 1 + .../rpc/rpc_rendezvous_mgr.cc | 245 ++++++++++++++++++ .../rpc/rpc_rendezvous_mgr_test.cc | 26 ++ tensorflow/core/framework/rendezvous.cc | 41 +++ tensorflow/core/framework/rendezvous.h | 26 ++ .../core/kernels/file_slice_sendrecv_ops.cc | 20 +- .../core/kernels/file_slice_sendrecv_ops.h | 2 + .../kernels/file_slice_sendrecv_ops_test.cc | 13 + tensorflow/core/kernels/slice_sendrecv_ops.cc | 40 +-- tensorflow/core/kernels/slice_sendrecv_ops.h | 2 + .../core/kernels/slice_sendrecv_ops_test.cc | 13 + tensorflow/core/protobuf/worker.proto | 46 ++++ tensorflow/core/protobuf/worker_service.proto | 5 + 21 files changed, 903 insertions(+), 30 deletions(-) diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc index 17935eb8982..ead121b30c8 100644 --- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc +++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc @@ -34,11 +34,13 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/env_var.h" namespace tensorflow { namespace { uint64 kGlobalStepId = 0x100000000000000uLL; + int64 kFlowControlMaxSize = 16; } // namespace anonymous static void StartAbortRendevous(Rendezvous* rendez, const Status& s) { @@ -127,6 +129,23 @@ void BaseRendezvousMgr::FuseRecvLocalAsync( rendez->FuseRecvLocalAsync(parsed_keys, std::move(done_cb)); } +void BaseRendezvousMgr::FlowControlRecvLocalAsync(int64 step_id, + const StringPiece& tag, const Rendezvous::ParsedKey& parsed, + Rendezvous::DoneCallback done) { + auto rendez = FindOrCreate(step_id); + using namespace std::placeholders; + Rendezvous::DoneCallback done_cb = std::bind( + [rendez](Rendezvous::DoneCallback done, + // Begin unbound arguments. + const Status& s, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& v, bool dead) { + rendez->Unref(); + done(s, send_args, recv_args, v, dead); + }, + std::move(done), _1, _2, _3, _4, _5); + rendez->FlowControlRecvLocalAsync(tag, parsed, std::move(done_cb)); +} + void BaseRendezvousMgr::Cleanup(int64 step_id) { Rendezvous* rendez = nullptr; { @@ -174,7 +193,17 @@ BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id) : env_(env), step_id_(step_id), local_(NewLocalRendezvous()), - session_(nullptr) {} + session_(nullptr), + flow_control_num_(0) { + Status s = ReadInt64FromEnvVar("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE", + kFlowControlMaxSize, &flow_control_max_size_); + if (!s.ok()) { + LOG(ERROR) << "Read REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE env error: " + << s.error_message(); + } + VLOG(2) << "BaseRemoteRendezvous set flow control max size: " + << flow_control_max_size_; +} BaseRemoteRendezvous::~BaseRemoteRendezvous() { CHECK(active_.empty()); @@ -221,6 +250,16 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) { std::move(fuse_call.done)); } + std::vector deferred_flow_control_calls; + { + mutex_lock l(mu_); + std::swap(deferred_flow_control_calls, deferred_flow_control_calls_); + } + for (auto& fc_call : deferred_flow_control_calls) { + FlowControlRecvLocalAsyncInternal(fc_call.tag, fc_call.parsed, + std::move(fc_call.done)); + } + return Status::OK(); } @@ -271,6 +310,43 @@ Status BaseRemoteRendezvous::Send(const ParsedKey& parsed, return local_->Send(parsed, args, val, mu, is_dead); } +Status BaseRemoteRendezvous::FlowControlSend(const StringPiece& tag, + const ParsedKey& parsed, + const Args& args, + const Tensor& val, + const bool is_dead, + const int64 timeout_millis) { + VLOG(1) << "BaseRemoteRendezvous FlowControlSend " << this << " " + << parsed.FullKey(); + const std::string tag_string(tag.data(), tag.size()); + { + mutex_lock l(mu_); + while(status_.ok() && flow_control_num_ >= flow_control_max_size_) { + if (flow_control_cv_.wait_for( + l, std::chrono::milliseconds(timeout_millis)) == \ + std::cv_status::timeout) { + return errors::DeadlineExceeded("FlowControlSend has timed out."); + } + } + + if (!status_.ok()) return status_; + DCHECK(is_initialized_locked()); + if (!IsLocalDevice(session_->worker_name, parsed.src_device)) { + return errors::InvalidArgument( + "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ", + session_->worker_name); + } + + flow_control_num_++; + if (flow_control_counters_.count(tag_string) == 0) { + flow_control_counters_[tag_string] = 0; + } + flow_control_counters_[tag_string]++; + } + // Buffers "val" and "device_context" in local_. + return local_->Send(parsed, args, val, is_dead); +} + Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed, bool is_src) { // Cache session pointer to avoid repeatedly taking & releasing the lock @@ -413,6 +489,63 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed, } } +void BaseRemoteRendezvous::FlowControlRecvAsync(const StringPiece& tag, + const ParsedKey& parsed, + const Args& recv_args, + DoneCallback done) { + VLOG(1) << "RemoteRendezvous FlowControlRecvAsync " << this + << " " << tag << " " << parsed.FullKey(); + + Status s = ValidateDevices(parsed, false /*!is_src*/); + if (s.ok() && !is_initialized()) { + s.Update(errors::Internal( + "FlowControlRecvAsync called when uninitialized (key:", + parsed.FullKey(), ").")); + } + if (!s.ok()) { + done(s, Args(), recv_args, Tensor(), false); + return; + } + + // Are src and dst in the same worker? + if (IsSameWorker(parsed.src, parsed.dst)) { + // Recv the tensor from local_. + local_->RecvAsync( + parsed, recv_args, + [this, tag, parsed, done]( + const Status& status, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) { + VLOG(2) << "RemoteRendezvous Finished Recv " << this << " " + << parsed.FullKey(); + Tensor* out = new Tensor; + StatusCallback final_callback = [done, send_args, recv_args, out, + is_dead](const Status& s) { + done(s, send_args, recv_args, *out, is_dead); + delete out; + }; + + if (status.ok()) { + SameWorkerRecvDone(parsed, send_args, recv_args, in, out, + std::move(final_callback)); + const std::string tag_string(tag.data(), tag.size()); + { + mutex_lock l(mu_); + flow_control_num_--; + DCHECK(flow_control_counters_.count(tag_string) != 0); + flow_control_counters_[tag_string]--; + } + flow_control_cv_.notify_one(); + } else { + final_callback(status); + } + }); + return; + } else { + FlowControlRecvFromRemoteAsync(tag, parsed, recv_args, std::move(done)); + } + +} + void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed, DoneCallback done) { { @@ -600,6 +733,58 @@ void BaseRemoteRendezvous::FuseRecvLocalAsyncInternal( } } +void BaseRemoteRendezvous::FlowControlRecvLocalAsync(const StringPiece& tag, + const ParsedKey& parsed, + DoneCallback done) { + { + mutex_lock l(mu_); + if (!is_initialized_locked()) { + // FlowControlRecvLocalAsync can be called (due to an incoming RecvTensor + // RPC from a remote worker) before the RunStep (or PartialRunStep) RPC + // from the master arrives. RecvLocalAsync thus buffers the arguments + // until after the RemoteRendezvous is Initialize()'d, when it completes + // the rendezvous logic. At some point after Initialize() is called, a + // Tensor is produced locally that will then be sent in response to the + // incoming RPC. + DeferredFlowControlCall call(tag, parsed, std::move(done)); + deferred_flow_control_calls_.push_back(call); + return; + } + } + FlowControlRecvLocalAsyncInternal(tag, parsed, std::move(done)); +} + +void BaseRemoteRendezvous::FlowControlRecvLocalAsyncInternal( + const StringPiece& tag, const ParsedKey& parsed, DoneCallback done) { + Status s = ValidateDevices(parsed, true /* is_src */); + if (!s.ok()) { + done(s, Args(), Args(), Tensor(), false); + return; + } + + using namespace std::placeholders; + Rendezvous::DoneCallback done_cb = std::bind( + [this, tag](Rendezvous::DoneCallback done, + // Begin unbound arguments. + const Status& s, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& v, bool dead) { + done(s, send_args, recv_args, v, dead); + if (s.ok()) { + const std::string tag_string(tag.data(), tag.size()); + { + mutex_lock l(mu_); + flow_control_num_--; + DCHECK(flow_control_counters_.count(tag_string) != 0); + flow_control_counters_[tag_string]--; + } + flow_control_cv_.notify_one(); + } + }, + std::move(done), _1, _2, _3, _4, _5); + + local_->RecvAsync(parsed, Args(), std::move(done_cb)); +} + void BaseRemoteRendezvous::FuseRecvFromRemoteAsync( const std::vector& parsed_keys, const Rendezvous::Args& args, @@ -607,6 +792,12 @@ void BaseRemoteRendezvous::FuseRecvFromRemoteAsync( CHECK(false) << "FuseRecvFromRemoteAsync Unimplemented"; } +void BaseRemoteRendezvous::FlowControlRecvFromRemoteAsync( + const StringPiece& tag, const Rendezvous::ParsedKey& parsed, + const Rendezvous::Args& args, DoneCallback done) { + CHECK(false) << "FlowControlRecvFromRemoteAsync Unimplemented."; +} + void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed, const Rendezvous::Args& recv_args, RefDoneCallback done) { @@ -636,6 +827,19 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed, } } +int64 BaseRemoteRendezvous::GetAllFlowControlItemNum() { + mutex_lock l(mu_); + return flow_control_num_; +} + +int64 BaseRemoteRendezvous::GetFlowControlItemNum(StringPiece tag) { + const std::string tag_string(tag.data(), tag.size()); + mutex_lock l(mu_); + if (flow_control_counters_.count(tag_string) == 0) + return 0; + return flow_control_counters_[tag_string]; +} + void BaseRemoteRendezvous::StartAbort(const Status& s) { CHECK(!s.ok()); // Use a "derived" status as the status for the rendezvous. Derived @@ -656,7 +860,10 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) { } active_.clear(); } + flow_control_num_ = 0; + flow_control_counters_.clear(); } + flow_control_cv_.notify_all(); } void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call, @@ -707,4 +914,8 @@ BaseRemoteRendezvous::DeferredFuseCall::DeferredFuseCall( const std::vector& parsed_keys, FuseDoneCallback done) : parsed_keys(parsed_keys), done(std::move(done)) {} +BaseRemoteRendezvous::DeferredFlowControlCall::DeferredFlowControlCall( + const StringPiece& tag, const ParsedKey& parsed, DoneCallback done) + : tag(tag), parsed(parsed), done(std::move(done)) {} + } // end namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h index b65e59436c0..fc72d9bedfc 100644 --- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h +++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_BASE_RENDEZVOUS_MGR_H_ #include +#include #include #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h" @@ -86,6 +87,10 @@ class BaseRendezvousMgr : public RendezvousMgrInterface { const std::vector& parsed_keys, Rendezvous::FuseDoneCallback done) override; + void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag, + const Rendezvous::ParsedKey& parsed, + Rendezvous::DoneCallback done) override; + // Removes rendezvous for "step_id". // // TODO(zhifengc): Have a background thread in worker that @@ -140,6 +145,11 @@ class BaseRemoteRendezvous : public RemoteRendezvous { Status Send(const ParsedKey& key, const Rendezvous::Args& args, Tensor* val, mutex* mu, const bool is_dead) override; + Status FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead, + const int64 timeout_millis) override; + // This method is called only by the RecvOp. It tests to see // whether the value will be produced by a local or remote device // and handles accordingly. In the local case it forwards to @@ -147,6 +157,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous { void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args, DoneCallback done) override; + void FlowControlRecvAsync(const StringPiece& tag, + const ParsedKey& parsed_key, + const Args& args, DoneCallback done) override; + void StartAbort(const Status& status) override; // This method is called only by the local Worker, forwarded through @@ -171,10 +185,18 @@ class BaseRemoteRendezvous : public RemoteRendezvous { void FuseRecvLocalSync(const std::vector& parsed_keys, FuseDoneCallback done); + void FlowControlRecvLocalAsync(const StringPiece& tag, + const ParsedKey& parsed, DoneCallback done); + // For ref send/recv void RecvAsync(const ParsedKey& key, const Rendezvous::Args& args, RefDoneCallback done) override; + // Obtain statistical information + int64 GetAllFlowControlItemNum() override; + + int64 GetFlowControlItemNum(StringPiece tag) override; + protected: virtual void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& args, @@ -185,6 +207,10 @@ class BaseRemoteRendezvous : public RemoteRendezvous { const Rendezvous::Args& args, FuseDoneCallback done); + virtual void FlowControlRecvFromRemoteAsync(const StringPiece& tag, + const Rendezvous::ParsedKey& parsed, + const Rendezvous::Args& args, DoneCallback done); + // Returns true if "src" and "dst" are located in the same worker, // and hence may use a local rendezvous. virtual bool IsSameWorker(DeviceNameUtils::ParsedName src, @@ -210,6 +236,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous { mutable mutex mu_; + // For Flow Control. + int64 flow_control_max_size_; + int64 flow_control_num_ GUARDED_BY(mu_); + std::unordered_map flow_control_counters_ GUARDED_BY(mu_); + tensorflow::condition_variable flow_control_cv_; + // Status given by StartAbort() if any. Status status_ GUARDED_BY(mu_); WorkerSession* session_ GUARDED_BY(mu_); // Not owned. @@ -233,6 +265,16 @@ class BaseRemoteRendezvous : public RemoteRendezvous { }; std::vector deferred_fuse_calls_ GUARDED_BY(mu_); + struct DeferredFlowControlCall { + const StringPiece tag; + const ParsedKey parsed; + DoneCallback done; + + DeferredFlowControlCall(const StringPiece& tag, const ParsedKey& parsed, + DoneCallback done); + }; + std::vector deferred_flow_control_calls_ GUARDED_BY(mu_); + typedef std::function InactiveCallback; // Active outstanding RecvTensor calls. @@ -262,6 +304,9 @@ class BaseRemoteRendezvous : public RemoteRendezvous { void FuseRecvLocalAsyncInternal(const std::vector& parsed_keys, FuseDoneCallback done); + void FlowControlRecvLocalAsyncInternal(const StringPiece& tag, + const ParsedKey& parsed, + DoneCallback done); TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous); }; diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h index caf4af97ac2..abc971c4552 100644 --- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h +++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h @@ -40,6 +40,11 @@ class RemoteRendezvous : public Rendezvous { public: // Fully construct the RemoteRendezvous. virtual Status Initialize(WorkerSession* session) = 0; + + // Obtain statistical information + virtual int64 GetAllFlowControlItemNum() = 0; + + virtual int64 GetFlowControlItemNum(StringPiece tag) = 0; }; // RendezvousMgr keeps track of a set of local rendezvous instances. @@ -87,7 +92,11 @@ class RendezvousMgrInterface { virtual void FuseRecvLocalAsync( int64 step_id, const std::vector& parsed_keys, - Rendezvous::FuseDoneCallback done) = 0; + Rendezvous::FuseDoneCallback done) = 0; + + virtual void FlowControlRecvLocalAsync(int64 step_id, const StringPiece& tag, + const Rendezvous::ParsedKey& parsed, + Rendezvous::DoneCallback done) = 0; // Removes rendezvous for "step_id". // diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc index ba95e80b496..c3fb6a8ee6c 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc @@ -63,6 +63,7 @@ class GrpcRemoteWorker : cleanupall_(Method(GrpcWorkerMethod::kCleanupAll)), recvtensor_(Method(GrpcWorkerMethod::kRecvTensor)), fuserecvtensor_(Method(GrpcWorkerMethod::kFuseRecvTensor)), + flowcontrolrecvtensor_(Method(GrpcWorkerMethod::kFlowControlRecvTensor)), recvbuf_(Method(GrpcWorkerMethod::kRecvBuf)), logging_(Method(GrpcWorkerMethod::kLogging)), tracing_(Method(GrpcWorkerMethod::kTracing)), @@ -210,6 +211,14 @@ class GrpcRemoteWorker : IssueRequest(request, response, fuserecvtensor_, done, call_opts); } + void FlowControlRecvTensorAsync(CallOptions* call_opts, + const FlowControlRecvTensorRequest* request, + TensorResponse* response, + StatusCallback done) { + VLOG(1) << "FlowControlRecvTensorAsync req: " << request->DebugString(); + IssueRequest(request, response, flowcontrolrecvtensor_, done, call_opts); + } + void RecvTensorAsync(CallOptions* call_opts, const RecvTensorRequest* request, TensorResponse* response, StatusCallback done) override { VLOG(1) << "RecvTensorAsync req: " << request->DebugString(); @@ -341,6 +350,7 @@ class GrpcRemoteWorker : const ::grpc::string cleanupall_; const ::grpc::string recvtensor_; const ::grpc::string fuserecvtensor_; + const ::grpc::string flowcontrolrecvtensor_; const ::grpc::string recvbuf_; const ::grpc::string logging_; const ::grpc::string tracing_; diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h index 20f1d2b5a62..2c885fec75d 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_interface.h @@ -6,6 +6,8 @@ namespace tensorflow { class CallOptions; class FuseTensorResponse; class FuseRecvTensorRequest; +class FlowControlRecvTensorRequest; +class TensorResponse; class GrpcWorkerInterface { public: @@ -13,6 +15,10 @@ class GrpcWorkerInterface { const FuseRecvTensorRequest* request, FuseTensorResponse* response, StatusCallback done) = 0; + + virtual void FlowControlRecvTensorAsync(CallOptions* call_opts, + const FlowControlRecvTensorRequest* request, + TensorResponse* response, StatusCallback done) = 0; }; } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc index ef4fbeab438..3bdacc29a12 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc @@ -170,6 +170,15 @@ class GrpcWorkerServiceThread { EnqueueFuseRecvTensorRequestRaw(); } + // Support FlowControlRecv + for (int i = 0; + i < gtl::FindWithDefault( + queue_depth_, static_cast(GrpcWorkerMethod::kFlowControlRecvTensor), + 1000); + ++i) { + EnqueueFlowControlRecvTensorRequestRaw(); + } + void* tag; bool ok; @@ -312,6 +321,24 @@ class GrpcWorkerServiceThread { EnqueueFuseRecvTensorRequestRaw(); } + void FlowControlRecvTensorHandlerRaw( + WorkerCall* call) { + Schedule([this, call]() { + CallOptions* call_opts = new CallOptions; + call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); }); + + worker_->GrpcFlowControlRecvTensorAsync(call_opts, &call->request, + &call->response, + [call, call_opts + ](const Status& s) { + call->ClearCancelCallback(); + delete call_opts; + call->SendResponse(ToGrpcStatus(s)); + }); + }); + EnqueueFlowControlRecvTensorRequestRaw(); + } + void RecvBufHandler(WorkerCall* call) { Schedule([this, call]() { CallOptions* call_opts = new CallOptions; @@ -394,6 +421,19 @@ class GrpcWorkerServiceThread { } } + void EnqueueFlowControlRecvTensorRequestRaw() { + mutex_lock l(shutdown_mu_); + if (!is_shutdown_) { + Call:: + EnqueueRequestForMethod( + worker_service_, cq_.get(), + static_cast(GrpcWorkerMethod::kFlowControlRecvTensor), + &GrpcWorkerServiceThread::FlowControlRecvTensorHandlerRaw, + true /* supports cancel*/); + } + } + GrpcWorker* const worker_ = nullptr; // Not owned. std::unique_ptr<::grpc::ServerCompletionQueue> cq_; std::unique_ptr thread_; @@ -746,6 +786,128 @@ void GrpcWorker::GrpcFuseRecvTensorAsync(CallOptions* opts, }); } +// GrpcFlowControlRecvTensorAsync: unlike the other Worker methods, which use +// protocol buffers for a response object, to avoid extra protocol buffer +// serialization overhead we generate our response directly into a +// ::grpc::ByteBuffer object +void GrpcWorker::GrpcFlowControlRecvTensorAsync(CallOptions* opts, + const FlowControlRecvTensorRequest* request, + ::grpc::ByteBuffer* response, StatusCallback done) { + VLOG(1) << "GrpcFlowControlRecvTensorAsync req: " << request->DebugString(); + const int64 request_id = request->request_id(); + const int64 step_id = request->step_id(); + + bool cache_enabled = (response_cache_ != nullptr && request_id != 0); + + auto do_response = [response, done, cache_enabled](const Tensor& tensor, + bool is_dead, + const Status& status) { + if (status.ok()) { + grpc::EncodeTensorToByteBuffer(is_dead, tensor, cache_enabled, response); + } + done(status); + }; + + // If response cache is enabled and the response cache already contains the + // request, we delegate this retry request to the response cache. Otherwise, + // we add the request to the response cache and start the computation to + // retrieve the requested data. + if (cache_enabled && + response_cache_->QueueRequest(request_id, step_id, do_response)) { + return; + } + + auto rendezvous_done = [this, request_id, do_response, cache_enabled]( + const Tensor& tensor, bool is_dead, + const Status& status) { + if (cache_enabled) { + // Data is ready. Process all pending requests in the response cache. + response_cache_->OnRequestFinished(request_id, tensor, is_dead, status); + } else { + do_response(tensor, is_dead, status); + } + }; + + auto fail = [&rendezvous_done](const Status& status) { + rendezvous_done(Tensor(), false, status); + }; + + Status s = recent_request_ids_.TrackUnique( + request_id, "RecvTensor (GrpcWorker)", *request); + if (!s.ok()) { + fail(s); + return; + } + + const string& key = request->rendezvous_key(); + TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str()); + Rendezvous::ParsedKey parsed; + s = Rendezvous::ParseKey(key, &parsed); + Device* src_dev = nullptr; + if (s.ok()) { + s = PrepareRecvTensor(parsed, &src_dev); + } + if (!s.ok()) { + fail(s); + return; + } + + // Request the tensor associated with the rendezvous key. + // Note that we log the cancellation here but do not abort the current step. + // gRPC can generate cancellations in response to transient network failures, + // and aborting the step eliminates the opportunity for client side retries. + // Repeated client failures will eventually cause the step to be aborted by + // the client. + opts->SetCancelCallback( + [step_id]() { LOG(WARNING) << "RecvTensor cancelled for " << step_id; }); + StringPiece tag = request->tag(); + env_->rendezvous_mgr->FlowControlRecvLocalAsync( + step_id, tag, parsed, + [opts, rendezvous_done, src_dev, request]( + const Status& status, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& val, + const bool is_dead) { + opts->ClearCancelCallback(); + if (status.ok()) { + // DMA can only be used for Tensors that do not fall into + // the following three odd edge cases: 1) a zero-size + // buffer, 2) a dead tensor which has an uninit value, and + // 3) the tensor has the on_host allocation attribute, + // i.e. it's in CPU RAM *independent of its assigned + // device type*. + const bool on_host = send_args.alloc_attrs.on_host(); + { + // Non-DMA cases. + if (src_dev->tensorflow_gpu_device_info() && (!on_host)) { + DeviceContext* send_dev_context = send_args.device_context; + AllocatorAttributes alloc_attrs; + alloc_attrs.set_gpu_compatible(true); + alloc_attrs.set_on_host(true); + Allocator* alloc = src_dev->GetAllocator(alloc_attrs); + Tensor* copy = new Tensor(alloc, val.dtype(), val.shape()); + CHECK(send_dev_context) + << "send dev name: " << src_dev->name() + << " gpu_info: " << src_dev->tensorflow_gpu_device_info(); + // "val" is on an accelerator device. Uses the device_context to + // fill the copy on host. + StatusCallback copy_ready = [rendezvous_done, copy, + is_dead](const Status& s) { + // The value is now ready to be returned on the wire. + rendezvous_done(*copy, is_dead, s); + delete copy; + }; + + CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(), + src_dev, copy, send_dev_context, copy_ready); + return; + } + } + } + + rendezvous_done(val, is_dead, status); + }); +} + namespace { // If RecvBufRespExtra.tensor_content is a single large string, then gRPC // can stall on the recv side when the string buffer needs to be enlarged, diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h index 69759c420cc..48941d438c9 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h @@ -51,6 +51,10 @@ class GrpcWorker : public Worker { ::grpc::ByteBuffer* response, StatusCallback done); + virtual void GrpcFlowControlRecvTensorAsync(CallOptions* opts, + const FlowControlRecvTensorRequest* request, + ::grpc::ByteBuffer* response, StatusCallback done); + void LoggingAsync(const LoggingRequest* request, LoggingResponse* response, StatusCallback done) override; diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc index 515d6e90beb..2095540e36a 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc @@ -48,6 +48,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) { return "/tensorflow.WorkerService/RecvTensor"; case GrpcWorkerMethod::kFuseRecvTensor: return "/tensorflow.WorkerService/FuseRecvTensor"; + case GrpcWorkerMethod::kFlowControlRecvTensor: + return "/tensorflow.WorkerService/FlowControlRecvTensor"; case GrpcWorkerMethod::kRecvBuf: return "/tensorflow.WorkerService/RecvBuf"; case GrpcWorkerMethod::kLogging: diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h index ff8e1c07cb4..ad77ee0fd80 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h @@ -80,6 +80,7 @@ enum class GrpcWorkerMethod { kCleanupAll, kRecvTensor, kFuseRecvTensor, + kFlowControlRecvTensor, kRecvBuf, kLogging, kTracing, diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc index 69f1481f59e..267bf09e66f 100644 --- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc +++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc @@ -53,6 +53,10 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous { const Rendezvous::Args& args, FuseDoneCallback done) override; + void FlowControlRecvFromRemoteAsync(const StringPiece& tag, + const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args, + DoneCallback done) override; + private: ~RpcRemoteRendezvous() override {} @@ -529,6 +533,247 @@ void RpcRemoteRendezvous::FuseRecvFromRemoteAsync( }); } + + +class FlowControlRpcRecvTensorCall : public BaseRecvTensorCall { + public: + FlowControlRpcRecvTensorCall() + : wi_(nullptr), dst_device_(nullptr) {} + + void Init(WorkerInterface* wi, int64 step_id, const StringPiece& tag, + const StringPiece& key, AllocatorAttributes alloc_attrs, + Device* dst_device, const Rendezvous::Args& recv_args, + Rendezvous::DoneCallback done) { + wi_ = wi; + grpc_wi_ = dynamic_cast(wi_); + alloc_attrs_ = alloc_attrs; + dst_device_ = dst_device; + recv_args_ = recv_args; + done_ = std::move(done); + req_.set_step_id(step_id); + req_.set_tag(tag.data(), tag.size()); + req_.set_request_id(GetUniqueRequestId()); + req_.set_rendezvous_key(key.data(), key.size()); + } + + void Reset() { + // The FlowControlRpcRemoteRendezvous using this object is responsible for + // calling ReleaseWorker() before Reset(). + DCHECK_EQ(static_cast(nullptr), wi_) + << "Leaking WorkerInterface in RpcRecvTensorCall::Reset()."; + + alloc_attrs_ = AllocatorAttributes(); + dst_device_ = nullptr; + // We don't clear opts_ and assume that Init will set up the state for + // opts_ appropriately. + req_.Clear(); + resp_.Clear(); + { + mutex_lock l(mu_); + status_ = Status::OK(); + } + done_ = nullptr; + } + + ~FlowControlRpcRecvTensorCall() override { + // Since only the FlowControlRpcRecvTensorFreeList will delete an + // FlowControlRpcRecvTensorCall, and it always sets this->wi_ to null when + // a call object is released to it, we can assert that this->wi_ is + // always null at the point of deletion. + CHECK_EQ(static_cast(nullptr), wi_) + << "Leaking WorkerInterface in FlowControlRpcRecvTensorCall destructor."; + } + + void Start(std::function recv_done) override { + StartRTCall(std::move(recv_done)); + } + + void StartAbort(const Status& s) override { + { + mutex_lock l(mu_); + status_.Update(s); + } + opts_.StartCancel(); + } + + Status status() const override { + mutex_lock l(mu_); + return status_; + } + + void ReleaseWorker(WorkerCacheInterface* worker_cache) { + DCHECK_NE(static_cast(nullptr), wi_) + << "FlowControlRpcRecvTensorCall::ReleaseWorker() called twice."; + worker_cache->ReleaseWorker(src_worker_, wi_); + wi_ = nullptr; + grpc_wi_ = nullptr; + } + + const Tensor& tensor() const { return resp_.tensor(); } + + bool is_dead() const { return resp_.metadata().is_dead(); } + + Device* dst_device() const { return dst_device_; } + const Rendezvous::Args recv_args() const { return recv_args_; } + const Rendezvous::DoneCallback& done() const { return done_; } + + private: + friend class RpcRemoteRendezvous; + + // Start the main RecvTensor call, checking for an async abort. + void StartRTCall(std::function recv_done) { + resp_.InitAlloc(dst_device_, alloc_attrs_); + using namespace std::placeholders; + StatusCallback cb = std::bind( + [this](std::function recv_done, + // Begin unbound arguments. + const Status& s) { + if (!s.ok()) { + mutex_lock l(mu_); + status_.Update(s); + } + recv_done(); + }, + std::move(recv_done), _1); + grpc_wi_->FlowControlRecvTensorAsync(&opts_, &req_, &resp_, std::move(cb)); + } + + string src_worker_; + string src_rel_device_; + WorkerInterface* wi_; // Not owned. + GrpcWorkerInterface* grpc_wi_; + AllocatorAttributes alloc_attrs_; + Device* dst_device_; + CallOptions opts_; + FlowControlRecvTensorRequest req_; + TensorResponse resp_; + Rendezvous::Args recv_args_; + Rendezvous::DoneCallback done_; + + mutable mutex mu_; + Status status_ GUARDED_BY(mu_); + + TF_DISALLOW_COPY_AND_ASSIGN(FlowControlRpcRecvTensorCall); +}; + +class FlowControlRpcRecvTensorFreeList { + public: + FlowControlRpcRecvTensorFreeList() {} + ~FlowControlRpcRecvTensorFreeList() { + for (size_t i = 0; i < objects_.size(); i++) { + delete objects_[i]; + } + } + + FlowControlRpcRecvTensorCall* New() { + { + mutex_lock l(mu_); + if (!objects_.empty()) { + FlowControlRpcRecvTensorCall* result = objects_.back(); + objects_.pop_back(); + return result; + } + } + return new FlowControlRpcRecvTensorCall; + } + + void Release(FlowControlRpcRecvTensorCall* obj) { + obj->Reset(); + { + mutex_lock l(mu_); + if (objects_.size() < kMaxObjects) { + objects_.push_back(obj); + return; + } + } + delete obj; + } + + private: + static const int kMaxObjects = 1000; + + mutex mu_; + std::vector objects_ GUARDED_BY(mu_); +}; + +static FlowControlRpcRecvTensorFreeList* get_flow_control_call_freelist() { + static FlowControlRpcRecvTensorFreeList* call_freelist = \ + new FlowControlRpcRecvTensorFreeList(); + return call_freelist; +} + +void RpcRemoteRendezvous::FlowControlRecvFromRemoteAsync( + const StringPiece& tag, const Rendezvous::ParsedKey& parsed, + const Rendezvous::Args& recv_args, DoneCallback done) { + CHECK(is_initialized()); + Status s; + + // Prepare a FlowControlRecvTensor call that can handle being aborted. + FlowControlRpcRecvTensorCall* call = get_flow_control_call_freelist()->New(); + + // key.src_device identifies a remote device. + if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &call->src_worker_, + &call->src_rel_device_)) { + s = errors::Internal(parsed.src_device, + " is invalid remote source device."); + } + + WorkerSession* sess = session(); + WorkerInterface* rwi = + sess->worker_cache->GetOrCreateWorker(call->src_worker_); + if (s.ok() && rwi == nullptr) { + s = errors::Internal("No worker known as ", call->src_worker_); + } + + Device* dst_device; + if (s.ok()) { + s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device); + } + if (!s.ok()) { + if (rwi != nullptr) { + sess->worker_cache->ReleaseWorker(call->src_worker_, rwi); + } + get_flow_control_call_freelist()->Release(call); + done(s, Args(), recv_args, Tensor{}, false); + return; + } + + call->Init(rwi, step_id_, tag, parsed.FullKey(), recv_args.alloc_attrs, + dst_device, recv_args, std::move(done)); + + // Record "call" in active_ so that it can be aborted cleanly. + RegisterCall(call, recv_args); + + // RendezvousMgr already aborted, shouldn't send RPC call any more + if (!call->status().ok()) { + // NOTE: `*sess` can potentially be deleted before we return from + // `call->done()(...)`, so we must release the worker before calling the + // callback. + call->ReleaseWorker(sess->worker_cache.get()); + call->done()(call->status(), Args(), Args(), Tensor(), false); + get_flow_control_call_freelist()->Release(call); + return; + } + + // Start "call". + Ref(); + call->Start([this, call]() { + // Removes "call" from active_. Prevent StartAbort(). + DeregisterCall(call); + // If StartAbort was called prior to DeregisterCall, then the + // current status should be bad. + Status s = call->status(); + // NOTE: `*session()` can potentially be deleted before we return from + // `call->done()(...)`, so we must release the worker before calling the + // callback. + call->ReleaseWorker(session()->worker_cache.get()); + call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead()); + get_flow_control_call_freelist()->Release(call); + Unref(); + }); + +} + } // namespace RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env) diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc index 5021853ce23..75f41ab3057 100644 --- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc +++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc @@ -211,6 +211,32 @@ TEST_F(RpcRendezvousMgrTest, CleanupAll) { } } +TEST_F(RpcRendezvousMgrTest, FlowControlSend) { + setenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE", "2", 1); + const int64 step_id = 123; + const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey( + "/job:mnist/replica:1/task:2/cpu:0", 7890, + "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0))); + { + RemoteRendezvous* rendez = rmgr_.Find(step_id); + TF_ASSERT_OK(rendez->Initialize(&worker_session_)); + core::ScopedUnref unref(rendez); + Rendezvous::Args args; + TF_ASSERT_OK( + rendez->FlowControlSend("TEST", key, args, V("peach_0"), false)); + TF_ASSERT_OK( + rendez->FlowControlSend("TEST", key, args, V("peach_1"), false)); + + EXPECT_NE( + rendez->FlowControlSend("TEST", key, args, V("peach_2"), false, 100), + Status::OK()); + EXPECT_EQ(rendez->GetAllFlowControlItemNum(), 2); + EXPECT_EQ(rendez->GetFlowControlItemNum("TEST"), 2); + } + + unsetenv("REMOTE_RENDEZVOUS_FLOW_CONTROL_MAX_SIZE"); +} + class DummyDeviceContext : public DeviceContext { public: explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {} diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc index e4db066a562..4d1adf1a070 100644 --- a/tensorflow/core/framework/rendezvous.cc +++ b/tensorflow/core/framework/rendezvous.cc @@ -146,6 +146,47 @@ Status Rendezvous::Recv(const ParsedKey& key, const Args& args, Tensor* val, return Recv(key, args, val, is_dead, no_timeout); } +Status Rendezvous::FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead) { + int64 no_timeout = 300000; + return FlowControlSend(tag, key, args, val, is_dead, no_timeout); +} + +Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key, + const Args& args, Tensor* val, bool* is_dead, + int64 timeout_ms) { + Status ret; + Notification n; + FlowControlRecvAsync(tag, key, args, [&ret, &n, val, is_dead]( + const Status& s, const Args& send_args, + const Args& recv_args, const Tensor& v, + const bool dead) { + ret = s; + *val = v; + *is_dead = dead; + n.Notify(); + }); + if (timeout_ms > 0) { + int64 timeout_us = timeout_ms * 1000; + bool notified = WaitForNotificationWithTimeout(&n, timeout_us); + if (!notified) { + return Status(error::DEADLINE_EXCEEDED, + "Timed out waiting for notification"); + } + } else { + n.WaitForNotification(); + } + return ret; +} + +Status Rendezvous::FlowControlRecv(const StringPiece& tag, const ParsedKey& key, + const Args& args, Tensor* val, + bool* is_dead) { + const int64 no_timeout = 0; + return FlowControlRecv(tag, key, args, val, is_dead, no_timeout); +} + class LocalRendezvousImpl : public Rendezvous { public: explicit LocalRendezvousImpl() {} diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h index 3aa65534272..106c0f26b32 100644 --- a/tensorflow/core/framework/rendezvous.h +++ b/tensorflow/core/framework/rendezvous.h @@ -108,6 +108,17 @@ class Rendezvous : public core::RefCounted { virtual Status Send(const ParsedKey& key, const Args& args, Tensor* ref_val, mutex* ref_mu, const bool is_dead) { return Status::OK(); } + virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead, + const int64 timeout_millis) { + return errors::Unimplemented("[Rendezvous] unimplement FlowControlSend."); + } + + virtual Status FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead); + // Callback provided by a tensor consumer waiting on the rendezvous. // It will be invoked when the tensor is available, or when a non-OK // status arises in the production of that tensor. It also gets @@ -139,12 +150,27 @@ class Rendezvous : public core::RefCounted { virtual void FuseRecvAsync(const std::vector& parsed_keys, const Args& args, FuseDoneCallback done) {} + // Local rendezvous does not need this. + virtual void FlowControlRecvAsync(const StringPiece& tag, + const ParsedKey& parsed_key, const Args& args, + DoneCallback done) { + CHECK(false) << "[Rendezvous] unimplement FlowControlRecvAsync."; + } + // Synchronous wrapper for RecvAsync. Status Recv(const ParsedKey& key, const Args& args, Tensor* val, bool* is_dead, int64 timeout_ms); Status Recv(const ParsedKey& key, const Args& args, Tensor* val, bool* is_dead); + // Synchronous wrapper for FlowControlRecvAsync. + Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key, + const Args& args, Tensor* val, bool* is_dead, + int64 timeout_ms); + + Status FlowControlRecv(const StringPiece& tag, const ParsedKey& key, + const Args& args, Tensor* val, bool* is_dead); + // Aborts all pending and future Send/Recv with the given "status". // // StartAbort() does not wait for ongoing calls to finish. diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc index 6bfe54363f9..a919238a5ee 100644 --- a/tensorflow/core/kernels/file_slice_sendrecv_ops.cc +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.cc @@ -33,11 +33,10 @@ FileSliceSendOp::FileSliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK( ctx, ctx->GetAttr("send_device_incarnation", reinterpret_cast(&send_device_incarnation))); - string tensor_name; - OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_)); key_prefix_ = \ slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, - recv_device, send_device_incarnation, tensor_name); + recv_device, send_device_incarnation, tensor_name_); if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; @@ -212,8 +211,9 @@ Status FileSliceSendOp::SendFileSlice(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "FileSliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, - ctx->is_input_dead())); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t, + ctx->is_input_dead())); } @@ -253,11 +253,10 @@ FileSliceRecvOp::FileSliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK( ctx, ctx->GetAttr("send_device_incarnation", reinterpret_cast(&send_device_incarnation))); - string tensor_name; - OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_)); key_prefix_ = \ slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, - recv_device, send_device_incarnation, tensor_name); + recv_device, send_device_incarnation, tensor_name_); if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; } @@ -464,8 +463,9 @@ Status FileSliceRecvOp::RecvFileSlice(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "FileSliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, - &is_dead, timeout_ms_)); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args, + &data_t, &is_dead, timeout_ms_)); // This shouldn't be a dead tensor. CHECK_EQ(is_dead, false); file_ptr->Append(data_t.scalar()()); diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops.h b/tensorflow/core/kernels/file_slice_sendrecv_ops.h index 6701196d481..df7e6c646f8 100644 --- a/tensorflow/core/kernels/file_slice_sendrecv_ops.h +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops.h @@ -28,6 +28,7 @@ class FileSliceSendOp : public OpKernel { private: // Variables. + string tensor_name_; string key_prefix_; bool hostmem_sendrecv_; int32 slice_size_; @@ -63,6 +64,7 @@ class FileSliceRecvOp: public OpKernel { private: // Variables. + string tensor_name_; string key_prefix_; bool hostmem_sendrecv_; string recv_dir_; diff --git a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc index 931cd152253..62f5596bb62 100644 --- a/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc +++ b/tensorflow/core/kernels/file_slice_sendrecv_ops_test.cc @@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous { kv_.erase(key_str); return Status::OK(); } + + Status FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead) { + return Send(key, args, val, is_dead); + } + void RecvAsync(const ParsedKey& key, const Args& args, DoneCallback done) override { std::string key_str = { key.FullKey().data(), key.FullKey().size() }; @@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous { done(Status::OK(), var.args, args, var.data, var.is_dead); kv_.erase(key_str); } + + void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key, + const Args& args, DoneCallback done) { + RecvAsync(parsed_key, args, done); + } + void StartAbort(const Status& status) override {} private: diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.cc b/tensorflow/core/kernels/slice_sendrecv_ops.cc index 25f1a4e8738..ee0e5426cbc 100644 --- a/tensorflow/core/kernels/slice_sendrecv_ops.cc +++ b/tensorflow/core/kernels/slice_sendrecv_ops.cc @@ -30,11 +30,10 @@ SliceSendOp::SliceSendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK( ctx, ctx->GetAttr("send_device_incarnation", reinterpret_cast(&send_device_incarnation))); - string tensor_name; - OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_)); key_prefix_ = \ slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, - recv_device, send_device_incarnation, tensor_name); + recv_device, send_device_incarnation, tensor_name_); if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; @@ -171,8 +170,9 @@ Status SliceSendOp::SendString(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, - ctx->is_input_dead())); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, + data_t, ctx->is_input_dead())); } else { TF_RETURN_IF_ERROR(SendStringSlice(ctx, frame_iter, elem, i)); } @@ -209,8 +209,9 @@ Status SliceSendOp::SendStringSlice(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, - ctx->is_input_dead())); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t, + ctx->is_input_dead())); } return Status::OK(); @@ -248,8 +249,9 @@ Status SliceSendOp::SendBasicType(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Send(parsed_key, args, data_t, - ctx->is_input_dead())); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlSend(tensor_name_, parsed_key, args, data_t, + ctx->is_input_dead())); } return Status::OK(); @@ -270,11 +272,10 @@ SliceRecvOp::SliceRecvOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK( ctx, ctx->GetAttr("send_device_incarnation", reinterpret_cast(&send_device_incarnation))); - string tensor_name; - OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name_)); key_prefix_ = \ slice_sendrecv::GetSliceRendezvousKeyPrefix(send_device, - recv_device, send_device_incarnation, tensor_name); + recv_device, send_device_incarnation, tensor_name_); if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) { hostmem_sendrecv_ = false; } @@ -440,8 +441,9 @@ Status SliceRecvOp::RecvString(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, - &is_dead, timeout_ms_)); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args, + &data_t, &is_dead, timeout_ms_)); // This shouldn't be a dead tensor. CHECK_EQ(is_dead, false); output_flat(i) = data_t.scalar()(); @@ -484,8 +486,9 @@ Status SliceRecvOp::RecvStringSlice(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceRecv " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, - &is_dead, timeout_ms_)); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args, + &data_t, &is_dead, timeout_ms_)); // This shouldn't be a dead tensor. CHECK_EQ(is_dead, false); output_flat(index) += data_t.scalar()(); @@ -529,8 +532,9 @@ Status SliceRecvOp::RecvBasicType(OpKernelContext* ctx, frame_iter, &parsed_key.buf_); VLOG(2) << "SliceSend " << parsed_key.buf_; TF_RETURN_IF_ERROR(Rendezvous::ParseKey(parsed_key.buf_, &parsed_key)); - TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(parsed_key, args, &data_t, - &is_dead, timeout_ms_)); + TF_RETURN_IF_ERROR( + ctx->rendezvous()->FlowControlRecv(tensor_name_, parsed_key, args, + &data_t, &is_dead, timeout_ms_)); // This shouldn't be a dead tensor. CHECK_EQ(is_dead, false); auto data_base = data_t.data(); diff --git a/tensorflow/core/kernels/slice_sendrecv_ops.h b/tensorflow/core/kernels/slice_sendrecv_ops.h index 43429bff32f..12e583e5551 100644 --- a/tensorflow/core/kernels/slice_sendrecv_ops.h +++ b/tensorflow/core/kernels/slice_sendrecv_ops.h @@ -28,6 +28,7 @@ class SliceSendOp : public OpKernel { private: // Variables. + string tensor_name_; string key_prefix_; bool hostmem_sendrecv_; int32 slice_size_; @@ -58,6 +59,7 @@ class SliceRecvOp : public OpKernel { private: // Variable. + string tensor_name_; string key_prefix_; bool hostmem_sendrecv_; int32 slice_size_; diff --git a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc index 5693ed57918..0eeb6d98c36 100644 --- a/tensorflow/core/kernels/slice_sendrecv_ops_test.cc +++ b/tensorflow/core/kernels/slice_sendrecv_ops_test.cc @@ -50,6 +50,13 @@ class DummyRendezvous : public Rendezvous { kv_.erase(key_str); return Status::OK(); } + + Status FlowControlSend(const StringPiece& tag, const ParsedKey& key, + const Args& args, const Tensor& val, + const bool is_dead) { + return Send(key, args, val, is_dead); + } + void RecvAsync(const ParsedKey& key, const Args& args, DoneCallback done) override { std::string key_str = { key.FullKey().data(), key.FullKey().size() }; @@ -72,6 +79,12 @@ class DummyRendezvous : public Rendezvous { done(Status::OK(), var.args, args, var.data, var.is_dead); kv_.erase(key_str); } + + void FlowControlRecvAsync(const StringPiece& tag, const ParsedKey& parsed_key, + const Args& args, DoneCallback done) { + RecvAsync(parsed_key, args, done); + } + void StartAbort(const Status& status) override {} private: diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto index 65ec7ffe4bc..fa18fec180c 100644 --- a/tensorflow/core/protobuf/worker.proto +++ b/tensorflow/core/protobuf/worker.proto @@ -441,6 +441,52 @@ message MarkRecvFinishedRequest { message MarkRecvFinishedResponse {} +//////////////////////////////////////////////////////////////////////////////// +// +// FlowControlRecvTensor method request messages +// +//////////////////////////////////////////////////////////////////////////////// + +message FlowControlRecvTensorRequest { + // The step in which the tensor will be produced. + // + // REQUIRED: This must eventually correspond to the `step_id` passed + // into a RunGraph call on the same WorkerService. + int64 step_id = 1; + + string tag = 2; + + // A key identifying the channel to receive tensors from. A RecvTensor request + // retrieves one tensor from the channel, but multiple tensors can be sent and + // received over the same channel with multiple RecvTensor requests. See + // rendezvous.h for details. + string rendezvous_key = 3; + + // If true, use an out-of-band DMA mechanism to transfer the + // received tensor. + bool dma_ok = 4; + + // Optional information on client-side device locality. + DeviceLocality client_locality = 5; + + // Optional information on server-side device locality. + DeviceLocality server_locality = 6; + + // Optional information needed by the RPC subsystem. + google.protobuf.Any transport_options = 7; + + // Unique identifier for this request. Every RecvTensorRequest must have a + // unique request_id, and retried RecvTensorRequests must have the same + // request_id. If request_id is zero, retry detection and response cache + // are disabled. + // + // Retried RecvTensorRequests are problematic because a RecvTensor with no + // corresponding sender will wait forever, and the tensor may have been + // delivered to a previous retry. Workers use request_ids to reject retried + // RecvTensor requests instead of waiting forever. + int64 request_id = 8; +} + //////////////////////////////////////////////////////////////////////////////// // // Logging method request/response messages diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto index 07a64c55ad8..8591f2fe6ab 100644 --- a/tensorflow/core/protobuf/worker_service.proto +++ b/tensorflow/core/protobuf/worker_service.proto @@ -72,6 +72,11 @@ service WorkerService { // FuseRecvTensor Method } + // See worker.proto for details. + rpc FlowControlRecvTensor(FlowControlRecvTensorRequest) returns (RecvTensorResponse) { + // FlowControlRecvTensor Method + } + // See worker.proto for details. rpc Logging(LoggingRequest) returns (LoggingResponse); From 9e30ab604aa316359f249bc061b5fe87a5773604 Mon Sep 17 00:00:00 2001 From: Chen Bangduo Date: Thu, 23 May 2024 12:00:02 +0800 Subject: [PATCH 44/45] [Embedding] Check the sharded property of tf.train.Saver. (#996) Signed-off-by: chenbangduo.cbd --- modelzoo/bst/train.py | 3 +- modelzoo/dbmtl/train.py | 3 +- modelzoo/dcn/train.py | 3 +- modelzoo/dcnv2/train.py | 3 +- modelzoo/deepfm/train.py | 3 +- modelzoo/dien/train.py | 3 +- modelzoo/din/train.py | 3 +- modelzoo/dlrm/train.py | 3 +- modelzoo/dssm/train.py | 3 +- modelzoo/esmm/train.py | 3 +- modelzoo/masknet/train.py | 3 +- modelzoo/mlperf/train.py | 3 +- modelzoo/mmoe/train.py | 3 +- modelzoo/ple/train.py | 3 +- modelzoo/simple_multitask/train.py | 3 +- modelzoo/wide_and_deep/train.py | 3 +- .../feature_column/feature_column_v2_test.py | 6 +- .../ops/embedding_variable_ops_gpu_test.py | 7 +- .../python/ops/embedding_variable_ops_test.py | 64 ++++++++++--------- tensorflow/python/training/incr_ckpt_test.py | 5 +- tensorflow/python/training/saver.py | 11 ++++ tensorflow/python/training/saver_test.py | 6 ++ 22 files changed, 76 insertions(+), 71 deletions(-) diff --git a/modelzoo/bst/train.py b/modelzoo/bst/train.py index eeeb136678b..536ddbc6905 100644 --- a/modelzoo/bst/train.py +++ b/modelzoo/bst/train.py @@ -612,10 +612,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dbmtl/train.py b/modelzoo/dbmtl/train.py index c848cbc76b2..36f2685a175 100644 --- a/modelzoo/dbmtl/train.py +++ b/modelzoo/dbmtl/train.py @@ -527,10 +527,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcn/train.py b/modelzoo/dcn/train.py index 44701e22d9f..5094a18bd85 100644 --- a/modelzoo/dcn/train.py +++ b/modelzoo/dcn/train.py @@ -594,10 +594,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dcnv2/train.py b/modelzoo/dcnv2/train.py index 5b572af0425..c1346ad6d7d 100644 --- a/modelzoo/dcnv2/train.py +++ b/modelzoo/dcnv2/train.py @@ -610,10 +610,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/deepfm/train.py b/modelzoo/deepfm/train.py index 166bedec0d0..89b2b823a46 100644 --- a/modelzoo/deepfm/train.py +++ b/modelzoo/deepfm/train.py @@ -472,10 +472,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dien/train.py b/modelzoo/dien/train.py index 190695f6ce0..f43fd2f1e73 100644 --- a/modelzoo/dien/train.py +++ b/modelzoo/dien/train.py @@ -776,10 +776,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/din/train.py b/modelzoo/din/train.py index 058583ce6fd..34621dee45e 100644 --- a/modelzoo/din/train.py +++ b/modelzoo/din/train.py @@ -594,10 +594,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dlrm/train.py b/modelzoo/dlrm/train.py index cc4c045c349..9dff32aca52 100644 --- a/modelzoo/dlrm/train.py +++ b/modelzoo/dlrm/train.py @@ -507,10 +507,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/dssm/train.py b/modelzoo/dssm/train.py index db949aac5e8..9d2264d9ce9 100644 --- a/modelzoo/dssm/train.py +++ b/modelzoo/dssm/train.py @@ -478,10 +478,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/esmm/train.py b/modelzoo/esmm/train.py index 073b08814d4..1916ed76c27 100755 --- a/modelzoo/esmm/train.py +++ b/modelzoo/esmm/train.py @@ -534,10 +534,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/masknet/train.py b/modelzoo/masknet/train.py index bb96a467701..bb9eee0ec3f 100644 --- a/modelzoo/masknet/train.py +++ b/modelzoo/masknet/train.py @@ -529,10 +529,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mlperf/train.py b/modelzoo/mlperf/train.py index ce34fe5e55c..559e4fb6efc 100644 --- a/modelzoo/mlperf/train.py +++ b/modelzoo/mlperf/train.py @@ -522,10 +522,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/mmoe/train.py b/modelzoo/mmoe/train.py index 694eb45da80..a3a6c9146d8 100644 --- a/modelzoo/mmoe/train.py +++ b/modelzoo/mmoe/train.py @@ -523,10 +523,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/ple/train.py b/modelzoo/ple/train.py index b2d2f2057ec..33aa9a15e8e 100644 --- a/modelzoo/ple/train.py +++ b/modelzoo/ple/train.py @@ -592,10 +592,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/simple_multitask/train.py b/modelzoo/simple_multitask/train.py index 4ef1874a521..6eb51f7d4e9 100644 --- a/modelzoo/simple_multitask/train.py +++ b/modelzoo/simple_multitask/train.py @@ -427,10 +427,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=train_steps) log_hook = tf.train.LoggingTensorHook( diff --git a/modelzoo/wide_and_deep/train.py b/modelzoo/wide_and_deep/train.py index 3024f58024e..2d1c964e593 100644 --- a/modelzoo/wide_and_deep/train.py +++ b/modelzoo/wide_and_deep/train.py @@ -543,10 +543,9 @@ def train(sess_config, hooks = [] hooks.extend(input_hooks) - sharded_saver = tf_config != None scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), data_init_op), - saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=sharded_saver)) + saver=tf.train.Saver(max_to_keep=args.keep_checkpoint_max, sharded=True)) stop_hook = tf.train.StopAtStepHook(last_step=steps) log_hook = tf.train.LoggingTensorHook( diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index 7946aee1e1a..24f8a36daa4 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -7527,7 +7527,7 @@ def testEmbeddingVariableForL2FeatureEviction(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables_lib.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) @@ -7758,7 +7758,7 @@ def testEmbeddingVariableForSharedEmbeddingColumnsWithPartitionNum(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) init = variables_lib.global_variables_initializer() - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) @test_util.run_deprecated_v1 def testEmbeddingVariableForInt32ID(self): @@ -7783,7 +7783,7 @@ def testEmbeddingVariableForInt32ID(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables_lib.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) diff --git a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py index d47d94d0d99..3c69153ab1b 100644 --- a/tensorflow/python/ops/embedding_variable_ops_gpu_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_gpu_test.py @@ -63,7 +63,8 @@ def testEmbeddingVariableForInitFromProto(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) graph = ops.get_default_graph() - meta_graph_def = saver_module.export_meta_graph() + saver = saver_module.Saver(sharded=True) + meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def()) ops.reset_default_graph() with self.test_session() as sess: res = saver_module.import_meta_graph(meta_graph_def) @@ -748,7 +749,7 @@ def testSaveV3(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) init = variables.global_variables_initializer() - saver = saver = saver_module.Saver() + saver = saver = saver_module.Saver(sharded=True) checkpoint_directory = self.get_temp_dir() model_path = os.path.join(checkpoint_directory, "model.ckpt") with self.test_session() as sess: @@ -816,7 +817,7 @@ def testEmbeddingVariableSaveAndRestoreOptimzierStatesForMultiTierWithHbm(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) graph = ops.get_default_graph() with self.test_session(graph = graph) as sess: saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt-12345")) diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py index dbf254d5f14..1119fd1c194 100644 --- a/tensorflow/python/ops/embedding_variable_ops_test.py +++ b/tensorflow/python/ops/embedding_variable_ops_test.py @@ -162,7 +162,7 @@ def _RecordFreqTestTemplate(self, optimizer): opt = self._CreateOptimizer(optimizer) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -194,7 +194,7 @@ def _RecordVersionTemplate(self, optimizer): opt = self._CreateOptimizer(optimizer) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -232,7 +232,7 @@ def testSaveVersionWithGlobalStepEviction(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) init = variables.global_variables_initializer() - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) model_path = os.path.join(checkpoint_directory, "model.ckpt") with self.test_session() as sess: sess.run([init]) @@ -269,7 +269,7 @@ def testFeatureColumnRecordFreqWithPartition(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -313,7 +313,7 @@ def testFeatureColumnRecordFreqSGDWithPartition(self): opt = gradient_descent.GradientDescentOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -387,7 +387,8 @@ def testDynamicEmbeddingVariableForInitFromProto(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) graph = ops.get_default_graph() - meta_graph_def = saver_module.export_meta_graph() + saver = saver_module.Saver(sharded=True) + meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def()) ops.reset_default_graph() with self.test_session() as sess: res = saver_module.import_meta_graph(meta_graph_def) @@ -406,7 +407,8 @@ def testEmbeddingVariableForInitFromProto(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) graph = ops.get_default_graph() - meta_graph_def = saver_module.export_meta_graph() + saver = saver_module.Saver(sharded=True) + meta_graph_def = saver_module.export_meta_graph(saver_def=saver.as_saver_def()) ops.reset_default_graph() with self.test_session() as sess: res = saver_module.import_meta_graph(meta_graph_def) @@ -450,7 +452,7 @@ def testEmbeddingVariableForLookupInt32(self): opt = adam.AdamOptimizer(0.01) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) @@ -643,7 +645,7 @@ def testEmbeddingVariableForL2FeatureEvictionFromContribFeatureColumn(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) @@ -682,7 +684,7 @@ def testEmbeddingVariableForGlobalStepEviction(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run([init]) @@ -720,7 +722,7 @@ def testEmbeddingVariableForL2FeatureEviction(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) @@ -1534,7 +1536,7 @@ def testEmbeddingVariableForSaveFreq(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) init = variables.global_variables_initializer() - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) model_path = os.path.join(checkpoint_directory, "model.ckpt") with self.test_session() as sess: sess.run([init]) @@ -1567,7 +1569,7 @@ def testEmbeddingVariableForL2FeatureEvictionDRAM(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS)) @@ -1724,7 +1726,7 @@ def runTestAdagrad(self, var, g): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -1778,7 +1780,7 @@ def runTestAdagrad(self, var, g): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -1849,7 +1851,7 @@ def runTestAdagrad(self, var, g): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -1923,7 +1925,7 @@ def testEmbeddingVariableForRecordFreq(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -1963,7 +1965,7 @@ def testEmbeddingVariableForRecordFreqWithCounterFilter(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -2278,7 +2280,7 @@ def testEmbeddingVariableForContirbFeatureColumnWithPartitionNum(self): opt = ftrl.FtrlOptimizer(0.1, l1_regularization_strength=2.0, l2_regularization_strength=0.00001) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) def testSaveV3(self): print("testSaveV3") @@ -2295,7 +2297,7 @@ def testSaveV3(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, global_step=gs) init = variables.global_variables_initializer() - saver = saver = saver_module.Saver() + saver = saver = saver_module.Saver(sharded=True) checkpoint_directory = self.get_temp_dir() model_path = os.path.join(checkpoint_directory, "model.ckpt") with self.test_session() as sess: @@ -2326,7 +2328,7 @@ def testEmbeddingVariableForNotSaveUnfilterFeature(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -2359,7 +2361,7 @@ def testEmbeddingVariableForSaveUnfilterFeature(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() model_path = os.path.join(checkpoint_directory, "model1.ckpt") @@ -2390,7 +2392,7 @@ def testEmbeddingVariableForMultiTierInference(self): opt = adagrad.AdagradOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v, gs) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session() as sess: sess.run([init]) @@ -2412,7 +2414,7 @@ def testEmbeddingVariableForMultiTierInference(self): emb = embedding_ops.embedding_lookup(emb_var, ids) tires = kv_variable_ops.lookup_tier(emb_var, math_ops.cast([1,2,3,4], dtypes.int64)) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) graph = ops.get_default_graph() with self.test_session(graph = graph) as sess: saver.restore(sess, os.path.join(checkpoint_directory, "model.ckpt")) @@ -2784,7 +2786,7 @@ def testSetInitializedWithoutRestore(self): g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) init = variables.global_variables_initializer() - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) with self.test_session() as sess: result = sess.run(var._is_initialized_op) self.assertEqual(False, result) @@ -2806,7 +2808,7 @@ def testSetInitializedWithRestore(self): opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: sess.run([init]) @@ -2823,7 +2825,7 @@ def testSetInitializedWithRestore(self): opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: result = sess.run(var._is_initialized_op) @@ -2860,7 +2862,7 @@ def testCountsTensor(self): opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: sess.run([init]) @@ -2893,7 +2895,7 @@ def testCountsWithSparseAndDenseTensor(self): opt = adagrad_decay.AdagradDecayOptimizer(0.1, gs) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: sess.run([init]) @@ -2929,7 +2931,7 @@ def testCountsTensorWithGradientDescent(self): opt = gradient_descent.GradientDescentOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: sess.run([init]) @@ -2964,7 +2966,7 @@ def testCountsDenseAndSparseTensorWithGradientDescent(self): opt = gradient_descent.GradientDescentOptimizer(0.1) g_v = opt.compute_gradients(loss) train_op = opt.apply_gradients(g_v) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() with self.test_session(graph=g) as sess: sess.run([init]) diff --git a/tensorflow/python/training/incr_ckpt_test.py b/tensorflow/python/training/incr_ckpt_test.py index 55cf748a9d6..849c73a44dc 100644 --- a/tensorflow/python/training/incr_ckpt_test.py +++ b/tensorflow/python/training/incr_ckpt_test.py @@ -75,7 +75,7 @@ def testSparseEvIncrSaveRestore(self): emb = embedding_ops.embedding_lookup(var, math_ops.cast([0,1,2,5,6,7], dtypes.int64)) with ops.device("/device:CPU:0"): apply_incr = gen_io_ops.record_sparse_indices(math_ops.cast([0,1,2,5,6,7], dtypes.int64), "var_ev1") - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() ev_var_name = "var_ev1" incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, [ev_var_name], [], [True],[var.handle]) @@ -178,7 +178,7 @@ def testMixIncrSaveRestore(self): activate_op = gen_io_ops. activate_sparse_recorder(["var_ev1","var_norm1"]) - saver = saver_module.Saver() + saver = saver_module.Saver(sharded=True) init = variables.global_variables_initializer() incr_save_op = gen_io_ops.incr_save(incr_ckpt_path, ["var_norm1", "var_ev1"], [], [True, True], [var_norm, var_ev.handle]) @@ -445,6 +445,7 @@ def testIncrementalSaverForResourceVariable(self): variable_scope.get_variable('var', shape=[100], use_resource=False) variable_scope.get_embedding_variable('ev', embedding_dim=100) saver = saver_module.Saver( + sharded=True, save_relative_paths=True, incremental_save_restore=True, ) diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py index acc9723c183..e70226f2968 100644 --- a/tensorflow/python/training/saver.py +++ b/tensorflow/python/training/saver.py @@ -1071,10 +1071,14 @@ def _build(self, checkpoint_path, build_save, build_restore): # pylint: disable=protected-access self._var_list = variables._all_saveable_objects() from tensorflow.python.ops import hash_table + from tensorflow.python.ops import kv_variable_ops if isinstance(self._var_list, dict): + ev = {} ht = {} lst = {} for name, x in self._var_list.items(): + if isinstance(x, kv_variable_ops.EmbeddingVariable): + ev[name] = x if isinstance(x, hash_table.HashTable): if x.hash_table not in ht: ht[x.hash_table] = [x] @@ -1084,15 +1088,20 @@ def _build(self, checkpoint_path, build_save, build_restore): lst[name] = BloomFilterSaveable(x) else: lst[name] = x + if len(ev) != 0 and not self._sharded: + raise ValueError("EmbeddingVariable can only use sharded saver") if len(ht) != 0 and not self._sharded: raise ValueError("HashTable can only use sharded saver") for x, y in ht.items(): lst[x.name] = HashTableSaveable(y) self._var_list = lst else: + ev = [] ht = {} lst = [] for x in self._var_list: + if isinstance(x, kv_variable_ops.EmbeddingVariable): + ev.append(x) if isinstance(x, hash_table.HashTable): if x.hash_table not in ht: ht[x.hash_table] = [x] @@ -1102,6 +1111,8 @@ def _build(self, checkpoint_path, build_save, build_restore): lst.append(BloomFilterSaveable(x)) else: lst.append(x) + if len(ev) != 0 and not self._sharded: + raise ValueError("EmbeddingVariable can only use sharded saver") if len(ht) != 0 and not self._sharded: raise ValueError("HashTable can only use sharded saver") for x, y in ht.items(): diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py index b48f00d0c14..365ef85af1d 100644 --- a/tensorflow/python/training/saver_test.py +++ b/tensorflow/python/training/saver_test.py @@ -852,6 +852,12 @@ def _model(): for orig, restored in zip(orig_vals, restored_vals): self.assertAllEqual(orig, restored) + def testEnableSaverShardedWhenUseEmbeddingVariable(self): + with ops_lib.Graph().as_default(): + emb_var = \ + variable_scope.get_embedding_variable(name="emb_var", embedding_dim=64) + with self.assertRaisesRegexp(ValueError, "EmbeddingVariable"): + saver_module.Saver([emb_var], sharded=False) class SaveRestoreShardedTest(test.TestCase): From d1c5a6e9aa2ec62da93f6719c6755293cf6406a5 Mon Sep 17 00:00:00 2001 From: LightWang4 <303176469@qq.com> Date: Tue, 21 Jan 2025 17:54:28 +0800 Subject: [PATCH 45/45] [Embedding] Fix op dependency in init_from_checkpoint API. (#1012) Signed-off-by: lightwang --- tensorflow/python/training/checkpoint_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py index db887fa12f1..d87a9f1b39b 100644 --- a/tensorflow/python/training/checkpoint_utils.py +++ b/tensorflow/python/training/checkpoint_utils.py @@ -443,7 +443,8 @@ def _set_checkpoint_initializer(variable, is_partitioned_ev = variable._save_slice_info is not None partition_id = variable._save_slice_info.var_offset[0] if is_partitioned_ev else 0 partition_num = variable._save_slice_info.full_shape[0] if is_partitioned_ev else 1 - with ops.control_dependencies([variable._initializer_op]): + restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0] + with ops.control_dependencies(restore_dependency[variable._primary_handle]): rank = variable.initial_value.get_shape().rank - 1 restore_op = gen_kv_variable_ops.kv_resource_import_v3( ckpt_file,