tensorflow
diff --git a/‎tensorflow/core/distributed_runtime/BUILD‎
Lines changed: 12 additions & 0 deletions b/‎tensorflow/core/distributed_runtime/BUILD‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tensorflow/core/distributed_runtime/rpc/BUILD‎
Lines changed: 6 additions & 1 deletion b/‎tensorflow/core/distributed_runtime/rpc/BUILD‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc‎
Lines changed: 97 additions & 25 deletions b/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc‎
Lines changed: 97 additions & 25 deletions
diff --git a/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h‎
Lines changed: 0 additions & 65 deletions b/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib_test.cc‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/core/distributed_runtime/rpc/grpc_server_lib_test.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc‎
Lines changed: 4 additions & 1 deletion b/‎tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc‎
Lines changed: 2 additions & 1 deletion b/‎tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc‎
Lines changed: 2 additions & 1 deletion
@@ -269,6 +269,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "server_lib",
+    srcs = ["server_lib.cc"],
+    hdrs = ["server_lib.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 # TODO(mrry): Move executor_test.cc to ../common_runtime when once it no longer depends
 # on grpc_testlib.
 tf_cc_tests(
 
@@ -211,7 +211,7 @@ cc_library(
     srcs = [
         "grpc_server_lib.cc",
     ],
-    hdrs = ["grpc_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
         "@grpc//:grpc++_unsecure",
         ":async_service_interface",
@@ -230,8 +230,10 @@ cc_library(
         "//tensorflow/core/distributed_runtime:master_env",
         "//tensorflow/core/distributed_runtime:master_session",
         "//tensorflow/core/distributed_runtime:process_util",
+        "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
     ],
+    alwayslink = 1,
 )
 
 cc_binary(
@@ -247,6 +249,7 @@ cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime:server_lib",
     ],
 )
 
@@ -276,6 +279,7 @@ cc_binary(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:server_lib",
     ],
 )
 
@@ -344,5 +348,6 @@ tf_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:process_util",
+        "//tensorflow/core/distributed_runtime:server_lib",
     ],
 )
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
-
 #include <memory>
 
 #include "grpc++/grpc++.h"
@@ -33,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -41,14 +40,14 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
-
 namespace {
-class TensorFlowServer : public ServerInterface {
+
+class GrpcServer : public ServerInterface {
  public:
-  TensorFlowServer(const ServerDef& server_def, Env* env)
+  GrpcServer(const ServerDef& server_def, Env* env)
       : server_def_(server_def), env_(env), state_(NEW) {}
 
-  ~TensorFlowServer() {
+  ~GrpcServer() {
     Stop();
     Join();
 
@@ -59,8 +58,14 @@ class TensorFlowServer : public ServerInterface {
     // to destroy them.
     delete master_env_.worker_cache;  // Shared with worker_env.worker_cache.
 
-    delete worker_env_.device_mgr;
+    // We must delete graph_mgr before device_mgr, due to shared
+    // ownership of OpKernels in the executors. (The graph_mgr will
+    // free all stateless OpKernels, and pass over borrowed stateful
+    // OpKernels, which are also held in their respective devices'
+    // OpSegments.)
     delete worker_env_.graph_mgr;
+    delete worker_env_.device_mgr;
+
     delete worker_env_.rendezvous_mgr;
 
     // Do not delete (as these are not owned by the server):
@@ -91,6 +96,56 @@ class TensorFlowServer : public ServerInterface {
       return errors::Internal("Could not parse worker name.");
     }
 
+    // Look up the port that has been requested for this task in `server_def_`.
+    requested_port_ = -1;
+    for (const auto& job : server_def_.cluster().job()) {
+      if (job.name() == server_def_.job_name()) {
+        auto iter = job.tasks().find(server_def_.task_index());
+        if (iter == job.tasks().end()) {
+          return errors::InvalidArgument("Task ", server_def_.task_index(),
+                                         " was not defined in job \"",
+                                         server_def_.job_name(), "\"");
+        } else if (!str_util::NumericParse32(
+                       str_util::Split(iter->second, ':')[1],
+                       &requested_port_)) {
+          return errors::Internal(
+              "Could not parse port for local server from \"", iter->second,
+              "\"");
+        } else {
+          break;
+        }
+      }
+    }
+    if (requested_port_ == -1) {
+      return errors::Internal("Job \"", server_def_.job_name(),
+                              "\" was not defined in cluster");
+    }
+
+    // N.B. The order of initialization here is intricate, because we
+    // wish to allow `requested_port_ == 0` (for choosing any port,
+    // mostly for testing). Therefore, the construction of the channel
+    // and worker caches depends on `bound_port_`, which is not set
+    // until we call `builder.BuildAndStart()`. We must create the
+    // service objects before calling `builder.BuildAndStart()`, but
+    // `master_env_` and `worker_env_` are only partially
+    // configured. However, this is not dangerous, because we do not
+    // start serving requests until `this->Start()` is called, which
+    // happens after this method returns.
+    //
+    // TODO(mrry): Provide a general mechanism for dynamically setting
+    // the identities of tasks in the worker pool after the service is
+    // running.
+    ::grpc::ServerBuilder builder;
+    builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port_),
+                             ::grpc::InsecureServerCredentials(), &bound_port_);
+    master_service_ = NewGrpcMasterService(&master_env_, &builder);
+    worker_service_ = NewGrpcWorkerService(&worker_env_, &builder);
+    server_ = builder.BuildAndStart();
+
+    if (!server_) {
+      return errors::Internal("Could not start gRPC server");
+    }
+
     GrpcChannelSpec channel_spec;
     for (const auto& job : server_def_.cluster().job()) {
       int max_task_id = -1;
@@ -99,7 +154,12 @@ class TensorFlowServer : public ServerInterface {
       }
       std::vector<string> host_ports(max_task_id + 1);
       for (const auto& task : job.tasks()) {
-        host_ports[task.first] = task.second;
+        if (job.name() == server_def_.job_name() &&
+            task.first == server_def_.task_index()) {
+          host_ports[task.first] = strings::StrCat("localhost:", bound_port_);
+        } else {
+          host_ports[task.first] = task.second;
+        }
       }
       channel_spec.AddHostPortsJob(job.name(), host_ports, host_ports.size());
     }
@@ -133,12 +193,6 @@ class TensorFlowServer : public ServerInterface {
     mutex_lock l(mu_);
     switch (state_) {
       case NEW: {
-        ::grpc::ServerBuilder builder;
-        builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port_),
-                                 ::grpc::InsecureServerCredentials());
-        master_service_ = NewGrpcMasterService(&master_env_, &builder);
-        worker_service_ = NewGrpcWorkerService(&worker_env_, &builder);
-        server_ = builder.BuildAndStart();
         master_thread_.reset(
             env_->StartThread(ThreadOptions(), "TF_master_service",
                               [this] { master_service_->HandleRPCsLoop(); }));
@@ -196,16 +250,19 @@ class TensorFlowServer : public ServerInterface {
     }
   }
 
-  const string& target() const override { return target_; }
+  const string target() const override {
+    return strings::StrCat("grpc://localhost:", bound_port_);
+  }
 
  private:
   // The overall server configuration.
   const ServerDef server_def_;
   Env* env_;
 
   // The port requested for this server.
-  // TODO(mrry): Support requested_port_ == 0 to bind to any available port.
   int requested_port_;
+  // The port to which this server is bound.
+  int bound_port_ = 0;
 
   // The `SessionOptions.target` to be used when connecting to this
   // server (as a master).
@@ -238,15 +295,30 @@ class TensorFlowServer : public ServerInterface {
 
   std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
 };
-}  // namespace
 
-Status NewServer(const ServerDef& server_def,
-                 std::unique_ptr<ServerInterface>* out_server) {
-  std::unique_ptr<TensorFlowServer> ret(
-      new TensorFlowServer(server_def, Env::Default()));
-  TF_RETURN_IF_ERROR(ret->Init());
-  *out_server = std::move(ret);
-  return Status::OK();
-}
+class GrpcServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "grpc";
+  }
 
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    std::unique_ptr<GrpcServer> ret(new GrpcServer(server_def, Env::Default()));
+    TF_RETURN_IF_ERROR(ret->Init());
+    *out_server = std::move(ret);
+    return Status::OK();
+  }
+};
+
+// Registers a `ServerFactory` for `GrpcServer` instances.
+class GrpcServerRegistrar {
+ public:
+  GrpcServerRegistrar() {
+    ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory());
+  }
+};
+static GrpcServerRegistrar registrar;
+
+}  // namespace
 }  // namespace tensorflow
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,6 +25,7 @@ namespace tensorflow {
 // when no calls are made against the server.
 TEST(Server, StopAfterNoop) {
   ServerDef def;
+  def.set_protocol("grpc");
   def.set_job_name("localhost");
   def.set_task_index(0);
   JobDef* job_def = def.mutable_cluster()->add_job();
@@ -42,6 +43,7 @@ TEST(Server, StopAfterNoop) {
 // when a simple call is made against the server.
 TEST(Server, StopAfterCall) {
   ServerDef def;
+  def.set_protocol("grpc");
   def.set_job_name("localhost");
   def.set_task_index(0);
   JobDef* job_def = def.mutable_cluster()->add_job();
 
@@ -19,7 +19,7 @@ limitations under the License.
 #include "grpc++/security/credentials.h"
 #include "grpc++/server_builder.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -31,10 +31,13 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 // This binary starts a TensorFlow server (master and worker).
+//
+// TODO(mrry): Replace with a py_binary that uses `tf.GrpcServer()`.
 namespace tensorflow {
 namespace {
 
 Status ParseFlagsForTask(int argc, char* argv[], ServerDef* options) {
+  options->set_protocol("grpc");
   string cluster_spec;
   int task_index = 0;
   const bool parse_result = ParseFlags(
 
@@ -17,7 +17,7 @@ limitations under the License.
 #include "grpc++/security/credentials.h"
 #include "grpc++/server_builder.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -33,6 +33,7 @@ namespace tensorflow {
 namespace {
 
 Status ParseFlagsForTask(int argc, char* argv[], ServerDef* options) {
+  options->set_protocol("grpc");
   string job_spec;
   int num_cpus = 1;
   int num_gpus = 0;