GraphicsCoder
diff --git a/‎tensorflow/contrib/cmake/tf_core_ops.cmake‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/contrib/cmake/tf_core_ops.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow/contrib/cmake/tf_python.cmake‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/contrib/cmake/tf_python.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow/core/BUILD‎
Lines changed: 3 additions & 0 deletions b/‎tensorflow/core/BUILD‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/BUILD‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/core/kernels/BUILD‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/core/kernels/fft_ops.cc‎
Lines changed: 145 additions & 49 deletions b/‎tensorflow/core/kernels/fft_ops.cc‎
Lines changed: 145 additions & 49 deletions
@@ -21,6 +21,7 @@ set(tf_op_lib_names
     "set_ops"  
     "sendrecv_ops"
     "sparse_ops"
+    "spectral_ops"
     "state_ops"
     "string_ops"
     "training_ops"
 
@@ -520,6 +520,7 @@ GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
 GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
+GENERATE_PYTHON_OP_LIB("spectral_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
 
@@ -496,6 +496,7 @@ tf_gen_op_libs(
         "script_ops",
         "sendrecv_ops",
         "sparse_ops",
+        "spectral_ops",
         "state_ops",
         "string_ops",
         "training_ops",
@@ -557,6 +558,7 @@ cc_library(
         ":sendrecv_ops_op_lib",
         ":set_ops_op_lib",
         ":sparse_ops_op_lib",
+        ":spectral_ops_op_lib",
         ":state_ops_op_lib",
         ":string_ops_op_lib",
         ":training_ops_op_lib",
@@ -2498,6 +2500,7 @@ tf_cc_tests(
         "ops/random_ops_test.cc",
         "ops/set_ops_test.cc",
         "ops/sparse_ops_test.cc",
+        "ops/spectral_ops_test.cc",
         "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
         "ops/training_ops_test.cc",
 
@@ -2128,7 +2128,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + [
+        "//tensorflow/core:spectral_ops_op_lib",
+    ],
 )
 
 tf_kernel_library(
 
@@ -49,72 +49,139 @@ class FFTGPUBase : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in = ctx->input(0);
     const TensorShape& shape = in.shape();
+    const int fft_rank = Rank();
     OP_REQUIRES(
-        ctx, shape.dims() >= Rank(),
-        errors::InvalidArgument("Input must have rank of at least ", Rank(),
+        ctx, shape.dims() >= fft_rank,
+        errors::InvalidArgument("Input must have rank of at least ", fft_rank,
                                 " but got: ", shape.DebugString()));
+
     Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &out));
+    TensorShape output_shape = shape;
+    uint64 fft_shape[3] = {0, 0, 0};
+
+    // In R2C or C2R mode, we use a second input to specify the FFT length
+    // instead of inferring it from the input shape.
+    if (IsReal()) {
+      const Tensor& fft_length = ctx->input(1);
+      OP_REQUIRES(ctx,
+                  fft_length.shape().dims() == 1 &&
+                      fft_length.shape().dim_size(0) == fft_rank,
+                  errors::InvalidArgument("fft_length must  have shape [",
+                                          fft_rank, "]"));
+
+      auto fft_length_as_vec = fft_length.vec<int32>();
+      for (int i = 0; i < fft_rank; ++i) {
+        fft_shape[i] = fft_length_as_vec(i);
+        uint64 dim = IsForward() && i == fft_rank - 1 && fft_shape[i] != 0
+                         ? fft_shape[i] / 2 + 1
+                         : fft_shape[i];
+        output_shape.set_dim(output_shape.dims() - fft_rank + i, dim);
+      }
+    } else {
+      for (int i = 0; i < fft_rank; ++i) {
+        fft_shape[i] =
+            output_shape.dim_size(output_shape.dims() - fft_rank + i);
+      }
+    }
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &out));
     if (shape.num_elements() == 0) {
       return;
     }
-    DoFFT(ctx, in, out);
+
+    DoFFT(ctx, in, fft_shape, out);
   }
 
  protected:
   virtual int Rank() const = 0;
   virtual bool IsForward() const = 0;
+  virtual bool IsReal() const = 0;
 
  private:
-  void DoFFT(OpKernelContext* ctx, const Tensor& in, Tensor* out) {
+  void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+             Tensor* out) {
     auto* stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
-    const TensorShape& shape = in.shape();
-    auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
-    auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
+    const TensorShape& input_shape = in.shape();
+    const TensorShape& output_shape = out->shape();
 
-    const int rank = Rank();
+    const int fft_rank = Rank();
     int batch_size = 1;
-    for (int i = 0; i < shape.dims() - rank; ++i) {
-      batch_size *= shape.dim_size(i);
+    for (int i = 0; i < input_shape.dims() - fft_rank; ++i) {
+      batch_size *= input_shape.dim_size(i);
     }
-    uint64 data_length = 1;
-    uint64 data_dims[3];
-    for (int i = 0; i < rank; ++i) {
-      auto dim = shape.dim_size(shape.dims() - rank + i);
-      data_length *= dim;
-      data_dims[i] = dim;
+    uint64 input_embed[3];
+    uint64 input_stride = 1;
+    uint64 input_distance = 1;
+    uint64 output_embed[3];
+    uint64 output_stride = 1;
+    uint64 output_distance = 1;
+
+    for (int i = 0; i < fft_rank; ++i) {
+      auto dim_offset = input_shape.dims() - fft_rank + i;
+      input_embed[i] = input_shape.dim_size(dim_offset);
+      input_distance *= input_shape.dim_size(dim_offset);
+      output_embed[i] = output_shape.dim_size(dim_offset);
+      output_distance *= output_shape.dim_size(dim_offset);
     }
 
-    constexpr uint64* kInputEmbed = nullptr;
-    constexpr uint64 kInputStride = 1;
-    constexpr uint64 kInputDistance = 1;
-    constexpr uint64* kOutputEmbed = nullptr;
-    constexpr uint64 kOutputStride = 1;
-    constexpr uint64 kOutputDistance = 1;
     constexpr bool kInPlaceFft = false;
+    const auto kFftType =
+        IsReal() ? (IsForward() ? perftools::gputools::fft::Type::kR2C
+                                : perftools::gputools::fft::Type::kC2R)
+                 : (IsForward() ? perftools::gputools::fft::Type::kC2CForward
+                                : perftools::gputools::fft::Type::kC2CInverse);
 
     auto plan = stream->parent()->AsFft()->CreateBatchedPlan(
-        stream, rank, data_dims, kInputEmbed, kInputStride, kInputDistance,
-        kOutputEmbed, kOutputStride, kOutputDistance,
-        IsForward() ? perftools::gputools::fft::Type::kC2CForward
-                    : perftools::gputools::fft::Type::kC2CInverse,
-        kInPlaceFft, batch_size);
-
-    OP_REQUIRES(
-        ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-        errors::Internal("c2c fft failed : in.shape=", shape.DebugString()));
-    if (!IsForward()) {
-      auto alpha = complex64(1.f / data_length);
+        stream, fft_rank, fft_shape, input_embed, input_stride, input_distance,
+        output_embed, output_stride, output_distance, kFftType, kInPlaceFft,
+        batch_size);
+
+    if (IsReal()) {
+      if (IsForward()) {
+        auto src = AsDeviceMemory<float>(in.flat<float>().data());
+        auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
+        OP_REQUIRES(
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+      } else {
+        auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
+        auto dst = AsDeviceMemory<float>(out->flat<float>().data());
+        OP_REQUIRES(
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+        auto alpha = 1.f / output_distance;
+        OP_REQUIRES(
+            ctx,
+            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                .ok(),
+            errors::Internal("BlasScal failed : in.shape=",
+                             input_shape.DebugString()));
+      }
+    } else {
+      auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
+      auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
       OP_REQUIRES(
-          ctx, stream->ThenBlasScal(shape.num_elements(), alpha, &dst, 1).ok(),
-          errors::Internal("BlasScal failed : in.shape=", shape.DebugString()));
+          ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+          errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                           " in.shape=", input_shape.DebugString()));
+      if (!IsForward()) {
+        auto alpha = complex64(1.f / output_distance);
+        OP_REQUIRES(
+            ctx,
+            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                .ok(),
+            errors::Internal("BlasScal failed : in.shape=",
+                             input_shape.DebugString()));
+      }
     }
   }
 };
 
-template <bool Forward, int FFTRank>
+template <bool Forward, bool _Real, int FFTRank>
 class FFTGPU : public FFTGPUBase {
  public:
   static_assert(FFTRank >= 1 && FFTRank <= 3,
@@ -124,24 +191,53 @@ class FFTGPU : public FFTGPUBase {
  protected:
   int Rank() const override { return FFTRank; }
   bool IsForward() const override { return Forward; }
+  bool IsReal() const override { return _Real; }
 };
 
-REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU), FFTGPU<true, 1>);
-REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU), FFTGPU<false, 1>);
-REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU), FFTGPU<true, 2>);
-REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU), FFTGPU<false, 2>);
-REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU), FFTGPU<true, 3>);
-REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU), FFTGPU<false, 3>);
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_GPU), FFTGPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 3>);
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 3>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 1>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 1>);
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 2>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT2D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 2>);
+REGISTER_KERNEL_BUILDER(
+    Name("RFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<true, true, 3>);
+REGISTER_KERNEL_BUILDER(
+    Name("IRFFT3D").Device(DEVICE_GPU).HostMemory("fft_length"),
+    FFTGPU<false, true, 3>);
 
 // Deprecated kernels.
-REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU), FFTGPU<true, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU), FFTGPU<false, 1>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU), FFTGPU<true, 2>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("BatchIFFT").Device(DEVICE_GPU),
+                        FFTGPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT2D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 2>);
 REGISTER_KERNEL_BUILDER(Name("BatchIFFT2D").Device(DEVICE_GPU),
-                        FFTGPU<false, 2>);
-REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU), FFTGPU<true, 3>);
+                        FFTGPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU),
+                        FFTGPU<true, false, 3>);
 REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU),
-                        FFTGPU<false, 3>);
+                        FFTGPU<false, false, 3>);
 
 }  // end namespace tensorflow