tensorflow
diff --git a/‎tensorflow/core/kernels/depthwise_conv_op_gpu.h‎
Lines changed: 4 additions & 5 deletions b/‎tensorflow/core/kernels/depthwise_conv_op_gpu.h‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎tensorflow/core/kernels/fill_empty_rows_functor_gpu.cu.cc‎
Lines changed: 14 additions & 17 deletions b/‎tensorflow/core/kernels/fill_empty_rows_functor_gpu.cu.cc‎
Lines changed: 14 additions & 17 deletions
diff --git a/‎tensorflow/core/kernels/gather_functor_gpu.cu.h‎
Lines changed: 9 additions & 21 deletions b/‎tensorflow/core/kernels/gather_functor_gpu.cu.h‎
Lines changed: 9 additions & 21 deletions
@@ -37,7 +37,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-using Eigen::numext::div_ceil;
 
 namespace detail {
 template <typename T>
@@ -641,7 +640,7 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
     case FORMAT_NHWC:
       block_dim = dim3(kBlockDepth, args.in_cols, block_height);
       block_count =
-          args.batch * div_ceil(args.out_depth, kBlockDepth) * kBlockDepth;
+          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
       kernel =
           DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
                                             kKnownFilterHeight, kBlockDepth,
@@ -650,7 +649,7 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
     case FORMAT_NCHW:
       block_dim = dim3(args.in_cols, block_height, kBlockDepth);
       block_count =
-          div_ceil(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
       kernel =
           DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
                                             kKnownFilterHeight, kBlockDepth,
@@ -1568,14 +1567,14 @@ Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     case FORMAT_NHWC:
       block_dim = dim3(kBlockDepth, args.in_cols, block_height);
       block_count =
-          args.batch * div_ceil(args.out_depth, kBlockDepth) * kBlockDepth;
+          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
     case FORMAT_NCHW:
       block_dim = dim3(args.in_cols, block_height, kBlockDepth);
       block_count =
-          div_ceil(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
 
@@ -18,7 +18,6 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
-#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -55,8 +54,7 @@ typename T::ConstPointerType to_pointers(const T& x) {
 template <typename Tindex, typename... CallerArgs, typename... KernelArgs>
 Status wrap_kernel_call(void (*func)(KernelArgs...), const GPUDevice& device,
                         Tindex size, CallerArgs... args) {
-  TF_ASSIGN_OR_RETURN(GpuLaunchConfig64 config,
-                      GetGpuLaunchConfig64(size, device));
+  auto config = GetGpuLaunchConfig(size, device);
   return GpuLaunchKernel(func, config.block_count, config.thread_per_block, 0,
                          device.stream(), config, to_pointers(args)...);
 }
@@ -79,10 +77,10 @@ struct CastFunctor {
 // true if the indices are not ordered by row.
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void CountElementsPerRowKernel(
-    GpuLaunchConfig64 cfg, Tindex dense_rows, int rank, const Tindex* indices,
+    GpuLaunchConfig cfg, Tindex dense_rows, int rank, const Tindex* indices,
     Tindex* elements_per_row, int* rows_are_not_ordered,
     int* first_invalid_index) {
-  for (int64_t i : GpuGridRangeX(cfg.virtual_thread_count)) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
     Tindex row = indices[i * rank];
     if (row < 0 || row >= dense_rows) {
       GpuAtomicMin(first_invalid_index, i);
@@ -100,19 +98,18 @@ __global__ __launch_bounds__(1024) void CountElementsPerRowKernel(
 
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void CopyRowIndicesKernel(
-    GpuLaunchConfig64 cfg, int rank, const Tindex* indices,
-    Tindex* row_indices) {
-  for (int64_t i : GpuGridRangeX(cfg.virtual_thread_count)) {
+    GpuLaunchConfig cfg, int rank, const Tindex* indices, Tindex* row_indices) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
     row_indices[i] = indices[i * rank];
   }
 }
 
 // Sets empty_row_indicator[row] to whether the row is empty.
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void ComputeEmptyRowIndicatorKernel(
-    GpuLaunchConfig64 cfg, const Tindex* elements_per_row,
+    GpuLaunchConfig cfg, const Tindex* elements_per_row,
     bool* empty_row_indicator) {
-  for (int64_t row : GpuGridRangeX(cfg.virtual_thread_count)) {
+  GPU_1D_KERNEL_LOOP(row, cfg.virtual_thread_count) {
     empty_row_indicator[row] = elements_per_row[row] == 0;
   }
 }
@@ -122,11 +119,11 @@ __global__ __launch_bounds__(1024) void ComputeEmptyRowIndicatorKernel(
 // empty row.
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void ScatterInputElementsKernel(
-    GpuLaunchConfig64 cfg, Tindex dense_rows, int rank,
+    GpuLaunchConfig cfg, Tindex dense_rows, int rank,
     const Tindex* input_index_map, const Tindex* indices, const T* values,
     const Tindex* num_new_rows_before, Tindex* output_indices, T* output_values,
     Tindex* reverse_index_map) {
-  for (int64_t i : ::tensorflow::GpuGridRangeX(cfg.virtual_thread_count)) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
     Tindex input_i = input_index_map ? input_index_map[i] : i;
     Tindex row = indices[input_i * rank];
     Tindex output_i = i + num_new_rows_before[row];
@@ -144,10 +141,10 @@ __global__ __launch_bounds__(1024) void ScatterInputElementsKernel(
 // input) in output_indices and output_values.
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void ScatterNewElementsKernel(
-    GpuLaunchConfig64 cfg, int rank, const T* default_value,
+    GpuLaunchConfig cfg, int rank, const T* default_value,
     const Tindex* num_new_rows_through, const Tindex* input_row_ends,
     const bool* empty_row_indicator, Tindex* output_indices, T* output_values) {
-  for (int64_t row : ::tensorflow::GpuGridRangeX(cfg.virtual_thread_count)) {
+  GPU_1D_KERNEL_LOOP(row, cfg.virtual_thread_count) {
     if (!empty_row_indicator[row]) continue;  // Only process empty rows
     Tindex input_i = (row == 0 ? 0 : input_row_ends[row - 1]);
     Tindex output_i = input_i + (row == 0 ? 0 : num_new_rows_through[row - 1]);
@@ -492,9 +489,9 @@ namespace {
 
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void GatherOriginalGradValuesKernel(
-    GpuLaunchConfig64 cfg, const Tindex* reverse_index_map,
-    const T* grad_values, T* d_values, bool* visited, Tindex N_full) {
-  for (int64_t input_i : GpuGridRangeX(cfg.virtual_thread_count)) {
+    GpuLaunchConfig cfg, const Tindex* reverse_index_map, const T* grad_values,
+    T* d_values, bool* visited, Tindex N_full) {
+  GPU_1D_KERNEL_LOOP(input_i, cfg.virtual_thread_count) {
     Tindex output_i = reverse_index_map[input_i];
     if (output_i >= 0 && output_i < N_full) {
       d_values[input_i] = grad_values[output_i];
 
@@ -20,7 +20,6 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/types.h"
@@ -36,7 +35,7 @@ __global__ void GatherOpKernel(const ValueOrVec* __restrict__ params,
                                ValueOrVec* __restrict__ out,
                                int64 gather_dim_size, int64 indices_size,
                                int64 slice_size, int64 out_size) {
-  for (int64_t i : GpuGridRangeX(out_size)) {
+  GPU_1D_KERNEL_LOOP(i, out_size) {
     Index batch_i = 0;
     Index indices_i = 0;
     Index slice_i = 0;
@@ -92,12 +91,9 @@ struct LaunchGatherKernelVectorized {
       const Tvec* params_vec = reinterpret_cast<const Tvec*>(params);
       Tvec* out_vec = reinterpret_cast<Tvec*>(out);
 
-      TF_ASSIGN_OR_RETURN(
-          GpuLaunchConfig64 config,
-          GetGpuLaunchConfig64(out_size_vec, d,
-                               &GatherOpKernel<Tvec, Index, is_axis_zero>,
-                               /*dynamic_shared_memory_size=*/0,
-                               /*block_size_limit=*/0));
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size_vec, d, &GatherOpKernel<Tvec, Index, is_axis_zero>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       return GpuLaunchKernel(
           GatherOpKernel<Tvec, Index, is_axis_zero>, config.block_count,
           config.thread_per_block, 0, d.stream(), params_vec, indices, out_vec,
@@ -146,21 +142,13 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 slice_size = params.dimension(2);
 
     if (is_axis_zero) {
-      Status status = LaunchGatherKernel<true>(
-          d, params.data(), indices.data(), out.data(), gather_dim_size,
-          indices_size, slice_size, out_size);
-      if (!status.ok()) {
-        ctx->CtxFailure(__FILE__, __LINE__, status);
-        return -1;
-      }
+      TF_CHECK_OK(LaunchGatherKernel<true>(d, params.data(), indices.data(),
+                                           out.data(), gather_dim_size,
+                                           indices_size, slice_size, out_size));
     } else {
-      Status status = LaunchGatherKernel<false>(
+      TF_CHECK_OK(LaunchGatherKernel<false>(
           d, params.data(), indices.data(), out.data(), gather_dim_size,
-          indices_size, slice_size, out_size);
-      if (!status.ok()) {
-        ctx->CtxFailure(__FILE__, __LINE__, status);
-        return -1;
-      }
+          indices_size, slice_size, out_size));
     }
     // TODO(fpmc): enable indices validation on GPU.
     // Right now checking for indices out of bound in the kernel would