tensorflow
diff --git a/‎tensorflow/core/kernels/depthwise_conv_op_gpu.h‎
Lines changed: 5 additions & 4 deletions b/‎tensorflow/core/kernels/depthwise_conv_op_gpu.h‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎tensorflow/core/kernels/fill_empty_rows_functor_gpu.cu.cc‎
Lines changed: 17 additions & 14 deletions b/‎tensorflow/core/kernels/fill_empty_rows_functor_gpu.cu.cc‎
Lines changed: 17 additions & 14 deletions
diff --git a/‎tensorflow/core/kernels/gather_functor_gpu.cu.h‎
Lines changed: 21 additions & 9 deletions b/‎tensorflow/core/kernels/gather_functor_gpu.cu.h‎
Lines changed: 21 additions & 9 deletions
@@ -37,6 +37,7 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
+using Eigen::numext::div_ceil;
 
 namespace detail {
 template <typename T>
@@ -640,7 +641,7 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
     case FORMAT_NHWC:
       block_dim = dim3(kBlockDepth, args.in_cols, block_height);
       block_count =
-          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
+          args.batch * div_ceil(args.out_depth, kBlockDepth) * kBlockDepth;
       kernel =
           DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
                                             kKnownFilterHeight, kBlockDepth,
@@ -649,7 +650,7 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
     case FORMAT_NCHW:
       block_dim = dim3(args.in_cols, block_height, kBlockDepth);
       block_count =
-          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+          div_ceil(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
       kernel =
           DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
                                             kKnownFilterHeight, kBlockDepth,
@@ -1567,14 +1568,14 @@ Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     case FORMAT_NHWC:
       block_dim = dim3(kBlockDepth, args.in_cols, block_height);
       block_count =
-          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
+          args.batch * div_ceil(args.out_depth, kBlockDepth) * kBlockDepth;
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
     case FORMAT_NCHW:
       block_dim = dim3(args.in_cols, block_height, kBlockDepth);
       block_count =
-          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
+          div_ceil(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
 
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -54,7 +55,8 @@ typename T::ConstPointerType to_pointers(const T& x) {
 template <typename Tindex, typename... CallerArgs, typename... KernelArgs>
 Status wrap_kernel_call(void (*func)(KernelArgs...), const GPUDevice& device,
                         Tindex size, CallerArgs... args) {
-  auto config = GetGpuLaunchConfig(size, device);
+  TF_ASSIGN_OR_RETURN(GpuLaunchConfig64 config,
+                      GetGpuLaunchConfig64(size, device));
   return GpuLaunchKernel(func, config.block_count, config.thread_per_block, 0,
                          device.stream(), config, to_pointers(args)...);
 }
@@ -77,10 +79,10 @@ struct CastFunctor {
 // true if the indices are not ordered by row.
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void CountElementsPerRowKernel(
-    GpuLaunchConfig cfg, Tindex dense_rows, int rank, const Tindex* indices,
+    GpuLaunchConfig64 cfg, Tindex dense_rows, int rank, const Tindex* indices,
     Tindex* elements_per_row, int* rows_are_not_ordered,
     int* first_invalid_index) {
-  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+  for (int64_t i : GpuGridRangeX(cfg.virtual_thread_count)) {
     Tindex row = indices[i * rank];
     if (row < 0 || row >= dense_rows) {
       GpuAtomicMin(first_invalid_index, i);
@@ -98,18 +100,19 @@ __global__ __launch_bounds__(1024) void CountElementsPerRowKernel(
 
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void CopyRowIndicesKernel(
-    GpuLaunchConfig cfg, int rank, const Tindex* indices, Tindex* row_indices) {
-  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    GpuLaunchConfig64 cfg, int rank, const Tindex* indices,
+    Tindex* row_indices) {
+  for (int64_t i : GpuGridRangeX(cfg.virtual_thread_count)) {
     row_indices[i] = indices[i * rank];
   }
 }
 
 // Sets empty_row_indicator[row] to whether the row is empty.
 template <typename Tindex>
 __global__ __launch_bounds__(1024) void ComputeEmptyRowIndicatorKernel(
-    GpuLaunchConfig cfg, const Tindex* elements_per_row,
+    GpuLaunchConfig64 cfg, const Tindex* elements_per_row,
     bool* empty_row_indicator) {
-  GPU_1D_KERNEL_LOOP(row, cfg.virtual_thread_count) {
+  for (int64_t row : GpuGridRangeX(cfg.virtual_thread_count)) {
     empty_row_indicator[row] = elements_per_row[row] == 0;
   }
 }
@@ -119,11 +122,11 @@ __global__ __launch_bounds__(1024) void ComputeEmptyRowIndicatorKernel(
 // empty row.
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void ScatterInputElementsKernel(
-    GpuLaunchConfig cfg, Tindex dense_rows, int rank,
+    GpuLaunchConfig64 cfg, Tindex dense_rows, int rank,
     const Tindex* input_index_map, const Tindex* indices, const T* values,
     const Tindex* num_new_rows_before, Tindex* output_indices, T* output_values,
     Tindex* reverse_index_map) {
-  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+  for (int64_t i : ::tensorflow::GpuGridRangeX(cfg.virtual_thread_count)) {
     Tindex input_i = input_index_map ? input_index_map[i] : i;
     Tindex row = indices[input_i * rank];
     Tindex output_i = i + num_new_rows_before[row];
@@ -141,10 +144,10 @@ __global__ __launch_bounds__(1024) void ScatterInputElementsKernel(
 // input) in output_indices and output_values.
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void ScatterNewElementsKernel(
-    GpuLaunchConfig cfg, int rank, const T* default_value,
+    GpuLaunchConfig64 cfg, int rank, const T* default_value,
     const Tindex* num_new_rows_through, const Tindex* input_row_ends,
     const bool* empty_row_indicator, Tindex* output_indices, T* output_values) {
-  GPU_1D_KERNEL_LOOP(row, cfg.virtual_thread_count) {
+  for (int64_t row : ::tensorflow::GpuGridRangeX(cfg.virtual_thread_count)) {
     if (!empty_row_indicator[row]) continue;  // Only process empty rows
     Tindex input_i = (row == 0 ? 0 : input_row_ends[row - 1]);
     Tindex output_i = input_i + (row == 0 ? 0 : num_new_rows_through[row - 1]);
@@ -489,9 +492,9 @@ namespace {
 
 template <typename T, typename Tindex>
 __global__ __launch_bounds__(1024) void GatherOriginalGradValuesKernel(
-    GpuLaunchConfig cfg, const Tindex* reverse_index_map, const T* grad_values,
-    T* d_values, bool* visited, Tindex N_full) {
-  GPU_1D_KERNEL_LOOP(input_i, cfg.virtual_thread_count) {
+    GpuLaunchConfig64 cfg, const Tindex* reverse_index_map,
+    const T* grad_values, T* d_values, bool* visited, Tindex N_full) {
+  for (int64_t input_i : GpuGridRangeX(cfg.virtual_thread_count)) {
     Tindex output_i = reverse_index_map[input_i];
     if (output_i >= 0 && output_i < N_full) {
       d_values[input_i] = grad_values[output_i];
 
@@ -20,6 +20,7 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/types.h"
@@ -35,7 +36,7 @@ __global__ void GatherOpKernel(const ValueOrVec* __restrict__ params,
                                ValueOrVec* __restrict__ out,
                                int64 gather_dim_size, int64 indices_size,
                                int64 slice_size, int64 out_size) {
-  GPU_1D_KERNEL_LOOP(i, out_size) {
+  for (int64_t i : GpuGridRangeX(out_size)) {
     Index batch_i = 0;
     Index indices_i = 0;
     Index slice_i = 0;
@@ -91,9 +92,12 @@ struct LaunchGatherKernelVectorized {
       const Tvec* params_vec = reinterpret_cast<const Tvec*>(params);
       Tvec* out_vec = reinterpret_cast<Tvec*>(out);
 
-      GpuLaunchConfig config = GetGpuLaunchConfig(
-          out_size_vec, d, &GatherOpKernel<Tvec, Index, is_axis_zero>,
-          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
+      TF_ASSIGN_OR_RETURN(
+          GpuLaunchConfig64 config,
+          GetGpuLaunchConfig64(out_size_vec, d,
+                               &GatherOpKernel<Tvec, Index, is_axis_zero>,
+                               /*dynamic_shared_memory_size=*/0,
+                               /*block_size_limit=*/0));
       return GpuLaunchKernel(
           GatherOpKernel<Tvec, Index, is_axis_zero>, config.block_count,
           config.thread_per_block, 0, d.stream(), params_vec, indices, out_vec,
@@ -142,13 +146,21 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 slice_size = params.dimension(2);
 
     if (is_axis_zero) {
-      TF_CHECK_OK(LaunchGatherKernel<true>(d, params.data(), indices.data(),
-                                           out.data(), gather_dim_size,
-                                           indices_size, slice_size, out_size));
+      Status status = LaunchGatherKernel<true>(
+          d, params.data(), indices.data(), out.data(), gather_dim_size,
+          indices_size, slice_size, out_size);
+      if (!status.ok()) {
+        ctx->CtxFailure(__FILE__, __LINE__, status);
+        return -1;
+      }
     } else {
-      TF_CHECK_OK(LaunchGatherKernel<false>(
+      Status status = LaunchGatherKernel<false>(
           d, params.data(), indices.data(), out.data(), gather_dim_size,
-          indices_size, slice_size, out_size));
+          indices_size, slice_size, out_size);
+      if (!status.ok()) {
+        ctx->CtxFailure(__FILE__, __LINE__, status);
+        return -1;
+      }
     }
     // TODO(fpmc): enable indices validation on GPU.
     // Right now checking for indices out of bound in the kernel would