[Effdetd0/PyT] thc fix for build fails

Victor49152 · shakandrew · commit bd4cd216192e · 2022-02-03T08:16:47.000-08:00
diff --git a/PyTorch/Detection/Efficientdet/Dockerfile b/PyTorch/Detection/Efficientdet/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.06-py3
+ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel
 
 FROM ${FROM_IMAGE_NAME}
 
diff --git a/PyTorch/Detection/Efficientdet/effdet/csrc/focal_loss/focal_loss_cuda_kernel.cu b/PyTorch/Detection/Efficientdet/effdet/csrc/focal_loss/focal_loss_cuda_kernel.cu
@@ -15,7 +15,6 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THC.h>
 
 thread_local int multiProcessorCount=0;
 
@@ -256,7 +255,7 @@ std::vector<at::Tensor> focal_loss_forward_cuda(
         });
   }
 
-  THCudaCheck(cudaGetLastError());
+  C10_CUDA_CHECK(cudaGetLastError());
   return {loss, partial_grad};
 }
 
@@ -281,6 +280,6 @@ at::Tensor focal_loss_backward_cuda(const at::Tensor &grad_output,
                                          partial_grad.numel());
       });
 
-  THCudaCheck(cudaGetLastError());
+  C10_CUDA_CHECK(cudaGetLastError());
   return partial_grad;
 }
diff --git a/PyTorch/Detection/Efficientdet/effdet/csrc/nms/cuda/nms.cu b/PyTorch/Detection/Efficientdet/effdet/csrc/nms/cuda/nms.cu
@@ -14,9 +14,8 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
+#include <ATen/ceil_div.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 
 #include <vector>
 #include <iostream>
@@ -74,7 +73,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
         t |= 1ULL << i;
       }
     }
-    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+    const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
     dev_mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
@@ -89,28 +88,28 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
   int boxes_num = boxes.size(0);
 
-  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+  const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
 
   scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
 
-  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+  at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
 
   unsigned long long* mask_dev = NULL;
   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
 
-  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+  mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
 
-  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
-              THCCeilDiv(boxes_num, threadsPerBlock));
+  dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
+              at::ceil_div(boxes_num, threadsPerBlock));
   dim3 threads(threadsPerBlock);
   nms_kernel<<<blocks, threads>>>(boxes_num,
                                   nms_overlap_thresh,
                                   boxes_dev,
                                   mask_dev);
 
   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  THCudaCheck(cudaMemcpy(&mask_host[0],
+  C10_CUDA_CHECK(cudaMemcpy(&mask_host[0],
                         mask_dev,
                         sizeof(unsigned long long) * boxes_num * col_blocks,
                         cudaMemcpyDeviceToHost));
@@ -135,7 +134,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
     }
   }
 
-  THCudaFree(state, mask_dev);
+  c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
   // TODO improve this part
   return std::get<0>(order_t.index({
                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,6 @@`
`15`	`15`	`#include <ATen/ATen.h>`
`16`	`16`	`#include <ATen/AccumulateType.h>`
`17`	`17`	`#include <ATen/cuda/CUDAContext.h>`
`18`		`-#include <THC/THC.h>`
`19`	`18`
`20`	`19`	`thread_local int multiProcessorCount=0;`
`21`	`20`
`@@ -256,7 +255,7 @@ std::vector<at::Tensor> focal_loss_forward_cuda(`
`256`	`255`	`});`
`257`	`256`	`}`
`258`	`257`
`259`		`- THCudaCheck(cudaGetLastError());`
	`258`	`+ C10_CUDA_CHECK(cudaGetLastError());`
`260`	`259`	`return {loss, partial_grad};`
`261`	`260`	`}`
`262`	`261`
`@@ -281,6 +280,6 @@ at::Tensor focal_loss_backward_cuda(const at::Tensor &grad_output,`
`281`	`280`	`partial_grad.numel());`
`282`	`281`	`});`
`283`	`282`
`284`		`- THCudaCheck(cudaGetLastError());`
	`283`	`+ C10_CUDA_CHECK(cudaGetLastError());`
`285`	`284`	`return partial_grad;`
`286`	`285`	`}`