From a12cf2d09dd9279aa6ec0e137d840239745cdd06 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 20 Feb 2020 16:58:47 +0100 Subject: [PATCH 1/7] GPU: Workaround for bug in clang Bug reported here: https://bugs.llvm.org/show_bug.cgi?id=44974, but since the workaround is trivial and makes the code even more readable, we can just as well keep it. --- .../TPCClusterFinder/GPUTPCCFPeakFinder.cxx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx index 16d0914f7d0d4..d8df91ab75fa3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFPeakFinder.cxx @@ -99,11 +99,11 @@ GPUd() bool GPUTPCCFPeakFinder::isPeak( bool peak = true; -#define CMP_NEIGHBOR(dp, dt, cmpOp) \ - do { \ - PackedCharge p = chargeMap[pos.delta({dp, dt})]; \ - const Charge otherCharge = p.unpack(); \ - peak &= (otherCharge cmpOp myCharge); \ +#define CMP_NEIGHBOR(dp, dt, cmpOp) \ + do { \ + PackedCharge p = chargeMap[pos.delta(Delta2{dp, dt})]; \ + const Charge otherCharge = p.unpack(); \ + peak &= (otherCharge cmpOp myCharge); \ } while (false) #define CMP_LT CMP_NEIGHBOR(-1, -1, <=) From 70a564f74875fdd5352c610e73c49488c6716d27 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 20 Feb 2020 17:06:25 +0100 Subject: [PATCH 2/7] GPU: Temporary workaround for bug in llvm IL to SPIR-V converter needed for OpenCL2 --- GPU/Common/GPUCommonMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h index a25b09c916fd4..80999013c5491 100644 --- a/GPU/Common/GPUCommonMath.h +++ b/GPU/Common/GPUCommonMath.h @@ -138,7 +138,7 @@ GPUhdi() unsigned int GPUCommonMath::Clz(unsigned int x) GPUhdi() unsigned int GPUCommonMath::Popcount(unsigned int x) { -#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && (!defined(__OPENCL__) || defined(__OPENCLCPP__)) +#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && (!defined(__OPENCL__) /*|| defined(__OPENCLCPP__)*/) // TODO: remove OPENCLCPP workaround when reported SPIR-V bug is fixed return CHOICE(__builtin_popcount(x), __popc(x), __builtin_popcount(x)); // use builtin if available #else unsigned int retVal = 0; From db2e4c755e16d64589664e426870f8cb520b21e0 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 18 Feb 2020 22:37:35 +0100 Subject: [PATCH 3/7] GPU: Add option to register all temporary input memory used by the standalone benchmark --- GPU/GPUTracking/Base/GPUReconstruction.h | 3 +++ .../Base/GPUReconstructionConvert.cxx | 2 +- .../Base/GPUReconstructionConvert.h | 2 +- GPU/GPUTracking/Base/GPUSettings.cxx | 1 + GPU/GPUTracking/Base/GPUSettings.h | 1 + GPU/GPUTracking/Global/GPUChainTracking.cxx | 19 ++++++++++++++++++- GPU/GPUTracking/Standalone/qconfigoptions.h | 1 + GPU/GPUTracking/Standalone/standalone.cxx | 1 + 8 files changed, 27 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 7f5339aeae66a..382ceda7fd475 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -342,6 +342,9 @@ inline void GPUReconstruction::AllocateIOMemoryHelper(unsigned int n, const T*& } u.reset(new T[n]); ptr = u.get(); + if (mDeviceProcessingSettings.registerStandaloneInputMemory) { + registerMemoryForGPU(u.get(), n * sizeof(T)); + } } template diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx index 9e027fd9f949c..d2b56290d98f4 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx @@ -180,7 +180,7 @@ void GPUReconstructionConvert::ZSstreamOut(unsigned short* bufIn, unsigned int& lenIn = 0; } -void GPUReconstructionConvert::RunZSEncoder(const GPUTrackingInOutDigits* in, const GPUTrackingInOutZS*& out, const GPUParam& param, bool zs12bit) +void GPUReconstructionConvert::RunZSEncoder(const GPUTrackingInOutDigits* in, GPUTrackingInOutZS*& out, const GPUParam& param, bool zs12bit) { #ifdef GPUCA_TPC_GEOMETRY_O2 static std::vector> buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS]; diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.h b/GPU/GPUTracking/Base/GPUReconstructionConvert.h index 17fa7c1a6d0bd..bf371c616e8e9 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionConvert.h +++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.h @@ -48,7 +48,7 @@ class GPUReconstructionConvert constexpr static unsigned int NSLICES = GPUCA_NSLICES; static void ConvertNativeToClusterData(o2::tpc::ClusterNativeAccess* native, std::unique_ptr* clusters, unsigned int* nClusters, const TPCFastTransform* transform, int continuousMaxTimeBin = 0); static void ConvertRun2RawToNative(o2::tpc::ClusterNativeAccess& native, std::unique_ptr& nativeBuffer, const AliHLTTPCRawCluster** rawClusters, unsigned int* nRawClusters); - static void RunZSEncoder(const GPUTrackingInOutDigits* in, const GPUTrackingInOutZS*& out, const GPUParam& param, bool zs12bit); + static void RunZSEncoder(const GPUTrackingInOutDigits* in, GPUTrackingInOutZS*& out, const GPUParam& param, bool zs12bit); static void RunZSFilter(std::unique_ptr* buffers, const deprecated::PackedDigit* const* ptrs, size_t* nsb, const size_t* ns, const GPUParam& param, bool zs12bit); static int GetMaxTimeBin(const o2::tpc::ClusterNativeAccess& native); static int GetMaxTimeBin(const GPUTrackingInOutDigits& digits); diff --git a/GPU/GPUTracking/Base/GPUSettings.cxx b/GPU/GPUTracking/Base/GPUSettings.cxx index 1cdd6ce384f9c..532a6366555e3 100644 --- a/GPU/GPUTracking/Base/GPUSettings.cxx +++ b/GPU/GPUTracking/Base/GPUSettings.cxx @@ -87,4 +87,5 @@ void GPUSettingsDeviceProcessing::SetDefaults() trackletSelectorInPipeline = false; forceMemoryPoolSize = 0; nTPCClustererLanes = 3; + registerStandaloneInputMemory = false; } diff --git a/GPU/GPUTracking/Base/GPUSettings.h b/GPU/GPUTracking/Base/GPUSettings.h index 0e2fde408d7f6..e326f29f85966 100644 --- a/GPU/GPUTracking/Base/GPUSettings.h +++ b/GPU/GPUTracking/Base/GPUSettings.h @@ -143,6 +143,7 @@ struct GPUSettingsDeviceProcessing { size_t forceMemoryPoolSize; // Override size of memory pool to be allocated on GPU / Host (set =1 to force allocating all device memory, if supported) int nTPCClustererLanes; // Number of TPC clusterers that can run in parallel bool deviceTimers; // Use device timers instead of host-based timers + bool registerStandaloneInputMemory; // Automatically register memory for the GPU which is used as input for the standalone benchmark }; } // namespace gpu } // namespace GPUCA_NAMESPACE diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index a1a8523d97806..f123f26a660d7 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -691,6 +691,9 @@ void GPUChainTracking::ConvertNativeToClusterDataLegacy() GPUReconstructionConvert::ConvertNativeToClusterData(mClusterNativeAccess.get(), mIOMem.clusterData, mIOPtrs.nClusterData, processors()->calibObjects.fastTransform, param().continuousMaxTimeBin); for (unsigned int i = 0; i < NSLICES; i++) { mIOPtrs.clusterData[i] = mIOMem.clusterData[i].get(); + if (GetDeviceProcessingSettings().registerStandaloneInputMemory) { + mRec->registerMemoryForGPU(mIOMem.clusterData[i].get(), mIOPtrs.nClusterData[i] * sizeof(*mIOPtrs.clusterData[i])); + } } mIOPtrs.clustersNative = nullptr; mIOMem.clustersNative.reset(nullptr); @@ -709,11 +712,25 @@ void GPUChainTracking::ConvertRun2RawToNative() mIOMem.clusterData[i].reset(nullptr); } mIOPtrs.clustersNative = mClusterNativeAccess.get(); + if (GetDeviceProcessingSettings().registerStandaloneInputMemory) { + mRec->registerMemoryForGPU(mIOMem.clustersNative.get(), mClusterNativeAccess->nClustersTotal * sizeof(*mClusterNativeAccess->clustersLinear)); + } } void GPUChainTracking::ConvertZSEncoder(bool zs12bit) { - GPUReconstructionConvert::RunZSEncoder(mIOPtrs.tpcPackedDigits, mIOPtrs.tpcZS, param(), zs12bit); + GPUTrackingInOutZS* tmp; + GPUReconstructionConvert::RunZSEncoder(mIOPtrs.tpcPackedDigits, tmp, param(), zs12bit); + mIOPtrs.tpcZS = tmp; + if (GetDeviceProcessingSettings().registerStandaloneInputMemory) { + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { + for (unsigned int k = 0; k < tmp->slice[i].count[j]; k++) { + mRec->registerMemoryForGPU(tmp->slice[i].zsPtr[j][k], tmp->slice[i].nZSPtr[j][k] * TPCZSHDR::TPC_ZS_PAGE_SIZE); + } + } + } + } } void GPUChainTracking::ConvertZSFilter(bool zs12bit) diff --git a/GPU/GPUTracking/Standalone/qconfigoptions.h b/GPU/GPUTracking/Standalone/qconfigoptions.h index b9a2f1185c306..0490901b6a32d 100644 --- a/GPU/GPUTracking/Standalone/qconfigoptions.h +++ b/GPU/GPUTracking/Standalone/qconfigoptions.h @@ -138,6 +138,7 @@ AddOption(dzdr, float, 2.5f, "DzDr", 0, "Use dZ/dR search window instead of vert AddOption(cont, bool, false, "continuous", 0, "Process continuous timeframe data") AddOption(forceMemorySize, unsigned long long int, 1, "memSize", 0, "Force size of allocated GPU / page locked host memory", min(0ull)) AddOption(outputcontrolmem, unsigned long long int, 0, "outputMemory", 0, "Use predefined output buffer of this size", min(0ull), message("Using %lld bytes as output memory")) +AddOption(registerInputMemory, bool, false, "registerInputMemory", 0, "Automatically register input memory buffers for the GPU") AddOption(affinity, int, -1, "cpuAffinity", 0, "Pin CPU affinity to this CPU core", min(-1), message("Setting affinity to restrict on CPU %d")) AddOption(fifo, bool, false, "fifoScheduler", 0, "Use FIFO realtime scheduler", message("Setting FIFO scheduler: %s")) AddOption(fpe, bool, true, "fpe", 0, "Trap on floating point exceptions") diff --git a/GPU/GPUTracking/Standalone/standalone.cxx b/GPU/GPUTracking/Standalone/standalone.cxx index b09f90960dda1..d55e2d774fb13 100644 --- a/GPU/GPUTracking/Standalone/standalone.cxx +++ b/GPU/GPUTracking/Standalone/standalone.cxx @@ -329,6 +329,7 @@ int SetupReconstruction() devProc.globalInitMutex = configStandalone.gpuInitMutex; devProc.gpuDeviceOnly = configStandalone.oclGPUonly; devProc.memoryAllocationStrategy = configStandalone.allocationStrategy; + devProc.registerStandaloneInputMemory = configStandalone.registerInputMemory; recSet.tpcRejectionMode = configStandalone.configRec.tpcReject; if (configStandalone.configRec.tpcRejectThreshold != 0.f) { recSet.tpcRejectQPt = 1.f / configStandalone.configRec.tpcRejectThreshold; From 14e2443ca16624b75687a689b302b5dcdb11d669 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 19 Feb 2020 12:03:48 +0100 Subject: [PATCH 4/7] Fix CMake warning --- DataFormats/MemoryResources/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DataFormats/MemoryResources/CMakeLists.txt b/DataFormats/MemoryResources/CMakeLists.txt index 1d85128149ccd..8979f242cd4dd 100644 --- a/DataFormats/MemoryResources/CMakeLists.txt +++ b/DataFormats/MemoryResources/CMakeLists.txt @@ -16,7 +16,7 @@ if(NOT APPLE) SOURCES test/testMemoryResources.cxx PUBLIC_LINK_LIBRARIES O2::MemoryResources COMPONENT_NAME MemoryResources) -endif(APPLE) +endif(NOT APPLE) o2_add_test(observer_ptr SOURCES test/test_observer_ptr.cxx From 2e239bf81be35a4136ee8d78d7ccce661265de8c Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 20 Feb 2020 23:48:59 +0100 Subject: [PATCH 5/7] GPU: Override incorrectly reported HIP constant memory size --- GPU/Common/GPUCommonDefSettings.h | 10 +++++----- GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/GPU/Common/GPUCommonDefSettings.h b/GPU/Common/GPUCommonDefSettings.h index c4dccd143e881..58d366e60bdb6 100644 --- a/GPU/Common/GPUCommonDefSettings.h +++ b/GPU/Common/GPUCommonDefSettings.h @@ -21,12 +21,12 @@ #error Please include GPUCommonDef.h! #endif -//#define GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS //Use C11 atomic instead of old style atomics for OpenCL C++ in clang (OpenCL 2.2 C++ will use C++11 atomics irrespectively) +//#define GPUCA_OPENCL_CPP_CLANG_C11_ATOMICS // Use C11 atomic instead of old style atomics for OpenCL C++ in clang (OpenCL 2.2 C++ will use C++11 atomics irrespectively) -//#define GPUCA_CUDA_NO_CONSTANT_MEMORY //Do not use constant memory for CUDA -#define GPUCA_HIP_NO_CONSTANT_MEMORY //Do not use constant memory for HIP - MANDATORY for now since all AMD GPUs have insufficient constant memory with HIP -//#define GPUCA_OPENCL_NO_CONSTANT_MEMORY //Do not use constant memory for OpenCL 1.2 -#define GPUCA_OPENCLCPP_NO_CONSTANT_MEMORY //Do not use constant memory for OpenCL C++ - MANDATORY as OpenCL cannot cast between __constant and __generic yet! +//#define GPUCA_CUDA_NO_CONSTANT_MEMORY // Do not use constant memory for CUDA +//#define GPUCA_HIP_NO_CONSTANT_MEMORY // Do not use constant memory for HIP - MANDATORY for now since all AMD GPUs have insufficient constant memory with HIP +//#define GPUCA_OPENCL_NO_CONSTANT_MEMORY // Do not use constant memory for OpenCL 1.2 +#define GPUCA_OPENCLCPP_NO_CONSTANT_MEMORY // Do not use constant memory for OpenCL C++ - MANDATORY as OpenCL cannot cast between __constant and __generic yet! // clang-format on diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx index adc02675879cd..39acf0d58ca94 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx @@ -218,6 +218,7 @@ int GPUReconstructionHIPBackend::InitDevice_Runtime() mDeviceId = bestDevice; GPUFailedMsgI(hipGetDeviceProperties(&hipDeviceProp_t, mDeviceId)); + hipDeviceProp_t.totalConstMem = 65536; // TODO: Remove workaround, fixes incorrectly reported HIP constant memory if (mDeviceProcessingSettings.debugLevel >= 2) { GPUInfo("Using HIP Device %s with Properties:", hipDeviceProp_t.name); From b85567c070919b41faf12ac9ed4629dca0675412 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 21 Feb 2020 01:00:20 +0100 Subject: [PATCH 6/7] GPU: Use launch_bounds also for HIP instead of attribute(num_vgpr) --- GPU/Common/GPUDefGPUParameters.h | 2 +- GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Common/GPUDefGPUParameters.h b/GPU/Common/GPUDefGPUParameters.h index 5b76ddb8985ed..870862dd55781 100644 --- a/GPU/Common/GPUDefGPUParameters.h +++ b/GPU/Common/GPUDefGPUParameters.h @@ -27,7 +27,7 @@ #define GPUCA_THREAD_COUNT_CONSTRUCTOR 128 #define GPUCA_THREAD_COUNT_SELECTOR 128 #define GPUCA_THREAD_COUNT_FINDER 128 - #define GPUCA_NEIGHBORSFINDER_REGS REG, 64 + #define GPUCA_NEIGHBORSFINDER_REGS REG, (GPUCA_THREAD_COUNT_FINDER, 1) #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 12 #elif defined(GPUCA_GPUTYPE_TURING) diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx index 39acf0d58ca94..36c198fdc0f19 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx @@ -72,7 +72,7 @@ GPUg() void runKernelHIP(GPUCA_CONSMEM_PTR int iSlice, Args... args) */ #undef GPUCA_KRNL_REG -#define GPUCA_KRNL_REG(num) __attribute__((amdgpu_num_vgpr(num))) +#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args)) #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward) GPUCA_KRNL_WRAP(GPUCA_KRNL_, x_class, x_attributes, x_arguments, x_forward) #define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionHIPBackend #define GPUCA_KRNL_CALL_single(x_class, x_attributes, x_arguments, x_forward) \ From bf1ffb7b8be9657687c596bf906f71d5516ed325 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 21 Feb 2020 01:15:24 +0100 Subject: [PATCH 7/7] GPU: Fix HIP memcpyToSymbol, which requires additional HIP_SYMBOL macro --- GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx index 36c198fdc0f19..75dae977443e7 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx @@ -412,9 +412,9 @@ size_t GPUReconstructionHIPBackend::WriteToConstantMemory(size_t offset, const v { #ifndef GPUCA_HIP_NO_CONSTANT_MEMORY if (stream == -1) { - GPUFailedMsg(hipMemcpyToSymbol(gGPUConstantMemBuffer, src, size, offset, hipMemcpyHostToDevice)); + GPUFailedMsg(hipMemcpyToSymbol(HIP_SYMBOL(gGPUConstantMemBuffer), src, size, offset, hipMemcpyHostToDevice)); } else { - GPUFailedMsg(hipMemcpyToSymbolAsync(gGPUConstantMemBuffer, src, size, offset, hipMemcpyHostToDevice, mInternals->HIPStreams[stream])); + GPUFailedMsg(hipMemcpyToSymbolAsync(HIP_SYMBOL(gGPUConstantMemBuffer), src, size, offset, hipMemcpyHostToDevice, mInternals->HIPStreams[stream])); } if (ev && stream != -1) { GPUFailedMsg(hipEventRecord(*(hipEvent_t*)ev, mInternals->HIPStreams[stream]));