GPU: Use auto-detected GPU memory size for GPU allocation when possible by default in standalone test

davidrohr · davidrohr · commit e82ac5f5a21c · 2020-02-04T10:44:01.000+01:00
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -141,7 +141,9 @@ int GPUReconstruction::Init()
     mDeviceMemorySize += memGpu;
     mHostMemorySize += memHost;
   }
-  if (mDeviceProcessingSettings.forceMemoryPoolSize) {
+  if (mDeviceProcessingSettings.forceMemoryPoolSize && mDeviceProcessingSettings.forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
+    mDeviceMemorySize = mDeviceProcessingSettings.forceMemoryPoolSize;
+  } else if (mDeviceProcessingSettings.forceMemoryPoolSize > 2) {
     mDeviceMemorySize = mHostMemorySize = mDeviceProcessingSettings.forceMemoryPoolSize;
   }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -257,6 +257,7 @@ class GPUReconstruction
 
   // Others
   virtual RecoStepField AvailableRecoSteps() { return RecoStep::AllRecoSteps; }
+  virtual bool CanQueryMaxMemory() { return false; }
 
   // Pointers to tracker classes
   GPUConstantMem* processors() { return mHostConstantMem.get(); }
diff --git a/GPU/GPUTracking/Base/GPUSettings.h b/GPU/GPUTracking/Base/GPUSettings.h
@@ -140,7 +140,7 @@ struct GPUSettingsDeviceProcessing {
   int nStreams;                       // Number of parallel GPU streams
   bool trackletConstructorInPipeline; // Run tracklet constructor in pileline like the preceeding tasks instead of as one big block
   bool trackletSelectorInPipeline;    // Run tracklet selector in pipeline, requres also tracklet constructor in pipeline
-  size_t forceMemoryPoolSize;         // Override size of memory pool to be allocated on GPU / Host
+  size_t forceMemoryPoolSize;         // Override size of memory pool to be allocated on GPU / Host (set =1 to force allocating all device memory, if supported)
   int nTPCClustererLanes;             // Number of TPC clusterers that can run in parallel
   bool deviceTimers;                  // Use device timers instead of host-based timers
 };
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -156,6 +156,7 @@ int GPUReconstructionCUDABackend::InitDevice_Runtime()
   const int reqVerMaj = 2;
   const int reqVerMin = 0;
   std::vector<bool> devicesOK(count, false);
+  std::vector<size_t> devMemory(count, 0);
   bool contextCreated = false;
   for (int i = 0; i < count; i++) {
     if (mDeviceProcessingSettings.debugLevel >= 4) {
@@ -200,7 +201,7 @@ int GPUReconstructionCUDABackend::InitDevice_Runtime()
     } else if (cudaDeviceProp.major < reqVerMaj || (cudaDeviceProp.major == reqVerMaj && cudaDeviceProp.minor < reqVerMin)) {
       deviceOK = false;
       deviceFailure = "Too low device revision";
-    } else if (free < mDeviceMemorySize) {
+    } else if (free < std::max(mDeviceMemorySize, (size_t)512 * 1024 * 1024)) {
       deviceOK = false;
       deviceFailure = "Insufficient GPU memory";
     }
@@ -213,6 +214,7 @@ int GPUReconstructionCUDABackend::InitDevice_Runtime()
       continue;
     }
     devicesOK[i] = true;
+    devMemory[i] = free;
     if (deviceSpeed > bestDeviceSpeed) {
       bestDevice = i;
       bestDeviceSpeed = deviceSpeed;
@@ -226,7 +228,7 @@ int GPUReconstructionCUDABackend::InitDevice_Runtime()
   bool noDevice = false;
   if (bestDevice == -1) {
     GPUWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
-    GPUImportant("Requiring Revision %d.%d, Mem: %lld", reqVerMaj, reqVerMin, (long long int)mDeviceMemorySize);
+    GPUImportant("Requiring Revision %d.%d, Mem: %lld", reqVerMaj, reqVerMin, (long long int)std::max(mDeviceMemorySize, (size_t)512 * 1024 * 1024));
     noDevice = true;
   } else if (mDeviceProcessingSettings.deviceNum > -1) {
     if (mDeviceProcessingSettings.deviceNum >= (signed)count) {
@@ -303,6 +305,12 @@ int GPUReconstructionCUDABackend::InitDevice_Runtime()
     return (1);
   }
 
+  if (mDeviceMemorySize == 2) {
+    mDeviceMemorySize = devMemory[mDeviceId] * 2 / 3; // Leave 1/3 of GPU memory for event display
+  } else if (mDeviceMemorySize == 1) {
+    mDeviceMemorySize = devMemory[mDeviceId] - 512 * 1024 * 1024; // Take all GPU memory but 1/2 GB
+  }
+
   if (mDeviceMemorySize > cudaDeviceProp.totalGlobalMem || GPUFailedMsgI(cudaMalloc(&mDeviceMemoryBase, mDeviceMemorySize))) {
     GPUError("CUDA Memory Allocation Error");
     GPUFailedMsgI(cudaDeviceReset());
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -50,7 +50,8 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
     GPUReconstructionCUDAInternals* mContext = nullptr;
   };
 
-  virtual std::unique_ptr<GPUThreadContext> GetThreadContext() override;
+  std::unique_ptr<GPUThreadContext> GetThreadContext() override;
+  bool CanQueryMaxMemory() { return true; }
   void SynchronizeGPU() override;
   int GPUDebug(const char* state = "UNKNOWN", int stream = -1) override;
   void SynchronizeStream(int stream) override;
diff --git a/GPU/GPUTracking/Standalone/qconfigoptions.h b/GPU/GPUTracking/Standalone/qconfigoptions.h
@@ -136,7 +136,7 @@ AddOptionSet(nways, int, 1, "1Way", 0, "Use 3-way track-fit")
 AddOption(nwaysouter, bool, false, "OuterParam", 0, "Create OuterParam")
 AddOption(dzdr, float, 2.5f, "DzDr", 0, "Use dZ/dR search window instead of vertex window")
 AddOption(cont, bool, false, "continuous", 0, "Process continuous timeframe data")
-AddOption(forceMemorySize, unsigned long long int, 0, "memSize", 0, "Force size of allocated GPU / page locked host memory", min(0ull))
+AddOption(forceMemorySize, unsigned long long int, 1, "memSize", 0, "Force size of allocated GPU / page locked host memory", min(0ull))
 AddOption(outputcontrolmem, unsigned long long int, 0, "outputMemory", 0, "Use predefined output buffer of this size", min(0ull), message("Using %lld bytes as output memory"))
 AddOption(affinity, int, -1, "cpuAffinity", 0, "Pin CPU affinity to this CPU core", min(-1), message("Setting affinity to restrict on CPU %d"))
 AddOption(fifo, bool, false, "fifoScheduler", 0, "Use FIFO realtime scheduler", message("Setting FIFO scheduler: %s"))
diff --git a/GPU/GPUTracking/Standalone/standalone.cxx b/GPU/GPUTracking/Standalone/standalone.cxx
@@ -293,7 +293,7 @@ int SetupReconstruction()
     devProc.nThreads = configStandalone.OMPThreads;
   }
   devProc.deviceNum = configStandalone.cudaDevice;
-  devProc.forceMemoryPoolSize = configStandalone.forceMemorySize;
+  devProc.forceMemoryPoolSize = (configStandalone.forceMemorySize == 1 && configStandalone.eventDisplay) ? 2 : configStandalone.forceMemorySize;
   devProc.debugLevel = configStandalone.DebugLevel;
   devProc.deviceTimers = configStandalone.DeviceTiming;
   devProc.runQA = configStandalone.qa;

Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,9 @@ int GPUReconstruction::Init()`
`141`	`141`	`mDeviceMemorySize += memGpu;`
`142`	`142`	`mHostMemorySize += memHost;`
`143`	`143`	`}`
`144`		`- if (mDeviceProcessingSettings.forceMemoryPoolSize) {`
	`144`	`+ if (mDeviceProcessingSettings.forceMemoryPoolSize && mDeviceProcessingSettings.forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {`
	`145`	`+ mDeviceMemorySize = mDeviceProcessingSettings.forceMemoryPoolSize;`
	`146`	`+ } else if (mDeviceProcessingSettings.forceMemoryPoolSize > 2) {`
`145`	`147`	`mDeviceMemorySize = mHostMemorySize = mDeviceProcessingSettings.forceMemoryPoolSize;`
`146`	`148`	`}`
`147`	`149`
Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ int SetupReconstruction()`
`293`	`293`	`devProc.nThreads = configStandalone.OMPThreads;`
`294`	`294`	`}`
`295`	`295`	`devProc.deviceNum = configStandalone.cudaDevice;`
`296`		`- devProc.forceMemoryPoolSize = configStandalone.forceMemorySize;`
	`296`	`+ devProc.forceMemoryPoolSize = (configStandalone.forceMemorySize == 1 && configStandalone.eventDisplay) ? 2 : configStandalone.forceMemorySize;`
`297`	`297`	`devProc.debugLevel = configStandalone.DebugLevel;`
`298`	`298`	`devProc.deviceTimers = configStandalone.DeviceTiming;`
`299`	`299`	`devProc.runQA = configStandalone.qa;`