@@ -187,6 +187,8 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
187187 const int iSlice = iSliceRow / GPUCA_ROW_COUNT;
188188 const int iRow = iSliceRow % GPUCA_ROW_COUNT;
189189 const int idOffset = clusters->clusterOffset [iSlice][iRow];
190+ const int idOffsetOut = clusters->clusterOffset [iSlice][iRow] * compressor.mMaxClusterFactorBase1024 / 1024 ;
191+ const int idOffsetOutMax = clusters->clusterOffset [iSlice][iRow + 1 ] * compressor.mMaxClusterFactorBase1024 / 1024 ;
190192 if (iThread == nThreads - 1 ) {
191193 smem.nCount = 0 ;
192194 }
@@ -246,7 +248,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
246248 continue ;
247249 }
248250
249- const unsigned int count = CAMath::Min (smem.nCount , (unsigned int )GPUCA_TPC_COMP_CHUNK_SIZE);
251+ unsigned int count = CAMath::Min (smem.nCount , (unsigned int )GPUCA_TPC_COMP_CHUNK_SIZE);
250252 if (param.rec .tpcCompressionModes & GPUSettings::CompressionDifferences) {
251253 if (param.rec .tpcCompressionSortOrder == GPUSettings::SortZPadTime) {
252254 CAAlgo::sortInBlock (sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters [iSlice][iRow]));
@@ -261,7 +263,12 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
261263 }
262264
263265 for (unsigned int j = get_local_id (0 ); j < count; j += get_local_size (0 )) {
264- int outidx = idOffset + totalCount + j;
266+ int outidx = idOffsetOut + totalCount + j;
267+ if (outidx >= idOffsetOutMax) {
268+ compressor.raiseError (GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, outidx, idOffsetOutMax);
269+ count = 0 ;
270+ break ;
271+ }
265272 const ClusterNative& GPUrestrict () orgCl = clusters->clusters [iSlice][iRow][sortBuffer[j]];
266273 unsigned int lastTime = 0 ;
267274 unsigned int lastPad = 0 ;
@@ -446,7 +453,7 @@ GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestric
446453}
447454
448455template <typename V, typename T, typename S>
449- GPUdi () void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict () dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const unsigned int* GPUrestrict() srcOffsets, unsigned int nTracks , int nLanes, int iLane, int diff)
456+ GPUdi () void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict () dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const unsigned int* GPUrestrict() srcOffsets, unsigned int nEntries , int nLanes, int iLane, int diff, size_t scaleBase1024 )
450457{
451458 int shmPos = 0 ;
452459 unsigned int dstOffset = 0 ;
@@ -456,9 +463,9 @@ GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T*
456463 CONSTEXPR int bufSize = GPUCA_WARP_SIZE;
457464 CONSTEXPR int bufTSize = bufSize * sizeof (V) / sizeof (T);
458465
459- for (unsigned int i = 0 ; i < nTracks ; i++) {
466+ for (unsigned int i = 0 ; i < nEntries ; i++) {
460467 unsigned int srcPos = 0 ;
461- unsigned int srcOffset = srcOffsets[i] + diff;
468+ unsigned int srcOffset = ( srcOffsets[i] * scaleBase1024 / 1024 ) + diff;
462469 unsigned int srcSize = nums[i] - diff;
463470
464471 if (dstAligned == nullptr ) {
@@ -565,13 +572,14 @@ GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKern
565572 for (unsigned int i = sliceStart; i <= sliceEnd && i < compressor.NSLICES ; i++) {
566573 for (unsigned int j = ((i == sliceStart) ? sliceRowStart : 0 ); j < ((i == sliceEnd) ? sliceRowEnd : GPUCA_ROW_COUNT); j++) {
567574 unsigned int nClusters = compressor.mPtrs .nSliceRowClusters [i * GPUCA_ROW_COUNT + j];
568- compressorMemcpy (compressor.mOutput ->qTotU + rowsOffset, compressor.mPtrs .qTotU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
569- compressorMemcpy (compressor.mOutput ->qMaxU + rowsOffset, compressor.mPtrs .qMaxU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
570- compressorMemcpy (compressor.mOutput ->flagsU + rowsOffset, compressor.mPtrs .flagsU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
571- compressorMemcpy (compressor.mOutput ->padDiffU + rowsOffset, compressor.mPtrs .padDiffU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
572- compressorMemcpy (compressor.mOutput ->timeDiffU + rowsOffset, compressor.mPtrs .timeDiffU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
573- compressorMemcpy (compressor.mOutput ->sigmaPadU + rowsOffset, compressor.mPtrs .sigmaPadU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
574- compressorMemcpy (compressor.mOutput ->sigmaTimeU + rowsOffset, compressor.mPtrs .sigmaTimeU + clusters->clusterOffset [i][j], nClusters, nLanes, iLane);
575+ unsigned int clusterOffsetInCache = clusters->clusterOffset [i][j] * compressor.mMaxClusterFactorBase1024 / 1024 ;
576+ compressorMemcpy (compressor.mOutput ->qTotU + rowsOffset, compressor.mPtrs .qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);
577+ compressorMemcpy (compressor.mOutput ->qMaxU + rowsOffset, compressor.mPtrs .qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);
578+ compressorMemcpy (compressor.mOutput ->flagsU + rowsOffset, compressor.mPtrs .flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);
579+ compressorMemcpy (compressor.mOutput ->padDiffU + rowsOffset, compressor.mPtrs .padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
580+ compressorMemcpy (compressor.mOutput ->timeDiffU + rowsOffset, compressor.mPtrs .timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
581+ compressorMemcpy (compressor.mOutput ->sigmaPadU + rowsOffset, compressor.mPtrs .sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);
582+ compressorMemcpy (compressor.mOutput ->sigmaTimeU + rowsOffset, compressor.mPtrs .sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);
575583 rowsOffset += nClusters;
576584 }
577585 }
@@ -676,18 +684,18 @@ GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int nBlocks, int nT
676684 compressorMemcpyBasic (output->padA , input.padA , compressor.mMemory ->nStoredTracks , nThreads, iThread);
677685 }
678686
679- const unsigned int * clusterOffsets = reinterpret_cast < const unsigned int *>( clusters->clusterOffset ) + rowStart;
687+ const unsigned int * clusterOffsets = & clusters->clusterOffset [ 0 ][ 0 ] + rowStart;
680688 const unsigned int * nSliceRowClusters = input.nSliceRowClusters + rowStart;
681689
682690 auto * buf = smem.getBuffer <V>(iWarp);
683691
684- compressorMemcpyBuffered (buf, output->qTotU + rowsOffset, input.qTotU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
685- compressorMemcpyBuffered (buf, output->qMaxU + rowsOffset, input.qMaxU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
686- compressorMemcpyBuffered (buf, output->flagsU + rowsOffset, input.flagsU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
687- compressorMemcpyBuffered (buf, output->padDiffU + rowsOffset, input.padDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
688- compressorMemcpyBuffered (buf, output->timeDiffU + rowsOffset, input.timeDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
689- compressorMemcpyBuffered (buf, output->sigmaPadU + rowsOffset, input.sigmaPadU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
690- compressorMemcpyBuffered (buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
692+ compressorMemcpyBuffered (buf, output->qTotU + rowsOffset, input.qTotU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
693+ compressorMemcpyBuffered (buf, output->qMaxU + rowsOffset, input.qMaxU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
694+ compressorMemcpyBuffered (buf, output->flagsU + rowsOffset, input.flagsU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
695+ compressorMemcpyBuffered (buf, output->padDiffU + rowsOffset, input.padDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
696+ compressorMemcpyBuffered (buf, output->timeDiffU + rowsOffset, input.timeDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
697+ compressorMemcpyBuffered (buf, output->sigmaPadU + rowsOffset, input.sigmaPadU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
698+ compressorMemcpyBuffered (buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
691699
692700 const unsigned short * nTrackClustersPtr = input.nTrackClusters + trackStart;
693701 const unsigned int * aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
@@ -742,16 +750,16 @@ GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int nBlocks, int nThre
742750 rowsPerWarp = rowEnd - rowStart;
743751
744752 const unsigned int rowsOffset = calculateWarpOffsets (smem, input.nSliceRowClusters , rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
745- const unsigned int * clusterOffsets = reinterpret_cast < const unsigned int *>( clusters->clusterOffset ) + rowStart;
753+ const unsigned int * clusterOffsets = & clusters->clusterOffset [ 0 ][ 0 ] + rowStart;
746754 const unsigned int * nSliceRowClusters = input.nSliceRowClusters + rowStart;
747755
748- compressorMemcpyBuffered (buf, output->qTotU + rowsOffset, input.qTotU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
749- compressorMemcpyBuffered (buf, output->qMaxU + rowsOffset, input.qMaxU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
750- compressorMemcpyBuffered (buf, output->flagsU + rowsOffset, input.flagsU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
751- compressorMemcpyBuffered (buf, output->padDiffU + rowsOffset, input.padDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
752- compressorMemcpyBuffered (buf, output->timeDiffU + rowsOffset, input.timeDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
753- compressorMemcpyBuffered (buf, output->sigmaPadU + rowsOffset, input.sigmaPadU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
754- compressorMemcpyBuffered (buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 );
756+ compressorMemcpyBuffered (buf, output->qTotU + rowsOffset, input.qTotU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
757+ compressorMemcpyBuffered (buf, output->qMaxU + rowsOffset, input.qMaxU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
758+ compressorMemcpyBuffered (buf, output->flagsU + rowsOffset, input.flagsU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
759+ compressorMemcpyBuffered (buf, output->padDiffU + rowsOffset, input.padDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
760+ compressorMemcpyBuffered (buf, output->timeDiffU + rowsOffset, input.timeDiffU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
761+ compressorMemcpyBuffered (buf, output->sigmaPadU + rowsOffset, input.sigmaPadU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
762+ compressorMemcpyBuffered (buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU , nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0 , compressor. mMaxClusterFactorBase1024 );
755763 } else {
756764 const unsigned int nGlobalWarps = nWarps * (nBlocks - 1 ) / 2 ;
757765 const unsigned int iGlobalWarp = nWarps * (iBlock / 2 - 1 ) + iWarp;
0 commit comments