Skip to content

Commit aa88226

Browse files
committed
GPU: Add possibility to user smaller unattached compressed hits buffer in TPC compression than total number of clusters
1 parent dcbceec commit aa88226

9 files changed

Lines changed: 60 additions & 48 deletions

File tree

DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ struct ClusterNativeAccess {
164164
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW];
165165
unsigned int nClustersSector[constants::MAXSECTOR];
166166
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW];
167-
unsigned int nClustersTotal;
167+
unsigned int nClustersTotal; // Must be directly after clusterOffsets, --> =clusterOffset[nRows * nSectors]!
168168

169169
void setOffsetPtrs();
170170

GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GPUTPCCompression.h"
1515
#include "GPUReconstruction.h"
1616
#include "GPUO2DataTypes.h"
17+
#include "GPUMemorySizeScalers.h"
1718

1819
using namespace GPUCA_NAMESPACE::gpu;
1920

@@ -39,7 +40,7 @@ void* GPUTPCCompression::SetPointersScratch(void* mem)
3940
computePointerWithAlignment(mem, mAttachedClusterFirstIndex, mMaxTracks);
4041
}
4142
if (mRec->GetProcessingSettings().tpcCompressionGatherMode != 1) {
42-
SetPointersCompressedClusters(mem, mPtrs, mMaxTrackClusters, mMaxTracks, mMaxClusters, false);
43+
SetPointersCompressedClusters(mem, mPtrs, mMaxTrackClusters, mMaxTracks, mMaxClustersInCache, false);
4344
}
4445
return mem;
4546
}
@@ -48,7 +49,7 @@ void* GPUTPCCompression::SetPointersOutput(void* mem)
4849
{
4950
computePointerWithAlignment(mem, mAttachedClusterFirstIndex, mMaxTrackClusters);
5051
if (mRec->GetProcessingSettings().tpcCompressionGatherMode == 1) {
51-
SetPointersCompressedClusters(mem, mPtrs, mMaxTrackClusters, mMaxTracks, mMaxClusters, false);
52+
SetPointersCompressedClusters(mem, mPtrs, mMaxTrackClusters, mMaxTracks, mMaxClustersInCache, false);
5253
}
5354
return mem;
5455
}
@@ -115,6 +116,8 @@ void GPUTPCCompression::RegisterMemoryAllocation()
115116
void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io)
116117
{
117118
mMaxClusters = io.clustersNative->nClustersTotal;
119+
mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->tpcCompressedUnattachedHitsBase1024[mRec->GetParam().rec.tpcRejectionMode] : 1024;
120+
mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024;
118121
mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters();
119122
mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks();
120123
if (mMaxClusters % 16) {

GPU/GPUTracking/DataCompression/GPUTPCCompression.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ class GPUTPCCompression : public GPUProcessor
9090
unsigned int mMaxTracks = 0;
9191
unsigned int mMaxClusters = 0;
9292
unsigned int mMaxTrackClusters = 0;
93-
unsigned int mNMaxClusterSliceRow = 0;
93+
unsigned int mMaxClustersInCache = 0;
94+
size_t mMaxClusterFactorBase1024 = 0;
9495

9596
template <class T>
9697
void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,8 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
187187
const int iSlice = iSliceRow / GPUCA_ROW_COUNT;
188188
const int iRow = iSliceRow % GPUCA_ROW_COUNT;
189189
const int idOffset = clusters->clusterOffset[iSlice][iRow];
190+
const int idOffsetOut = clusters->clusterOffset[iSlice][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;
191+
const int idOffsetOutMax = clusters->clusterOffset[iSlice][iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024;
190192
if (iThread == nThreads - 1) {
191193
smem.nCount = 0;
192194
}
@@ -246,7 +248,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
246248
continue;
247249
}
248250

249-
const unsigned int count = CAMath::Min(smem.nCount, (unsigned int)GPUCA_TPC_COMP_CHUNK_SIZE);
251+
unsigned int count = CAMath::Min(smem.nCount, (unsigned int)GPUCA_TPC_COMP_CHUNK_SIZE);
250252
if (param.rec.tpcCompressionModes & GPUSettings::CompressionDifferences) {
251253
if (param.rec.tpcCompressionSortOrder == GPUSettings::SortZPadTime) {
252254
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSlice][iRow]));
@@ -261,7 +263,12 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
261263
}
262264

263265
for (unsigned int j = get_local_id(0); j < count; j += get_local_size(0)) {
264-
int outidx = idOffset + totalCount + j;
266+
int outidx = idOffsetOut + totalCount + j;
267+
if (outidx >= idOffsetOutMax) {
268+
compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, outidx, idOffsetOutMax);
269+
count = 0;
270+
break;
271+
}
265272
const ClusterNative& GPUrestrict() orgCl = clusters->clusters[iSlice][iRow][sortBuffer[j]];
266273
unsigned int lastTime = 0;
267274
unsigned int lastPad = 0;
@@ -446,7 +453,7 @@ GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestric
446453
}
447454

448455
template <typename V, typename T, typename S>
449-
GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const unsigned int* GPUrestrict() srcOffsets, unsigned int nTracks, int nLanes, int iLane, int diff)
456+
GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const unsigned int* GPUrestrict() srcOffsets, unsigned int nEntries, int nLanes, int iLane, int diff, size_t scaleBase1024)
450457
{
451458
int shmPos = 0;
452459
unsigned int dstOffset = 0;
@@ -456,9 +463,9 @@ GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T*
456463
CONSTEXPR int bufSize = GPUCA_WARP_SIZE;
457464
CONSTEXPR int bufTSize = bufSize * sizeof(V) / sizeof(T);
458465

459-
for (unsigned int i = 0; i < nTracks; i++) {
466+
for (unsigned int i = 0; i < nEntries; i++) {
460467
unsigned int srcPos = 0;
461-
unsigned int srcOffset = srcOffsets[i] + diff;
468+
unsigned int srcOffset = (srcOffsets[i] * scaleBase1024 / 1024) + diff;
462469
unsigned int srcSize = nums[i] - diff;
463470

464471
if (dstAligned == nullptr) {
@@ -565,13 +572,14 @@ GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKern
565572
for (unsigned int i = sliceStart; i <= sliceEnd && i < compressor.NSLICES; i++) {
566573
for (unsigned int j = ((i == sliceStart) ? sliceRowStart : 0); j < ((i == sliceEnd) ? sliceRowEnd : GPUCA_ROW_COUNT); j++) {
567574
unsigned int nClusters = compressor.mPtrs.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
568-
compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
569-
compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
570-
compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
571-
compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
572-
compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
573-
compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
574-
compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusters->clusterOffset[i][j], nClusters, nLanes, iLane);
575+
unsigned int clusterOffsetInCache = clusters->clusterOffset[i][j] * compressor.mMaxClusterFactorBase1024 / 1024;
576+
compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);
577+
compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);
578+
compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);
579+
compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
580+
compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
581+
compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);
582+
compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);
575583
rowsOffset += nClusters;
576584
}
577585
}
@@ -676,18 +684,18 @@ GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int nBlocks, int nT
676684
compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
677685
}
678686

679-
const unsigned int* clusterOffsets = reinterpret_cast<const unsigned int*>(clusters->clusterOffset) + rowStart;
687+
const unsigned int* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
680688
const unsigned int* nSliceRowClusters = input.nSliceRowClusters + rowStart;
681689

682690
auto* buf = smem.getBuffer<V>(iWarp);
683691

684-
compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
685-
compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
686-
compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
687-
compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
688-
compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
689-
compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
690-
compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
692+
compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
693+
compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
694+
compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
695+
compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
696+
compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
697+
compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
698+
compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
691699

692700
const unsigned short* nTrackClustersPtr = input.nTrackClusters + trackStart;
693701
const unsigned int* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
@@ -742,16 +750,16 @@ GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int nBlocks, int nThre
742750
rowsPerWarp = rowEnd - rowStart;
743751

744752
const unsigned int rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
745-
const unsigned int* clusterOffsets = reinterpret_cast<const unsigned int*>(clusters->clusterOffset) + rowStart;
753+
const unsigned int* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
746754
const unsigned int* nSliceRowClusters = input.nSliceRowClusters + rowStart;
747755

748-
compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
749-
compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
750-
compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
751-
compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
752-
compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
753-
compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
754-
compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0);
756+
compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
757+
compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
758+
compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
759+
compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
760+
compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
761+
compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
762+
compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSliceRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
755763
} else {
756764
const unsigned int nGlobalWarps = nWarps * (nBlocks - 1) / 2;
757765
const unsigned int iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ class GPUTPCCompressionGatherKernels : public GPUKernelTemplate
112112
GPUdi() static void compressorMemcpyBasic(T* dst, const T* src, unsigned int size, int nThreads, int iThread, int nBlocks = 1, int iBlock = 0);
113113

114114
template <typename V, typename T, typename S>
115-
GPUdi() static void compressorMemcpyBuffered(V* buf, T* dst, const T* src, const S* nums, const unsigned int* srcOffets, unsigned int nTracks, int nLanes, int iLane, int diff = 0);
115+
GPUdi() static void compressorMemcpyBuffered(V* buf, T* dst, const T* src, const S* nums, const unsigned int* srcOffets, unsigned int nEntries, int nLanes, int iLane, int diff = 0, size_t scaleBase1024 = 1024);
116116

117117
template <typename T>
118118
GPUdi() static unsigned int calculateWarpOffsets(GPUSharedMemory& smem, T* nums, unsigned int start, unsigned int end, int nWarps, int iWarp, int nLanes, int iLane);

GPU/GPUTracking/DataTypes/GPUMemorySizeScalers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct GPUMemorySizeScalers {
4343
double tpcSectorTrackHitsPerHit = 0.8f;
4444
double tpcMergedTrackPerSliceTrack = 0.9;
4545
double tpcMergedTrackHitPerSliceHit = 1.1;
46+
size_t tpcCompressedUnattachedHitsBase1024[3] = {900, 900, 500}; // No ratio, but integer fraction of 1024 for exact computation
4647

4748
// Upper limits
4849
size_t tpcMaxPeaks = 1000000000;

GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,6 @@ int GPUChainTracking::RunTPCCompression()
3737
if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
3838
RecordMarker(&mEvents->single, 0);
3939
}
40-
Compressor.mNMaxClusterSliceRow = 0;
41-
for (unsigned int i = 0; i < NSLICES; i++) {
42-
for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
43-
if (mIOPtrs.clustersNative->nClusters[i][j] > Compressor.mNMaxClusterSliceRow) {
44-
Compressor.mNMaxClusterSliceRow = mIOPtrs.clustersNative->nClusters[i][j];
45-
}
46-
}
47-
}
4840

4941
if (ProcessingSettings().tpcCompressionGatherMode == 3) {
5042
mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
@@ -160,13 +152,14 @@ int GPUChainTracking::RunTPCCompression()
160152
unsigned int offset = 0;
161153
for (unsigned int i = 0; i < NSLICES; i++) {
162154
for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
163-
GPUMemCpyAlways(myStep, O->qTotU + offset, P->qTotU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->qTotU[0]), outputStream, direction);
164-
GPUMemCpyAlways(myStep, O->qMaxU + offset, P->qMaxU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->qMaxU[0]), outputStream, direction);
165-
GPUMemCpyAlways(myStep, O->flagsU + offset, P->flagsU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->flagsU[0]), outputStream, direction);
166-
GPUMemCpyAlways(myStep, O->padDiffU + offset, P->padDiffU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->padDiffU[0]), outputStream, direction);
167-
GPUMemCpyAlways(myStep, O->timeDiffU + offset, P->timeDiffU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->timeDiffU[0]), outputStream, direction);
168-
GPUMemCpyAlways(myStep, O->sigmaPadU + offset, P->sigmaPadU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->sigmaPadU[0]), outputStream, direction);
169-
GPUMemCpyAlways(myStep, O->sigmaTimeU + offset, P->sigmaTimeU + mIOPtrs.clustersNative->clusterOffset[i][j], O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->sigmaTimeU[0]), outputStream, direction);
155+
unsigned int srcOffset = mIOPtrs.clustersNative->clusterOffset[i][j] * Compressor.mMaxClusterFactorBase1024 / 1024;
156+
GPUMemCpyAlways(myStep, O->qTotU + offset, P->qTotU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->qTotU[0]), outputStream, direction);
157+
GPUMemCpyAlways(myStep, O->qMaxU + offset, P->qMaxU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->qMaxU[0]), outputStream, direction);
158+
GPUMemCpyAlways(myStep, O->flagsU + offset, P->flagsU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->flagsU[0]), outputStream, direction);
159+
GPUMemCpyAlways(myStep, O->padDiffU + offset, P->padDiffU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->padDiffU[0]), outputStream, direction);
160+
GPUMemCpyAlways(myStep, O->timeDiffU + offset, P->timeDiffU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->timeDiffU[0]), outputStream, direction);
161+
GPUMemCpyAlways(myStep, O->sigmaPadU + offset, P->sigmaPadU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->sigmaPadU[0]), outputStream, direction);
162+
GPUMemCpyAlways(myStep, O->sigmaTimeU + offset, P->sigmaTimeU + srcOffset, O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(O->sigmaTimeU[0]), outputStream, direction);
170163
offset += O->nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
171164
}
172165
}

0 commit comments

Comments
 (0)