Skip to content

Commit cd5ddd9

Browse files
committed
GPU: Simplify and unify alignment constants
1 parent 3942c9c commit cd5ddd9

File tree

5 files changed

+24
-27
lines changed

5 files changed

+24
-27
lines changed

GPU/Common/GPUDefGPUParameters.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@
111111

112112
#define GPUCA_EXTERN_ROW_HITS // Store row hits in separate array outside of tracklets
113113
#define GPUCA_SORT_STARTHITS_GPU // Sort the start hits when running on GPU
114-
#define GPUCA_ROWALIGNMENT uint4 // Align Row Hits and Grid
114+
#define GPUCA_ROWALIGNMENT 16 // Align of Row Hits and Grid
115+
#define GPUCA_BUFFER_ALIGNMENT 64 // Alignment of buffers obtained from SetPointers
116+
#define GPUCA_MEMALIGN (64 * 1024) // Alignment of allocated memory blocks
115117

116118
// #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE // Output Profiling Data for Tracklet Constructor Tracklet Scheduling
117119

@@ -122,8 +124,6 @@
122124
#define GPUCA_MAX_TRD_TRACKLETS ((size_t) 128 * 1024) // Maximum number of TRD tracklets
123125
#define GPUCA_MAX_ITS_FIT_TRACKS ((size_t) 96 * 1024) // Max number of tracks for ITS track fit
124126
#define GPUCA_TRACKER_CONSTANT_MEM ((size_t) 63 * 1024) // Amount of Constant Memory to reserve
125-
#define GPUCA_MEMALIGN ((size_t) 64 * 1024) // Alignment of memory blocks, all constants above must be multiple of this!!!
126-
#define GPUCA_MEMALIGN_SMALL ((size_t) 64 * 1024) // Alignment of small blocks, GPUCA_MEMALIGN must be multiple of this!!!
127127
#define GPUCA_MEMORY_SIZE ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device
128128
#define GPUCA_HOST_MEMORY_SIZE ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Host
129129
#define GPUCA_GPU_STACK_SIZE ((size_t) 8 * 1024) // Stack size per GPU thread

GPU/GPUTracking/Base/GPUProcessor.h

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,7 @@ class GPUProcessor
6262
void InitGPUProcessor(GPUReconstruction* rec, ProcessorType type = PROCESSOR_TYPE_CPU, GPUProcessor* slaveProcessor = nullptr);
6363
void Clear();
6464

65-
// Helpers for memory allocation
66-
CONSTEXPR static size_t MIN_ALIGNMENT = 64;
67-
68-
template <size_t alignment = MIN_ALIGNMENT>
65+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
6966
static inline size_t getAlignment(size_t addr)
7067
{
7168
static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2");
@@ -78,22 +75,22 @@ class GPUProcessor
7875
}
7976
return (alignment - mod);
8077
}
81-
template <size_t alignment = MIN_ALIGNMENT>
78+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
8279
static inline size_t nextMultipleOf(size_t size)
8380
{
8481
return size + getAlignment<alignment>(size);
8582
}
86-
template <size_t alignment = MIN_ALIGNMENT>
83+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
8784
static inline void* alignPointer(void* ptr)
8885
{
8986
return (reinterpret_cast<void*>(nextMultipleOf<alignment>(reinterpret_cast<size_t>(ptr))));
9087
}
91-
template <size_t alignment = MIN_ALIGNMENT>
88+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
9289
static inline size_t getAlignment(void* addr)
9390
{
9491
return (getAlignment<alignment>(reinterpret_cast<size_t>(addr)));
9592
}
96-
template <size_t alignment = MIN_ALIGNMENT, class S>
93+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>
9794
static inline S* getPointerWithAlignment(size_t& basePtr, size_t nEntries = 1)
9895
{
9996
if (basePtr == 0) {
@@ -105,13 +102,13 @@ class GPUProcessor
105102
basePtr += nEntries * sizeof(S);
106103
return retVal;
107104
}
108-
template <size_t alignment = MIN_ALIGNMENT, class S>
105+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>
109106
static inline S* getPointerWithAlignment(void*& basePtr, size_t nEntries = 1)
110107
{
111108
return getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);
112109
}
113110

114-
template <size_t alignment = MIN_ALIGNMENT, class T, class S>
111+
template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class T, class S>
115112
static inline void computePointerWithAlignment(T*& basePtr, S*& objPtr, size_t nEntries = 1, bool runConstructor = false)
116113
{
117114
objPtr = getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
268268
memorypool = (char*)((res->*setPtr)(memorypool));
269269
size_t retVal = (char*)memorypool - (char*)ptr;
270270
if (IsGPU() && retVal == 0) { // Transferring 0 bytes might break some GPU backends, but we cannot simply skip the transfer, or we will break event dependencies
271-
GPUProcessor::getPointerWithAlignment<GPUProcessor::MIN_ALIGNMENT, char>(memorypool, retVal = GPUProcessor::MIN_ALIGNMENT);
271+
GPUProcessor::getPointerWithAlignment<GPUCA_BUFFER_ALIGNMENT, char>(memorypool, retVal = GPUCA_BUFFER_ALIGNMENT);
272272
}
273273
if ((size_t)((char*)memorypool - (char*)memorybase) > memorysize) {
274274
std::cout << "Memory pool size exceeded (" << res->mName << ": " << (char*)memorypool - (char*)memorybase << " < " << memorysize << "\n";
@@ -299,9 +299,9 @@ size_t GPUReconstruction::AllocateRegisteredMemory(short ires, GPUOutputControl*
299299
}
300300
res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
301301
} else {
302-
res->mPtrDevice = operator new(res->mSize + GPUProcessor::MIN_ALIGNMENT);
302+
res->mPtrDevice = operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT);
303303
}
304-
res->mPtr = GPUProcessor::alignPointer<GPUProcessor::MIN_ALIGNMENT>(res->mPtrDevice);
304+
res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
305305
res->SetPointers(res->mPtr);
306306
if (mDeviceProcessingSettings.debugLevel >= 5) {
307307
std::cout << (res->mReuse != -1 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n";
@@ -347,8 +347,8 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int type)
347347
throw std::bad_alloc();
348348
}
349349
if (mDeviceProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
350-
mUnmanagedChunks.emplace_back(new char[size + GPUProcessor::MIN_ALIGNMENT]);
351-
return GPUProcessor::alignPointer<GPUProcessor::MIN_ALIGNMENT>(mUnmanagedChunks.back().get());
350+
mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
351+
return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
352352
} else {
353353
void* pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
354354
void* base = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryBase : mHostMemoryBase;

GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,14 @@ void GPUTPCSliceData::SetClusterData(const GPUTPCClusterData* data, int nCluster
137137

138138
void GPUTPCSliceData::SetMaxData()
139139
{
140-
int hitMemCount = GPUCA_ROW_COUNT * sizeof(GPUCA_ROWALIGNMENT) + mNumberOfHits;
140+
int hitMemCount = GPUCA_ROW_COUNT * GPUCA_ROWALIGNMENT + mNumberOfHits;
141141
const unsigned int kVectorAlignment = 256;
142-
mNumberOfHitsPlusAlign = GPUProcessor::nextMultipleOf<(kVectorAlignment > sizeof(GPUCA_ROWALIGNMENT) ? kVectorAlignment : sizeof(GPUCA_ROWALIGNMENT)) / sizeof(int)>(hitMemCount);
142+
mNumberOfHitsPlusAlign = GPUProcessor::nextMultipleOf<(kVectorAlignment > GPUCA_ROWALIGNMENT ? kVectorAlignment : GPUCA_ROWALIGNMENT) / sizeof(int)>(hitMemCount);
143143
}
144144

145145
void* GPUTPCSliceData::SetPointersInput(void* mem, bool idsOnGPU)
146146
{
147-
const int firstHitInBinSize = (23 + sizeof(GPUCA_ROWALIGNMENT) / sizeof(int)) * GPUCA_ROW_COUNT + 4 * mNumberOfHits + 3;
147+
const int firstHitInBinSize = (23 + GPUCA_ROWALIGNMENT / sizeof(int)) * GPUCA_ROW_COUNT + 4 * mNumberOfHits + 3;
148148
GPUProcessor::computePointerWithAlignment(mem, mHitData, mNumberOfHitsPlusAlign);
149149
GPUProcessor::computePointerWithAlignment(mem, mFirstHitInBin, firstHitInBinSize);
150150
if (idsOnGPU) {
@@ -248,7 +248,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
248248
// 2. fill HitData and FirstHitInBin
249249
////////////////////////////////////
250250

251-
vecpod<GPUTPCHit> binSortedHits(mNumberOfHits + sizeof(GPUCA_ROWALIGNMENT));
251+
vecpod<GPUTPCHit> binSortedHits(mNumberOfHits + GPUCA_ROWALIGNMENT);
252252

253253
int gridContentOffset = 0;
254254
int hitOffset = 0;
@@ -274,7 +274,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
274274
}
275275
row.mNHits = NumberOfClustersInRow[rowIndex];
276276
row.mHitNumberOffset = hitOffset;
277-
hitOffset += GPUProcessor::nextMultipleOf<sizeof(GPUCA_ROWALIGNMENT) / sizeof(unsigned short)>(NumberOfClustersInRow[rowIndex]);
277+
hitOffset += GPUProcessor::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(unsigned short)>(NumberOfClustersInRow[rowIndex]);
278278

279279
row.mFirstHitInBinOffset = gridContentOffset;
280280

@@ -286,7 +286,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
286286
return (1);
287287
}
288288

289-
int binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.mNHits + sizeof(GPUCA_ROWALIGNMENT) / sizeof(unsigned short) * (GPUCA_ROW_COUNT + 1) + 1;
289+
int binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.mNHits + GPUCA_ROWALIGNMENT / sizeof(unsigned short) * (GPUCA_ROW_COUNT + 1) + 1;
290290
if (binCreationMemorySizeNew > binCreationMemorySize) {
291291
binCreationMemorySize = binCreationMemorySizeNew;
292292
binCreationMemory.resize(binCreationMemorySize);
@@ -344,7 +344,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
344344
gridContentOffset += nn;
345345

346346
// Make pointer aligned
347-
gridContentOffset = GPUProcessor::nextMultipleOf<sizeof(GPUCA_ROWALIGNMENT) / sizeof(calink)>(gridContentOffset);
347+
gridContentOffset = GPUProcessor::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(calink)>(gridContentOffset);
348348
}
349349

350350
return (0);

GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,13 @@ void GPUTPCClusterFinder::SetMaxData(const GPUTrackingInOutPointers& io)
7070
mNMaxPeaks = 0.5f * mNMaxDigits;
7171
mNMaxClusters = 0.2f * mNMaxPeaks;
7272
mNMaxClusterPerRow = 0.01f * mNMaxDigits;
73-
mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(mNMaxDigits);
73+
mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(mNMaxDigits);
7474
mNBufs = getNSteps(mBufSize);
7575
}
7676

7777
void GPUTPCClusterFinder::SetNMaxDigits(size_t nDigits, size_t nPages)
7878
{
79-
mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(nDigits);
79+
mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(nDigits);
8080
mNMaxPages = nPages;
8181
}
8282

0 commit comments

Comments
 (0)