GPU: Simplify and unify alignment constants

davidrohr · davidrohr · commit cd5ddd925fec · 2020-02-24T07:14:02.000+01:00
diff --git a/GPU/Common/GPUDefGPUParameters.h b/GPU/Common/GPUDefGPUParameters.h
@@ -111,7 +111,9 @@
 
 #define GPUCA_EXTERN_ROW_HITS                                          // Store row hits in separate array outside of tracklets
 #define GPUCA_SORT_STARTHITS_GPU                                       // Sort the start hits when running on GPU
-#define GPUCA_ROWALIGNMENT uint4                                       // Align Row Hits and Grid
+#define GPUCA_ROWALIGNMENT 16                                          // Align of Row Hits and Grid
+#define GPUCA_BUFFER_ALIGNMENT 64                                      // Alignment of buffers obtained from SetPointers
+#define GPUCA_MEMALIGN (64 * 1024)                                     // Alignment of allocated memory blocks
 
 // #define GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE                       // Output Profiling Data for Tracklet Constructor Tracklet Scheduling
 
@@ -122,8 +124,6 @@
 #define GPUCA_MAX_TRD_TRACKLETS      ((size_t)             128 * 1024) // Maximum number of TRD tracklets
 #define GPUCA_MAX_ITS_FIT_TRACKS     ((size_t)              96 * 1024) // Max number of tracks for ITS track fit
 #define GPUCA_TRACKER_CONSTANT_MEM   ((size_t)              63 * 1024) // Amount of Constant Memory to reserve
-#define GPUCA_MEMALIGN               ((size_t)              64 * 1024) // Alignment of memory blocks, all constants above must be multiple of this!!!
-#define GPUCA_MEMALIGN_SMALL         ((size_t)              64 * 1024) // Alignment of small blocks, GPUCA_MEMALIGN must be multiple of this!!!
 #define GPUCA_MEMORY_SIZE            ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Device
 #define GPUCA_HOST_MEMORY_SIZE       ((size_t) 6 * 1024 * 1024 * 1024) // Size of memory allocated on Host
 #define GPUCA_GPU_STACK_SIZE         ((size_t)               8 * 1024) // Stack size per GPU thread
diff --git a/GPU/GPUTracking/Base/GPUProcessor.h b/GPU/GPUTracking/Base/GPUProcessor.h
@@ -62,10 +62,7 @@ class GPUProcessor
   void InitGPUProcessor(GPUReconstruction* rec, ProcessorType type = PROCESSOR_TYPE_CPU, GPUProcessor* slaveProcessor = nullptr);
   void Clear();
 
-  // Helpers for memory allocation
-  CONSTEXPR static size_t MIN_ALIGNMENT = 64;
-
-  template <size_t alignment = MIN_ALIGNMENT>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
   static inline size_t getAlignment(size_t addr)
   {
     static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2");
@@ -78,22 +75,22 @@ class GPUProcessor
     }
     return (alignment - mod);
   }
-  template <size_t alignment = MIN_ALIGNMENT>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
   static inline size_t nextMultipleOf(size_t size)
   {
     return size + getAlignment<alignment>(size);
   }
-  template <size_t alignment = MIN_ALIGNMENT>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
   static inline void* alignPointer(void* ptr)
   {
     return (reinterpret_cast<void*>(nextMultipleOf<alignment>(reinterpret_cast<size_t>(ptr))));
   }
-  template <size_t alignment = MIN_ALIGNMENT>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
   static inline size_t getAlignment(void* addr)
   {
     return (getAlignment<alignment>(reinterpret_cast<size_t>(addr)));
   }
-  template <size_t alignment = MIN_ALIGNMENT, class S>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>
   static inline S* getPointerWithAlignment(size_t& basePtr, size_t nEntries = 1)
   {
     if (basePtr == 0) {
@@ -105,13 +102,13 @@ class GPUProcessor
     basePtr += nEntries * sizeof(S);
     return retVal;
   }
-  template <size_t alignment = MIN_ALIGNMENT, class S>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>
   static inline S* getPointerWithAlignment(void*& basePtr, size_t nEntries = 1)
   {
     return getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);
   }
 
-  template <size_t alignment = MIN_ALIGNMENT, class T, class S>
+  template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class T, class S>
   static inline void computePointerWithAlignment(T*& basePtr, S*& objPtr, size_t nEntries = 1, bool runConstructor = false)
   {
     objPtr = getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -268,7 +268,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
   memorypool = (char*)((res->*setPtr)(memorypool));
   size_t retVal = (char*)memorypool - (char*)ptr;
   if (IsGPU() && retVal == 0) { // Transferring 0 bytes might break some GPU backends, but we cannot simply skip the transfer, or we will break event dependencies
-    GPUProcessor::getPointerWithAlignment<GPUProcessor::MIN_ALIGNMENT, char>(memorypool, retVal = GPUProcessor::MIN_ALIGNMENT);
+    GPUProcessor::getPointerWithAlignment<GPUCA_BUFFER_ALIGNMENT, char>(memorypool, retVal = GPUCA_BUFFER_ALIGNMENT);
   }
   if ((size_t)((char*)memorypool - (char*)memorybase) > memorysize) {
     std::cout << "Memory pool size exceeded (" << res->mName << ": " << (char*)memorypool - (char*)memorybase << " < " << memorysize << "\n";
@@ -299,9 +299,9 @@ size_t GPUReconstruction::AllocateRegisteredMemory(short ires, GPUOutputControl*
         }
         res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
       } else {
-        res->mPtrDevice = operator new(res->mSize + GPUProcessor::MIN_ALIGNMENT);
+        res->mPtrDevice = operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT);
       }
-      res->mPtr = GPUProcessor::alignPointer<GPUProcessor::MIN_ALIGNMENT>(res->mPtrDevice);
+      res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
       res->SetPointers(res->mPtr);
       if (mDeviceProcessingSettings.debugLevel >= 5) {
         std::cout << (res->mReuse != -1 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n";
@@ -347,8 +347,8 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int type)
     throw std::bad_alloc();
   }
   if (mDeviceProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
-    mUnmanagedChunks.emplace_back(new char[size + GPUProcessor::MIN_ALIGNMENT]);
-    return GPUProcessor::alignPointer<GPUProcessor::MIN_ALIGNMENT>(mUnmanagedChunks.back().get());
+    mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
+    return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
   } else {
     void* pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
     void* base = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryBase : mHostMemoryBase;
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
@@ -137,14 +137,14 @@ void GPUTPCSliceData::SetClusterData(const GPUTPCClusterData* data, int nCluster
 
 void GPUTPCSliceData::SetMaxData()
 {
-  int hitMemCount = GPUCA_ROW_COUNT * sizeof(GPUCA_ROWALIGNMENT) + mNumberOfHits;
+  int hitMemCount = GPUCA_ROW_COUNT * GPUCA_ROWALIGNMENT + mNumberOfHits;
   const unsigned int kVectorAlignment = 256;
-  mNumberOfHitsPlusAlign = GPUProcessor::nextMultipleOf<(kVectorAlignment > sizeof(GPUCA_ROWALIGNMENT) ? kVectorAlignment : sizeof(GPUCA_ROWALIGNMENT)) / sizeof(int)>(hitMemCount);
+  mNumberOfHitsPlusAlign = GPUProcessor::nextMultipleOf<(kVectorAlignment > GPUCA_ROWALIGNMENT ? kVectorAlignment : GPUCA_ROWALIGNMENT) / sizeof(int)>(hitMemCount);
 }
 
 void* GPUTPCSliceData::SetPointersInput(void* mem, bool idsOnGPU)
 {
-  const int firstHitInBinSize = (23 + sizeof(GPUCA_ROWALIGNMENT) / sizeof(int)) * GPUCA_ROW_COUNT + 4 * mNumberOfHits + 3;
+  const int firstHitInBinSize = (23 + GPUCA_ROWALIGNMENT / sizeof(int)) * GPUCA_ROW_COUNT + 4 * mNumberOfHits + 3;
   GPUProcessor::computePointerWithAlignment(mem, mHitData, mNumberOfHitsPlusAlign);
   GPUProcessor::computePointerWithAlignment(mem, mFirstHitInBin, firstHitInBinSize);
   if (idsOnGPU) {
@@ -248,7 +248,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
   // 2. fill HitData and FirstHitInBin
   ////////////////////////////////////
 
-  vecpod<GPUTPCHit> binSortedHits(mNumberOfHits + sizeof(GPUCA_ROWALIGNMENT));
+  vecpod<GPUTPCHit> binSortedHits(mNumberOfHits + GPUCA_ROWALIGNMENT);
 
   int gridContentOffset = 0;
   int hitOffset = 0;
@@ -274,7 +274,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
     }
     row.mNHits = NumberOfClustersInRow[rowIndex];
     row.mHitNumberOffset = hitOffset;
-    hitOffset += GPUProcessor::nextMultipleOf<sizeof(GPUCA_ROWALIGNMENT) / sizeof(unsigned short)>(NumberOfClustersInRow[rowIndex]);
+    hitOffset += GPUProcessor::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(unsigned short)>(NumberOfClustersInRow[rowIndex]);
 
     row.mFirstHitInBinOffset = gridContentOffset;
 
@@ -286,7 +286,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
       return (1);
     }
 
-    int binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.mNHits + sizeof(GPUCA_ROWALIGNMENT) / sizeof(unsigned short) * (GPUCA_ROW_COUNT + 1) + 1;
+    int binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.mNHits + GPUCA_ROWALIGNMENT / sizeof(unsigned short) * (GPUCA_ROW_COUNT + 1) + 1;
     if (binCreationMemorySizeNew > binCreationMemorySize) {
       binCreationMemorySize = binCreationMemorySizeNew;
       binCreationMemory.resize(binCreationMemorySize);
@@ -344,7 +344,7 @@ int GPUTPCSliceData::InitFromClusterData(GPUconstantref() const MEM_CONSTANT(GPU
     gridContentOffset += nn;
 
     // Make pointer aligned
-    gridContentOffset = GPUProcessor::nextMultipleOf<sizeof(GPUCA_ROWALIGNMENT) / sizeof(calink)>(gridContentOffset);
+    gridContentOffset = GPUProcessor::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(calink)>(gridContentOffset);
   }
 
   return (0);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
@@ -70,13 +70,13 @@ void GPUTPCClusterFinder::SetMaxData(const GPUTrackingInOutPointers& io)
   mNMaxPeaks = 0.5f * mNMaxDigits;
   mNMaxClusters = 0.2f * mNMaxPeaks;
   mNMaxClusterPerRow = 0.01f * mNMaxDigits;
-  mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(mNMaxDigits);
+  mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(mNMaxDigits);
   mNBufs = getNSteps(mBufSize);
 }
 
 void GPUTPCClusterFinder::SetNMaxDigits(size_t nDigits, size_t nPages)
 {
-  mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(nDigits);
+  mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(nDigits);
   mNMaxPages = nPages;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -62,10 +62,7 @@ class GPUProcessor`
`62`	`62`	`void InitGPUProcessor(GPUReconstruction* rec, ProcessorType type = PROCESSOR_TYPE_CPU, GPUProcessor* slaveProcessor = nullptr);`
`63`	`63`	`void Clear();`
`64`	`64`
`65`		`- // Helpers for memory allocation`
`66`		`- CONSTEXPR static size_t MIN_ALIGNMENT = 64;`
`67`		`-`
`68`		`- template <size_t alignment = MIN_ALIGNMENT>`
	`65`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`69`	`66`	`static inline size_t getAlignment(size_t addr)`
`70`	`67`	`{`
`71`	`68`	`static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2");`
`@@ -78,22 +75,22 @@ class GPUProcessor`
`78`	`75`	`}`
`79`	`76`	`return (alignment - mod);`
`80`	`77`	`}`
`81`		`- template <size_t alignment = MIN_ALIGNMENT>`
	`78`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`82`	`79`	`static inline size_t nextMultipleOf(size_t size)`
`83`	`80`	`{`
`84`	`81`	`return size + getAlignment<alignment>(size);`
`85`	`82`	`}`
`86`		`- template <size_t alignment = MIN_ALIGNMENT>`
	`83`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`87`	`84`	`static inline void* alignPointer(void* ptr)`
`88`	`85`	`{`
`89`	`86`	`return (reinterpret_cast<void*>(nextMultipleOf<alignment>(reinterpret_cast<size_t>(ptr))));`
`90`	`87`	`}`
`91`		`- template <size_t alignment = MIN_ALIGNMENT>`
	`88`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`92`	`89`	`static inline size_t getAlignment(void* addr)`
`93`	`90`	`{`
`94`	`91`	`return (getAlignment<alignment>(reinterpret_cast<size_t>(addr)));`
`95`	`92`	`}`
`96`		`- template <size_t alignment = MIN_ALIGNMENT, class S>`
	`93`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>`
`97`	`94`	`static inline S* getPointerWithAlignment(size_t& basePtr, size_t nEntries = 1)`
`98`	`95`	`{`
`99`	`96`	`if (basePtr == 0) {`
`@@ -105,13 +102,13 @@ class GPUProcessor`
`105`	`102`	`basePtr += nEntries * sizeof(S);`
`106`	`103`	`return retVal;`
`107`	`104`	`}`
`108`		`- template <size_t alignment = MIN_ALIGNMENT, class S>`
	`105`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class S>`
`109`	`106`	`static inline S* getPointerWithAlignment(void*& basePtr, size_t nEntries = 1)`
`110`	`107`	`{`
`111`	`108`	`return getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);`
`112`	`109`	`}`
`113`	`110`
`114`		`- template <size_t alignment = MIN_ALIGNMENT, class T, class S>`
	`111`	`+ template <size_t alignment = GPUCA_BUFFER_ALIGNMENT, class T, class S>`
`115`	`112`	`static inline void computePointerWithAlignment(T& basePtr, S& objPtr, size_t nEntries = 1, bool runConstructor = false)`
`116`	`113`	`{`
`117`	`114`	`objPtr = getPointerWithAlignment<alignment, S>(reinterpret_cast<size_t&>(basePtr), nEntries);`
Original file line number	Diff line number	Diff line change
`@@ -70,13 +70,13 @@ void GPUTPCClusterFinder::SetMaxData(const GPUTrackingInOutPointers& io)`
`70`	`70`	`mNMaxPeaks = 0.5f * mNMaxDigits;`
`71`	`71`	`mNMaxClusters = 0.2f * mNMaxPeaks;`
`72`	`72`	`mNMaxClusterPerRow = 0.01f * mNMaxDigits;`
`73`		`- mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(mNMaxDigits);`
	`73`	`+ mBufSize = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(mNMaxDigits);`
`74`	`74`	`mNBufs = getNSteps(mBufSize);`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`void GPUTPCClusterFinder::SetNMaxDigits(size_t nDigits, size_t nPages)`
`78`	`78`	`{`
`79`		`- mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN_SMALL, mScanWorkGroupSize)>(nDigits);`
	`79`	`+ mNMaxDigits = nextMultipleOf<std::max<int>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(nDigits);`
`80`	`80`	`mNMaxPages = nPages;`
`81`	`81`	`}`
`82`	`82`