Skip to content

Commit 351b273

Browse files
committed
GPU: Transpose shared memory caches to avoid bank conflicts
1 parent 5d33833 commit 351b273

File tree

4 files changed

+30
-23
lines changed

4 files changed

+30
-23
lines changed

GPU/GPUTracking/SliceTracker/GPUTPCNeighboursFinder.cxx

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -124,16 +124,10 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int /*nBlocks*/, int nThreads, i
124124
const float z = z0 + tracker.HitDataZ(row, ih) * stepZ;
125125
#endif // GPUCA_TEXTURE_FETCH_NEIGHBORS
126126

127-
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
128-
GPUsharedref() calink* neighUp = s.mB[iThread];
129-
GPUsharedref() float2* yzUp = s.mA[iThread];
130127
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP < GPUCA_MAXN
131-
calink neighUp2[GPUCA_MAXN - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
132-
float2 yzUp2[GPUCA_MAXN - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
133-
#endif
134-
#else
135-
calink neighUp[GPUCA_MAXN];
136-
float2 yzUp[GPUCA_MAXN];
128+
calink neighUp[GPUCA_MAXN - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
129+
float yzUp[GPUCA_MAXN - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
130+
float yzUp2[GPUCA_MAXN - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
137131
#endif // GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
138132

139133
int nNeighUp = 0;
@@ -151,15 +145,23 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int /*nBlocks*/, int nThreads, i
151145
break;
152146
}
153147

154-
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0 && GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP < GPUCA_MAXN
155-
if (nNeighUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP) {
156-
neighUp2[nNeighUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] = (calink)i;
157-
yzUp2[nNeighUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] = CAMath::MakeFloat2(s.mDnDx * (h.Y() - y), s.mDnDx * (h.Z() - z));
148+
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP < GPUCA_MAXN
149+
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP == 0
150+
if (true) {
151+
#else
152+
if ((unsigned int)nNeighUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP) {
153+
#endif
154+
neighUp[nNeighUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] = (calink)i;
155+
yzUp[nNeighUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] = s.mDnDx * (h.Y() - y);
156+
yzUp2[nNeighUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] = s.mDnDx * (h.Z() - z);
158157
} else
159158
#endif
160159
{
161-
neighUp[nNeighUp] = (calink)i;
162-
yzUp[nNeighUp] = CAMath::MakeFloat2(s.mDnDx * (h.Y() - y), s.mDnDx * (h.Z() - z));
160+
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
161+
s.mB[nNeighUp][iThread] = (calink)i;
162+
s.mA1[nNeighUp][iThread] = s.mDnDx * (h.Y() - y);
163+
s.mA2[nNeighUp][iThread] = s.mDnDx * (h.Z() - z);
164+
#endif
163165
}
164166
if (++nNeighUp >= GPUCA_MAXN) {
165167
// GPUInfo("Neighbors buffer ran full...");
@@ -184,9 +186,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int /*nBlocks*/, int nThreads, i
184186

185187
for (int iUp = 0; iUp < nNeighUp; iUp++) {
186188
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0 && GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP < GPUCA_MAXN
187-
float2 yzup = iUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP ? yzUp2[iUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] : yzUp[iUp];
189+
float2 yzup = iUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP ? CAMath::MakeFloat2(yzUp[iUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP], yzUp2[iUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP]) : CAMath::MakeFloat2(s.mA1[iUp][iThread], s.mA2[iUp][iThread]);
190+
#elif GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP == GPUCA_MAXN
191+
float2 yzup = CAMath::MakeFloat2(s.mA1[iUp][iThread], s.mA2[iUp][iThread]);
188192
#else
189-
float2 yzup = yzUp[iUp];
193+
float2 yzup = CAMath::MakeFloat2(yzUp[iUp], yzUp2[iUp]);
190194
#endif
191195
float dy = yzdn.x - yzup.x;
192196
float dz = yzdn.y - yzup.y;
@@ -201,7 +205,9 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int /*nBlocks*/, int nThreads, i
201205

202206
if (bestD <= chi2Cut) {
203207
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0 && GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP < GPUCA_MAXN
204-
linkUp = bestUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP ? neighUp2[bestUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] : neighUp[bestUp];
208+
linkUp = bestUp >= GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP ? neighUp[bestUp - GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP] : s.mB[bestUp][iThread];
209+
#elif GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP == GPUCA_MAXN
210+
linkUp = s.mB[bestUp][iThread];
205211
#else
206212
linkUp = neighUp[bestUp];
207213
#endif

GPU/GPUTracking/SliceTracker/GPUTPCNeighboursFinder.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,9 @@ class GPUTPCNeighboursFinder : public GPUKernelTemplate
4646
int mIRowUp; // next row number
4747
int mIRowDn; // previous row number
4848
#if GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
49-
float2 mA[GPUCA_THREAD_COUNT_FINDER][GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP]; // temp memory
50-
calink mB[GPUCA_THREAD_COUNT_FINDER][GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP];
49+
float mA1[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_THREAD_COUNT_FINDER];
50+
float mA2[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_THREAD_COUNT_FINDER];
51+
calink mB[GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_THREAD_COUNT_FINDER];
5152
#endif
5253
MEM_LG(GPUTPCRow)
5354
mRow, mRowUp, mRowDown;

GPU/GPUTracking/SliceTracker/GPUTPCTrackletSelector.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ GPUdii() void GPUTPCTrackletSelector::Thread<0>(int nBlocks, int nThreads, int i
7373
gap = 0;
7474
#if GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
7575
if (nHits < GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE) {
76-
s.mHits[iThread][nHits].Set(irow, ih);
76+
s.mHits[nHits][iThread].Set(irow, ih);
7777
} else
7878
#endif // GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
7979
{
@@ -109,7 +109,7 @@ GPUdii() void GPUTPCTrackletSelector::Thread<0>(int nBlocks, int nThreads, int i
109109
for (int jh = 0; jh < nHits; jh++) {
110110
#if GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
111111
if (jh < GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE) {
112-
tracker.TrackHits()[nFirstTrackHit + jh] = s.mHits[iThread][jh];
112+
tracker.TrackHits()[nFirstTrackHit + jh] = s.mHits[jh][iThread];
113113
} else
114114
#endif // GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
115115
{

GPU/GPUTracking/SliceTracker/GPUTPCTrackletSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class GPUTPCTrackletSelector : public GPUKernelTemplate
3939
int mNThreadsTotal; // total n threads
4040
int mNTracklets; // n of tracklets
4141
#if GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
42-
GPUTPCHitId mHits[GPUCA_THREAD_COUNT_SELECTOR][GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE];
42+
GPUTPCHitId mHits[GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE][GPUCA_THREAD_COUNT_SELECTOR];
4343
#endif // GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
4444
};
4545

0 commit comments

Comments
 (0)