From 070aaaf1f76afb6b88132a173106d28d2845116c Mon Sep 17 00:00:00 2001
From: jodavies <jodavies@users.noreply.github.com>
Date: Thu, 26 Feb 2026 11:37:11 +0000
Subject: [PATCH] perf: allocate Normalize buffers dynamically, store in AT

Large stack allocations have a performance implication due to stack clash
protection. Dynamically allocate the buffers needed by Normalize in AT,
instead. This leads to a large performance improvement.
---
 sources/declare.h |  1 +
 sources/normal.c  | 80 +++++++++++++++++++++++++++++++++--------------
 sources/setfile.c | 26 +++++++++++++++
 sources/startup.c |  5 +++
 sources/structs.h | 23 ++++++++++++++
 sources/threads.c |  6 ++++
 6 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/sources/declare.h b/sources/declare.h
index 802e171e..b8493ef4 100644
--- a/sources/declare.h
+++ b/sources/declare.h
@@ -1300,6 +1300,7 @@ extern void   SubsInAll(PHEAD0);
 extern void   TransferBuffer(int,int,int);
 extern int    TakeIDfunction(PHEAD WORD *);
 extern int    MakeSetupAllocs(void);
+extern NORMDATA *AllocNormData(void);
 extern int    TryFileSetups(void);
 extern void   ExchangeExpressions(int,int);
 extern void   ExchangeDollars(int,int);
diff --git a/sources/normal.c b/sources/normal.c
index 7e5254ff..93d6e4c0 100644
--- a/sources/normal.c
+++ b/sources/normal.c
@@ -1,11 +1,12 @@
 /** @file normal.c
  * 
- *  Mainly the routine Normalize. This routine brings terms to standard
- *	FORM. Currently it has one serious drawback. Its buffers are all
- *	in the stack. This means these buffers have a fixed size (NORMSIZE).
- *	In the past this has caused problems and NORMSIZE had to be increased.
+ *	Mainly the routine Normalize. This routine brings terms to standard
+ *	form. Its main buffers are in AT, but they have a fixed size controlled
+ *	by NORMSIZE, which limit the maximum complexity of terms which can be
+ *	normalized.
  *
- *	It is not clear whether Normalize can be called recursively.
+ *	Normalize is called recursively, currently via:
+ *		Normalize -> ExpandRat -> Normalize.
  */
 /* #[ License : */
 /*
@@ -185,8 +186,7 @@ int Commute(WORD *fleft, WORD *fright)
 
 	This is the big normalization routine. It has a great need
 	to be economical.
-	There is a fixed limit to the number of objects coming in.
-	Something should be done about it.
+	The limit on the number of objects coming in is given by NORMSIZE.
 
 */
 
@@ -199,20 +199,41 @@ int Normalize(PHEAD WORD *term)
 	WORD *t, *m, *r, i, j, k, l, nsym, *ss, *tt, *u;
 	WORD shortnum, stype;
 	WORD *stop, *to = 0, *from = 0;
-/*
-	The next variables could be better off in the AT.WorkSpace (?)
-	Now they make stackallocations rather bothersome.
-*/
-	WORD psym[7*NORMSIZE],*ppsym;
-	WORD pvec[NORMSIZE],*ppvec,nvec;
-	WORD pdot[3*NORMSIZE],*ppdot,ndot;
-	WORD pdel[2*NORMSIZE],*ppdel,ndel;
-	WORD pind[NORMSIZE],nind;
-	WORD *peps[NORMSIZE/3],neps;
-	WORD *pden[NORMSIZE/3],nden;
-	WORD *pcom[NORMSIZE],ncom;
-	WORD *pnco[NORMSIZE],nnco;
-	WORD *pcon[2*NORMSIZE],ncon;		/* Pointer to contractable indices */
+
+	WORD *ppsym, *ppvec, *ppdot, *ppdel;
+	WORD nvec, ndot, ndel, nind, neps, nden, ncom, nnco, ncon;
+
+	AT.NormDepth++;
+#ifdef DEBUGGING
+	if ( AT.NormDepth > 2 ) {
+		// We don't expect this to happen in the current codebase.
+		Terminate(-1);
+	}
+#endif
+	if ( AT.NormDepth > AT.NormDataSize ) {
+		NORMDATA **top = AT.NormData + AT.NormDataSize;
+		DoubleBuffer((void **)&(AT.NormData), (void **)&(top),
+			sizeof(*AT.NormData), "double NormData pointers");
+		AT.NormDataSize *= 2;
+		for ( LONG i = AT.NormDepth-1; i < AT.NormDataSize; i++ ) {
+			AT.NormData[i] = NULL;
+		}
+	}
+	if ( AT.NormData[AT.NormDepth-1] == NULL ) {
+		AT.NormData[AT.NormDepth-1] = AllocNormData();
+	}
+
+	WORD  *psym = AT.NormData[AT.NormDepth-1]->psym;
+	WORD  *pvec = AT.NormData[AT.NormDepth-1]->pvec;
+	WORD  *pdot = AT.NormData[AT.NormDepth-1]->pdot;
+	WORD  *pdel = AT.NormData[AT.NormDepth-1]->pdel;
+	WORD  *pind = AT.NormData[AT.NormDepth-1]->pind;
+	WORD **peps = AT.NormData[AT.NormDepth-1]->peps;
+	WORD **pden = AT.NormData[AT.NormDepth-1]->pden;
+	WORD **pcom = AT.NormData[AT.NormDepth-1]->pcom;
+	WORD **pnco = AT.NormData[AT.NormDepth-1]->pnco;
+	WORD **pcon = AT.NormData[AT.NormDepth-1]->pcon;
+
 	WORD *n_coef, ncoef;				/* Accumulator for the coefficient */
 	WORD *n_llnum, *lnum, nnum;
 	WORD *termout, oldtoprhs = 0, subtype;
@@ -243,7 +264,12 @@ PrintTerm(term,"Normalize");
 	didcontr = 0;
 	ReplaceType = -1;
 	t = term;
-	if ( !*t ) { TermFree(n_coef,"NormCoef"); TermFree(n_llnum,"n_llnum"); return(regval); }
+	if ( !*t ) {
+		AT.NormDepth--;
+		TermFree(n_coef,"NormCoef");
+		TermFree(n_llnum,"n_llnum");
+		return(regval);
+	}
 	r = t + *t;
 	ncoef = r[-1];
 	i = ABS(ncoef);
@@ -4134,6 +4160,7 @@ regularratfun:;
 		C->numrhs = oldtoprhs;
 		C->Pointer = C->Buffer + oldcpointer;
 */
+		AT.NormDepth--;
 		TermFree(n_llnum,"n_llnum");
 		TermFree(n_coef,"NormCoef");
 		return(1);
@@ -4161,6 +4188,7 @@ regularratfun:;
 		TermAssign(term);
 	}
 */
+	AT.NormDepth--;
 	TermFree(n_llnum,"n_llnum");
 	TermFree(n_coef,"NormCoef");
 	return(regval);
@@ -4186,11 +4214,13 @@ regularratfun:;
 NormZero:
 	*term = 0;
 	AT.WorkPointer = termout;
+	AT.NormDepth--;
 	TermFree(n_llnum,"n_llnum");
 	TermFree(n_coef,"NormCoef");
 	return(regval);
 
 NormMin:
+	AT.NormDepth--;
 	TermFree(n_llnum,"n_llnum");
 	TermFree(n_coef,"NormCoef");
 	return(-1);
@@ -4199,6 +4229,7 @@ regularratfun:;
 	MLOCK(ErrorMessageLock);
 	MesCall("Norm");
 	MUNLOCK(ErrorMessageLock);
+	AT.NormDepth--;
 	TermFree(n_llnum,"n_llnum");
 	TermFree(n_coef,"NormCoef");
 	return(-1);
@@ -5164,7 +5195,10 @@ void DropSymbols(PHEAD WORD *term)
 int SymbolNormalize(WORD *term)
 {
 	GETIDENTITY
-	WORD buffer[7*NORMSIZE], *t, *b, *bb, *tt, *m, *tstop;
+	WORD *t, *b, *bb, *tt, *m, *tstop;
+	// Here we use a stack-allocated array, since things are much smaller
+	// compared to the full Normalize routine.
+	WORD buffer[7*NORMSIZE];
 	int i;
 	b = buffer;
 	*b++ = SYMBOL; *b++ = 2;
diff --git a/sources/setfile.c b/sources/setfile.c
index a61cde24..17f028a3 100644
--- a/sources/setfile.c
+++ b/sources/setfile.c
@@ -1116,6 +1116,32 @@ int MakeSetupAllocs(void)
 
 /*
  		#] MakeSetupAllocs : 
+		#[ AllocNormData :
+
+	Allocate the arrays within the NORMDATA struct. These are dynamic,
+	such that valgrind can detect buffer overruns in these arrays.
+*/
+
+NORMDATA* AllocNormData(void)
+{
+	NORMDATA* tmp = Malloc1(sizeof(NORMDATA), "NormData struct");
+
+	tmp->psym = Malloc1(7*NORMSIZE*sizeof(*(tmp->psym)), "Normalize struct psym");
+	tmp->pvec = Malloc1(1*NORMSIZE*sizeof(*(tmp->pvec)), "Normalize struct pvec");
+	tmp->pdot = Malloc1(3*NORMSIZE*sizeof(*(tmp->pdot)), "Normalize struct pdot");
+	tmp->pdel = Malloc1(2*NORMSIZE*sizeof(*(tmp->pdel)), "Normalize struct pdel");
+	tmp->pind = Malloc1(1*NORMSIZE*sizeof(*(tmp->pind)), "Normalize struct pind");
+	tmp->peps = Malloc1(NORMSIZE/3*sizeof(*(tmp->peps)), "Normalize struct peps");
+	tmp->pden = Malloc1(NORMSIZE/3*sizeof(*(tmp->pden)), "Normalize struct pden");
+	tmp->pcom = Malloc1(1*NORMSIZE*sizeof(*(tmp->pcom)), "Normalize struct pcom");
+	tmp->pnco = Malloc1(1*NORMSIZE*sizeof(*(tmp->pnco)), "Normalize struct pnco");
+	tmp->pcon = Malloc1(2*NORMSIZE*sizeof(*(tmp->pcon)), "Normalize struct pcon");
+
+	return tmp;
+}
+
+/*
+		#] AllocNormData : 
  		#[ TryFileSetups :
 
 		Routine looks in the input file for a start of the type
diff --git a/sources/startup.c b/sources/startup.c
index e724906a..4f7adcde 100644
--- a/sources/startup.c
+++ b/sources/startup.c
@@ -1232,6 +1232,11 @@ void StartVariables(void)
 	AR.wranfnpair1 = NPAIR1;
 	AR.wranfnpair2 = NPAIR2;
 	AR.wranfseed = 0;
+
+	AT.NormData = Malloc1(sizeof(*(AT.NormData)), "NormData pointers");
+	AT.NormDataSize = 1;
+	AT.NormData[0] = AllocNormData();
+	AT.NormDepth = 0;
 #endif
 	AM.atstartup = 1;
 	AM.oldnumextrasymbols = strDup1((UBYTE *)"OLDNUMEXTRASYMBOLS_","oldnumextrasymbols");
diff --git a/sources/structs.h b/sources/structs.h
index 651a6ddf..c4c64897 100644
--- a/sources/structs.h
+++ b/sources/structs.h
@@ -1373,6 +1373,26 @@ typedef struct {
 	WORD flags;
 } TERMINFO;
 
+/*
+	Struct to hold temporary data in Normalize, so that it is not allocated
+	on the stack each function call. The sizes of these arrays introduces a
+	maximum complexity of terms which can be normalized. Each array is
+	allocated dynamically, by AllocNormData, such that valgrind can detect
+	errors if they are over-run.
+*/
+typedef struct NoRmDaTa {
+	WORD  *psym;
+	WORD  *pvec;
+	WORD  *pdot;
+	WORD  *pdel;
+	WORD  *pind;
+	WORD **peps;
+	WORD **pden;
+	WORD **pcom;
+	WORD **pnco;
+	WORD **pcon;
+} NORMDATA;
+
 /*
   	#] Varia : 
     #[ A :
@@ -2081,6 +2101,9 @@ struct T_const {
     void    *aux_;
     void    *auxr_;
 #endif
+    NORMDATA **NormData;
+    LONG    NormDataSize;
+    LONG    NormDepth;
     PARTI   partitions;
     LONG    sBer;                  /* (T) Size of the Bernoullis buffer */
     LONG    pWorkPointer;          /* (R) Offset-pointer in pWorkSpace */
diff --git a/sources/threads.c b/sources/threads.c
index 9176e4c6..73fa5de1 100644
--- a/sources/threads.c
+++ b/sources/threads.c
@@ -737,6 +737,12 @@ ALLPRIVATES *InitializeOneThread(int identity)
 	AR.wranfnpair1 = NPAIR1;
 	AR.wranfnpair2 = NPAIR2;
 	AR.wranfseed = 0;
+
+	AT.NormData = Malloc1(sizeof(*(AT.NormData)), "NormData thread pointers");
+	AT.NormDataSize = 1;
+	AT.NormData[0] = AllocNormData();
+	AT.NormDepth = 0;
+
 	AN.SplitScratch = 0;
 	AN.SplitScratchSize = AN.InScratch = 0;
 	AN.SplitScratch1 = 0;