Math: Optimizations to CORDIC trigonometric functions

singalsu · singalsu · commit e68223a7fec6 · 2026-01-23T19:43:19.000+02:00
This patch optimizes the cycles count performance for the
math functions. The computational accuracy is not impacted.

In function cordic_approx() in trig.c, the most performance
critical function, the if statements are simplified and macro
values used instead of global constants. The first if statement
is simplified with use of absolute value of the parameter
th_rad_fxp and using new direction variable from sign of the
parameter.

The b_idx iteration loop is simplified with direction sign
variable and new variable for common shift amount.

The local variables in the b_idx iteration speed up the algorithm.

In is_scalar_cordic_acos() and is_scalar_cordic_acos() functions
the numiters parameter is replaced by numiters_minus_one to
avoid the subtract by one. The cosvalue shift is avoided by
changing the constant to compare to as 2x.

The variable k is eliminated from the b_i iteration since it is
duplicate of variable b_i.

The constant variables to macros names have been adjusted for
the actual Q-format computed the code. E.g. PI/2 Q29 fraction
is same integer value as PI Q28 fraction.

In header file trig.h the PI related constants are adjusted
to Q-format used in the code. The equation to calculate the
value in Octave is shown in comment.

The new macros for table size -1 iteration counts are added
and the inline functions are changed to use it.

The int16_t parameters for functions are replaced with int
for better code speed.

These changes save in MTL platform 46 MCPS in stft_process
component with 1024 size FFT and 256 hop in converting FFTs
to (magnitude,phase) phase format and back complex FFT.

Signed-off-by: Seppo Ingalsuo &lt;seppo.ingalsuo@linux.intel.com&gt;
diff --git a/src/include/sof/math/trig.h b/src/include/sof/math/trig.h
@@ -13,14 +13,17 @@
 
 #include <stdint.h>
 
-#define PI_DIV2_Q4_28 421657428
-#define PI_DIV2_Q3_29 843314856
-#define PI_Q4_28      843314857
-#define PI_MUL2_Q4_28     1686629713
-#define CORDIC_31B_TABLE_SIZE		31
-#define CORDIC_15B_TABLE_SIZE		15
-#define CORDIC_30B_ITABLE_SIZE		30
-#define CORDIC_16B_ITABLE_SIZE		16
+#define PI_Q4_28 843314857	 /* int32(pi * 2^28) */
+#define PI_MUL2_Q4_28 1686629713 /* int32(2 * pi * 2^28) */
+#define PI_DIV2_Q3_29 843314857	 /* int32(pi / 2 * 2^29) */
+#define PI_Q3_29 1686629713	 /* int32(pi * 2^29) */
+
+#define CORDIC_31B_TABLE_SIZE 31
+#define CORDIC_15B_TABLE_SIZE 15
+#define CORDIC_30B_ITABLE_SIZE 30
+#define CORDIC_16B_ITABLE_SIZE 16
+#define CORDIC_31B_ITERS_MINUS_ONE (CORDIC_31B_TABLE_SIZE - 1)
+#define CORDIC_16B_ITERS_MINUS_ONE (CORDIC_16B_ITABLE_SIZE - 1)
 
 typedef enum {
 	EN_32B_CORDIC_SINE,
@@ -38,8 +41,9 @@ struct cordic_cmpx {
 
 void cordic_approx(int32_t th_rad_fxp, int32_t a_idx, int32_t *sign, int32_t *b_yn, int32_t *xn,
 		   int32_t *th_cdc_fxp);
-int32_t is_scalar_cordic_acos(int32_t realvalue, int16_t numiters);
-int32_t is_scalar_cordic_asin(int32_t realvalue, int16_t numiters);
+int32_t is_scalar_cordic_acos(int32_t realvalue, int numiters_minus_one);
+int32_t is_scalar_cordic_asin(int32_t realvalue, int numiters_minus_one);
+
 void cmpx_cexp(int32_t sign, int32_t b_yn, int32_t xn, cordic_cfg type, struct cordic_cmpx *cexp);
 /* Input is Q4.28, output is Q1.31 */
 /**
@@ -238,11 +242,9 @@ static inline int32_t asin_fixed_32b(int32_t cdc_asin_th)
 	int32_t th_asin_fxp;
 
 	if (cdc_asin_th >= 0)
-		th_asin_fxp = is_scalar_cordic_asin(cdc_asin_th,
-						    CORDIC_31B_TABLE_SIZE);
+		th_asin_fxp = is_scalar_cordic_asin(cdc_asin_th, CORDIC_31B_ITERS_MINUS_ONE);
 	else
-		th_asin_fxp = -is_scalar_cordic_asin(-cdc_asin_th,
-						     CORDIC_31B_TABLE_SIZE);
+		th_asin_fxp = -is_scalar_cordic_asin(-cdc_asin_th, CORDIC_31B_ITERS_MINUS_ONE);
 
 	return th_asin_fxp; /* Q2.30 */
 }
@@ -262,12 +264,10 @@ static inline int32_t acos_fixed_32b(int32_t cdc_acos_th)
 	int32_t th_acos_fxp;
 
 	if (cdc_acos_th >= 0)
-		th_acos_fxp = is_scalar_cordic_acos(cdc_acos_th,
-						    CORDIC_31B_TABLE_SIZE);
+		th_acos_fxp = is_scalar_cordic_acos(cdc_acos_th, CORDIC_31B_ITERS_MINUS_ONE);
 	else
 		th_acos_fxp =
-		PI_MUL2_Q4_28 - is_scalar_cordic_acos(-cdc_acos_th,
-						      CORDIC_31B_TABLE_SIZE);
+		    PI_Q3_29 - is_scalar_cordic_acos(-cdc_acos_th, CORDIC_31B_ITERS_MINUS_ONE);
 
 	return th_acos_fxp; /* Q3.29 */
 }
@@ -289,11 +289,9 @@ static inline int16_t asin_fixed_16b(int32_t cdc_asin_th)
 	int32_t th_asin_fxp;
 
 	if (cdc_asin_th >= 0)
-		th_asin_fxp = is_scalar_cordic_asin(cdc_asin_th,
-						    CORDIC_16B_ITABLE_SIZE);
+		th_asin_fxp = is_scalar_cordic_asin(cdc_asin_th, CORDIC_16B_ITERS_MINUS_ONE);
 	else
-		th_asin_fxp = -is_scalar_cordic_asin(-cdc_asin_th,
-						     CORDIC_16B_ITABLE_SIZE);
+		th_asin_fxp = -is_scalar_cordic_asin(-cdc_asin_th, CORDIC_16B_ITERS_MINUS_ONE);
 	/*convert Q2.30 to Q2.14 format*/
 	return sat_int16(Q_SHIFT_RND(th_asin_fxp, 30, 14));
 }
@@ -314,12 +312,10 @@ static inline int16_t acos_fixed_16b(int32_t cdc_acos_th)
 	int32_t th_acos_fxp;
 
 	if (cdc_acos_th >= 0)
-		th_acos_fxp = is_scalar_cordic_acos(cdc_acos_th,
-						    CORDIC_16B_ITABLE_SIZE);
+		th_acos_fxp = is_scalar_cordic_acos(cdc_acos_th, CORDIC_16B_ITERS_MINUS_ONE);
 	else
 		th_acos_fxp =
-		PI_MUL2_Q4_28 - is_scalar_cordic_acos(-cdc_acos_th,
-						      CORDIC_16B_ITABLE_SIZE);
+		    PI_Q3_29 - is_scalar_cordic_acos(-cdc_acos_th, CORDIC_16B_ITERS_MINUS_ONE);
 
 	/*convert Q3.29 to Q3.13 format*/
 	return sat_int16(Q_SHIFT_RND(th_acos_fxp, 29, 13));
diff --git a/src/math/trig.c b/src/math/trig.c
@@ -13,18 +13,13 @@
 #include <sof/math/cordic.h>
 #include <stdint.h>
 
-/* Use a local definition to avoid adding a dependency on <math.h> */
-#define _M_PI		3.14159265358979323846	/* pi */
+#define CORDIC_SINE_COS_LUT_Q29 652032874 /* deg = 69.586061, int32(1.214505869895220 * 2^29) */
+
+#define CORDIC_SINCOS_PIOVERTWO_Q28 421657428	  /* int32(pi / 2 * 2^28) */
+#define CORDIC_SINCOS_PI_Q28 843314857		  /* int32(pi * 2^28) */
+#define CORDIC_SINCOS_TWOPI_Q28 1686629713	  /* int32(2 * pi * 2^28) */
+#define CORDIC_SINCOS_ONEANDHALFPI_Q28 1264972285 /* int32(1.5 * pi * 2^28) */
 
-/* 652032874 , deg = 69.586061*/
-const int32_t cordic_sine_cos_lut_q29fl	 =  Q_CONVERT_FLOAT(1.214505869895220, 29);
-/* 1686629713, deg = 90.000000	*/
-const int32_t cordic_sine_cos_piovertwo_q30fl  = Q_CONVERT_FLOAT(_M_PI / 2, 30);
-/* 421657428 , deg = 90.000000 */
-const int32_t cord_sincos_piovertwo_q28fl  = Q_CONVERT_FLOAT(_M_PI / 2, 28);
-/* 843314857,  deg = 90.000000	*/
-const int32_t cord_sincos_piovertwo_q29fl  = Q_CONVERT_FLOAT(_M_PI / 2, 29);
-/* arc trignometry constant*/
 /**
  * CORDIC-based approximation of sine and cosine
  * \+----------+----------------------------------------+--------------------+-------------------+
@@ -36,20 +31,25 @@ const int32_t cord_sincos_piovertwo_q29fl  = Q_CONVERT_FLOAT(_M_PI / 2, 29);
  * \|1686629713| Q_CONVERT_FLOAT(1.5707963267341256, 30)|    89.9999999965181| 1.57079632673413  |
  * \+----------+----------------------------------------+--------------------+-------------------+
  */
-/* 379625062,  deg = 81.0284683480568475 or round(1.4142135605216026*2^28) */
-const int32_t cord_arcsincos_q28fl  = Q_CONVERT_FLOAT(1.4142135605216026 / 2, 28);
-/* 1073741824, deg = 57.2957795130823229 or round(1*2^30)*/
-const int32_t cord_arcsincos_q30fl  = Q_CONVERT_FLOAT(1.0000000000000000, 30);
+
+#define CORDIC_ARCSINCOS_SQRT2_DIV4_Q30 379625062 /* int32(sqrt(2) / 4 * 2^30) */
+#define CORDIC_ARCSINCOS_ONE_Q30 1073741824	  /* int32(1 * 2^30) */
+
 /**
  * CORDIC-based approximation of sine, cosine and complex exponential
  */
 void cordic_approx(int32_t th_rad_fxp, int32_t a_idx, int32_t *sign, int32_t *b_yn, int32_t *xn,
 		   int32_t *th_cdc_fxp)
 {
+	int32_t direction;
+	int32_t abs_th;
 	int32_t b_idx;
-	int32_t xtmp;
-	int32_t ytmp;
-	*sign = 1;
+	int32_t xn_local = CORDIC_SINE_COS_LUT_Q29;
+	int32_t yn_local = 0;
+	int32_t xtmp = CORDIC_SINE_COS_LUT_Q29;
+	int32_t ytmp = 0;
+	int shift;
+
 	/* Addition or subtraction by a multiple of pi/2 is done in the data type
 	 * of the input. When the fraction length is 29, then the quantization error
 	 * introduced by the addition or subtraction of pi/2 is done with 29 bits of
@@ -58,46 +58,37 @@ void cordic_approx(int32_t th_rad_fxp, int32_t a_idx, int32_t *sign, int32_t *b_
 	 * without overflow.Increase of fractionLength makes the addition or
 	 * subtraction of a multiple of pi/2 more precise
 	 */
-	if (th_rad_fxp > cord_sincos_piovertwo_q28fl) {
-		if ((th_rad_fxp - cord_sincos_piovertwo_q29fl) <= cord_sincos_piovertwo_q28fl) {
-			th_rad_fxp -= cord_sincos_piovertwo_q29fl;
-			*sign  = -1;
-		} else {
-			th_rad_fxp -= cordic_sine_cos_piovertwo_q30fl;
-		}
-	} else if (th_rad_fxp < -cord_sincos_piovertwo_q28fl) {
-		if ((th_rad_fxp + cord_sincos_piovertwo_q29fl) >= -cord_sincos_piovertwo_q28fl) {
-			th_rad_fxp += cord_sincos_piovertwo_q29fl;
-			*sign  = -1;
+	abs_th = (th_rad_fxp >= 0) ? th_rad_fxp : -th_rad_fxp;
+	direction = (th_rad_fxp >= 0) ? 1 : -1;
+	*sign = 1;
+	if (abs_th > CORDIC_SINCOS_PIOVERTWO_Q28) {
+		if (abs_th <= CORDIC_SINCOS_ONEANDHALFPI_Q28) {
+			th_rad_fxp -= direction * CORDIC_SINCOS_PI_Q28;
+			*sign = -1;
 		} else {
-			th_rad_fxp += cordic_sine_cos_piovertwo_q30fl;
+			th_rad_fxp -= direction * CORDIC_SINCOS_TWOPI_Q28;
 		}
 	}
 
 	th_rad_fxp <<= 2;
-	*b_yn = 0;
-	*xn = cordic_sine_cos_lut_q29fl;
-	xtmp = cordic_sine_cos_lut_q29fl;
-	ytmp = 0;
 
 	/* Calculate the correct coefficient values from rotation angle.
 	 * Find difference between the coefficients from the lookup table
 	 * and those from the calculation
 	 */
 	for (b_idx = 0; b_idx < a_idx; b_idx++) {
-		if (th_rad_fxp < 0) {
-			th_rad_fxp += cordic_lookup[b_idx];
-			*xn += ytmp;
-			*b_yn -= xtmp;
-		} else {
-			th_rad_fxp -= cordic_lookup[b_idx];
-			*xn -= ytmp;
-			*b_yn += xtmp;
-		}
-		xtmp = *xn >> (b_idx + 1);
-		ytmp = *b_yn >> (b_idx + 1);
+		direction = (th_rad_fxp >= 0) ? 1 : -1;
+		shift = b_idx + 1;
+		th_rad_fxp -= direction * cordic_lookup[b_idx];
+		xn_local -= direction * ytmp;
+		yn_local += direction * xtmp;
+		xtmp = xn_local >> shift;
+		ytmp = yn_local >> shift;
 	}
-	/* Q2.30 format -sine, cosine*/
+
+	/* Write back results once */
+	*xn = xn_local;
+	*b_yn = yn_local;
 	*th_cdc_fxp = th_rad_fxp;
 }
 EXPORT_SYMBOL(cordic_approx);
@@ -108,7 +99,7 @@ EXPORT_SYMBOL(cordic_approx);
  *		  int16_t numiters
  * Return Type	: int32_t
  */
-int32_t is_scalar_cordic_acos(int32_t cosvalue, int16_t numiters)
+int32_t is_scalar_cordic_acos(int32_t cosvalue, int numiters_minus_one)
 {
 	int32_t xdshift;
 	int32_t ydshift;
@@ -118,46 +109,37 @@ int32_t is_scalar_cordic_acos(int32_t cosvalue, int16_t numiters)
 	int32_t y = 0;
 	int32_t z = 0;
 	int32_t sign;
-	int32_t b_i;
-	int i;
+	int b_i;
 	int j;
-	int k;
 
 	/* Initialize the variables for the cordic iteration
 	 * angles less than pi/4, we initialize (x,y) along the x-axis.
 	 * angles greater than or equal to pi/4, we initialize (x,y)
 	 * along the y-axis. This improves the accuracy of the algorithm
 	 * near the edge of the domain of convergence
+	 *
+	 * Note: not pi/4 but sqrt(2)/4 is used as the threshold
 	 */
-	if ((cosvalue >> 1) < cord_arcsincos_q28fl) {
-		x = 0;
-		y = cord_arcsincos_q30fl;
+	if (cosvalue < CORDIC_ARCSINCOS_SQRT2_DIV4_Q30) {
+		y = CORDIC_ARCSINCOS_ONE_Q30;
 		z = PI_DIV2_Q3_29;
 	} else {
-		x = cord_arcsincos_q30fl;
-		y = 0;
-		z = 0;
+		x = CORDIC_ARCSINCOS_ONE_Q30;
 	}
 
 	/* DCORDIC(Double CORDIC) algorithm */
 	/* Double iterations method consists in the fact that unlike the classical */
 	/* CORDIC method,where the iteration step value changes EVERY time, i.e. on */
 	/* each iteration, in the double iteration method, the iteration step value */
 	/* is repeated twice and changes only through one iteration */
-	i = numiters - 1;
-	for (b_i = 0; b_i < i; b_i++) {
+	for (b_i = 0; b_i < numiters_minus_one; b_i++) {
 		j = (b_i + 1) << 1;
 		if (j >= 31)
 			j = 31;
 
-		if (b_i < 31)
-			k = b_i;
-		else
-			k = 31;
-
-		xshift = x >> k;
+		xshift = x >> b_i;
+		yshift = y >> b_i;
 		xdshift = x >> j;
-		yshift = y >> k;
 		ydshift = y >> j;
 		/* Do nothing if x currently equals the target value. Allowed for
 		 * double rotations algorithms, as it is equivalent to rotating by
@@ -188,7 +170,7 @@ int32_t is_scalar_cordic_acos(int32_t cosvalue, int16_t numiters)
  *		  int16_t numiters
  * Return Type	: int32_t
  */
-int32_t is_scalar_cordic_asin(int32_t sinvalue, int16_t numiters)
+int32_t is_scalar_cordic_asin(int32_t sinvalue, int numiters_minus_one)
 {
 	int32_t xdshift;
 	int32_t ydshift;
@@ -198,47 +180,39 @@ int32_t is_scalar_cordic_asin(int32_t sinvalue, int16_t numiters)
 	int32_t y = 0;
 	int32_t z = 0;
 	int32_t sign;
-	int32_t b_i;
-	int i;
+	int b_i;
 	int j;
-	int k;
 
 	/* Initialize the variables for the cordic iteration
 	 * angles less than pi/4, we initialize (x,y) along the x-axis.
 	 * angles greater than or equal to pi/4, we initialize (x,y)
 	 * along the y-axis. This improves the accuracy of the algorithm
 	 * near the edge of the domain of convergence
+	 *
+	 * Note: Instead of pi/4, sqrt(2)/4 is used as the threshold
 	 */
-	if ((sinvalue >> 1) > cord_arcsincos_q28fl) {
-		x = 0;
-		y = cord_arcsincos_q30fl;
+	if (sinvalue > CORDIC_ARCSINCOS_SQRT2_DIV4_Q30) {
+		y = CORDIC_ARCSINCOS_ONE_Q30;
 		z = PI_DIV2_Q3_29;
 	} else {
-		x = cord_arcsincos_q30fl;
-		y = 0;
-		z = 0;
+		x = CORDIC_ARCSINCOS_ONE_Q30;
 	}
 
 	/* DCORDIC(Double CORDIC) algorithm */
 	/* Double iterations method consists in the fact that unlike the classical */
 	/* CORDIC method,where the iteration step value changes EVERY time, i.e. on */
 	/* each iteration, in the double iteration method, the iteration step value */
 	/* is repeated twice and changes only through one iteration */
-	i = numiters - 1;
-	for (b_i = 0; b_i < i; b_i++) {
+	// i = numiters - 1;
+	for (b_i = 0; b_i < numiters_minus_one; b_i++) {
 		j = (b_i + 1) << 1;
 		if (j >= 31)
 			j = 31;
 
-		if (b_i < 31)
-			k = b_i;
-		else
-			k = 31;
-
-		xshift = x >> k;
-		xdshift = x >> j;
-		yshift = y >> k;
+		xshift = x >> b_i;
+		yshift = y >> b_i;
 		ydshift = y >> j;
+		xdshift = x >> j;
 		/* Do nothing if x currently equals the target value. Allowed for
 		 * double rotations algorithms, as it is equivalent to rotating by
 		 * the same angle in opposite directions sequentially. Accounts for