Skip to content

Commit e286577

Browse files
committed
Math: FIR: Optimize filter core function for HiFi5
This patch optimizes the function fir_32x16_2x_hifi5(). - The (4x) quad-MAC with AE_MULAFD32X16X2_FIR_HH() and AE_MULAFD32X16X2_FIR_HL() is replaced with a 8x MAC intrinsic AE_MULA2Q32X16_FIR_H(). - Since the 8x MAC is not supporting fractions, a shift left by one is added to adjust the format to Q17.47. - The output sample single saturation and round is replaced with instruction that rounds two 64 bit accumulators. WIP - Currently the MCPS saving with FIR EQ and TDFB components seems much smaller, only 0.2 MCPS. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 23c0437 commit e286577

File tree

1 file changed

+28
-45
lines changed

1 file changed

+28
-45
lines changed

src/math/fir_hifi5.c

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,6 @@ void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
8585
}
8686
EXPORT_SYMBOL(fir_get_lrshifts);
8787

88-
/* HiFi EP has the follow number of reqisters that should not be exceeded
89-
* 4x 56 bit registers in register file Q
90-
* 8x 48 bit registers in register file P
91-
*/
92-
9388
void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
9489
{
9590
/* This function uses
@@ -163,31 +158,26 @@ void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
163158
}
164159
EXPORT_SYMBOL(fir_32x16);
165160

166-
/* HiFi EP has the follow number of reqisters that should not be exceeded
167-
* 4x 56 bit registers in register file Q
168-
* 8x 48 bit registers in register file P
169-
*/
170-
171161
void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
172162
ae_int32 *y0, ae_int32 *y1, int shift)
173163
{
174164
/* This function uses
175-
* 2x 56 bit registers Q,
176-
* 4x 48 bit registers P
165+
* 7x 64 bit AE registers
177166
* 3x integers
178167
* 2x address pointers,
179168
*/
180-
ae_f64 a;
181-
ae_f64 b;
182169
ae_valign u;
170+
ae_f64 a = AE_ZERO64();
171+
ae_f64 b = AE_ZERO64();
183172
ae_f32x2 d0;
184173
ae_f32x2 d1;
174+
ae_f32x2 d2;
185175
ae_f16x4 coefs;
186-
int i;
187176
ae_f32x2 *dp;
188177
ae_f16x4 *coefp = fir->coef;
189178
const int taps_div_4 = fir->taps >> 2;
190179
const int inc = 2 * sizeof(int32_t);
180+
int i;
191181

192182
/* Bypass samples if taps count is zero. */
193183
if (!taps_div_4) {
@@ -201,18 +191,11 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
201191
dp = (ae_f32x2 *)fir->rwp;
202192
AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t));
203193

204-
/* Note: If the next function is converted to handle two samples
205-
* per call the data load can be done with single instruction
206-
* AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
207-
*/
208-
a = AE_ZERO64();
209-
b = AE_ZERO64();
210-
211194
/* Prime the coefficients stream */
212195
u = AE_LA64_PP(coefp);
213196

214-
/* Load two data samples and pack to d0 to data2_h and
215-
* d1 to data2_l.
197+
/* Load two samples, two newest samples and proceed
198+
* to elder input samples in delay line.
216199
*/
217200
AE_L32X2_XC(d0, dp, inc);
218201
for (i = 0; i < taps_div_4; i++) {
@@ -222,34 +205,34 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
222205
*/
223206
AE_LA16X4_IP(coefs, u, coefp);
224207

225-
/* Load two data samples. Upper part d1_h is x[n+1] and
226-
* lower part d1_l is x[n].
208+
/* Load two data samples more.
209+
* d0.H is x[n] the newest sample
210+
* d0.L is x[n-1]
211+
* d1.H is x[n-2]
212+
* d1.L is x[n-3]
213+
* d2.H is x[n-4]
227214
*/
228215
AE_L32X2_XC(d1, dp, inc);
216+
AE_L32X2_XC(d2, dp, inc);
229217

230-
/* Quad MAC (HH)
231-
* b += d0_h * coefs_3 + d0_l * coefs_2
232-
* a += d0_l * coefs_3 + d1_h * coefs_2
218+
/* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b)
219+
* b = b + d0.H * c.3 + d0.L * c.2 + d1.H * c.1 + d1.L * c.0
220+
* a = a + d0.L * c.3 + d1.H * c.2 + d1.L * c.1 + d2.H * c.0
233221
*/
234-
AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
235-
d0 = d1;
236-
237-
/* Repeat the same for next two taps and increase coefp. */
238-
AE_L32X2_XC(d1, dp, inc);
222+
AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs);
239223

240-
/* Quad MAC (HL)
241-
* b += d0_h * coefs_1 + d0_l * coefs_0
242-
* a += d0_l * coefs_1 + d1_h * coefs_0
243-
*/
244-
AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
245-
d0 = d1;
224+
/* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */
225+
d0 = d2;
246226
}
247227

248-
/* Do scaling shifts and store sample. */
249-
b = AE_SLAA64S(b, shift);
250-
a = AE_SLAA64S(a, shift);
251-
AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
252-
AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
228+
/* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and
229+
* store output samples.
230+
*/
231+
b = AE_SLAA64S(b, shift + 1);
232+
a = AE_SLAA64S(a, shift + 1);
233+
d0 = AE_ROUND32X2F48SASYM(b, a);
234+
AE_S32_H_I(d0, (ae_int32 *)y1, 0);
235+
AE_S32_L_I(d0, (ae_int32 *)y0, 0);
253236
}
254237
EXPORT_SYMBOL(fir_32x16_2x);
255238

0 commit comments

Comments
 (0)