Skip to content

Commit 5bd1b45

Browse files
committed
Audio: ASRC: Optimize for HiFi5 function asrc_fir_filter32()
This change improves efficiency of FIR filter compute. The FIR calculation is unrolled by four. The coefficients and delay line reads are changed to 128 bits 4x int32_t reads. The MAC instruction is changed from dual-MAC to quad-MAC. The FIR accuracy improves a bit due to internal Q17.47 format instead of Q1.31. The saving is 1.7 MCPS, from 27.2 to 25.5 MCPS with 32 bit 44.1 to 48.8 kHz stereo push mode ASRC. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 3df6068 commit 5bd1b45

1 file changed

Lines changed: 28 additions & 37 deletions

File tree

src/audio/asrc/asrc_farrow_hifi5.c

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,16 @@ void asrc_fir_filter16(struct asrc_farrow *src_obj, int16_t **output_buffers,
100100
void asrc_fir_filter32(struct asrc_farrow *src_obj, int32_t **output_buffers,
101101
int index_output_frame)
102102
{
103-
ae_f32x2 prod;
104-
ae_f32x2 buffer01 = AE_ZERO32(); /* Note: Init is not needed */
105-
ae_f32x2 filter01 = AE_ZERO32(); /* Note: Init is not needed */
106-
ae_f32x2 *filter_p;
107-
ae_f32x2 *buffer_p;
103+
ae_valignx2 align_filter;
104+
ae_valignx2 align_buffer;
105+
ae_f64 prod0;
106+
ae_f64 prod1;
107+
ae_f32x2 buffer23;
108+
ae_f32x2 buffer01;
109+
ae_f32x2 filter01;
110+
ae_f32x2 filter23;
111+
const ae_int32x4 *filter_p;
112+
const ae_int32x4 *buffer_p;
108113
int n_limit;
109114
int ch;
110115
int n;
@@ -115,7 +120,7 @@ void asrc_fir_filter32(struct asrc_farrow *src_obj, int32_t **output_buffers,
115120
* 'n_limit' is therefore stored to reduce redundant
116121
* calculations. Also handle possible interleaved output.
117122
*/
118-
n_limit = src_obj->filter_length >> 1;
123+
n_limit = src_obj->filter_length >> 2;
119124
if (src_obj->output_format == ASRC_IOF_INTERLEAVED)
120125
i = src_obj->num_channels * index_output_frame;
121126
else
@@ -124,55 +129,41 @@ void asrc_fir_filter32(struct asrc_farrow *src_obj, int32_t **output_buffers,
124129
/* Iterate over each channel */
125130
for (ch = 0; ch < src_obj->num_channels; ch++) {
126131
/* Pointer to the beginning of the impulse response */
127-
filter_p = (ae_f32x2 *)&src_obj->impulse_response[0];
132+
filter_p = (ae_int32x4 *)&src_obj->impulse_response[0];
128133

129134
/* Pointer to the buffered input data */
130135
buffer_p =
131-
(ae_f32x2 *)&src_obj->ring_buffers32[ch]
136+
(ae_int32x4 *)&src_obj->ring_buffers32[ch]
132137
[src_obj->buffer_write_position];
133138

134139
/* Allows unaligned load of 64 bit per cycle */
135-
ae_valign align_filter = AE_LA64_PP(filter_p);
136-
ae_valign align_buffer = AE_LA64_PP(buffer_p);
140+
align_filter = AE_LA128_PP(filter_p);
141+
align_buffer = AE_LA128_PP(buffer_p);
137142

138-
/* Initialise the accumulator */
139-
prod = AE_ZERO32();
143+
/* Initialise the accumulators */
144+
prod0 = AE_ZERO64();
145+
prod1 = AE_ZERO64();
140146

141147
/* Iterate over the filter bins */
142148
for (n = 0; n < n_limit; n++) {
143-
/* Read two buffered samples at once */
144-
AE_LA32X2_IP(buffer01, align_buffer, buffer_p);
149+
/* Read four buffered samples */
150+
AE_LA32X2X2_IP(buffer01, buffer23, align_buffer, buffer_p);
145151

146-
/* Store two bins of the impulse response */
147-
AE_LA32X2_IP(filter01, align_filter, filter_p);
152+
/* Load four coefficients of the impulse response */
153+
AE_LA32X2X2_IP(filter01, filter23, align_filter, filter_p);
148154

149155
/* Multiply and accumulate */
150-
AE_MULAFP32X2RS(prod, buffer01, filter01);
156+
AE_MULAAF2D32RA_HH_LL(prod0, prod1, buffer01, buffer23, filter01, filter23);
151157
}
152158

153-
/* Shift left after accumulation, because interim
154-
* results might saturate during filtering prod = prod
155-
* << 1; will shift after last addition
156-
*/
157-
158-
/* swap LL and HH reusing filter01 to perform
159-
* saturated addition of both halves
160-
*/
161-
filter01 = AE_SEL32_LH(prod, prod);
162-
163-
/* Add up the lower and upper 32 bit data of the
164-
* 'prod' prod = AE_ADD32_HL_LH(prod, prod); fix using
165-
* saturated addition
166-
*/
167-
prod = AE_ADD32S(prod, filter01);
159+
/* Add up the two accumulators */
160+
prod0 = AE_ADD64S(prod0, prod1);
168161

169162
/* Shift with saturation */
170-
prod = AE_SLAI32S(prod, 1);
163+
buffer01 = AE_SLAI32S(AE_ROUND32F48SASYM(prod0), 1);
171164

172-
/* Store 'prod' in (de-)interleaved format in the output
173-
* buffers
174-
*/
175-
AE_S32_L_X(prod, (ae_f32 *)&output_buffers[ch][i], 0);
165+
/* Store 'buffer01' in (de-)interleaved format in the output buffers */
166+
AE_S32_L_X(buffer01, (ae_f32 *)&output_buffers[ch][i], 0);
176167
}
177168
}
178169

0 commit comments

Comments
 (0)