@@ -100,11 +100,16 @@ void asrc_fir_filter16(struct asrc_farrow *src_obj, int16_t **output_buffers,
100100void asrc_fir_filter32 (struct asrc_farrow * src_obj , int32_t * * output_buffers ,
101101 int index_output_frame )
102102{
103- ae_f32x2 prod ;
104- ae_f32x2 buffer01 = AE_ZERO32 (); /* Note: Init is not needed */
105- ae_f32x2 filter01 = AE_ZERO32 (); /* Note: Init is not needed */
106- ae_f32x2 * filter_p ;
107- ae_f32x2 * buffer_p ;
103+ ae_valignx2 align_filter ;
104+ ae_valignx2 align_buffer ;
105+ ae_f64 prod0 ;
106+ ae_f64 prod1 ;
107+ ae_f32x2 buffer23 ;
108+ ae_f32x2 buffer01 ;
109+ ae_f32x2 filter01 ;
110+ ae_f32x2 filter23 ;
111+ const ae_int32x4 * filter_p ;
112+ const ae_int32x4 * buffer_p ;
108113 int n_limit ;
109114 int ch ;
110115 int n ;
@@ -115,7 +120,7 @@ void asrc_fir_filter32(struct asrc_farrow *src_obj, int32_t **output_buffers,
115120 * 'n_limit' is therefore stored to reduce redundant
116121 * calculations. Also handle possible interleaved output.
117122 */
118- n_limit = src_obj -> filter_length >> 1 ;
123+ n_limit = src_obj -> filter_length >> 2 ;
119124 if (src_obj -> output_format == ASRC_IOF_INTERLEAVED )
120125 i = src_obj -> num_channels * index_output_frame ;
121126 else
@@ -124,55 +129,41 @@ void asrc_fir_filter32(struct asrc_farrow *src_obj, int32_t **output_buffers,
124129 /* Iterate over each channel */
125130 for (ch = 0 ; ch < src_obj -> num_channels ; ch ++ ) {
126131 /* Pointer to the beginning of the impulse response */
127- filter_p = (ae_f32x2 * )& src_obj -> impulse_response [0 ];
132+ filter_p = (ae_int32x4 * )& src_obj -> impulse_response [0 ];
128133
129134 /* Pointer to the buffered input data */
130135 buffer_p =
131- (ae_f32x2 * )& src_obj -> ring_buffers32 [ch ]
136+ (ae_int32x4 * )& src_obj -> ring_buffers32 [ch ]
132137 [src_obj -> buffer_write_position ];
133138
134139 /* Allows unaligned load of 64 bit per cycle */
135- ae_valign align_filter = AE_LA64_PP (filter_p );
136- ae_valign align_buffer = AE_LA64_PP (buffer_p );
140+ align_filter = AE_LA128_PP (filter_p );
141+ align_buffer = AE_LA128_PP (buffer_p );
137142
138- /* Initialise the accumulator */
139- prod = AE_ZERO32 ();
143+ /* Initialise the accumulators */
144+ prod0 = AE_ZERO64 ();
145+ prod1 = AE_ZERO64 ();
140146
141147 /* Iterate over the filter bins */
142148 for (n = 0 ; n < n_limit ; n ++ ) {
143- /* Read two buffered samples at once */
144- AE_LA32X2_IP (buffer01 , align_buffer , buffer_p );
149+ /* Read four buffered samples */
150+ AE_LA32X2X2_IP (buffer01 , buffer23 , align_buffer , buffer_p );
145151
146- /* Store two bins of the impulse response */
147- AE_LA32X2_IP (filter01 , align_filter , filter_p );
152+ /* Load four coefficients of the impulse response */
153+ AE_LA32X2X2_IP (filter01 , filter23 , align_filter , filter_p );
148154
149155 /* Multiply and accumulate */
150- AE_MULAFP32X2RS ( prod , buffer01 , filter01 );
156+ AE_MULAAF2D32RA_HH_LL ( prod0 , prod1 , buffer01 , buffer23 , filter01 , filter23 );
151157 }
152158
153- /* Shift left after accumulation, because interim
154- * results might saturate during filtering prod = prod
155- * << 1; will shift after last addition
156- */
157-
158- /* swap LL and HH reusing filter01 to perform
159- * saturated addition of both halves
160- */
161- filter01 = AE_SEL32_LH (prod , prod );
162-
163- /* Add up the lower and upper 32 bit data of the
164- * 'prod' prod = AE_ADD32_HL_LH(prod, prod); fix using
165- * saturated addition
166- */
167- prod = AE_ADD32S (prod , filter01 );
159+ /* Add up the two accumulators */
160+ prod0 = AE_ADD64S (prod0 , prod1 );
168161
169162 /* Shift with saturation */
170- prod = AE_SLAI32S (prod , 1 );
163+ buffer01 = AE_SLAI32S (AE_ROUND32F48SASYM ( prod0 ) , 1 );
171164
172- /* Store 'prod' in (de-)interleaved format in the output
173- * buffers
174- */
175- AE_S32_L_X (prod , (ae_f32 * )& output_buffers [ch ][i ], 0 );
165+ /* Store 'buffer01' in (de-)interleaved format in the output buffers */
166+ AE_S32_L_X (buffer01 , (ae_f32 * )& output_buffers [ch ][i ], 0 );
176167 }
177168}
178169
0 commit comments