@@ -101,213 +101,60 @@ void audiomixer_mixer_reset_buffer(audiomixer_mixer_obj_t* self,
101101 }
102102}
103103
104- uint32_t add8signed (uint32_t a , uint32_t b ) {
105- #if (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1 )) //Cortex-M4 w/FPU
106- return __SHADD8 (a , b );
107- #else
108- uint32_t result = 0 ;
109- for (int8_t i = 0 ; i < 4 ; i ++ ) {
110- int8_t ai = a >> (sizeof (int8_t ) * 8 * i );
111- int8_t bi = b >> (sizeof (int8_t ) * 8 * i );
112- int32_t intermediate = (int32_t ) ai + bi / 2 ;
113- if (intermediate > CHAR_MAX ) {
114- intermediate = CHAR_MAX ;
115- } else if (intermediate < CHAR_MIN ) {
116- intermediate = CHAR_MIN ;
117- }
118- result |= ((uint32_t ) intermediate & 0xff ) << (sizeof (int8_t ) * 8 * i );
119- }
120- return result ;
121- #endif
104+ __attribute__((always_inline ))
105+ static inline uint32_t add16signed (uint32_t a , uint32_t b ) {
106+ return __QADD16 (a , b );
122107}
123108
124- uint32_t add8unsigned (uint32_t a , uint32_t b ) {
125- #if (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1 )) //Cortex-M4 w/FPU
126- return __UHADD8 (a , b );
127- #else
128- uint32_t result = 0 ;
129- for (int8_t i = 0 ; i < 4 ; i ++ ) {
130- uint8_t ai = (a >> (sizeof (uint8_t ) * 8 * i ));
131- uint8_t bi = (b >> (sizeof (uint8_t ) * 8 * i ));
132- int32_t intermediate = (int32_t ) (ai + bi ) / 2 ;
133- if (intermediate > UCHAR_MAX ) {
134- intermediate = UCHAR_MAX ;
135- }
136- result |= ((uint32_t ) intermediate & 0xff ) << (sizeof (uint8_t ) * 8 * i );
137- }
138- return result ;
139- #endif
109+ __attribute__((always_inline ))
110+ static inline uint32_t mult16signed (uint32_t val , int32_t mul ) {
111+ mul <<= 16 ;
112+ int32_t hi , lo ;
113+ enum { bits = 16 }; // saturate to 16 bits
114+ enum { shift = 15 }; // shift is done automatically
115+ asm volatile ("smulwb %0, %1, %2" : "=r" (lo ) : "r" (mul ), "r" (val ));
116+ asm volatile ("smulwt %0, %1, %2" : "=r" (hi ) : "r" (mul ), "r" (val ));
117+ asm volatile ("ssat %0, %1, %2, asr %3" : "=r" (lo ) : "I" (bits ), "r" (lo ), "I" (shift ));
118+ asm volatile ("ssat %0, %1, %2, asr %3" : "=r" (hi ) : "I" (bits ), "r" (hi ), "I" (shift ));
119+ asm volatile ("pkhbt %0, %1, %2, lsl #16" : "=r" (val ) : "r" (lo ), "r" (hi )); // pack
120+ return val ;
140121}
141122
142- uint32_t add16signed (uint32_t a , uint32_t b ) {
143- #if (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1 )) //Cortex-M4 w/FPU
144- return __SHADD16 (a , b );
145- #else
146- uint32_t result = 0 ;
147- for (int8_t i = 0 ; i < 2 ; i ++ ) {
148- int16_t ai = a >> (sizeof (int16_t ) * 8 * i );
149- int16_t bi = b >> (sizeof (int16_t ) * 8 * i );
150- int32_t intermediate = (int32_t ) ai + bi / 2 ;
151- if (intermediate > SHRT_MAX ) {
152- intermediate = SHRT_MAX ;
153- } else if (intermediate < SHRT_MIN ) {
154- intermediate = SHRT_MIN ;
155- }
156- result |= (((uint32_t ) intermediate ) & 0xffff ) << (sizeof (int16_t ) * 8 * i );
157- }
158- return result ;
159- #endif
123+ static inline uint32_t tounsigned8 (uint32_t val ) {
124+ return __UADD8 (val , 0x80808080 );
160125}
161126
162- uint32_t add16unsigned (uint32_t a , uint32_t b ) {
163- #if (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1 )) //Cortex-M4 w/FPU
164- return __UHADD16 (a , b );
165- #else
166- uint32_t result = 0 ;
167- for (int8_t i = 0 ; i < 2 ; i ++ ) {
168- int16_t ai = (a >> (sizeof (uint16_t ) * 8 * i )) - 0x8000 ;
169- int16_t bi = (b >> (sizeof (uint16_t ) * 8 * i )) - 0x8000 ;
170- int32_t intermediate = (int32_t ) ai + bi / 2 ;
171- if (intermediate > USHRT_MAX ) {
172- intermediate = USHRT_MAX ;
173- }
174- result |= ((uint32_t ) intermediate & 0xffff ) << (sizeof (int16_t ) * 8 * i );
175- }
176- return result ;
177- #endif
127+ static inline uint32_t tounsigned16 (uint32_t val ) {
128+ return __UADD16 (val , 0x80008000 );
178129}
179130
180- static inline uint32_t mult8unsigned (uint32_t val , int32_t mul ) {
181- // if mul == 0, no need in wasting cycles
182- if (mul == 0 ) {
183- return 0 ;
184- }
185- /* TODO: workout ARMv7 instructions
186- #if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
187- return val;
188- #else*/
189- uint32_t result = 0 ;
190- float mod_mul = (float ) mul / (float ) ((1 <<15 )- 1 );
191- for (int8_t i = 0 ; i < 4 ; i ++ ) {
192- uint8_t ai = val >> (sizeof (uint8_t ) * 8 * i );
193- int32_t intermediate = ai * mod_mul ;
194- if (intermediate > SHRT_MAX ) {
195- intermediate = SHRT_MAX ;
196- }
197- result |= ((uint32_t ) intermediate & 0xff ) << (sizeof (uint8_t ) * 8 * i );
198- }
199-
200- return result ;
201- //#endif
202- }
203-
204- static inline uint32_t mult8signed (uint32_t val , int32_t mul ) {
205- // if mul == 0, no need in wasting cycles
206- if (mul == 0 ) {
207- return 0 ;
208- }
209- /* TODO: workout ARMv7 instructions
210- #if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
211- return val;
212- #else
213- */
214- uint32_t result = 0 ;
215- float mod_mul = (float )mul / (float )((1 <<15 )- 1 );
216- for (int8_t i = 0 ; i < 4 ; i ++ ) {
217- int16_t ai = val >> (sizeof (int8_t ) * 8 * i );
218- int32_t intermediate = ai * mod_mul ;
219- if (intermediate > CHAR_MAX ) {
220- intermediate = CHAR_MAX ;
221- } else if (intermediate < CHAR_MIN ) {
222- intermediate = CHAR_MIN ;
223- }
224- result |= (((uint32_t ) intermediate ) & 0xff ) << (sizeof (int16_t ) * 8 * i );
225- }
226- return result ;
227- //#endif
131+ static inline uint32_t tosigned16 (uint32_t val ) {
132+ return __UADD16 (val , 0x80008000 );
228133}
229134
230- //TODO:
231- static inline uint32_t mult16unsigned (uint32_t val , int32_t mul ) {
232- // if mul == 0, no need in wasting cycles
233- if (mul == 0 ) {
234- return 0 ;
235- }
236- /* TODO: the below ARMv7m instructions "work", but the amplitude is much higher/louder
237- #if (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) //Cortex-M4 w/FPU
238- // there is no unsigned equivalent to the 'SMULWx' ARMv7 Thumb function,
239- // so we have to do it by hand.
240- uint32_t lo = val & 0xffff;
241- uint32_t hi = val >> 16;
242- //mp_printf(&mp_plat_print, "pre-asm: (mul: %d)\n\tval: %x\tlo: %x\thi: %x\n", mul, val, lo, hi);
243- uint32_t val_lo;
244- asm volatile("mul %0, %1, %2" : "=r" (val_lo) : "r" (mul), "r" (lo));
245- asm volatile("mla %0, %1, %2, %3" : "=r" (val) : "r" (mul), "r" (hi), "r" (val_lo));
246- //mp_printf(&mp_plat_print, "post-asm:\n\tval: %x\tlo: %x\n\n", val, val_lo);
247- return val;
248- #else
249- */
250- uint32_t result = 0 ;
251- float mod_mul = (float )mul / (float )((1 <<15 )- 1 );
252- for (int8_t i = 0 ; i < 2 ; i ++ ) {
253- int16_t ai = (val >> (sizeof (uint16_t ) * 8 * i )) - 0x8000 ;
254- int32_t intermediate = ai * mod_mul ;
255- if (intermediate > SHRT_MAX ) {
256- intermediate = SHRT_MAX ;
257- } else if (intermediate < SHRT_MIN ) {
258- intermediate = SHRT_MIN ;
259- }
260- result |= (((uint32_t ) intermediate ) + 0x8000 ) << (sizeof (int16_t ) * 8 * i );
261- }
262- return result ;
263- //#endif
135+ static inline uint32_t unpack8 (uint16_t val ) {
136+ return ((val & 0xff00 ) << 16 ) | ((val & 0x00ff ) << 8 );
264137}
265138
266- static inline uint32_t mult16signed (uint32_t val , int32_t mul ) {
267- // if mul == 0, no need in wasting cycles
268- if (mul == 0 ) {
269- return 0 ;
270- }
271- #if (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1 )) //Cortex-M4 w/FPU
272- int32_t hi , lo ;
273- enum { bits = 16 }; // saturate to 16 bits
274- enum { shift = 0 }; // shift is done automatically
275- asm volatile ("smulwb %0, %1, %2" : "=r" (lo ) : "r" (mul ), "r" (val ));
276- asm volatile ("smulwt %0, %1, %2" : "=r" (hi ) : "r" (mul ), "r" (val ));
277- asm volatile ("ssat %0, %1, %2, asr %3" : "=r" (lo ) : "I" (bits ), "r" (lo ), "I" (shift ));
278- asm volatile ("ssat %0, %1, %2, asr %3" : "=r" (hi ) : "I" (bits ), "r" (hi ), "I" (shift ));
279- asm volatile ("pkhbt %0, %1, %2, lsl #16" : "=r" (val ) : "r" (lo ), "r" (hi )); // pack
280- return val ;
281- #else
282- uint32_t result = 0 ;
283- float mod_mul = (float )mul / (float )((1 <<15 )- 1 );
284- for (int8_t i = 0 ; i < 2 ; i ++ ) {
285- int16_t ai = val >> (sizeof (int16_t ) * 8 * i );
286- int32_t intermediate = ai * mod_mul ;
287- if (intermediate > SHRT_MAX ) {
288- intermediate = SHRT_MAX ;
289- } else if (intermediate < SHRT_MIN ) {
290- intermediate = SHRT_MIN ;
291- }
292- result |= (((uint32_t ) intermediate ) & 0xffff ) << (sizeof (int16_t ) * 8 * i );
293- }
294- return result ;
295- #endif
139+ static inline uint32_t pack8 (uint32_t val ) {
140+ return ((val & 0xff000000 ) >> 16 ) | ((val & 0xff00 ) >> 8 );
296141}
297142
143+ #define LIKELY (x ) (__builtin_expect(!!(x), 1))
144+ #define UNLIKELY (x ) (__builtin_expect(!!(x), 0))
298145static void mix_one_voice (audiomixer_mixer_obj_t * self ,
299146 audiomixer_mixervoice_obj_t * voice , bool voices_active ,
300147 uint32_t * word_buffer , uint32_t length ) {
301- uint32_t j = 0 ;
302148 bool voice_done = voice -> sample == NULL ;
303- for ( uint32_t i = 0 ; i < length ; i ++ ) {
304- if (! voice_done && j >= voice -> buffer_length ) {
149+ while (! voice_done && length ! = 0 ) {
150+ if (voice -> buffer_length == 0 ) {
305151 if (!voice -> more_data ) {
306152 if (voice -> loop ) {
307153 audiosample_reset_buffer (voice -> sample , false, 0 );
308154 } else {
309155 voice -> sample = NULL ;
310156 voice_done = true;
157+ break ;
311158 }
312159 }
313160 if (!voice_done ) {
@@ -316,64 +163,81 @@ static void mix_one_voice(audiomixer_mixer_obj_t* self,
316163 // Track length in terms of words.
317164 voice -> buffer_length /= sizeof (uint32_t );
318165 voice -> more_data = result == GET_BUFFER_MORE_DATA ;
319- j = 0 ;
320166 }
321167 }
168+
169+ uint32_t n = MIN (voice -> buffer_length , length );
170+ uint32_t * src = voice -> remaining_buffer ;
171+ uint16_t level = voice -> level ;
172+
322173 // First active voice gets copied over verbatim.
323- uint32_t sample_value ;
324- if (voice_done ) {
325- // Exit early if another voice already set all samples once.
326- if (voices_active ) {
327- continue ;
328- }
329- sample_value = 0 ;
330- if (!self -> samples_signed ) {
331- if (self -> bits_per_sample == 8 ) {
332- sample_value = 0x7f7f7f7f ;
174+ if (!voices_active ) {
175+ if (LIKELY (self -> bits_per_sample == 16 )) {
176+ if (LIKELY (self -> samples_signed )) {
177+ for (uint32_t i = 0 ; i < n ; i ++ ) {
178+ uint32_t v = src [i ];
179+ word_buffer [i ] = mult16signed (v , level );
180+ }
333181 } else {
334- sample_value = 0x7fff7fff ;
182+ for (uint32_t i = 0 ; i < n ; i ++ ) {
183+ uint32_t v = src [i ];
184+ v = tosigned16 (v );
185+ word_buffer [i ] = mult16signed (v , level );
186+ }
335187 }
336- }
337- } else {
338- sample_value = voice -> remaining_buffer [j ];
339- }
340-
341- // apply the mixer level
342- if (!self -> samples_signed ) {
343- if (self -> bits_per_sample == 8 ) {
344- sample_value = mult8unsigned (sample_value , voice -> level );
345- } else {
346- sample_value = mult16unsigned (sample_value , voice -> level );
347- }
348- } else {
349- if (self -> bits_per_sample == 8 ) {
350- sample_value = mult8signed (sample_value , voice -> level );
351188 } else {
352- sample_value = mult16signed (sample_value , voice -> level );
189+ uint16_t * hword_buffer = (uint16_t * )word_buffer ;
190+ uint16_t * hsrc = (uint16_t * )src ;
191+ for (uint32_t i = 0 ; i < n * 2 ; i ++ ) {
192+ uint32_t word = unpack8 (hsrc [i ]);
193+ if (LIKELY (!self -> samples_signed )) {
194+ word = tosigned16 (word );
195+ }
196+ word = mult16signed (word , level );
197+ hword_buffer [i ] = pack8 (word );
198+ }
353199 }
354- }
355-
356- if (!voices_active ) {
357- word_buffer [i ] = sample_value ;
358200 } else {
359- if (self -> bits_per_sample == 8 ) {
360- if (self -> samples_signed ) {
361- word_buffer [i ] = add8signed (word_buffer [i ], sample_value );
201+ if (LIKELY (self -> bits_per_sample == 16 )) {
202+ if (LIKELY (self -> samples_signed )) {
203+ for (uint32_t i = 0 ; i < n ; i ++ ) {
204+ uint32_t word = src [i ];
205+ word_buffer [i ] = add16signed (mult16signed (word , level ), word_buffer [i ]);
206+ }
362207 } else {
363- word_buffer [i ] = add8unsigned (word_buffer [i ], sample_value );
208+ for (uint32_t i = 0 ; i < n ; i ++ ) {
209+ uint32_t word = src [i ];
210+ word = tosigned16 (word );
211+ word_buffer [i ] = add16signed (mult16signed (word , level ), word_buffer [i ]);
212+ }
364213 }
365214 } else {
366- if (self -> samples_signed ) {
367- word_buffer [i ] = add16signed (word_buffer [i ], sample_value );
368- } else {
369- word_buffer [i ] = add16unsigned (word_buffer [i ], sample_value );
215+ uint16_t * hword_buffer = (uint16_t * )word_buffer ;
216+ uint16_t * hsrc = (uint16_t * )src ;
217+ for (uint32_t i = 0 ; i < n * 2 ; i ++ ) {
218+ uint32_t word = unpack8 (hsrc [i ]);
219+ if (LIKELY (!self -> samples_signed )) {
220+ word = tosigned16 (word );
221+ }
222+ word = mult16signed (word , level );
223+ word = add16signed (word , unpack8 (hword_buffer [i ]));
224+ hword_buffer [i ] = pack8 (word );
370225 }
371226 }
372227 }
373- j ++ ;
228+ length -= n ;
229+ word_buffer += n ;
230+ voice -> remaining_buffer += n ;
231+ voice -> buffer_length -= n ;
232+ }
233+
234+ if (length && !voices_active ) {
235+ uint32_t sample_value = self -> bits_per_sample == 8
236+ ? 0x80808080 : 0x80008000 ;
237+ for (uint32_t i = 0 ; i < length ; i ++ ) {
238+ word_buffer [i ] = sample_value ;
239+ }
374240 }
375- voice -> buffer_length -= j ;
376- voice -> remaining_buffer += j ;
377241}
378242
379243audioio_get_buffer_result_t audiomixer_mixer_get_buffer (audiomixer_mixer_obj_t * self ,
@@ -403,13 +267,27 @@ audioio_get_buffer_result_t audiomixer_mixer_get_buffer(audiomixer_mixer_obj_t*
403267 }
404268 self -> use_first_buffer = !self -> use_first_buffer ;
405269 bool voices_active = false;
270+ uint32_t length = self -> len / sizeof (uint32_t );
271+
406272 for (int32_t v = 0 ; v < self -> voice_count ; v ++ ) {
407273 audiomixer_mixervoice_obj_t * voice = MP_OBJ_TO_PTR (self -> voice [v ]);
408274
409- mix_one_voice (self , voice , voices_active , word_buffer , self -> len / sizeof ( uint32_t ) );
275+ mix_one_voice (self , voice , voices_active , word_buffer , length );
410276 voices_active = true;
411277 }
412278
279+ if (!self -> samples_signed ) {
280+ if (self -> bits_per_sample == 16 ) {
281+ for (uint32_t i = 0 ; i < length ; i ++ ) {
282+ word_buffer [i ] = tounsigned16 (word_buffer [i ]);
283+ }
284+ } else {
285+ for (uint32_t i = 0 ; i < length ; i ++ ) {
286+ word_buffer [i ] = tounsigned8 (word_buffer [i ]);
287+ }
288+ }
289+ }
290+
413291 self -> read_count += 1 ;
414292 } else if (!self -> use_first_buffer ) {
415293 * buffer = (uint8_t * ) self -> first_buffer ;
0 commit comments