// SPDX-License-Identifier: BSD-3-Clause // // Copyright(c) 2016 Intel Corporation. All rights reserved. // // Author: Seppo Ingalsuo /* HiFi3 optimized code parts for SRC */ #include "src_config.h" #if SRC_HIFI3 #include "src.h" #include #include #include #include /* HiFi3 has * 16x 64 bit registers in register file AE_DR */ #if SRC_SHORT /* 16 bit coefficients version */ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, const int taps_div_4, const int shift, const int nch) { /* This function uses * 6x 64 bit registers * 3x integers * 5x address pointers, */ ae_f64 a0; ae_f64 a1; ae_valign u; ae_f16x4 coef4; ae_f32x2 d0; ae_f32x2 d1; ae_f32x2 data2; ae_f16x4 *coefp; ae_f32x2 *dp; ae_f32 *dp0; ae_f32 *dp1; int i; int j; ae_f32 *wp = wp0; const int inc = nch * sizeof(int32_t); if (nch == 2) { /* Move data pointer back by one sample to start from right * channel sample. Discard read value p0. */ dp = (ae_f32x2 *)rp; AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); /* Reset coefficient pointer and clear accumulator */ coefp = (ae_f16x4 *)cp; a0 = AE_ZERO64(); a1 = AE_ZERO64(); /* Compute FIR filter for current channel with four * taps per every loop iteration. Four coefficients * are loaded simultaneously. Data is read * from interleaved buffer with stride of channels * count. */ for (i = 0; i < taps_div_4; i++) { /* Load four coefficients */ AE_LA16X4_IP(coef4, u, coefp); /* Load two data samples from two channels */ AE_L32X2_XC(d0, dp, inc); /* r0, l0 */ AE_L32X2_XC(d1, dp, inc); /* r1, l1 */ /* Select to data2 sequential samples from a channel * and then accumulate to a0 and a1 * data2_h * coef4_3 + data2_l * coef4_2. * The data is 32 bits Q1.31 and coefficient 16 bits * Q1.15. The accumulators are Q17.47. */ data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ AE_MULAAFD32X16_H3_L2(a0, data2, coef4); data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ AE_MULAAFD32X16_H3_L2(a1, data2, coef4); /* Load two data samples from two channels */ AE_L32X2_XC(d0, dp, inc); /* r2, l2 */ AE_L32X2_XC(d1, dp, inc); /* r3, l3 */ /* Accumulate * data2_h * coef4_1 + data2_l * coef4_0. */ data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ AE_MULAAFD32X16_H1_L0(a0, data2, coef4); data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ AE_MULAAFD32X16_H1_L0(a1, data2, coef4); } /* Scale FIR output with right shifts, round/saturate * to Q1.31, and store 32 bit output. */ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t)); return; } dp1 = (ae_f32 *)rp; for (j = 0; j < nch; j++) { /* Copy pointer and advance to next ch with dummy load */ dp0 = dp1; AE_L32_XC(d0, dp1, -sizeof(ae_f32)); /* Reset coefficient pointer and clear accumulator */ coefp = (ae_f16x4 *)cp; a0 = AE_ZERO64(); /* Compute FIR filter for current channel with four * taps per every loop iteration. Data is read from * interleaved buffer with stride of channels count. */ for (i = 0; i < taps_div_4; i++) { /* Load four coefficients */ AE_LA16X4_IP(coef4, u, coefp); /* Load two data samples, place to high and * low of data2. */ AE_L32_XC(d0, dp0, inc); AE_L32_XC(d1, dp0, inc); data2 = AE_SEL32_LL(d0, d1); /* Accumulate * data2_h * coef4_3 + data2_l* coef4_2. * The data is 32 bits Q1.31 and coefficient 16 bits * Q1.15. The accumulator is Q17.47. */ AE_MULAAFD32X16_H3_L2(a0, data2, coef4); /* Repeat with next two samples */ AE_L32_XC(d0, dp0, inc); AE_L32_XC(d1, dp0, inc); data2 = AE_SEL32_LL(d0, d1); /* Accumulate * data2_h * coef4_1 + data2_l * coef4_0. */ AE_MULAAFD32X16_H1_L0(a0, data2, coef4); } /* Scale FIR output with right shifts, round/saturate Q17.47 * to Q1.31, and store 32 bit output. Advance write * pointer to next sample. */ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); } } #else /* 32bit coefficients version */ static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, const int taps_div_4, const int shift, const int nch) { /* This function uses * 6x 64 bit registers * 3x integers * 5x address pointers, */ ae_f64 a0; ae_f64 a1; ae_f24x2 data2 = AE_ZERO24(); ae_f24x2 coef2 = AE_ZERO24(); ae_f24x2 d0 = AE_ZERO24(); ae_f24x2 d1 = AE_ZERO24(); ae_f24x2 *coefp; ae_f24x2 *dp; ae_f24 *dp1; ae_f24 *dp0; int i; int j; ae_f32 *wp = wp0; const int inc = nch * sizeof(int32_t); if (nch == 2) { /* Move data pointer back by one sample to start from right * channel sample. Discard read value p0. */ dp = (ae_f24x2 *)rp; AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24)); /* Reset coefficient pointer and clear accumulator */ coefp = (ae_f24x2 *)cp; a0 = AE_ZERO64(); a1 = AE_ZERO64(); /* Compute FIR filter for current channel with four * taps per every loop iteration. Two coefficients * are loaded simultaneously. Data is read * from interleaved buffer with stride of channels * count. */ for (i = 0; i < taps_div_4; i++) { /* Load two coefficients. Coef2_h contains tap *coefp * and coef2_l contains the next tap. */ /* TODO: Ensure coefficients are 64 bits aligned */ AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); /* Load two data samples from two channels */ AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */ AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */ /* Select to d0 successive left channel samples, to d1 * successive right channel samples. Then Accumulate * to a0 and a1 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 * data and Q1.15 coefficients are used as 24 bits as * Q1.23 values. */ data2 = AE_SELP24_LL(d0, d1); AE_MULAAFP24S_HH_LL(a0, data2, coef2); data2 = AE_SELP24_HH(d0, d1); AE_MULAAFP24S_HH_LL(a1, data2, coef2); /* Repeat for next two taps */ AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */ AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */ data2 = AE_SELP24_LL(d0, d1); AE_MULAAFP24S_HH_LL(a0, data2, coef2); data2 = AE_SELP24_HH(d0, d1); AE_MULAAFP24S_HH_LL(a1, data2, coef2); } /* Scale FIR output with right shifts, round/saturate * to Q1.31, and store 32 bit output. */ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, sizeof(int32_t)); return; } dp1 = (ae_f24 *)rp; for (j = 0; j < nch; j++) { /* Copy pointer and advance to next ch with dummy load */ dp0 = dp1; AE_L32F24_XC(data2, dp1, -sizeof(ae_f24)); /* Reset coefficient pointer and clear accumulator */ coefp = (ae_f24x2 *)cp; a0 = AE_ZERO64(); /* Compute FIR filter for current channel with four * taps per every loop iteration. Data is read from * interleaved buffer with stride of channels count. */ for (i = 0; i < taps_div_4; i++) { /* Load two coefficients */ coef2 = *coefp++; /* Load two data samples, place to high and * low of data2. */ AE_L32F24_XC(d0, dp0, inc); AE_L32F24_XC(d1, dp0, inc); data2 = AE_SELP24_LL(d0, d1); /* Accumulate to data2_h * coef2_h + * data2_l*coef2_l. The Q1.31 bit data is used * as Q1.23 from MSB side bits of the 32 bit * word. The accumulator m is Q17.47. */ AE_MULAAFD24_HH_LL(a0, data2, coef2); /* Repeat the same for next two filter taps */ coef2 = *coefp++; AE_L32F24_XC(d0, dp0, inc); AE_L32F24_XC(d1, dp0, inc); data2 = AE_SELP24_LL(d0, d1); AE_MULAAFD24_HH_LL(a0, data2, coef2); } /* Scale FIR output with right shifts, round/saturate Q17.47 * to Q1.31, and store 32 bit output. Advance write * pointer to next sample. */ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, sizeof(int32_t)); } } #endif /* 32bit coefficients version */ #if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE void src_polyphase_stage_cir(struct src_stage_prm *s) { /* This function uses * 1x 64 bit registers * 16x integers * 11x address pointers, */ ae_int32x2 q = AE_ZERO32(); ae_f32 *rp; ae_f32 *wp; int i; int n; int m; int n_wrap_buf; int n_min; struct src_state *fir = s->state; struct src_stage *cfg = s->stage; int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; const void *cp; /* Can be int32_t or int16_t */ const size_t out_size = fir->out_delay_size * sizeof(int32_t); const int nch = s->nch; const int nch_x_odm = cfg->odm * nch; const int blk_in_words = nch * cfg->blk_in; const int blk_out_words = nch * cfg->num_of_subfilters; const int sz = sizeof(int32_t); const int n_sz = -sizeof(int32_t); const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm); const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); const int taps_div_4 = cfg->subfilter_length >> 2; int32_t *x_rptr = (int32_t *)s->x_rptr; int32_t *y_wptr = (int32_t *)s->y_wptr; int32_t *x_end_addr = (int32_t *)s->x_end_addr; int32_t *y_end_addr = (int32_t *)s->y_end_addr; #if SRC_SHORT const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); #else const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); #endif for (n = 0; n < s->times; n++) { /* Input data to filter */ m = blk_in_words; /* Setup circular buffer for FIR input data delay */ AE_SETCBEGIN0(fir->fir_delay); AE_SETCEND0(fir_end); while (m > 0) { /* Number of words until circular wrap */ n_wrap_buf = x_end_addr - x_rptr; n_min = (m < n_wrap_buf) ? m : n_wrap_buf; m -= n_min; for (i = 0; i < n_min; i++) { /* Load 32 bits sample to accumulator, * advance pointer, shift left with saturate. */ AE_L32_XP(q, (ae_int32 *)x_rptr, sz); q = AE_SLAA32(q, s->shift); /* Store to circular buffer, advance pointer */ AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); } /* Check for wrap */ src_inc_wrap(&x_rptr, x_end_addr, s->x_size); } /* Do filter */ cp = cfg->coefs; /* Reset to 1st coefficient */ rp = (ae_f32 *)fir->fir_wp; /* Do circular modification to pointer rp by amount of * rewind to to data start. Loaded value q is discarded. */ AE_L32_XC(q, rp, rewind_sz); /* Reset FIR write pointer and compute all polyphase * sub-filters. */ wp = (ae_f32 *)fir->out_rp; for (i = 0; i < cfg->num_of_subfilters; i++) { fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); wp += nch_x_odm; cp = (char *)cp + subfilter_size; src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); /* Circular advance pointer rp by number of * channels x input delay multiplier. Loaded value q * is discarded. */ AE_L32_XC(q, rp, nch_x_idm_sz); } /* Output */ /* Setup circular buffer for SRC out delay access */ AE_SETCBEGIN0(fir->out_delay); AE_SETCEND0(out_delay_end); m = blk_out_words; while (m > 0) { n_wrap_buf = y_end_addr - y_wptr; n_min = (m < n_wrap_buf) ? m : n_wrap_buf; m -= n_min; for (i = 0; i < n_min; i++) { /* Circular load, shift right, linear store, * and advance read and write pointers. */ AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); q = AE_SRAA32(q, s->shift); AE_S32_L_XP(q, (ae_int32 *)y_wptr, sz); } /* Check wrap */ src_inc_wrap(&y_wptr, y_end_addr, s->y_size); } } s->x_rptr = x_rptr; s->y_wptr = y_wptr; } #endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */ #if CONFIG_FORMAT_S16LE void src_polyphase_stage_cir_s16(struct src_stage_prm *s) { /* This function uses * 2x 64 bit registers * 16x integers * 11x address pointers, */ ae_int32x2 q = AE_ZERO32(); ae_int16x4 d = AE_ZERO16(); ae_f32 *rp; ae_f32 *wp; int i; int n; int m; int n_wrap_buf; int n_min; struct src_state *fir = s->state; struct src_stage *cfg = s->stage; int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; const void *cp; /* Can be int32_t or int16_t */ const size_t out_size = fir->out_delay_size * sizeof(int32_t); const int nch = s->nch; const int nch_x_odm = cfg->odm * nch; const int blk_in_words = nch * cfg->blk_in; const int blk_out_words = nch * cfg->num_of_subfilters; const int sz = sizeof(int32_t); const int n_sz = -sizeof(int32_t); const int rewind_sz = sz * nch * (cfg->blk_in + (cfg->num_of_subfilters - 1) * cfg->idm); const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); const int taps_div_4 = cfg->subfilter_length >> 2; int16_t *x_rptr = (int16_t *)s->x_rptr; int16_t *y_wptr = (int16_t *)s->y_wptr; int16_t *x_end_addr = (int16_t *)s->x_end_addr; int16_t *y_end_addr = (int16_t *)s->y_end_addr; #if SRC_SHORT const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); #else const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); #endif for (n = 0; n < s->times; n++) { /* Input data */ m = blk_in_words; /* Setup circular buffer for FIR input data delay */ AE_SETCBEGIN0(fir->fir_delay); AE_SETCEND0(fir_end); while (m > 0) { /* Number of words without circular wrap */ n_wrap_buf = x_end_addr - x_rptr; n_min = (m < n_wrap_buf) ? m : n_wrap_buf; m -= n_min; for (i = 0; i < n_min; i++) { /* Load a 16 bits sample into d and left shift * by 16 into q, advance read and write * pointers. */ AE_L16_XP(d, (ae_int16 *)x_rptr, sizeof(ae_int16)); q = AE_CVT32X2F16_32(d); AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); } /* Check for wrap */ src_inc_wrap_s16(&x_rptr, x_end_addr, s->x_size); } /* Do filter */ cp = cfg->coefs; /* Reset to 1st coefficient */ rp = (ae_f32 *)fir->fir_wp; /* Do circular modification to pointer rp by amount of * rewind to to data start. Loaded value q is discarded. */ AE_L32_XC(q, rp, rewind_sz); /* Reset FIR output write pointer and compute all polyphase * sub-filters. */ wp = (ae_f32 *)fir->out_rp; for (i = 0; i < cfg->num_of_subfilters; i++) { fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); wp += nch_x_odm; cp = (char *)cp + subfilter_size; src_inc_wrap((int32_t **)&wp, out_delay_end, out_size); /* Circular advance pointer rp by number of * channels x input delay multiplier. Loaded value q * is discarded. */ AE_L32_XC(q, rp, nch_x_idm_sz); } /* Output */ /* Setup circular buffer for SRC out delay access */ AE_SETCBEGIN0(fir->out_delay); AE_SETCEND0(out_delay_end); m = blk_out_words; while (m > 0) { n_wrap_buf = y_end_addr - y_wptr; n_min = (m < n_wrap_buf) ? m : n_wrap_buf; m -= n_min; for (i = 0; i < n_min; i++) { /* Circular load for 32 bit sample, * advance read pointer. */ AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); /* Store Q1.31 value as Q1.15 and * advance write pointer. */ d = AE_ROUND16X4F32SSYM(q, q); AE_S16_0_XP(d, (ae_int16 *)y_wptr, sizeof(ae_int16)); } /* Check wrap */ src_inc_wrap_s16(&y_wptr, y_end_addr, s->y_size); } } s->x_rptr = x_rptr; s->y_wptr = y_wptr; } #endif /* CONFIG_FORMAT_S16LE */ #endif