DSP/Source/TransformFunctions/arm_cfft_f32.c - third_party/github/STMicroelectronics/cmsis_core.git - Git at Google

 /* ----------------------------------------------------------------------
  * Project:      CMSIS DSP Library
  * Title:        arm_cfft_f32.c
  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
  *
  * $Date:        18. March 2019
  * $Revision:    V1.6.0
  *
  * Target Processor: Cortex-M cores
  * -------------------------------------------------------------------- */
 /*
  * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "arm_math.h"
 #include "arm_common_tables.h"

 extern void arm_radix8_butterfly_f32(
         float32_t * pSrc,
         uint16_t fftLen,
   const float32_t * pCoef,
         uint16_t twidCoefModifier);

 extern void arm_bitreversal_32(
         uint32_t * pSrc,
   const uint16_t bitRevLen,
   const uint16_t * pBitRevTable);

 /**
   @ingroup groupTransforms
  */

 /**
   @defgroup ComplexFFT Complex FFT Functions

   @par
                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
                    than the DFT, especially for long lengths.
                    The algorithms described in this section
                    operate on complex data.  A separate set of functions is devoted to handling
                    of real sequences.
   @par
                    There are separate algorithms for handling floating-point, Q15, and Q31 data
                    types.  The algorithms available for each data type are described next.
   @par
                    The FFT functions operate in-place.  That is, the array holding the input data
                    will also be used to hold the corresponding result.  The input data is complex
                    and contains <code>2*fftLen</code> interleaved values as shown below.
                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                    The FFT result will be contained in the same array and the frequency domain
                    values will have the same interleaving.

   @par Floating-point
                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
                    a different twiddle factor table.
   @par
                    The function uses the standard FFT definition and output values may grow by a
                    factor of <code>fftLen</code> when computing the forward transform.  The
                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
                    calculation and this matches the textbook definition of the inverse FFT.
   @par
                    Pre-initialized data structures containing twiddle factors and bit reversal
                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                    this header in your function and then pass one of the constant structures as
                    an argument to arm_cfft_f32.  For example:
   @par
                    <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
   @par
                    computes a 64-point inverse complex FFT including bit reversal.
                    The data structures are treated as constant data and not modified during the
                    calculation.  The same data structure can be reused for multiple transforms
                    including mixing forward and inverse transforms.
   @par
                    Earlier releases of the library provided separate radix-2 and radix-4
                    algorithms that operated on floating-point data.  These functions are still
                    provided but are deprecated.  The older functions are slower and less general
                    than the new functions.
   @par
                    An example of initialization of the constants for the arm_cfft_f32 function follows:
   @code
                    const static arm_cfft_instance_f32 *S;
                    ...
                      switch (length) {
                        case 16:
                          S = &arm_cfft_sR_f32_len16;
                          break;
                        case 32:
                          S = &arm_cfft_sR_f32_len32;
                          break;
                        case 64:
                          S = &arm_cfft_sR_f32_len64;
                          break;
                        case 128:
                          S = &arm_cfft_sR_f32_len128;
                          break;
                        case 256:
                          S = &arm_cfft_sR_f32_len256;
                          break;
                        case 512:
                          S = &arm_cfft_sR_f32_len512;
                          break;
                        case 1024:
                          S = &arm_cfft_sR_f32_len1024;
                          break;
                        case 2048:
                          S = &arm_cfft_sR_f32_len2048;
                          break;
                        case 4096:
                          S = &arm_cfft_sR_f32_len4096;
                          break;
                      }
   @endcode
   @par Q15 and Q31
                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
                    stages are performed along with a single radix-2 stage, as needed.
                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
                    a different twiddle factor table.
   @par
                    The function uses the standard FFT definition and output values may grow by a
                    factor of <code>fftLen</code> when computing the forward transform.  The
                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
                    calculation and this matches the textbook definition of the inverse FFT.
   @par
                    Pre-initialized data structures containing twiddle factors and bit reversal
                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                    this header in your function and then pass one of the constant structures as
                    an argument to arm_cfft_q31. For example:
   @par
                    <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
   @par
                    computes a 64-point inverse complex FFT including bit reversal.
                    The data structures are treated as constant data and not modified during the
                    calculation.  The same data structure can be reused for multiple transforms
                    including mixing forward and inverse transforms.
   @par
                    Earlier releases of the library provided separate radix-2 and radix-4
                    algorithms that operated on floating-point data.  These functions are still
                    provided but are deprecated.  The older functions are slower and less general
                    than the new functions.
   @par
                    An example of initialization of the constants for the arm_cfft_q31 function follows:
   @code
                    const static arm_cfft_instance_q31 *S;
                    ...
                      switch (length) {
                        case 16:
                          S = &arm_cfft_sR_q31_len16;
                          break;
                        case 32:
                          S = &arm_cfft_sR_q31_len32;
                          break;
                        case 64:
                          S = &arm_cfft_sR_q31_len64;
                          break;
                        case 128:
                          S = &arm_cfft_sR_q31_len128;
                          break;
                        case 256:
                          S = &arm_cfft_sR_q31_len256;
                          break;
                        case 512:
                          S = &arm_cfft_sR_q31_len512;
                          break;
                        case 1024:
                          S = &arm_cfft_sR_q31_len1024;
                          break;
                        case 2048:
                          S = &arm_cfft_sR_q31_len2048;
                          break;
                        case 4096:
                          S = &arm_cfft_sR_q31_len4096;
                          break;
                      }
   @endcode

  */

 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
 {
   uint32_t    L  = S->fftLen;
   float32_t * pCol1, * pCol2, * pMid1, * pMid2;
   float32_t * p2 = p1 + L;
   const float32_t * tw = (float32_t *) S->pTwiddle;
   float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
   float32_t m0, m1, m2, m3;
   uint32_t l;

   pCol1 = p1;
   pCol2 = p2;

   /* Define new length */
   L >>= 1;

   /* Initialize mid pointers */
   pMid1 = p1 + L;
   pMid2 = p2 + L;

   /* do two dot Fourier transform */
   for (l = L >> 2; l > 0; l-- )
   {
     t1[0] = p1[0];
     t1[1] = p1[1];
     t1[2] = p1[2];
     t1[3] = p1[3];

     t2[0] = p2[0];
     t2[1] = p2[1];
     t2[2] = p2[2];
     t2[3] = p2[3];

     t3[0] = pMid1[0];
     t3[1] = pMid1[1];
     t3[2] = pMid1[2];
     t3[3] = pMid1[3];

     t4[0] = pMid2[0];
     t4[1] = pMid2[1];
     t4[2] = pMid2[2];
     t4[3] = pMid2[3];

     *p1++ = t1[0] + t2[0];
     *p1++ = t1[1] + t2[1];
     *p1++ = t1[2] + t2[2];
     *p1++ = t1[3] + t2[3];    /* col 1 */

     t2[0] = t1[0] - t2[0];
     t2[1] = t1[1] - t2[1];
     t2[2] = t1[2] - t2[2];
     t2[3] = t1[3] - t2[3];    /* for col 2 */

     *pMid1++ = t3[0] + t4[0];
     *pMid1++ = t3[1] + t4[1];
     *pMid1++ = t3[2] + t4[2];
     *pMid1++ = t3[3] + t4[3]; /* col 1 */

     t4[0] = t4[0] - t3[0];
     t4[1] = t4[1] - t3[1];
     t4[2] = t4[2] - t3[2];
     t4[3] = t4[3] - t3[3];    /* for col 2 */

     twR = *tw++;
     twI = *tw++;

     /* multiply by twiddle factors */
     m0 = t2[0] * twR;
     m1 = t2[1] * twI;
     m2 = t2[1] * twR;
     m3 = t2[0] * twI;

     /* R  =  R  *  Tr - I * Ti */
     *p2++ = m0 + m1;
     /* I  =  I  *  Tr + R * Ti */
     *p2++ = m2 - m3;

     /* use vertical symmetry */
     /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
     m0 = t4[0] * twI;
     m1 = t4[1] * twR;
     m2 = t4[1] * twI;
     m3 = t4[0] * twR;

     *pMid2++ = m0 - m1;
     *pMid2++ = m2 + m3;

     twR = *tw++;
     twI = *tw++;

     m0 = t2[2] * twR;
     m1 = t2[3] * twI;
     m2 = t2[3] * twR;
     m3 = t2[2] * twI;

     *p2++ = m0 + m1;
     *p2++ = m2 - m3;

     m0 = t4[2] * twI;
     m1 = t4[3] * twR;
     m2 = t4[3] * twI;
     m3 = t4[2] * twR;

     *pMid2++ = m0 - m1;
     *pMid2++ = m2 + m3;
   }

   /* first col */
   arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);

   /* second col */
   arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
 }

 void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
 {
     uint32_t    L  = S->fftLen >> 1;
     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
     const float32_t *tw2, *tw3, *tw4;
     float32_t * p2 = p1 + L;
     float32_t * p3 = p2 + L;
     float32_t * p4 = p3 + L;
     float32_t t2[4], t3[4], t4[4], twR, twI;
     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
     float32_t m0, m1, m2, m3;
     uint32_t l, twMod2, twMod3, twMod4;

     pCol1 = p1;         /* points to real values by default */
     pCol2 = p2;
     pCol3 = p3;
     pCol4 = p4;
     pEnd1 = p2 - 1;     /* points to imaginary values by default */
     pEnd2 = p3 - 1;
     pEnd3 = p4 - 1;
     pEnd4 = pEnd3 + L;

     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;

     L >>= 1;

     /* do four dot Fourier transform */

     twMod2 = 2;
     twMod3 = 4;
     twMod4 = 6;

     /* TOP */
     p1ap3_0 = p1[0] + p3[0];
     p1sp3_0 = p1[0] - p3[0];
     p1ap3_1 = p1[1] + p3[1];
     p1sp3_1 = p1[1] - p3[1];

     /* col 2 */
     t2[0] = p1sp3_0 + p2[1] - p4[1];
     t2[1] = p1sp3_1 - p2[0] + p4[0];
     /* col 3 */
     t3[0] = p1ap3_0 - p2[0] - p4[0];
     t3[1] = p1ap3_1 - p2[1] - p4[1];
     /* col 4 */
     t4[0] = p1sp3_0 - p2[1] + p4[1];
     t4[1] = p1sp3_1 + p2[0] - p4[0];
     /* col 1 */
     *p1++ = p1ap3_0 + p2[0] + p4[0];
     *p1++ = p1ap3_1 + p2[1] + p4[1];

     /* Twiddle factors are ones */
     *p2++ = t2[0];
     *p2++ = t2[1];
     *p3++ = t3[0];
     *p3++ = t3[1];
     *p4++ = t4[0];
     *p4++ = t4[1];

     tw2 += twMod2;
     tw3 += twMod3;
     tw4 += twMod4;

     for (l = (L - 2) >> 1; l > 0; l-- )
     {
       /* TOP */
       p1ap3_0 = p1[0] + p3[0];
       p1sp3_0 = p1[0] - p3[0];
       p1ap3_1 = p1[1] + p3[1];
       p1sp3_1 = p1[1] - p3[1];
       /* col 2 */
       t2[0] = p1sp3_0 + p2[1] - p4[1];
       t2[1] = p1sp3_1 - p2[0] + p4[0];
       /* col 3 */
       t3[0] = p1ap3_0 - p2[0] - p4[0];
       t3[1] = p1ap3_1 - p2[1] - p4[1];
       /* col 4 */
       t4[0] = p1sp3_0 - p2[1] + p4[1];
       t4[1] = p1sp3_1 + p2[0] - p4[0];
       /* col 1 - top */
       *p1++ = p1ap3_0 + p2[0] + p4[0];
       *p1++ = p1ap3_1 + p2[1] + p4[1];

       /* BOTTOM */
       p1ap3_1 = pEnd1[-1] + pEnd3[-1];
       p1sp3_1 = pEnd1[-1] - pEnd3[-1];
       p1ap3_0 = pEnd1[ 0] + pEnd3[0];
       p1sp3_0 = pEnd1[ 0] - pEnd3[0];
       /* col 2 */
       t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
       t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
       /* col 3 */
       t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
       t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
       /* col 4 */
       t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
       t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
       /* col 1 - Bottom */
       *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
       *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];

       /* COL 2 */
       /* read twiddle factors */
       twR = *tw2++;
       twI = *tw2++;
       /* multiply by twiddle factors */
       /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
       /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */

       /* Top */
       m0 = t2[0] * twR;
       m1 = t2[1] * twI;
       m2 = t2[1] * twR;
       m3 = t2[0] * twI;

       *p2++ = m0 + m1;
       *p2++ = m2 - m3;
       /* use vertical symmetry col 2 */
       /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
       /* Bottom */
       m0 = t2[3] * twI;
       m1 = t2[2] * twR;
       m2 = t2[2] * twI;
       m3 = t2[3] * twR;

       *pEnd2-- = m0 - m1;
       *pEnd2-- = m2 + m3;

       /* COL 3 */
       twR = tw3[0];
       twI = tw3[1];
       tw3 += twMod3;
       /* Top */
       m0 = t3[0] * twR;
       m1 = t3[1] * twI;
       m2 = t3[1] * twR;
       m3 = t3[0] * twI;

       *p3++ = m0 + m1;
       *p3++ = m2 - m3;
       /* use vertical symmetry col 3 */
       /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
       /* Bottom */
       m0 = -t3[3] * twR;
       m1 =  t3[2] * twI;
       m2 =  t3[2] * twR;
       m3 =  t3[3] * twI;

       *pEnd3-- = m0 - m1;
       *pEnd3-- = m3 - m2;

       /* COL 4 */
       twR = tw4[0];
       twI = tw4[1];
       tw4 += twMod4;
       /* Top */
       m0 = t4[0] * twR;
       m1 = t4[1] * twI;
       m2 = t4[1] * twR;
       m3 = t4[0] * twI;

       *p4++ = m0 + m1;
       *p4++ = m2 - m3;
       /* use vertical symmetry col 4 */
       /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
       /* Bottom */
       m0 = t4[3] * twI;
       m1 = t4[2] * twR;
       m2 = t4[2] * twI;
       m3 = t4[3] * twR;

       *pEnd4-- = m0 - m1;
       *pEnd4-- = m2 + m3;
     }

     /* MIDDLE */
     /* Twiddle factors are */
     /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
     p1ap3_0 = p1[0] + p3[0];
     p1sp3_0 = p1[0] - p3[0];
     p1ap3_1 = p1[1] + p3[1];
     p1sp3_1 = p1[1] - p3[1];

     /* col 2 */
     t2[0] = p1sp3_0 + p2[1] - p4[1];
     t2[1] = p1sp3_1 - p2[0] + p4[0];
     /* col 3 */
     t3[0] = p1ap3_0 - p2[0] - p4[0];
     t3[1] = p1ap3_1 - p2[1] - p4[1];
     /* col 4 */
     t4[0] = p1sp3_0 - p2[1] + p4[1];
     t4[1] = p1sp3_1 + p2[0] - p4[0];
     /* col 1 - Top */
     *p1++ = p1ap3_0 + p2[0] + p4[0];
     *p1++ = p1ap3_1 + p2[1] + p4[1];

     /* COL 2 */
     twR = tw2[0];
     twI = tw2[1];

     m0 = t2[0] * twR;
     m1 = t2[1] * twI;
     m2 = t2[1] * twR;
     m3 = t2[0] * twI;

     *p2++ = m0 + m1;
     *p2++ = m2 - m3;
     /* COL 3 */
     twR = tw3[0];
     twI = tw3[1];

     m0 = t3[0] * twR;
     m1 = t3[1] * twI;
     m2 = t3[1] * twR;
     m3 = t3[0] * twI;

     *p3++ = m0 + m1;
     *p3++ = m2 - m3;
     /* COL 4 */
     twR = tw4[0];
     twI = tw4[1];

     m0 = t4[0] * twR;
     m1 = t4[1] * twI;
     m2 = t4[1] * twR;
     m3 = t4[0] * twI;

     *p4++ = m0 + m1;
     *p4++ = m2 - m3;

     /* first col */
     arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);

     /* second col */
     arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);

     /* third col */
     arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);

     /* fourth col */
     arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
 }

 /**
   @addtogroup ComplexFFT
   @{
  */

 /**
   @brief         Processing function for the floating-point complex FFT.
   @param[in]     S              points to an instance of the floating-point CFFT structure
   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
   @param[in]     ifftFlag       flag that selects transform direction
                    - value = 0: forward transform
                    - value = 1: inverse transform
   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
                    - value = 0: disables bit reversal of output
                    - value = 1: enables bit reversal of output
   @return        none
  */

 void arm_cfft_f32(
   const arm_cfft_instance_f32 * S,
         float32_t * p1,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
 {
   uint32_t  L = S->fftLen, l;
   float32_t invL, * pSrc;

   if (ifftFlag == 1U)
   {
     /* Conjugate input data */
     pSrc = p1 + 1;
     for (l = 0; l < L; l++)
     {
       *pSrc = -*pSrc;
       pSrc += 2;
     }
   }

   switch (L)
   {
   case 16:
   case 128:
   case 1024:
     arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
     break;
   case 32:
   case 256:
   case 2048:
     arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
     break;
   case 64:
   case 512:
   case 4096:
     arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
     break;
   }

   if ( bitReverseFlag )
     arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);

   if (ifftFlag == 1U)
   {
     invL = 1.0f / (float32_t)L;

     /* Conjugate and scale output data */
     pSrc = p1;
     for (l= 0; l < L; l++)
     {
       *pSrc++ *=   invL ;
       *pSrc    = -(*pSrc) * invL;
       pSrc++;
     }
   }
 }

 /**
   @} end of ComplexFFT group
  */
	/* ----------------------------------------------------------------------
	* Project: CMSIS DSP Library
	* Title: arm_cfft_f32.c
	* Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
	*
	* $Date: 18. March 2019
	* $Revision: V1.6.0
	*
	* Target Processor: Cortex-M cores
	* -------------------------------------------------------------------- */
	/*
	* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
	*
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the License); you may
	* not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "arm_math.h"
	#include "arm_common_tables.h"

	extern void arm_radix8_butterfly_f32(
	float32_t * pSrc,
	uint16_t fftLen,
	const float32_t * pCoef,
	uint16_t twidCoefModifier);

	extern void arm_bitreversal_32(
	uint32_t * pSrc,
	const uint16_t bitRevLen,
	const uint16_t * pBitRevTable);

	/**
	@ingroup groupTransforms
	*/

	/**
	@defgroup ComplexFFT Complex FFT Functions

	@par
	The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
	Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
	than the DFT, especially for long lengths.
	The algorithms described in this section
	operate on complex data. A separate set of functions is devoted to handling
	of real sequences.
	@par
	There are separate algorithms for handling floating-point, Q15, and Q31 data
	types. The algorithms available for each data type are described next.
	@par
	The FFT functions operate in-place. That is, the array holding the input data
	will also be used to hold the corresponding result. The input data is complex
	and contains <code>2*fftLen</code> interleaved values as shown below.
	<pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
	The FFT result will be contained in the same array and the frequency domain
	values will have the same interleaving.

	@par Floating-point
	The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
	stages are performed along with a single radix-2 or radix-4 stage, as needed.
	The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
	a different twiddle factor table.
	@par
	The function uses the standard FFT definition and output values may grow by a
	factor of <code>fftLen</code> when computing the forward transform. The
	inverse transform includes a scale of <code>1/fftLen</code> as part of the
	calculation and this matches the textbook definition of the inverse FFT.
	@par
	Pre-initialized data structures containing twiddle factors and bit reversal
	tables are provided and defined in <code>arm_const_structs.h</code>. Include
	this header in your function and then pass one of the constant structures as
	an argument to arm_cfft_f32. For example:
	@par
	<code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
	@par
	computes a 64-point inverse complex FFT including bit reversal.
	The data structures are treated as constant data and not modified during the
	calculation. The same data structure can be reused for multiple transforms
	including mixing forward and inverse transforms.
	@par
	Earlier releases of the library provided separate radix-2 and radix-4
	algorithms that operated on floating-point data. These functions are still
	provided but are deprecated. The older functions are slower and less general
	than the new functions.
	@par
	An example of initialization of the constants for the arm_cfft_f32 function follows:
	@code
	const static arm_cfft_instance_f32 *S;
	...
	switch (length) {
	case 16:
	S = &arm_cfft_sR_f32_len16;
	break;
	case 32:
	S = &arm_cfft_sR_f32_len32;
	break;
	case 64:
	S = &arm_cfft_sR_f32_len64;
	break;
	case 128:
	S = &arm_cfft_sR_f32_len128;
	break;
	case 256:
	S = &arm_cfft_sR_f32_len256;
	break;
	case 512:
	S = &arm_cfft_sR_f32_len512;
	break;
	case 1024:
	S = &arm_cfft_sR_f32_len1024;
	break;
	case 2048:
	S = &arm_cfft_sR_f32_len2048;
	break;
	case 4096:
	S = &arm_cfft_sR_f32_len4096;
	break;
	}
	@endcode
	@par Q15 and Q31
	The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
	stages are performed along with a single radix-2 stage, as needed.
	The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
	a different twiddle factor table.
	@par
	The function uses the standard FFT definition and output values may grow by a
	factor of <code>fftLen</code> when computing the forward transform. The
	inverse transform includes a scale of <code>1/fftLen</code> as part of the
	calculation and this matches the textbook definition of the inverse FFT.
	@par
	Pre-initialized data structures containing twiddle factors and bit reversal
	tables are provided and defined in <code>arm_const_structs.h</code>. Include
	this header in your function and then pass one of the constant structures as
	an argument to arm_cfft_q31. For example:
	@par
	<code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
	@par
	computes a 64-point inverse complex FFT including bit reversal.
	The data structures are treated as constant data and not modified during the
	calculation. The same data structure can be reused for multiple transforms
	including mixing forward and inverse transforms.
	@par
	Earlier releases of the library provided separate radix-2 and radix-4
	algorithms that operated on floating-point data. These functions are still
	provided but are deprecated. The older functions are slower and less general
	than the new functions.
	@par
	An example of initialization of the constants for the arm_cfft_q31 function follows:
	@code
	const static arm_cfft_instance_q31 *S;
	...
	switch (length) {
	case 16:
	S = &arm_cfft_sR_q31_len16;
	break;
	case 32:
	S = &arm_cfft_sR_q31_len32;
	break;
	case 64:
	S = &arm_cfft_sR_q31_len64;
	break;
	case 128:
	S = &arm_cfft_sR_q31_len128;
	break;
	case 256:
	S = &arm_cfft_sR_q31_len256;
	break;
	case 512:
	S = &arm_cfft_sR_q31_len512;
	break;
	case 1024:
	S = &arm_cfft_sR_q31_len1024;
	break;
	case 2048:
	S = &arm_cfft_sR_q31_len2048;
	break;
	case 4096:
	S = &arm_cfft_sR_q31_len4096;
	break;
	}
	@endcode

	*/

	void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
	{
	uint32_t L = S->fftLen;
	float32_t * pCol1, * pCol2, * pMid1, * pMid2;
	float32_t * p2 = p1 + L;
	const float32_t * tw = (float32_t *) S->pTwiddle;
	float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
	float32_t m0, m1, m2, m3;
	uint32_t l;

	pCol1 = p1;
	pCol2 = p2;

	/* Define new length */
	L >>= 1;

	/* Initialize mid pointers */
	pMid1 = p1 + L;
	pMid2 = p2 + L;

	/* do two dot Fourier transform */
	for (l = L >> 2; l > 0; l-- )
	{
	t1[0] = p1[0];
	t1[1] = p1[1];
	t1[2] = p1[2];
	t1[3] = p1[3];

	t2[0] = p2[0];
	t2[1] = p2[1];
	t2[2] = p2[2];
	t2[3] = p2[3];

	t3[0] = pMid1[0];
	t3[1] = pMid1[1];
	t3[2] = pMid1[2];
	t3[3] = pMid1[3];

	t4[0] = pMid2[0];
	t4[1] = pMid2[1];
	t4[2] = pMid2[2];
	t4[3] = pMid2[3];

	*p1++ = t1[0] + t2[0];
	*p1++ = t1[1] + t2[1];
	*p1++ = t1[2] + t2[2];
	p1++ = t1[3] + t2[3]; / col 1 */

	t2[0] = t1[0] - t2[0];
	t2[1] = t1[1] - t2[1];
	t2[2] = t1[2] - t2[2];
	t2[3] = t1[3] - t2[3]; /* for col 2 */

	*pMid1++ = t3[0] + t4[0];
	*pMid1++ = t3[1] + t4[1];
	*pMid1++ = t3[2] + t4[2];
	pMid1++ = t3[3] + t4[3]; / col 1 */

	t4[0] = t4[0] - t3[0];
	t4[1] = t4[1] - t3[1];
	t4[2] = t4[2] - t3[2];
	t4[3] = t4[3] - t3[3]; /* for col 2 */

	twR = *tw++;
	twI = *tw++;

	/* multiply by twiddle factors */
	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	/* R = R * Tr - I * Ti */
	*p2++ = m0 + m1;
	/* I = I * Tr + R * Ti */
	*p2++ = m2 - m3;

	/* use vertical symmetry */
	/* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
	m0 = t4[0] * twI;
	m1 = t4[1] * twR;
	m2 = t4[1] * twI;
	m3 = t4[0] * twR;

	*pMid2++ = m0 - m1;
	*pMid2++ = m2 + m3;

	twR = *tw++;
	twI = *tw++;

	m0 = t2[2] * twR;
	m1 = t2[3] * twI;
	m2 = t2[3] * twR;
	m3 = t2[2] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;

	m0 = t4[2] * twI;
	m1 = t4[3] * twR;
	m2 = t4[3] * twI;
	m3 = t4[2] * twR;

	*pMid2++ = m0 - m1;
	*pMid2++ = m2 + m3;
	}

	/* first col */
	arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);

	/* second col */
	arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
	}

	void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
	{
	uint32_t L = S->fftLen >> 1;
	float32_t * pCol1, pCol2, pCol3, pCol4, pEnd1, pEnd2, pEnd3, *pEnd4;
	const float32_t tw2, tw3, *tw4;
	float32_t * p2 = p1 + L;
	float32_t * p3 = p2 + L;
	float32_t * p4 = p3 + L;
	float32_t t2[4], t3[4], t4[4], twR, twI;
	float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
	float32_t m0, m1, m2, m3;
	uint32_t l, twMod2, twMod3, twMod4;

	pCol1 = p1; /* points to real values by default */
	pCol2 = p2;
	pCol3 = p3;
	pCol4 = p4;
	pEnd1 = p2 - 1; /* points to imaginary values by default */
	pEnd2 = p3 - 1;
	pEnd3 = p4 - 1;
	pEnd4 = pEnd3 + L;

	tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;

	L >>= 1;

	/* do four dot Fourier transform */

	twMod2 = 2;
	twMod3 = 4;
	twMod4 = 6;

	/* TOP */
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];

	/* col 2 */
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	/* col 3 */
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	/* col 4 */
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	/* col 1 */
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	/* Twiddle factors are ones */
	*p2++ = t2[0];
	*p2++ = t2[1];
	*p3++ = t3[0];
	*p3++ = t3[1];
	*p4++ = t4[0];
	*p4++ = t4[1];

	tw2 += twMod2;
	tw3 += twMod3;
	tw4 += twMod4;

	for (l = (L - 2) >> 1; l > 0; l-- )
	{
	/* TOP */
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];
	/* col 2 */
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	/* col 3 */
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	/* col 4 */
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	/* col 1 - top */
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	/* BOTTOM */
	p1ap3_1 = pEnd1[-1] + pEnd3[-1];
	p1sp3_1 = pEnd1[-1] - pEnd3[-1];
	p1ap3_0 = pEnd1[ 0] + pEnd3[0];
	p1sp3_0 = pEnd1[ 0] - pEnd3[0];
	/* col 2 */
	t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
	t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
	/* col 3 */
	t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
	t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
	/* col 4 */
	t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
	t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
	/* col 1 - Bottom */
	*pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
	*pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];

	/* COL 2 */
	/* read twiddle factors */
	twR = *tw2++;
	twI = *tw2++;
	/* multiply by twiddle factors */
	/* let Z1 = a + i(b), Z2 = c + i(d) */
	/* => Z1 * Z2 = (ac - bd) + i(bc + ad) */

	/* Top */
	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;
	/* use vertical symmetry col 2 */
	/* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
	/* Bottom */
	m0 = t2[3] * twI;
	m1 = t2[2] * twR;
	m2 = t2[2] * twI;
	m3 = t2[3] * twR;

	*pEnd2-- = m0 - m1;
	*pEnd2-- = m2 + m3;

	/* COL 3 */
	twR = tw3[0];
	twI = tw3[1];
	tw3 += twMod3;
	/* Top */
	m0 = t3[0] * twR;
	m1 = t3[1] * twI;
	m2 = t3[1] * twR;
	m3 = t3[0] * twI;

	*p3++ = m0 + m1;
	*p3++ = m2 - m3;
	/* use vertical symmetry col 3 */
	/* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
	/* Bottom */
	m0 = -t3[3] * twR;
	m1 = t3[2] * twI;
	m2 = t3[2] * twR;
	m3 = t3[3] * twI;

	*pEnd3-- = m0 - m1;
	*pEnd3-- = m3 - m2;

	/* COL 4 */
	twR = tw4[0];
	twI = tw4[1];
	tw4 += twMod4;
	/* Top */
	m0 = t4[0] * twR;
	m1 = t4[1] * twI;
	m2 = t4[1] * twR;
	m3 = t4[0] * twI;

	*p4++ = m0 + m1;
	*p4++ = m2 - m3;
	/* use vertical symmetry col 4 */
	/* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
	/* Bottom */
	m0 = t4[3] * twI;
	m1 = t4[2] * twR;
	m2 = t4[2] * twI;
	m3 = t4[3] * twR;

	*pEnd4-- = m0 - m1;
	*pEnd4-- = m2 + m3;
	}

	/* MIDDLE */
	/* Twiddle factors are */
	/* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];

	/* col 2 */
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	/* col 3 */
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	/* col 4 */
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	/* col 1 - Top */
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	/* COL 2 */
	twR = tw2[0];
	twI = tw2[1];

	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;
	/* COL 3 */
	twR = tw3[0];
	twI = tw3[1];

	m0 = t3[0] * twR;
	m1 = t3[1] * twI;
	m2 = t3[1] * twR;
	m3 = t3[0] * twI;

	*p3++ = m0 + m1;
	*p3++ = m2 - m3;
	/* COL 4 */
	twR = tw4[0];
	twI = tw4[1];

	m0 = t4[0] * twR;
	m1 = t4[1] * twI;
	m2 = t4[1] * twR;
	m3 = t4[0] * twI;

	*p4++ = m0 + m1;
	*p4++ = m2 - m3;

	/* first col */
	arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);

	/* second col */
	arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);

	/* third col */
	arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);

	/* fourth col */
	arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
	}

	/**
	@addtogroup ComplexFFT
	@{
	*/

	/**
	@brief Processing function for the floating-point complex FFT.
	@param[in] S points to an instance of the floating-point CFFT structure
	@param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
	@param[in] ifftFlag flag that selects transform direction
	- value = 0: forward transform
	- value = 1: inverse transform
	@param[in] bitReverseFlag flag that enables / disables bit reversal of output
	- value = 0: disables bit reversal of output
	- value = 1: enables bit reversal of output
	@return none
	*/

	void arm_cfft_f32(
	const arm_cfft_instance_f32 * S,
	float32_t * p1,
	uint8_t ifftFlag,
	uint8_t bitReverseFlag)
	{
	uint32_t L = S->fftLen, l;
	float32_t invL, * pSrc;

	if (ifftFlag == 1U)
	{
	/* Conjugate input data */
	pSrc = p1 + 1;
	for (l = 0; l < L; l++)
	{
	pSrc = -pSrc;
	pSrc += 2;
	}
	}

	switch (L)
	{
	case 16:
	case 128:
	case 1024:
	arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
	break;
	case 32:
	case 256:
	case 2048:
	arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
	break;
	case 64:
	case 512:
	case 4096:
	arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
	break;
	}

	if ( bitReverseFlag )
	arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);

	if (ifftFlag == 1U)
	{
	invL = 1.0f / (float32_t)L;

	/* Conjugate and scale output data */
	pSrc = p1;
	for (l= 0; l < L; l++)
	{
	pSrc++ = invL ;
	pSrc = -(pSrc) * invL;
	pSrc++;
	}
	}
	}

	/**
	@} end of ComplexFFT group
	*/