DSP_Lib/Source/TransformFunctions/arm_cfft_f32.c - third_party/github/STMicroelectronics/cmsis_core - Git at Google

 /* ----------------------------------------------------------------------
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
 *
 * $Date:        19. March 2015
 * $Revision: 	V.1.4.5
 *
 * Project: 	    CMSIS DSP Library
 * Title:	    arm_cfft_f32.c
 *
 * Description:	Combined Radix Decimation in Frequency CFFT Floating point processing function
 *
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */

 #include "arm_math.h"
 #include "arm_common_tables.h"

 extern void arm_radix8_butterfly_f32(
     float32_t * pSrc,
     uint16_t fftLen,
     const float32_t * pCoef,
     uint16_t twidCoefModifier);

 extern void arm_bitreversal_32(
     uint32_t * pSrc,
     const uint16_t bitRevLen,
     const uint16_t * pBitRevTable);

 /**
 * @ingroup groupTransforms
 */

 /**
 * @defgroup ComplexFFT Complex FFT Functions
 *
 * \par
 * The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
 * Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
 * than the DFT, especially for long lengths.
 * The algorithms described in this section
 * operate on complex data.  A separate set of functions is devoted to handling
 * of real sequences.
 * \par
 * There are separate algorithms for handling floating-point, Q15, and Q31 data
 * types.  The algorithms available for each data type are described next.
 * \par
 * The FFT functions operate in-place.  That is, the array holding the input data
 * will also be used to hold the corresponding result.  The input data is complex
 * and contains <code>2*fftLen</code> interleaved values as shown below.
 * <pre> {real[0], imag[0], real[1], imag[1],..} </pre>
 * The FFT result will be contained in the same array and the frequency domain
 * values will have the same interleaving.
 *
 * \par Floating-point
 * The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
 * stages are performed along with a single radix-2 or radix-4 stage, as needed.
 * The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
 * a different twiddle factor table.
 * \par
 * The function uses the standard FFT definition and output values may grow by a
 * factor of <code>fftLen</code> when computing the forward transform.  The
 * inverse transform includes a scale of <code>1/fftLen</code> as part of the
 * calculation and this matches the textbook definition of the inverse FFT.
 * \par
 * Pre-initialized data structures containing twiddle factors and bit reversal
 * tables are provided and defined in <code>arm_const_structs.h</code>.  Include
 * this header in your function and then pass one of the constant structures as
 * an argument to arm_cfft_f32.  For example:
 * \par
 * <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
 * \par
 * computes a 64-point inverse complex FFT including bit reversal.
 * The data structures are treated as constant data and not modified during the
 * calculation.  The same data structure can be reused for multiple transforms
 * including mixing forward and inverse transforms.
 * \par
 * Earlier releases of the library provided separate radix-2 and radix-4
 * algorithms that operated on floating-point data.  These functions are still
 * provided but are deprecated.  The older functions are slower and less general
 * than the new functions.
 * \par
 * An example of initialization of the constants for the arm_cfft_f32 function follows:
 * \code
 * const static arm_cfft_instance_f32 *S;
 * ...
 *   switch (length) {
 *     case 16:
 *       S = &arm_cfft_sR_f32_len16;
 *       break;
 *     case 32:
 *       S = &arm_cfft_sR_f32_len32;
 *       break;
 *     case 64:
 *       S = &arm_cfft_sR_f32_len64;
 *       break;
 *     case 128:
 *       S = &arm_cfft_sR_f32_len128;
 *       break;
 *     case 256:
 *       S = &arm_cfft_sR_f32_len256;
 *       break;
 *     case 512:
 *       S = &arm_cfft_sR_f32_len512;
 *       break;
 *     case 1024:
 *       S = &arm_cfft_sR_f32_len1024;
 *       break;
 *     case 2048:
 *       S = &arm_cfft_sR_f32_len2048;
 *       break;
 *     case 4096:
 *       S = &arm_cfft_sR_f32_len4096;
 *       break;
 *   }
 * \endcode
 * \par Q15 and Q31
 * The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
 * stages are performed along with a single radix-2 stage, as needed.
 * The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
 * a different twiddle factor table.
 * \par
 * The function uses the standard FFT definition and output values may grow by a
 * factor of <code>fftLen</code> when computing the forward transform.  The
 * inverse transform includes a scale of <code>1/fftLen</code> as part of the
 * calculation and this matches the textbook definition of the inverse FFT.
 * \par
 * Pre-initialized data structures containing twiddle factors and bit reversal
 * tables are provided and defined in <code>arm_const_structs.h</code>.  Include
 * this header in your function and then pass one of the constant structures as
 * an argument to arm_cfft_q31.  For example:
 * \par
 * <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
 * \par
 * computes a 64-point inverse complex FFT including bit reversal.
 * The data structures are treated as constant data and not modified during the
 * calculation.  The same data structure can be reused for multiple transforms
 * including mixing forward and inverse transforms.
 * \par
 * Earlier releases of the library provided separate radix-2 and radix-4
 * algorithms that operated on floating-point data.  These functions are still
 * provided but are deprecated.  The older functions are slower and less general
 * than the new functions.
 * \par
 * An example of initialization of the constants for the arm_cfft_q31 function follows:
 * \code
 * const static arm_cfft_instance_q31 *S;
 * ...
 *   switch (length) {
 *     case 16:
 *       S = &arm_cfft_sR_q31_len16;
 *       break;
 *     case 32:
 *       S = &arm_cfft_sR_q31_len32;
 *       break;
 *     case 64:
 *       S = &arm_cfft_sR_q31_len64;
 *       break;
 *     case 128:
 *       S = &arm_cfft_sR_q31_len128;
 *       break;
 *     case 256:
 *       S = &arm_cfft_sR_q31_len256;
 *       break;
 *     case 512:
 *       S = &arm_cfft_sR_q31_len512;
 *       break;
 *     case 1024:
 *       S = &arm_cfft_sR_q31_len1024;
 *       break;
 *     case 2048:
 *       S = &arm_cfft_sR_q31_len2048;
 *       break;
 *     case 4096:
 *       S = &arm_cfft_sR_q31_len4096;
 *       break;
 *   }
 * \endcode
 *
 */

 void arm_cfft_radix8by2_f32( arm_cfft_instance_f32 * S, float32_t * p1)
 {
     uint32_t    L  = S->fftLen;
     float32_t * pCol1, * pCol2, * pMid1, * pMid2;
     float32_t * p2 = p1 + L;
     const float32_t * tw = (float32_t *) S->pTwiddle;
     float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
     float32_t m0, m1, m2, m3;
     uint32_t l;

     pCol1 = p1;
     pCol2 = p2;

     //    Define new length
     L >>= 1;
     //    Initialize mid pointers
     pMid1 = p1 + L;
     pMid2 = p2 + L;

     // do two dot Fourier transform
     for ( l = L >> 2; l > 0; l-- )
     {
         t1[0] = p1[0];
         t1[1] = p1[1];
         t1[2] = p1[2];
         t1[3] = p1[3];

         t2[0] = p2[0];
         t2[1] = p2[1];
         t2[2] = p2[2];
         t2[3] = p2[3];

         t3[0] = pMid1[0];
         t3[1] = pMid1[1];
         t3[2] = pMid1[2];
         t3[3] = pMid1[3];

         t4[0] = pMid2[0];
         t4[1] = pMid2[1];
         t4[2] = pMid2[2];
         t4[3] = pMid2[3];

         *p1++ = t1[0] + t2[0];
         *p1++ = t1[1] + t2[1];
         *p1++ = t1[2] + t2[2];
         *p1++ = t1[3] + t2[3];    // col 1

         t2[0] = t1[0] - t2[0];
         t2[1] = t1[1] - t2[1];
         t2[2] = t1[2] - t2[2];
         t2[3] = t1[3] - t2[3];    // for col 2

         *pMid1++ = t3[0] + t4[0];
         *pMid1++ = t3[1] + t4[1];
         *pMid1++ = t3[2] + t4[2];
         *pMid1++ = t3[3] + t4[3]; // col 1

         t4[0] = t4[0] - t3[0];
         t4[1] = t4[1] - t3[1];
         t4[2] = t4[2] - t3[2];
         t4[3] = t4[3] - t3[3];    // for col 2

         twR = *tw++;
         twI = *tw++;

         // multiply by twiddle factors
         m0 = t2[0] * twR;
         m1 = t2[1] * twI;
         m2 = t2[1] * twR;
         m3 = t2[0] * twI;

         // R  =  R  *  Tr - I * Ti
         *p2++ = m0 + m1;
         // I  =  I  *  Tr + R * Ti
         *p2++ = m2 - m3;

         // use vertical symmetry
         //  0.9988 - 0.0491i <==> -0.0491 - 0.9988i
         m0 = t4[0] * twI;
         m1 = t4[1] * twR;
         m2 = t4[1] * twI;
         m3 = t4[0] * twR;

         *pMid2++ = m0 - m1;
         *pMid2++ = m2 + m3;

         twR = *tw++;
         twI = *tw++;

         m0 = t2[2] * twR;
         m1 = t2[3] * twI;
         m2 = t2[3] * twR;
         m3 = t2[2] * twI;

         *p2++ = m0 + m1;
         *p2++ = m2 - m3;

         m0 = t4[2] * twI;
         m1 = t4[3] * twR;
         m2 = t4[3] * twI;
         m3 = t4[2] * twR;

         *pMid2++ = m0 - m1;
         *pMid2++ = m2 + m3;
     }

     // first col
     arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 2u);
     // second col
     arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 2u);
 }

 void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
 {
     uint32_t    L  = S->fftLen >> 1;
     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
     const float32_t *tw2, *tw3, *tw4;
     float32_t * p2 = p1 + L;
     float32_t * p3 = p2 + L;
     float32_t * p4 = p3 + L;
     float32_t t2[4], t3[4], t4[4], twR, twI;
     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
     float32_t m0, m1, m2, m3;
     uint32_t l, twMod2, twMod3, twMod4;

     pCol1 = p1;         // points to real values by default
     pCol2 = p2;
     pCol3 = p3;
     pCol4 = p4;
     pEnd1 = p2 - 1;     // points to imaginary values by default
     pEnd2 = p3 - 1;
     pEnd3 = p4 - 1;
     pEnd4 = pEnd3 + L;

     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;

     L >>= 1;

     // do four dot Fourier transform

     twMod2 = 2;
     twMod3 = 4;
     twMod4 = 6;

     // TOP
     p1ap3_0 = p1[0] + p3[0];
     p1sp3_0 = p1[0] - p3[0];
     p1ap3_1 = p1[1] + p3[1];
     p1sp3_1 = p1[1] - p3[1];

     // col 2
     t2[0] = p1sp3_0 + p2[1] - p4[1];
     t2[1] = p1sp3_1 - p2[0] + p4[0];
     // col 3
     t3[0] = p1ap3_0 - p2[0] - p4[0];
     t3[1] = p1ap3_1 - p2[1] - p4[1];
     // col 4
     t4[0] = p1sp3_0 - p2[1] + p4[1];
     t4[1] = p1sp3_1 + p2[0] - p4[0];
     // col 1
     *p1++ = p1ap3_0 + p2[0] + p4[0];
     *p1++ = p1ap3_1 + p2[1] + p4[1];

     // Twiddle factors are ones
     *p2++ = t2[0];
     *p2++ = t2[1];
     *p3++ = t3[0];
     *p3++ = t3[1];
     *p4++ = t4[0];
     *p4++ = t4[1];

     tw2 += twMod2;
     tw3 += twMod3;
     tw4 += twMod4;

     for (l = (L - 2) >> 1; l > 0; l-- )
     {
         // TOP
         p1ap3_0 = p1[0] + p3[0];
         p1sp3_0 = p1[0] - p3[0];
         p1ap3_1 = p1[1] + p3[1];
         p1sp3_1 = p1[1] - p3[1];
         // col 2
         t2[0] = p1sp3_0 + p2[1] - p4[1];
         t2[1] = p1sp3_1 - p2[0] + p4[0];
         // col 3
         t3[0] = p1ap3_0 - p2[0] - p4[0];
         t3[1] = p1ap3_1 - p2[1] - p4[1];
         // col 4
         t4[0] = p1sp3_0 - p2[1] + p4[1];
         t4[1] = p1sp3_1 + p2[0] - p4[0];
         // col 1 - top
         *p1++ = p1ap3_0 + p2[0] + p4[0];
         *p1++ = p1ap3_1 + p2[1] + p4[1];

         // BOTTOM
         p1ap3_1 = pEnd1[-1] + pEnd3[-1];
         p1sp3_1 = pEnd1[-1] - pEnd3[-1];
         p1ap3_0 = pEnd1[0] + pEnd3[0];
         p1sp3_0 = pEnd1[0] - pEnd3[0];
         // col 2
         t2[2] = pEnd2[0]  - pEnd4[0] + p1sp3_1;
         t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
         // col 3
         t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
         t3[3] = p1ap3_0 - pEnd2[0]  - pEnd4[0];
         // col 4
         t4[2] = pEnd2[0]  - pEnd4[0]  - p1sp3_1;
         t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
         // col 1 - Bottom
         *pEnd1-- = p1ap3_0 + pEnd2[0] + pEnd4[0];
         *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];

         // COL 2
         // read twiddle factors
         twR = *tw2++;
         twI = *tw2++;
         // multiply by twiddle factors
         //  let    Z1 = a + i(b),   Z2 = c + i(d)
         //   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d)

         // Top
         m0 = t2[0] * twR;
         m1 = t2[1] * twI;
         m2 = t2[1] * twR;
         m3 = t2[0] * twI;

         *p2++ = m0 + m1;
         *p2++ = m2 - m3;
         // use vertical symmetry col 2
         // 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i
         // Bottom
         m0 = t2[3] * twI;
         m1 = t2[2] * twR;
         m2 = t2[2] * twI;
         m3 = t2[3] * twR;

         *pEnd2-- = m0 - m1;
         *pEnd2-- = m2 + m3;

         // COL 3
         twR = tw3[0];
         twI = tw3[1];
         tw3 += twMod3;
         // Top
         m0 = t3[0] * twR;
         m1 = t3[1] * twI;
         m2 = t3[1] * twR;
         m3 = t3[0] * twI;

         *p3++ = m0 + m1;
         *p3++ = m2 - m3;
         // use vertical symmetry col 3
         // 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i
         // Bottom
         m0 = -t3[3] * twR;
         m1 = t3[2] * twI;
         m2 = t3[2] * twR;
         m3 = t3[3] * twI;

         *pEnd3-- = m0 - m1;
         *pEnd3-- = m3 - m2;

         // COL 4
         twR = tw4[0];
         twI = tw4[1];
         tw4 += twMod4;
         // Top
         m0 = t4[0] * twR;
         m1 = t4[1] * twI;
         m2 = t4[1] * twR;
         m3 = t4[0] * twI;

         *p4++ = m0 + m1;
         *p4++ = m2 - m3;
         // use vertical symmetry col 4
         // 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i
         // Bottom
         m0 = t4[3] * twI;
         m1 = t4[2] * twR;
         m2 = t4[2] * twI;
         m3 = t4[3] * twR;

         *pEnd4-- = m0 - m1;
         *pEnd4-- = m2 + m3;
     }

     //MIDDLE
     // Twiddle factors are
     //  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i
     p1ap3_0 = p1[0] + p3[0];
     p1sp3_0 = p1[0] - p3[0];
     p1ap3_1 = p1[1] + p3[1];
     p1sp3_1 = p1[1] - p3[1];

     // col 2
     t2[0] = p1sp3_0 + p2[1] - p4[1];
     t2[1] = p1sp3_1 - p2[0] + p4[0];
     // col 3
     t3[0] = p1ap3_0 - p2[0] - p4[0];
     t3[1] = p1ap3_1 - p2[1] - p4[1];
     // col 4
     t4[0] = p1sp3_0 - p2[1] + p4[1];
     t4[1] = p1sp3_1 + p2[0] - p4[0];
     // col 1 - Top
     *p1++ = p1ap3_0 + p2[0] + p4[0];
     *p1++ = p1ap3_1 + p2[1] + p4[1];

     // COL 2
     twR = tw2[0];
     twI = tw2[1];

     m0 = t2[0] * twR;
     m1 = t2[1] * twI;
     m2 = t2[1] * twR;
     m3 = t2[0] * twI;

     *p2++ = m0 + m1;
     *p2++ = m2 - m3;
     // COL 3
     twR = tw3[0];
     twI = tw3[1];

     m0 = t3[0] * twR;
     m1 = t3[1] * twI;
     m2 = t3[1] * twR;
     m3 = t3[0] * twI;

     *p3++ = m0 + m1;
     *p3++ = m2 - m3;
     // COL 4
     twR = tw4[0];
     twI = tw4[1];

     m0 = t4[0] * twR;
     m1 = t4[1] * twI;
     m2 = t4[1] * twR;
     m3 = t4[0] * twI;

     *p4++ = m0 + m1;
     *p4++ = m2 - m3;

     // first col
     arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 4u);
     // second col
     arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 4u);
     // third col
     arm_radix8_butterfly_f32( pCol3, L, (float32_t *) S->pTwiddle, 4u);
     // fourth col
     arm_radix8_butterfly_f32( pCol4, L, (float32_t *) S->pTwiddle, 4u);
 }

 /**
 * @addtogroup ComplexFFT
 * @{
 */

 /**
 * @details
 * @brief       Processing function for the floating-point complex FFT.
 * @param[in]      *S    points to an instance of the floating-point CFFT structure.
 * @param[in, out] *p1   points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
 * @param[in]     ifftFlag       flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
 * @param[in]     bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
 * @return none.
 */

 void arm_cfft_f32(
     const arm_cfft_instance_f32 * S,
     float32_t * p1,
     uint8_t ifftFlag,
     uint8_t bitReverseFlag)
 {
     uint32_t  L = S->fftLen, l;
     float32_t invL, * pSrc;

     if(ifftFlag == 1u)
     {
         /*  Conjugate input data  */
         pSrc = p1 + 1;
         for(l=0; l<L; l++)
         {
             *pSrc = -*pSrc;
             pSrc += 2;
         }
     }

     switch (L)
     {
     case 16:
     case 128:
     case 1024:
         arm_cfft_radix8by2_f32  ( (arm_cfft_instance_f32 *) S, p1);
         break;
     case 32:
     case 256:
     case 2048:
         arm_cfft_radix8by4_f32  ( (arm_cfft_instance_f32 *) S, p1);
         break;
     case 64:
     case 512:
     case 4096:
         arm_radix8_butterfly_f32( p1, L, (float32_t *) S->pTwiddle, 1);
         break;
     }

     if( bitReverseFlag )
         arm_bitreversal_32((uint32_t*)p1,S->bitRevLength,S->pBitRevTable);

     if(ifftFlag == 1u)
     {
         invL = 1.0f/(float32_t)L;
         /*  Conjugate and scale output data */
         pSrc = p1;
         for(l=0; l<L; l++)
         {
             *pSrc++ *=   invL ;
             *pSrc  = -(*pSrc) * invL;
             pSrc++;
         }
     }
 }

 /**
 * @} end of ComplexFFT group
 */
	/* ----------------------------------------------------------------------
	* Copyright (C) 2010-2014 ARM Limited. All rights reserved.
	*
	* $Date: 19. March 2015
	* $Revision: V.1.4.5
	*
	* Project: CMSIS DSP Library
	* Title: arm_cfft_f32.c
	*
	* Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
	*
	* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	* - Neither the name of ARM LIMITED nor the names of its contributors
	* may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	* -------------------------------------------------------------------- */

	#include "arm_math.h"
	#include "arm_common_tables.h"

	extern void arm_radix8_butterfly_f32(
	float32_t * pSrc,
	uint16_t fftLen,
	const float32_t * pCoef,
	uint16_t twidCoefModifier);

	extern void arm_bitreversal_32(
	uint32_t * pSrc,
	const uint16_t bitRevLen,
	const uint16_t * pBitRevTable);

	/**
	* @ingroup groupTransforms
	*/

	/**
	* @defgroup ComplexFFT Complex FFT Functions
	*
	* \par
	* The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
	* Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
	* than the DFT, especially for long lengths.
	* The algorithms described in this section
	* operate on complex data. A separate set of functions is devoted to handling
	* of real sequences.
	* \par
	* There are separate algorithms for handling floating-point, Q15, and Q31 data
	* types. The algorithms available for each data type are described next.
	* \par
	* The FFT functions operate in-place. That is, the array holding the input data
	* will also be used to hold the corresponding result. The input data is complex
	* and contains <code>2*fftLen</code> interleaved values as shown below.
	* <pre> {real[0], imag[0], real[1], imag[1],..} </pre>
	* The FFT result will be contained in the same array and the frequency domain
	* values will have the same interleaving.
	*
	* \par Floating-point
	* The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
	* stages are performed along with a single radix-2 or radix-4 stage, as needed.
	* The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
	* a different twiddle factor table.
	* \par
	* The function uses the standard FFT definition and output values may grow by a
	* factor of <code>fftLen</code> when computing the forward transform. The
	* inverse transform includes a scale of <code>1/fftLen</code> as part of the
	* calculation and this matches the textbook definition of the inverse FFT.
	* \par
	* Pre-initialized data structures containing twiddle factors and bit reversal
	* tables are provided and defined in <code>arm_const_structs.h</code>. Include
	* this header in your function and then pass one of the constant structures as
	* an argument to arm_cfft_f32. For example:
	* \par
	* <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
	* \par
	* computes a 64-point inverse complex FFT including bit reversal.
	* The data structures are treated as constant data and not modified during the
	* calculation. The same data structure can be reused for multiple transforms
	* including mixing forward and inverse transforms.
	* \par
	* Earlier releases of the library provided separate radix-2 and radix-4
	* algorithms that operated on floating-point data. These functions are still
	* provided but are deprecated. The older functions are slower and less general
	* than the new functions.
	* \par
	* An example of initialization of the constants for the arm_cfft_f32 function follows:
	* \code
	* const static arm_cfft_instance_f32 *S;
	* ...
	* switch (length) {
	* case 16:
	* S = &arm_cfft_sR_f32_len16;
	* break;
	* case 32:
	* S = &arm_cfft_sR_f32_len32;
	* break;
	* case 64:
	* S = &arm_cfft_sR_f32_len64;
	* break;
	* case 128:
	* S = &arm_cfft_sR_f32_len128;
	* break;
	* case 256:
	* S = &arm_cfft_sR_f32_len256;
	* break;
	* case 512:
	* S = &arm_cfft_sR_f32_len512;
	* break;
	* case 1024:
	* S = &arm_cfft_sR_f32_len1024;
	* break;
	* case 2048:
	* S = &arm_cfft_sR_f32_len2048;
	* break;
	* case 4096:
	* S = &arm_cfft_sR_f32_len4096;
	* break;
	* }
	* \endcode
	* \par Q15 and Q31
	* The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
	* stages are performed along with a single radix-2 stage, as needed.
	* The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
	* a different twiddle factor table.
	* \par
	* The function uses the standard FFT definition and output values may grow by a
	* factor of <code>fftLen</code> when computing the forward transform. The
	* inverse transform includes a scale of <code>1/fftLen</code> as part of the
	* calculation and this matches the textbook definition of the inverse FFT.
	* \par
	* Pre-initialized data structures containing twiddle factors and bit reversal
	* tables are provided and defined in <code>arm_const_structs.h</code>. Include
	* this header in your function and then pass one of the constant structures as
	* an argument to arm_cfft_q31. For example:
	* \par
	* <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
	* \par
	* computes a 64-point inverse complex FFT including bit reversal.
	* The data structures are treated as constant data and not modified during the
	* calculation. The same data structure can be reused for multiple transforms
	* including mixing forward and inverse transforms.
	* \par
	* Earlier releases of the library provided separate radix-2 and radix-4
	* algorithms that operated on floating-point data. These functions are still
	* provided but are deprecated. The older functions are slower and less general
	* than the new functions.
	* \par
	* An example of initialization of the constants for the arm_cfft_q31 function follows:
	* \code
	* const static arm_cfft_instance_q31 *S;
	* ...
	* switch (length) {
	* case 16:
	* S = &arm_cfft_sR_q31_len16;
	* break;
	* case 32:
	* S = &arm_cfft_sR_q31_len32;
	* break;
	* case 64:
	* S = &arm_cfft_sR_q31_len64;
	* break;
	* case 128:
	* S = &arm_cfft_sR_q31_len128;
	* break;
	* case 256:
	* S = &arm_cfft_sR_q31_len256;
	* break;
	* case 512:
	* S = &arm_cfft_sR_q31_len512;
	* break;
	* case 1024:
	* S = &arm_cfft_sR_q31_len1024;
	* break;
	* case 2048:
	* S = &arm_cfft_sR_q31_len2048;
	* break;
	* case 4096:
	* S = &arm_cfft_sR_q31_len4096;
	* break;
	* }
	* \endcode
	*
	*/

	void arm_cfft_radix8by2_f32( arm_cfft_instance_f32 * S, float32_t * p1)
	{
	uint32_t L = S->fftLen;
	float32_t * pCol1, * pCol2, * pMid1, * pMid2;
	float32_t * p2 = p1 + L;
	const float32_t * tw = (float32_t *) S->pTwiddle;
	float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
	float32_t m0, m1, m2, m3;
	uint32_t l;

	pCol1 = p1;
	pCol2 = p2;

	// Define new length
	L >>= 1;
	// Initialize mid pointers
	pMid1 = p1 + L;
	pMid2 = p2 + L;

	// do two dot Fourier transform
	for ( l = L >> 2; l > 0; l-- )
	{
	t1[0] = p1[0];
	t1[1] = p1[1];
	t1[2] = p1[2];
	t1[3] = p1[3];

	t2[0] = p2[0];
	t2[1] = p2[1];
	t2[2] = p2[2];
	t2[3] = p2[3];

	t3[0] = pMid1[0];
	t3[1] = pMid1[1];
	t3[2] = pMid1[2];
	t3[3] = pMid1[3];

	t4[0] = pMid2[0];
	t4[1] = pMid2[1];
	t4[2] = pMid2[2];
	t4[3] = pMid2[3];

	*p1++ = t1[0] + t2[0];
	*p1++ = t1[1] + t2[1];
	*p1++ = t1[2] + t2[2];
	*p1++ = t1[3] + t2[3]; // col 1

	t2[0] = t1[0] - t2[0];
	t2[1] = t1[1] - t2[1];
	t2[2] = t1[2] - t2[2];
	t2[3] = t1[3] - t2[3]; // for col 2

	*pMid1++ = t3[0] + t4[0];
	*pMid1++ = t3[1] + t4[1];
	*pMid1++ = t3[2] + t4[2];
	*pMid1++ = t3[3] + t4[3]; // col 1

	t4[0] = t4[0] - t3[0];
	t4[1] = t4[1] - t3[1];
	t4[2] = t4[2] - t3[2];
	t4[3] = t4[3] - t3[3]; // for col 2

	twR = *tw++;
	twI = *tw++;

	// multiply by twiddle factors
	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	// R = R * Tr - I * Ti
	*p2++ = m0 + m1;
	// I = I * Tr + R * Ti
	*p2++ = m2 - m3;

	// use vertical symmetry
	// 0.9988 - 0.0491i <==> -0.0491 - 0.9988i
	m0 = t4[0] * twI;
	m1 = t4[1] * twR;
	m2 = t4[1] * twI;
	m3 = t4[0] * twR;

	*pMid2++ = m0 - m1;
	*pMid2++ = m2 + m3;

	twR = *tw++;
	twI = *tw++;

	m0 = t2[2] * twR;
	m1 = t2[3] * twI;
	m2 = t2[3] * twR;
	m3 = t2[2] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;

	m0 = t4[2] * twI;
	m1 = t4[3] * twR;
	m2 = t4[3] * twI;
	m3 = t4[2] * twR;

	*pMid2++ = m0 - m1;
	*pMid2++ = m2 + m3;
	}

	// first col
	arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 2u);
	// second col
	arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 2u);
	}

	void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
	{
	uint32_t L = S->fftLen >> 1;
	float32_t * pCol1, pCol2, pCol3, pCol4, pEnd1, pEnd2, pEnd3, *pEnd4;
	const float32_t tw2, tw3, *tw4;
	float32_t * p2 = p1 + L;
	float32_t * p3 = p2 + L;
	float32_t * p4 = p3 + L;
	float32_t t2[4], t3[4], t4[4], twR, twI;
	float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
	float32_t m0, m1, m2, m3;
	uint32_t l, twMod2, twMod3, twMod4;

	pCol1 = p1; // points to real values by default
	pCol2 = p2;
	pCol3 = p3;
	pCol4 = p4;
	pEnd1 = p2 - 1; // points to imaginary values by default
	pEnd2 = p3 - 1;
	pEnd3 = p4 - 1;
	pEnd4 = pEnd3 + L;

	tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;

	L >>= 1;

	// do four dot Fourier transform

	twMod2 = 2;
	twMod3 = 4;
	twMod4 = 6;

	// TOP
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];

	// col 2
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	// col 3
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	// col 4
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	// col 1
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	// Twiddle factors are ones
	*p2++ = t2[0];
	*p2++ = t2[1];
	*p3++ = t3[0];
	*p3++ = t3[1];
	*p4++ = t4[0];
	*p4++ = t4[1];

	tw2 += twMod2;
	tw3 += twMod3;
	tw4 += twMod4;

	for (l = (L - 2) >> 1; l > 0; l-- )
	{
	// TOP
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];
	// col 2
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	// col 3
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	// col 4
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	// col 1 - top
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	// BOTTOM
	p1ap3_1 = pEnd1[-1] + pEnd3[-1];
	p1sp3_1 = pEnd1[-1] - pEnd3[-1];
	p1ap3_0 = pEnd1[0] + pEnd3[0];
	p1sp3_0 = pEnd1[0] - pEnd3[0];
	// col 2
	t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
	t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
	// col 3
	t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
	t3[3] = p1ap3_0 - pEnd2[0] - pEnd4[0];
	// col 4
	t4[2] = pEnd2[0] - pEnd4[0] - p1sp3_1;
	t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
	// col 1 - Bottom
	*pEnd1-- = p1ap3_0 + pEnd2[0] + pEnd4[0];
	*pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];

	// COL 2
	// read twiddle factors
	twR = *tw2++;
	twI = *tw2++;
	// multiply by twiddle factors
	// let Z1 = a + i(b), Z2 = c + i(d)
	// => Z1 * Z2 = (ac - bd) + i(bc + ad)

	// Top
	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;
	// use vertical symmetry col 2
	// 0.9997 - 0.0245i <==> 0.0245 - 0.9997i
	// Bottom
	m0 = t2[3] * twI;
	m1 = t2[2] * twR;
	m2 = t2[2] * twI;
	m3 = t2[3] * twR;

	*pEnd2-- = m0 - m1;
	*pEnd2-- = m2 + m3;

	// COL 3
	twR = tw3[0];
	twI = tw3[1];
	tw3 += twMod3;
	// Top
	m0 = t3[0] * twR;
	m1 = t3[1] * twI;
	m2 = t3[1] * twR;
	m3 = t3[0] * twI;

	*p3++ = m0 + m1;
	*p3++ = m2 - m3;
	// use vertical symmetry col 3
	// 0.9988 - 0.0491i <==> -0.9988 - 0.0491i
	// Bottom
	m0 = -t3[3] * twR;
	m1 = t3[2] * twI;
	m2 = t3[2] * twR;
	m3 = t3[3] * twI;

	*pEnd3-- = m0 - m1;
	*pEnd3-- = m3 - m2;

	// COL 4
	twR = tw4[0];
	twI = tw4[1];
	tw4 += twMod4;
	// Top
	m0 = t4[0] * twR;
	m1 = t4[1] * twI;
	m2 = t4[1] * twR;
	m3 = t4[0] * twI;

	*p4++ = m0 + m1;
	*p4++ = m2 - m3;
	// use vertical symmetry col 4
	// 0.9973 - 0.0736i <==> -0.0736 + 0.9973i
	// Bottom
	m0 = t4[3] * twI;
	m1 = t4[2] * twR;
	m2 = t4[2] * twI;
	m3 = t4[3] * twR;

	*pEnd4-- = m0 - m1;
	*pEnd4-- = m2 + m3;
	}

	//MIDDLE
	// Twiddle factors are
	// 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i
	p1ap3_0 = p1[0] + p3[0];
	p1sp3_0 = p1[0] - p3[0];
	p1ap3_1 = p1[1] + p3[1];
	p1sp3_1 = p1[1] - p3[1];

	// col 2
	t2[0] = p1sp3_0 + p2[1] - p4[1];
	t2[1] = p1sp3_1 - p2[0] + p4[0];
	// col 3
	t3[0] = p1ap3_0 - p2[0] - p4[0];
	t3[1] = p1ap3_1 - p2[1] - p4[1];
	// col 4
	t4[0] = p1sp3_0 - p2[1] + p4[1];
	t4[1] = p1sp3_1 + p2[0] - p4[0];
	// col 1 - Top
	*p1++ = p1ap3_0 + p2[0] + p4[0];
	*p1++ = p1ap3_1 + p2[1] + p4[1];

	// COL 2
	twR = tw2[0];
	twI = tw2[1];

	m0 = t2[0] * twR;
	m1 = t2[1] * twI;
	m2 = t2[1] * twR;
	m3 = t2[0] * twI;

	*p2++ = m0 + m1;
	*p2++ = m2 - m3;
	// COL 3
	twR = tw3[0];
	twI = tw3[1];

	m0 = t3[0] * twR;
	m1 = t3[1] * twI;
	m2 = t3[1] * twR;
	m3 = t3[0] * twI;

	*p3++ = m0 + m1;
	*p3++ = m2 - m3;
	// COL 4
	twR = tw4[0];
	twI = tw4[1];

	m0 = t4[0] * twR;
	m1 = t4[1] * twI;
	m2 = t4[1] * twR;
	m3 = t4[0] * twI;

	*p4++ = m0 + m1;
	*p4++ = m2 - m3;

	// first col
	arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 4u);
	// second col
	arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 4u);
	// third col
	arm_radix8_butterfly_f32( pCol3, L, (float32_t *) S->pTwiddle, 4u);
	// fourth col
	arm_radix8_butterfly_f32( pCol4, L, (float32_t *) S->pTwiddle, 4u);
	}

	/**
	* @addtogroup ComplexFFT
	* @{
	*/

	/**
	* @details
	* @brief Processing function for the floating-point complex FFT.
	* @param[in] *S points to an instance of the floating-point CFFT structure.
	* @param[in, out] p1 points to the complex data buffer of size <code>2fftLen</code>. Processing occurs in-place.
	* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
	* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
	* @return none.
	*/

	void arm_cfft_f32(
	const arm_cfft_instance_f32 * S,
	float32_t * p1,
	uint8_t ifftFlag,
	uint8_t bitReverseFlag)
	{
	uint32_t L = S->fftLen, l;
	float32_t invL, * pSrc;

	if(ifftFlag == 1u)
	{
	/* Conjugate input data */
	pSrc = p1 + 1;
	for(l=0; l<L; l++)
	{
	pSrc = -pSrc;
	pSrc += 2;
	}
	}

	switch (L)
	{
	case 16:
	case 128:
	case 1024:
	arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
	break;
	case 32:
	case 256:
	case 2048:
	arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
	break;
	case 64:
	case 512:
	case 4096:
	arm_radix8_butterfly_f32( p1, L, (float32_t *) S->pTwiddle, 1);
	break;
	}

	if( bitReverseFlag )
	arm_bitreversal_32((uint32_t*)p1,S->bitRevLength,S->pBitRevTable);

	if(ifftFlag == 1u)
	{
	invL = 1.0f/(float32_t)L;
	/* Conjugate and scale output data */
	pSrc = p1;
	for(l=0; l<L; l++)
	{
	pSrc++ = invL ;
	pSrc = -(pSrc) * invL;
	pSrc++;
	}
	}
	}

	/**
	* @} end of ComplexFFT group
	*/