NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c - third_party/github/STMicroelectronics/cmsis_core - Git at Google

 /*
  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
  * Title:        arm_fully_connected_q15_opt.c
  * Description:  Q15 opt fully-connected layer function
  *
  * $Date:        17. January 2018
  * $Revision:    V.1.0.0
  *
  * Target Processor:  Cortex-M cores
  *
  * -------------------------------------------------------------------- */

 #include "arm_math.h"
 #include "arm_nnfunctions.h"

 /**
  *  @ingroup groupNN
  */

 /**
  * @addtogroup FC
  * @{
  */

   /**
    * @brief Q15 opt fully-connected layer function
    * @param[in]       pV          pointer to input vector
    * @param[in]       pM          pointer to matrix weights
    * @param[in]       dim_vec     length of the vector
    * @param[in]       num_of_rows number of rows in weight matrix
    * @param[in]       bias_shift  amount of left-shift for bias
    * @param[in]       out_shift   amount of right-shift for output
    * @param[in]       bias        pointer to bias
    * @param[in,out]   pOut        pointer to output vector
    * @param[in,out]   vec_buffer  pointer to buffer space for input
    * @return     The function returns <code>ARM_MATH_SUCCESS</code>
    *
    *
    * @details
    *
    * <b>Buffer size:</b>
    *
    * vec_buffer size: 0
    *
    *  Here we use only one pointer to read 4 rows in the weight
    *  matrix. So if the original matrix looks like this:
    *
    *  | a11 | a12 | a13 |
    *
    *  | a21 | a22 | a23 |
    *
    *  | a31 | a32 | a33 |
    *
    *  | a41 | a42 | a43 |
    *
    *  | a51 | a52 | a53 |
    *
    *  | a61 | a62 | a63 |
    *
    *  We operates on multiple-of-4 rows, so the first four rows becomes
    *
    *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
    *
    *  | a13 | a23 | a33 | a43 |
    *
    *  Remaining rows are kept the same original order.
    *
    *  So the stored weight matrix looks like this:
    *
    *
    *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
    *
    *  | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 |
    *
    *  | a62 | a63 |
    */

 arm_status
 arm_fully_connected_q15_opt(const q15_t * pV,
                             const q15_t * pM,
                             const uint16_t dim_vec,
                             const uint16_t num_of_rows,
                             const uint16_t bias_shift,
                             const uint16_t out_shift,
                             const q15_t * bias,
                             q15_t * pOut,
                             q15_t * vec_buffer)
 {

 #if defined (ARM_MATH_DSP)
     /* Run the following code for Cortex-M4 and Cortex-M7 */

     const q15_t *pB = pM;
     q15_t    *pO = pOut;
     const q15_t *pBias = bias;
     const q15_t *pA = pV;

     uint16_t  rowCnt = num_of_rows >> 2;

     while (rowCnt)
     {
         q31_t     sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

         uint16_t  colCnt = dim_vec >> 1;

         pA = pV;

 #ifdef USE_INTRINSIC

         while (colCnt)
         {
             q31_t     inM11, inM12, inM13, inM14;
             q31_t     inV;

             inV = *__SIMD32(pA)++;
             inM11 = *__SIMD32(pB)++;
             sum = __SMLAD(inV, inM11, sum);
             inM12 = *__SIMD32(pB)++;
             sum2 = __SMLAD(inV, inM12, sum2);
             inM13 = *__SIMD32(pB)++;
             sum3 = __SMLAD(inV, inM13, sum3);
             inM14 = *__SIMD32(pB)++;
             sum4 = __SMLAD(inV, inM14, sum4);
             colCnt--;
         }

 #else

         /*
          * register needed:
          * loop counter: colCnt
          * accumulators: sum, sum2, sum3, sum4
          * pointers: pB, pA
          * weight data: inM11, inM12, inM13, inM14
          * activation data: inV
          */

         asm volatile ("COL_LOOP_%=:\n"
                       "ldr.w r4, [%[pA]], #4\n"
                       "ldr.w r0, [%[pB]], #16\n"
                       "smlad %[sum], r4, r0, %[sum]\n"
                       "ldr.w r1, [%[pB] , #-12]\n"
                       "smlad %[sum2], r4, r1, %[sum2]\n"
                       "ldr.w r2, [%[pB] , #-8]\n"
                       "smlad %[sum3], r4, r2, %[sum3]\n"
                       "ldr.w r3, [%[pB] , #-4]\n"
                       "smlad %[sum4], r4, r3, %[sum4]\n"
                       "subs %[colCnt], #1\n"
                       "bne COL_LOOP_%=\n":[sum] "+r"(sum),
                       [sum2] "+r"(sum2),[sum3] "+r"(sum3),
                       [sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");

 #endif                          /* USE_INTRINSIC */

         colCnt = dim_vec & 0x1;
         while (colCnt)
         {

             q15_t     inV = *pA++;
             q15_t     inM = *pB++;
             q15_t     inM2 = *pB++;
             q15_t     inM3 = *pB++;
             q15_t     inM4 = *pB++;

             sum += inV * inM;
             sum2 += inV * inM2;
             sum3 += inV * inM3;
             sum4 += inV * inM4;
             colCnt--;
         }                       /* while over colCnt */
         *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
         *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
         *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
         *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));

         /* adjust the pointers and counters */
         rowCnt--;
     }

     /* left-over part of the rows */
     rowCnt = num_of_rows & 0x3;

     while (rowCnt)
     {
         q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

         uint16_t  colCnt = dim_vec >> 2;

         pA = pV;

         while (colCnt)
         {
             q31_t     inV1, inV2, inM1, inM2;

             inM1 = *__SIMD32(pB)++;
             inV1 = *__SIMD32(pA)++;
             sum = __SMLAD(inV1, inM1, sum);

             inM2 = *__SIMD32(pB)++;
             inV2 = *__SIMD32(pA)++;
             sum = __SMLAD(inV2, inM2, sum);

             colCnt--;
         }

         /* left-over of the vector */
         colCnt = dim_vec & 0x3;
         while (colCnt)
         {
             q15_t     inV = *pA++;
             q15_t     inM = *pB++;
             sum += inV * inM;
             colCnt--;
         }

         *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));

         rowCnt--;
     }

 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
     uint16_t  rowCnt = num_of_rows >> 2;
     const q15_t *pB = pM;
     const q15_t *pA;
     q15_t    *pO = pOut;
     const q15_t *pBias = bias;

     while (rowCnt)
     {
         q31_t     sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         q31_t     sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

         uint16_t  colCnt = dim_vec >> 1;

         pA = pV;
         while (colCnt)
         {
             q15_t     inA1 = *pA++;
             q15_t     inA2 = *pA++;

             q15_t     inB1 = *pB++;
             q15_t     inB2 = *pB++;
             sum += inA1 * inB1 + inA2 * inB2;

             inB1 = *pB++;
             inB2 = *pB++;
             sum2 += inA1 * inB1 + inA2 * inB2;

             inB1 = *pB++;
             inB2 = *pB++;
             sum3 += inA1 * inB1 + inA2 * inB2;

             inB1 = *pB++;
             inB2 = *pB++;
             sum4 += inA1 * inB1 + inA2 * inB2;

             colCnt--;
         }
         colCnt = dim_vec & 0x1;
         while (colCnt)
         {
             q15_t     inA = *pA++;
             q15_t     inB = *pB++;
             sum += inA * inB;
             inB = *pB++;
             sum2 += inA * inB;
             inB = *pB++;
             sum3 += inA * inB;
             inB = *pB++;
             sum4 += inA * inB;
             colCnt--;
         }
         *pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
         *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
         *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
         *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);

         rowCnt--;
     }
     rowCnt = num_of_rows & 0x3;

     while (rowCnt)
     {
         int       ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
         int       j;

         pA = pV;
         for (j = 0; j < dim_vec; j++)
         {
             q15_t     inA = *pA++;
             q15_t     inB = *pB++;
             ip_out += inA * inB;
         }
         *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);

         rowCnt--;
     }

 #endif                          /* ARM_MATH_DSP */

     /* Return to ARM_MATH_SUCCESS */
     return (ARM_MATH_SUCCESS);

 }

 /**
  * @} end of FC group
  */
	/*
	* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
	*
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the License); you may
	* not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* ----------------------------------------------------------------------
	* Project: CMSIS NN Library
	* Title: arm_fully_connected_q15_opt.c
	* Description: Q15 opt fully-connected layer function
	*
	* $Date: 17. January 2018
	* $Revision: V.1.0.0
	*
	* Target Processor: Cortex-M cores
	*
	* -------------------------------------------------------------------- */

	#include "arm_math.h"
	#include "arm_nnfunctions.h"

	/**
	* @ingroup groupNN
	*/

	/**
	* @addtogroup FC
	* @{
	*/

	/**
	* @brief Q15 opt fully-connected layer function
	* @param[in] pV pointer to input vector
	* @param[in] pM pointer to matrix weights
	* @param[in] dim_vec length of the vector
	* @param[in] num_of_rows number of rows in weight matrix
	* @param[in] bias_shift amount of left-shift for bias
	* @param[in] out_shift amount of right-shift for output
	* @param[in] bias pointer to bias
	* @param[in,out] pOut pointer to output vector
	* @param[in,out] vec_buffer pointer to buffer space for input
	* @return The function returns <code>ARM_MATH_SUCCESS</code>
	*
	*
	* @details
	*
	* <b>Buffer size:</b>
	*
	* vec_buffer size: 0
	*
	* Here we use only one pointer to read 4 rows in the weight
	* matrix. So if the original matrix looks like this:
	*
	* \| a11 \| a12 \| a13 \|
	*
	* \| a21 \| a22 \| a23 \|
	*
	* \| a31 \| a32 \| a33 \|
	*
	* \| a41 \| a42 \| a43 \|
	*
	* \| a51 \| a52 \| a53 \|
	*
	* \| a61 \| a62 \| a63 \|
	*
	* We operates on multiple-of-4 rows, so the first four rows becomes
	*
	* \| a11 \| a12 \| a21 \| a22 \| a31 \| a32 \| a41 \| a42 \|
	*
	* \| a13 \| a23 \| a33 \| a43 \|
	*
	* Remaining rows are kept the same original order.
	*
	* So the stored weight matrix looks like this:
	*
	*
	* \| a11 \| a12 \| a21 \| a22 \| a31 \| a32 \| a41 \| a42 \|
	*
	* \| a13 \| a23 \| a33 \| a43 \| a51 \| a52 \| a53 \| a61 \|
	*
	* \| a62 \| a63 \|
	*/

	arm_status
	arm_fully_connected_q15_opt(const q15_t * pV,
	const q15_t * pM,
	const uint16_t dim_vec,
	const uint16_t num_of_rows,
	const uint16_t bias_shift,
	const uint16_t out_shift,
	const q15_t * bias,
	q15_t * pOut,
	q15_t * vec_buffer)
	{

	#if defined (ARM_MATH_DSP)
	/* Run the following code for Cortex-M4 and Cortex-M7 */

	const q15_t *pB = pM;
	q15_t *pO = pOut;
	const q15_t *pBias = bias;
	const q15_t *pA = pV;

	uint16_t rowCnt = num_of_rows >> 2;

	while (rowCnt)
	{
	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

	uint16_t colCnt = dim_vec >> 1;

	pA = pV;

	#ifdef USE_INTRINSIC

	while (colCnt)
	{
	q31_t inM11, inM12, inM13, inM14;
	q31_t inV;

	inV = *__SIMD32(pA)++;
	inM11 = *__SIMD32(pB)++;
	sum = __SMLAD(inV, inM11, sum);
	inM12 = *__SIMD32(pB)++;
	sum2 = __SMLAD(inV, inM12, sum2);
	inM13 = *__SIMD32(pB)++;
	sum3 = __SMLAD(inV, inM13, sum3);
	inM14 = *__SIMD32(pB)++;
	sum4 = __SMLAD(inV, inM14, sum4);
	colCnt--;
	}

	#else

	/*
	* register needed:
	* loop counter: colCnt
	* accumulators: sum, sum2, sum3, sum4
	* pointers: pB, pA
	* weight data: inM11, inM12, inM13, inM14
	* activation data: inV
	*/

	asm volatile ("COL_LOOP_%=:\n"
	"ldr.w r4, [%[pA]], #4\n"
	"ldr.w r0, [%[pB]], #16\n"
	"smlad %[sum], r4, r0, %[sum]\n"
	"ldr.w r1, [%[pB] , #-12]\n"
	"smlad %[sum2], r4, r1, %[sum2]\n"
	"ldr.w r2, [%[pB] , #-8]\n"
	"smlad %[sum3], r4, r2, %[sum3]\n"
	"ldr.w r3, [%[pB] , #-4]\n"
	"smlad %[sum4], r4, r3, %[sum4]\n"
	"subs %[colCnt], #1\n"
	"bne COL_LOOP_%=\n":[sum] "+r"(sum),
	[sum2] "+r"(sum2),[sum3] "+r"(sum3),
	[sum4] "+r"(sum4),[pB] "+r"(pB),[pA] "+r"(pA):[colCnt] "r"(colCnt):"r0", "r1", "r2", "r3", "r4");

	#endif /* USE_INTRINSIC */

	colCnt = dim_vec & 0x1;
	while (colCnt)
	{

	q15_t inV = *pA++;
	q15_t inM = *pB++;
	q15_t inM2 = *pB++;
	q15_t inM3 = *pB++;
	q15_t inM4 = *pB++;

	sum += inV * inM;
	sum2 += inV * inM2;
	sum3 += inV * inM3;
	sum4 += inV * inM4;
	colCnt--;
	} /* while over colCnt */
	*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
	*pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
	*pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
	*pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));

	/* adjust the pointers and counters */
	rowCnt--;
	}

	/* left-over part of the rows */
	rowCnt = num_of_rows & 0x3;

	while (rowCnt)
	{
	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

	uint16_t colCnt = dim_vec >> 2;

	pA = pV;

	while (colCnt)
	{
	q31_t inV1, inV2, inM1, inM2;

	inM1 = *__SIMD32(pB)++;
	inV1 = *__SIMD32(pA)++;
	sum = __SMLAD(inV1, inM1, sum);

	inM2 = *__SIMD32(pB)++;
	inV2 = *__SIMD32(pA)++;
	sum = __SMLAD(inV2, inM2, sum);

	colCnt--;
	}

	/* left-over of the vector */
	colCnt = dim_vec & 0x3;
	while (colCnt)
	{
	q15_t inV = *pA++;
	q15_t inM = *pB++;
	sum += inV * inM;
	colCnt--;
	}

	*pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));

	rowCnt--;
	}

	#else
	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
	uint16_t rowCnt = num_of_rows >> 2;
	const q15_t *pB = pM;
	const q15_t *pA;
	q15_t *pO = pOut;
	const q15_t *pBias = bias;

	while (rowCnt)
	{
	q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);

	uint16_t colCnt = dim_vec >> 1;

	pA = pV;
	while (colCnt)
	{
	q15_t inA1 = *pA++;
	q15_t inA2 = *pA++;

	q15_t inB1 = *pB++;
	q15_t inB2 = *pB++;
	sum += inA1 * inB1 + inA2 * inB2;

	inB1 = *pB++;
	inB2 = *pB++;
	sum2 += inA1 * inB1 + inA2 * inB2;

	inB1 = *pB++;
	inB2 = *pB++;
	sum3 += inA1 * inB1 + inA2 * inB2;

	inB1 = *pB++;
	inB2 = *pB++;
	sum4 += inA1 * inB1 + inA2 * inB2;

	colCnt--;
	}
	colCnt = dim_vec & 0x1;
	while (colCnt)
	{
	q15_t inA = *pA++;
	q15_t inB = *pB++;
	sum += inA * inB;
	inB = *pB++;
	sum2 += inA * inB;
	inB = *pB++;
	sum3 += inA * inB;
	inB = *pB++;
	sum4 += inA * inB;
	colCnt--;
	}
	*pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
	*pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
	*pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
	*pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);

	rowCnt--;
	}
	rowCnt = num_of_rows & 0x3;

	while (rowCnt)
	{
	int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
	int j;

	pA = pV;
	for (j = 0; j < dim_vec; j++)
	{
	q15_t inA = *pA++;
	q15_t inB = *pB++;
	ip_out += inA * inB;
	}
	*pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);

	rowCnt--;
	}

	#endif /* ARM_MATH_DSP */

	/* Return to ARM_MATH_SUCCESS */
	return (ARM_MATH_SUCCESS);

	}

	/**
	* @} end of FC group
	*/