NN/Source/PoolingFunctions/arm_pool_q7_HWC.c - third_party/github/STMicroelectronics/cmsis_core - Git at Google

 /*
  * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
  * Title:        arm_pool_q7_HWC.c
  * Description:  Pooling function implementations
  *
  * $Date:        20. July 2021
  * $Revision:    V.1.1.1
  *
  * Target Processor:  Cortex-M cores
  *
  * -------------------------------------------------------------------- */

 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"

 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)

 /**
  * @brief A few utility functions used by pooling functions
  *
  *
  */

 static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale)
 {
     int i;

     for (i = 0; i < length; i++)
     {
         target[i] = (q7_t)(buffer[i] / scale);
     }
 }

 static void compare_and_replace_if_larger_q7(q7_t *base,           // base data
                                              const q7_t *target,   // compare target
                                              const uint16_t length // data length
 )
 {
     q7_t *pIn = base;
     const q7_t *pCom = target;
     union arm_nnword in;
     union arm_nnword com;
     uint16_t cnt = length >> 2;

     while (cnt > 0u)
     {
         in.word = arm_nn_read_q7x4((const q7_t *)pIn);
         com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);

         // if version
         if (com.bytes[0] > in.bytes[0])
             in.bytes[0] = com.bytes[0];
         if (com.bytes[1] > in.bytes[1])
             in.bytes[1] = com.bytes[1];
         if (com.bytes[2] > in.bytes[2])
             in.bytes[2] = com.bytes[2];
         if (com.bytes[3] > in.bytes[3])
             in.bytes[3] = com.bytes[3];

         arm_nn_write_q7x4_ia(&pIn, in.word);

         cnt--;
     }

     cnt = length & 0x3;
     while (cnt > 0u)
     {
         if (*pCom > *pIn)
         {
             *pIn = *pCom;
         }
         pIn++;
         pCom++;
         cnt--;
     }
 }

 static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length)
 {
     q15_t *pCnt = base;
     q7_t *pV = target;
     q31_t v1, v2, vo1, vo2;
     uint16_t cnt = length >> 2;
     q31_t in;

     while (cnt > 0u)
     {
         q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
         v1 = __SXTB16(__ROR(value, 8));
         v2 = __SXTB16(value);
 #ifndef ARM_MATH_BIG_ENDIAN

         vo2 = __PKHTB(v1, v2, 16);
         vo1 = __PKHBT(v2, v1, 16);

 #else

         vo1 = __PKHTB(v1, v2, 16);
         vo2 = __PKHBT(v2, v1, 16);

 #endif

         in = arm_nn_read_q15x2(pCnt);
         arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));

         in = arm_nn_read_q15x2(pCnt);
         arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));

         cnt--;
     }
     cnt = length & 0x3;
     while (cnt > 0u)
     {
         *pCnt++ += *pV++;
         cnt--;
     }
 }

 #endif // ARM_MATH_DSP

 /**
  *  @ingroup groupNN
  */

 /**
  * @addtogroup Pooling
  * @{
  */

 /**
  * @brief Q7 max pooling function
  * @param[in, out]  Im_in       pointer to input tensor
  * @param[in]       dim_im_in   input tensor dimention
  * @param[in]       ch_im_in    number of input tensor channels
  * @param[in]       dim_kernel  filter kernel size
  * @param[in]       padding     padding sizes
  * @param[in]       stride      convolution stride
  * @param[in]       dim_im_out  output tensor dimension
  * @param[in,out]   bufferA     Not used
  * @param[in,out]   Im_out      pointer to output tensor
  *
  * @details
  *
  * The pooling function is implemented as split x-pooling then
  * y-pooling.
  *
  * This pooling function is input-destructive. Input data is undefined
  * after calling this function.
  *
  */

 void arm_maxpool_q7_HWC(q7_t *Im_in,
                         const uint16_t dim_im_in,
                         const uint16_t ch_im_in,
                         const uint16_t dim_kernel,
                         const uint16_t padding,
                         const uint16_t stride,
                         const uint16_t dim_im_out,
                         q7_t *bufferA,
                         q7_t *Im_out)
 {
     (void)bufferA;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */

     int16_t i_x, i_y;

     /* first does the pooling along x axis */
     for (i_y = 0; i_y < dim_im_in; i_y++)
     {

         for (i_x = 0; i_x < dim_im_out; i_x++)
         {
             /* for each output pixel */
             q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
             q7_t *win_start;
             q7_t *win_stop;
             if (i_x * stride - padding < 0)
             {
                 win_start = target;
             }
             else
             {
                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
             }

             if (i_x * stride - padding + dim_kernel >= dim_im_in)
             {
                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
             }
             else
             {
                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
             }

             /* first step is to copy over initial data */
             /* arm_copy_q7(win_start, target, ch_im_in); */
             memmove(target, win_start, ch_im_in);

             /* start the max operation from the second part */
             win_start += ch_im_in;
             for (; win_start < win_stop; win_start += ch_im_in)
             {
                 compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
             }
         }
     }

     /* then does the pooling along y axis */
     for (i_y = 0; i_y < dim_im_out; i_y++)
     {

         /* for each output row */
         q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
         q7_t *row_start;
         q7_t *row_end;
         /* setting the starting row */
         if (i_y * stride - padding < 0)
         {
             row_start = Im_in;
         }
         else
         {
             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
         }
         /* setting the stopping row */
         if (i_y * stride - padding + dim_kernel >= dim_im_in)
         {
             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
         }
         else
         {
             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
         }

         /* copy over the first row */
         /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
         memmove(target, row_start, dim_im_out * ch_im_in);

         /* move over to next row */
         row_start += ch_im_in * dim_im_in;

         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
         {
             compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
         }
     }

 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
     int16_t i_ch_in, i_x, i_y;
     int16_t k_x, k_y;

     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
     {
         for (i_y = 0; i_y < dim_im_out; i_y++)
         {
             for (i_x = 0; i_x < dim_im_out; i_x++)
             {
                 int max = -129;
                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
                 {
                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
                     {
                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
                         {
                             if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
                             {
                                 max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
                             }
                         }
                     }
                 }
                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
             }
         }
     }

 #endif /* ARM_MATH_DSP */
 }

 /**
  * @brief Q7 average pooling function
  * @param[in,out]   Im_in       pointer to input tensor
  * @param[in]       dim_im_in   input tensor dimention
  * @param[in]       ch_im_in    number of input tensor channels
  * @param[in]       dim_kernel  filter kernel size
  * @param[in]       padding     padding sizes
  * @param[in]       stride      convolution stride
  * @param[in]       dim_im_out  output tensor dimension
  * @param[in,out]   bufferA     pointer to buffer space for input
  * @param[in,out]   Im_out      pointer to output tensor
  *
  * @details
  *
  * <b>Buffer size:</b>
  *
  * bufferA size:  2*dim_im_out*ch_im_in
  *
  * The pooling function is implemented as split x-pooling then
  * y-pooling.
  *
  * This pooling function is input-destructive. Input data is undefined
  * after calling this function.
  *
  */

 void arm_avepool_q7_HWC(q7_t *Im_in,
                         const uint16_t dim_im_in,
                         const uint16_t ch_im_in,
                         const uint16_t dim_kernel,
                         const uint16_t padding,
                         const uint16_t stride,
                         const uint16_t dim_im_out,
                         q7_t *bufferA,
                         q7_t *Im_out)
 {

 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
     /* Run the following code for Cortex-M4 and Cortex-M7 */

     q15_t *buffer = (q15_t *)bufferA;
     int16_t i_x, i_y;
     int16_t count = 0;

     /* first does the pooling along x axis */
     for (i_y = 0; i_y < dim_im_in; i_y++)
     {

         for (i_x = 0; i_x < dim_im_out; i_x++)
         {
             /* for each output pixel */
             q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
             q7_t *win_start;
             q7_t *win_stop;
             if (i_x * stride - padding < 0)
             {
                 win_start = target;
             }
             else
             {
                 win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
             }

             if (i_x * stride - padding + dim_kernel >= dim_im_in)
             {
                 win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
             }
             else
             {
                 win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
             }

             /* first step is to copy over initial data */
             arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
             count = 1;

             /* start the max operation from the second part */
             win_start += ch_im_in;
             for (; win_start < win_stop; win_start += ch_im_in)
             {
                 accumulate_q7_to_q15(buffer, win_start, ch_im_in);
                 count++;
             }
             buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
         }
     }

     /* then does the pooling along y axis */
     for (i_y = 0; i_y < dim_im_out; i_y++)
     {
         /* for each output row */
         q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
         q7_t *row_start;
         q7_t *row_end;
         /* setting the starting row */
         if (i_y * stride - padding < 0)
         {
             row_start = Im_in;
         }
         else
         {
             row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
         }
         /* setting the stopping row */
         if (i_y * stride - padding + dim_kernel >= dim_im_in)
         {
             row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
         }
         else
         {
             row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
         }

         /* copy over the first row */
         arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
         count = 1;

         /* move over to next row */
         row_start += ch_im_in * dim_im_in;

         for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
         {
             accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
             count++;
         }
         buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
     }

 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */

     (void)bufferA;
     int16_t i_ch_in, i_x, i_y;
     int16_t k_x, k_y;

     for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
     {
         for (i_y = 0; i_y < dim_im_out; i_y++)
         {
             for (i_x = 0; i_x < dim_im_out; i_x++)
             {
                 int sum = 0;
                 int count = 0;
                 for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
                 {
                     for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
                     {
                         if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
                         {
                             sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
                             count++;
                         }
                     }
                 }
                 Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
             }
         }
     }

 #endif /* ARM_MATH_DSP */
 }

 /**
  * @} end of Pooling group
  */
	/*
	* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
	*
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the License); you may
	* not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* ----------------------------------------------------------------------
	* Project: CMSIS NN Library
	* Title: arm_pool_q7_HWC.c
	* Description: Pooling function implementations
	*
	* $Date: 20. July 2021
	* $Revision: V.1.1.1
	*
	* Target Processor: Cortex-M cores
	*
	* -------------------------------------------------------------------- */

	#include "arm_nnfunctions.h"
	#include "arm_nnsupportfunctions.h"

	#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)

	/**
	* @brief A few utility functions used by pooling functions
	*
	*
	*/

	static void buffer_scale_back_q15_to_q7(q15_t buffer, q7_t target, uint16_t length, uint16_t scale)
	{
	int i;

	for (i = 0; i < length; i++)
	{
	target[i] = (q7_t)(buffer[i] / scale);
	}
	}

	static void compare_and_replace_if_larger_q7(q7_t *base, // base data
	const q7_t *target, // compare target
	const uint16_t length // data length
	)
	{
	q7_t *pIn = base;
	const q7_t *pCom = target;
	union arm_nnword in;
	union arm_nnword com;
	uint16_t cnt = length >> 2;

	while (cnt > 0u)
	{
	in.word = arm_nn_read_q7x4((const q7_t *)pIn);
	com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);

	// if version
	if (com.bytes[0] > in.bytes[0])
	in.bytes[0] = com.bytes[0];
	if (com.bytes[1] > in.bytes[1])
	in.bytes[1] = com.bytes[1];
	if (com.bytes[2] > in.bytes[2])
	in.bytes[2] = com.bytes[2];
	if (com.bytes[3] > in.bytes[3])
	in.bytes[3] = com.bytes[3];

	arm_nn_write_q7x4_ia(&pIn, in.word);

	cnt--;
	}

	cnt = length & 0x3;
	while (cnt > 0u)
	{
	if (pCom > pIn)
	{
	pIn = pCom;
	}
	pIn++;
	pCom++;
	cnt--;
	}
	}

	static void accumulate_q7_to_q15(q15_t base, q7_t target, const uint16_t length)
	{
	q15_t *pCnt = base;
	q7_t *pV = target;
	q31_t v1, v2, vo1, vo2;
	uint16_t cnt = length >> 2;
	q31_t in;

	while (cnt > 0u)
	{
	q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
	v1 = __SXTB16(__ROR(value, 8));
	v2 = __SXTB16(value);
	#ifndef ARM_MATH_BIG_ENDIAN

	vo2 = __PKHTB(v1, v2, 16);
	vo1 = __PKHBT(v2, v1, 16);

	#else

	vo1 = __PKHTB(v1, v2, 16);
	vo2 = __PKHBT(v2, v1, 16);

	#endif

	in = arm_nn_read_q15x2(pCnt);
	arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));

	in = arm_nn_read_q15x2(pCnt);
	arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));

	cnt--;
	}
	cnt = length & 0x3;
	while (cnt > 0u)
	{
	pCnt++ += pV++;
	cnt--;
	}
	}

	#endif // ARM_MATH_DSP

	/**
	* @ingroup groupNN
	*/

	/**
	* @addtogroup Pooling
	* @{
	*/

	/**
	* @brief Q7 max pooling function
	* @param[in, out] Im_in pointer to input tensor
	* @param[in] dim_im_in input tensor dimention
	* @param[in] ch_im_in number of input tensor channels
	* @param[in] dim_kernel filter kernel size
	* @param[in] padding padding sizes
	* @param[in] stride convolution stride
	* @param[in] dim_im_out output tensor dimension
	* @param[in,out] bufferA Not used
	* @param[in,out] Im_out pointer to output tensor
	*
	* @details
	*
	* The pooling function is implemented as split x-pooling then
	* y-pooling.
	*
	* This pooling function is input-destructive. Input data is undefined
	* after calling this function.
	*
	*/

	void arm_maxpool_q7_HWC(q7_t *Im_in,
	const uint16_t dim_im_in,
	const uint16_t ch_im_in,
	const uint16_t dim_kernel,
	const uint16_t padding,
	const uint16_t stride,
	const uint16_t dim_im_out,
	q7_t *bufferA,
	q7_t *Im_out)
	{
	(void)bufferA;
	#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
	/* Run the following code for Cortex-M4 and Cortex-M7 */

	int16_t i_x, i_y;

	/* first does the pooling along x axis */
	for (i_y = 0; i_y < dim_im_in; i_y++)
	{

	for (i_x = 0; i_x < dim_im_out; i_x++)
	{
	/* for each output pixel */
	q7_t target = Im_in + (i_y dim_im_in + i_x) * ch_im_in;
	q7_t *win_start;
	q7_t *win_stop;
	if (i_x * stride - padding < 0)
	{
	win_start = target;
	}
	else
	{
	win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
	}

	if (i_x * stride - padding + dim_kernel >= dim_im_in)
	{
	win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
	}
	else
	{
	win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
	}

	/* first step is to copy over initial data */
	/* arm_copy_q7(win_start, target, ch_im_in); */
	memmove(target, win_start, ch_im_in);

	/* start the max operation from the second part */
	win_start += ch_im_in;
	for (; win_start < win_stop; win_start += ch_im_in)
	{
	compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
	}
	}
	}

	/* then does the pooling along y axis */
	for (i_y = 0; i_y < dim_im_out; i_y++)
	{

	/* for each output row */
	q7_t target = Im_out + i_y dim_im_out * ch_im_in;
	q7_t *row_start;
	q7_t *row_end;
	/* setting the starting row */
	if (i_y * stride - padding < 0)
	{
	row_start = Im_in;
	}
	else
	{
	row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
	}
	/* setting the stopping row */
	if (i_y * stride - padding + dim_kernel >= dim_im_in)
	{
	row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
	}
	else
	{
	row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
	}

	/* copy over the first row */
	/* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
	memmove(target, row_start, dim_im_out * ch_im_in);

	/* move over to next row */
	row_start += ch_im_in * dim_im_in;

	for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
	{
	compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
	}
	}

	#else
	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
	int16_t i_ch_in, i_x, i_y;
	int16_t k_x, k_y;

	for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
	{
	for (i_y = 0; i_y < dim_im_out; i_y++)
	{
	for (i_x = 0; i_x < dim_im_out; i_x++)
	{
	int max = -129;
	for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
	{
	for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
	{
	if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
	{
	if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
	{
	max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
	}
	}
	}
	}
	Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
	}
	}
	}

	#endif /* ARM_MATH_DSP */
	}

	/**
	* @brief Q7 average pooling function
	* @param[in,out] Im_in pointer to input tensor
	* @param[in] dim_im_in input tensor dimention
	* @param[in] ch_im_in number of input tensor channels
	* @param[in] dim_kernel filter kernel size
	* @param[in] padding padding sizes
	* @param[in] stride convolution stride
	* @param[in] dim_im_out output tensor dimension
	* @param[in,out] bufferA pointer to buffer space for input
	* @param[in,out] Im_out pointer to output tensor
	*
	* @details
	*
	* <b>Buffer size:</b>
	*
	* bufferA size: 2dim_im_outch_im_in
	*
	* The pooling function is implemented as split x-pooling then
	* y-pooling.
	*
	* This pooling function is input-destructive. Input data is undefined
	* after calling this function.
	*
	*/

	void arm_avepool_q7_HWC(q7_t *Im_in,
	const uint16_t dim_im_in,
	const uint16_t ch_im_in,
	const uint16_t dim_kernel,
	const uint16_t padding,
	const uint16_t stride,
	const uint16_t dim_im_out,
	q7_t *bufferA,
	q7_t *Im_out)
	{

	#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
	/* Run the following code for Cortex-M4 and Cortex-M7 */

	q15_t buffer = (q15_t )bufferA;
	int16_t i_x, i_y;
	int16_t count = 0;

	/* first does the pooling along x axis */
	for (i_y = 0; i_y < dim_im_in; i_y++)
	{

	for (i_x = 0; i_x < dim_im_out; i_x++)
	{
	/* for each output pixel */
	q7_t target = Im_in + (i_y dim_im_in + i_x) * ch_im_in;
	q7_t *win_start;
	q7_t *win_stop;
	if (i_x * stride - padding < 0)
	{
	win_start = target;
	}
	else
	{
	win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
	}

	if (i_x * stride - padding + dim_kernel >= dim_im_in)
	{
	win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
	}
	else
	{
	win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
	}

	/* first step is to copy over initial data */
	arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
	count = 1;

	/* start the max operation from the second part */
	win_start += ch_im_in;
	for (; win_start < win_stop; win_start += ch_im_in)
	{
	accumulate_q7_to_q15(buffer, win_start, ch_im_in);
	count++;
	}
	buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
	}
	}

	/* then does the pooling along y axis */
	for (i_y = 0; i_y < dim_im_out; i_y++)
	{
	/* for each output row */
	q7_t target = Im_out + i_y dim_im_out * ch_im_in;
	q7_t *row_start;
	q7_t *row_end;
	/* setting the starting row */
	if (i_y * stride - padding < 0)
	{
	row_start = Im_in;
	}
	else
	{
	row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
	}
	/* setting the stopping row */
	if (i_y * stride - padding + dim_kernel >= dim_im_in)
	{
	row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
	}
	else
	{
	row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
	}

	/* copy over the first row */
	arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
	count = 1;

	/* move over to next row */
	row_start += ch_im_in * dim_im_in;

	for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
	{
	accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
	count++;
	}
	buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
	}

	#else
	/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */

	(void)bufferA;
	int16_t i_ch_in, i_x, i_y;
	int16_t k_x, k_y;

	for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
	{
	for (i_y = 0; i_y < dim_im_out; i_y++)
	{
	for (i_x = 0; i_x < dim_im_out; i_x++)
	{
	int sum = 0;
	int count = 0;
	for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
	{
	for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
	{
	if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
	{
	sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
	count++;
	}
	}
	}
	Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
	}
	}
	}

	#endif /* ARM_MATH_DSP */
	}

	/**
	* @} end of Pooling group
	*/