| /* |
| * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. |
| * |
| * SPDX-License-Identifier: Apache-2.0 |
| * |
| * Licensed under the Apache License, Version 2.0 (the License); you may |
| * not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* ---------------------------------------------------------------------- |
| * Project: CMSIS NN Library |
| * Title: arm_pool_q7_HWC.c |
| * Description: Pooling function implementations |
| * |
| * $Date: 17. January 2018 |
| * $Revision: V.1.0.0 |
| * |
| * Target Processor: Cortex-M cores |
| * |
| * -------------------------------------------------------------------- */ |
| |
| #include "arm_math.h" |
| #include "arm_nnfunctions.h" |
| |
| #if defined (ARM_MATH_DSP) |
| |
| /** |
| * @brief A few utility functions used by pooling functions |
| * |
| * |
| */ |
| |
| static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) |
| { |
| int i; |
| |
| for (i = 0; i < length; i++) |
| { |
| target[i] = (q7_t) (buffer[i] / scale); |
| } |
| } |
| |
| static void compare_and_replace_if_larger_q7(q7_t * base, // base data |
| const q7_t * target, // compare target |
| const uint16_t length // data length |
| ) |
| { |
| q7_t *pIn = base; |
| const q7_t *pCom = target; |
| union arm_nnword in; |
| union arm_nnword com; |
| uint16_t cnt = length >> 2; |
| |
| while (cnt > 0u) |
| { |
| in.word = *__SIMD32(pIn); |
| com.word = *__SIMD32(pCom)++; |
| |
| // if version |
| if (com.bytes[0] > in.bytes[0]) |
| in.bytes[0] = com.bytes[0]; |
| if (com.bytes[1] > in.bytes[1]) |
| in.bytes[1] = com.bytes[1]; |
| if (com.bytes[2] > in.bytes[2]) |
| in.bytes[2] = com.bytes[2]; |
| if (com.bytes[3] > in.bytes[3]) |
| in.bytes[3] = com.bytes[3]; |
| |
| *__SIMD32(pIn)++ = in.word; |
| |
| cnt--; |
| } |
| |
| cnt = length & 0x3; |
| while (cnt > 0u) |
| { |
| if (*pCom > *pIn) |
| { |
| *pIn = *pCom; |
| } |
| pIn++; |
| pCom++; |
| cnt--; |
| } |
| } |
| |
| static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) |
| { |
| q15_t *pCnt = base; |
| q7_t *pV = target; |
| q31_t v1, v2, vo1, vo2; |
| uint16_t cnt = length >> 2; |
| q31_t in; |
| |
| while (cnt > 0u) |
| { |
| q31_t value = *__SIMD32(pV)++; |
| v1 = __SXTB16(__ROR(value, 8)); |
| v2 = __SXTB16(value); |
| #ifndef ARM_MATH_BIG_ENDIAN |
| |
| vo2 = __PKHTB(v1, v2, 16); |
| vo1 = __PKHBT(v2, v1, 16); |
| |
| #else |
| |
| vo1 = __PKHTB(v1, v2, 16); |
| vo2 = __PKHBT(v2, v1, 16); |
| |
| #endif |
| |
| in = *__SIMD32(pCnt); |
| *__SIMD32(pCnt)++ = __QADD16(vo1, in); |
| |
| in = *__SIMD32(pCnt); |
| *__SIMD32(pCnt)++ = __QADD16(vo2, in); |
| |
| cnt--; |
| } |
| cnt = length & 0x3; |
| while (cnt > 0u) |
| { |
| *pCnt++ += *pV++; |
| cnt--; |
| } |
| } |
| |
| #endif // ARM_MATH_DSP |
| |
| /** |
| * @ingroup groupNN |
| */ |
| |
| /** |
| * @addtogroup Pooling |
| * @{ |
| */ |
| |
| /** |
| * @brief Q7 max pooling function |
| * @param[in, out] Im_in pointer to input tensor |
| * @param[in] dim_im_in input tensor dimention |
| * @param[in] ch_im_in number of input tensor channels |
| * @param[in] dim_kernel filter kernel size |
| * @param[in] padding padding sizes |
| * @param[in] stride convolution stride |
| * @param[in] dim_im_out output tensor dimension |
| * @param[in,out] bufferA pointer to buffer space for input |
| * @param[in,out] Im_out pointer to output tensor |
| * @return none. |
| * |
| * @details |
| * |
| * <b>Buffer size:</b> |
| * |
| * bufferA size: 0 |
| * |
| * The pooling function is implemented as split x-pooling then |
| * y-pooling. |
| * |
| * This pooling function is input-destructive. Input data is undefined |
| * after calling this function. |
| * |
| */ |
| |
| void |
| arm_maxpool_q7_HWC(q7_t * Im_in, |
| const uint16_t dim_im_in, |
| const uint16_t ch_im_in, |
| const uint16_t dim_kernel, |
| const uint16_t padding, |
| const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
| { |
| |
| #if defined (ARM_MATH_DSP) |
| /* Run the following code for Cortex-M4 and Cortex-M7 */ |
| |
| int16_t i_x, i_y; |
| |
| /* first does the pooling along x axis */ |
| for (i_y = 0; i_y < dim_im_in; i_y++) |
| { |
| |
| for (i_x = 0; i_x < dim_im_out; i_x++) |
| { |
| /* for each output pixel */ |
| q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
| q7_t *win_start; |
| q7_t *win_stop; |
| if (i_x * stride - padding < 0) |
| { |
| win_start = target; |
| } else |
| { |
| win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
| } |
| |
| if (i_x * stride - padding + dim_kernel >= dim_im_in) |
| { |
| win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
| } else |
| { |
| win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
| } |
| |
| /* first step is to copy over initial data */ |
| /* arm_copy_q7(win_start, target, ch_im_in); */ |
| memmove(target, win_start, ch_im_in); |
| |
| /* start the max operation from the second part */ |
| win_start += ch_im_in; |
| for (; win_start < win_stop; win_start += ch_im_in) |
| { |
| compare_and_replace_if_larger_q7(target, win_start, ch_im_in); |
| } |
| } |
| } |
| |
| /* then does the pooling along y axis */ |
| for (i_y = 0; i_y < dim_im_out; i_y++) |
| { |
| |
| /* for each output row */ |
| q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
| q7_t *row_start; |
| q7_t *row_end; |
| /* setting the starting row */ |
| if (i_y * stride - padding < 0) |
| { |
| row_start = Im_in; |
| } else |
| { |
| row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
| } |
| /* setting the stopping row */ |
| if (i_y * stride - padding + dim_kernel >= dim_im_in) |
| { |
| row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
| } else |
| { |
| row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
| } |
| |
| /* copy over the first row */ |
| /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ |
| memmove(target, row_start, dim_im_out * ch_im_in); |
| |
| /* move over to next row */ |
| row_start += ch_im_in * dim_im_in; |
| |
| for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
| { |
| compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); |
| } |
| } |
| |
| #else |
| /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
| |
| int16_t i_ch_in, i_x, i_y; |
| int16_t k_x, k_y; |
| |
| for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
| { |
| for (i_y = 0; i_y < dim_im_out; i_y++) |
| { |
| for (i_x = 0; i_x < dim_im_out; i_x++) |
| { |
| int max = -129; |
| for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
| { |
| for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
| { |
| if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
| { |
| if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) |
| { |
| max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
| } |
| } |
| } |
| } |
| Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; |
| } |
| } |
| } |
| |
| #endif /* ARM_MATH_DSP */ |
| |
| } |
| |
| /** |
| * @brief Q7 average pooling function |
| * @param[in,out] Im_in pointer to input tensor |
| * @param[in] dim_im_in input tensor dimention |
| * @param[in] ch_im_in number of input tensor channels |
| * @param[in] dim_kernel filter kernel size |
| * @param[in] padding padding sizes |
| * @param[in] stride convolution stride |
| * @param[in] dim_im_out output tensor dimension |
| * @param[in,out] bufferA pointer to buffer space for input |
| * @param[in,out] Im_out pointer to output tensor |
| * @return none. |
| * |
| * @details |
| * |
| * <b>Buffer size:</b> |
| * |
| * bufferA size: 2*dim_im_out*ch_im_in |
| * |
| * The pooling function is implemented as split x-pooling then |
| * y-pooling. |
| * |
| * This pooling function is input-destructive. Input data is undefined |
| * after calling this function. |
| * |
| */ |
| |
| void |
| arm_avepool_q7_HWC(q7_t * Im_in, |
| const uint16_t dim_im_in, |
| const uint16_t ch_im_in, |
| const uint16_t dim_kernel, |
| const uint16_t padding, |
| const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) |
| { |
| |
| #if defined (ARM_MATH_DSP) |
| /* Run the following code for Cortex-M4 and Cortex-M7 */ |
| |
| q15_t *buffer = (q15_t *) bufferA; |
| int16_t i_x, i_y; |
| int16_t count = 0; |
| |
| /* first does the pooling along x axis */ |
| for (i_y = 0; i_y < dim_im_in; i_y++) |
| { |
| |
| for (i_x = 0; i_x < dim_im_out; i_x++) |
| { |
| /* for each output pixel */ |
| q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; |
| q7_t *win_start; |
| q7_t *win_stop; |
| if (i_x * stride - padding < 0) |
| { |
| win_start = target; |
| } else |
| { |
| win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; |
| } |
| |
| if (i_x * stride - padding + dim_kernel >= dim_im_in) |
| { |
| win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; |
| } else |
| { |
| win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; |
| } |
| |
| /* first step is to copy over initial data */ |
| arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); |
| count = 1; |
| |
| /* start the max operation from the second part */ |
| win_start += ch_im_in; |
| for (; win_start < win_stop; win_start += ch_im_in) |
| { |
| accumulate_q7_to_q15(buffer, win_start, ch_im_in); |
| count++; |
| } |
| buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); |
| } |
| } |
| |
| /* then does the pooling along y axis */ |
| for (i_y = 0; i_y < dim_im_out; i_y++) |
| { |
| /* for each output row */ |
| q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; |
| q7_t *row_start; |
| q7_t *row_end; |
| /* setting the starting row */ |
| if (i_y * stride - padding < 0) |
| { |
| row_start = Im_in; |
| } else |
| { |
| row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; |
| } |
| /* setting the stopping row */ |
| if (i_y * stride - padding + dim_kernel >= dim_im_in) |
| { |
| row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; |
| } else |
| { |
| row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; |
| } |
| |
| /* copy over the first row */ |
| arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); |
| count = 1; |
| |
| /* move over to next row */ |
| row_start += ch_im_in * dim_im_in; |
| |
| for (; row_start < row_end; row_start += dim_im_in * ch_im_in) |
| { |
| accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); |
| count++; |
| } |
| buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); |
| } |
| |
| #else |
| /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ |
| |
| int16_t i_ch_in, i_x, i_y; |
| int16_t k_x, k_y; |
| |
| for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) |
| { |
| for (i_y = 0; i_y < dim_im_out; i_y++) |
| { |
| for (i_x = 0; i_x < dim_im_out; i_x++) |
| { |
| int sum = 0; |
| int count = 0; |
| for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) |
| { |
| for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) |
| { |
| if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) |
| { |
| sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; |
| count++; |
| } |
| } |
| } |
| Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; |
| } |
| } |
| } |
| |
| #endif /* ARM_MATH_DSP */ |
| |
| } |
| |
| /** |
| * @} end of Pooling group |
| */ |