NN/Source/SVDFunctions/arm_svdf_s8.c - third_party/github/STMicroelectronics/cmsis_core - Git at Google

 /*
  * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* ----------------------------------------------------------------------
  * Project:      CMSIS NN Library
  * Title:        arm_svdf_s8.c
  * Description:  S8 basic SVDF layer function
  *
  * $Date:        15. April 2021
  * $Revision:    V.1.5.0
  *
  * Target Processor:  Cortex-M processors
  *
  * -------------------------------------------------------------------- */

 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"

 /**
  * @ingroup groupNN
  */

 /**
  * @addtogroup SVDF
  * @{
  */

 /*
  * S8 SVDF layer function for TensorFlow Lite
  *
  * Refer to header file for details.
  *
  */

 arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
                        const cmsis_nn_context *output_ctx,
                        const cmsis_nn_svdf_params *svdf_params,
                        const cmsis_nn_per_tensor_quant_params *input_quant_params,
                        const cmsis_nn_per_tensor_quant_params *output_quant_params,
                        const cmsis_nn_dims *input_dims,
                        const q7_t *input_data,
                        const cmsis_nn_dims *state_dims,
                        q15_t *state_data,
                        const cmsis_nn_dims *weights_feature_dims,
                        const q7_t *weights_feature_data,
                        const cmsis_nn_dims *weights_time_dims,
                        const q15_t *weights_time_data,
                        const cmsis_nn_dims *bias_dims,
                        const q31_t *bias_data,
                        const cmsis_nn_dims *output_dims,
                        q7_t *output_data)
 {
     (void)bias_dims;
     (void)state_dims;
     (void)output_dims;

     const q31_t multiplier_in = input_quant_params->multiplier;
     const q31_t shift_in = input_quant_params->shift;
     const q31_t multiplier_out = output_quant_params->multiplier;
     const q31_t shift_2 = output_quant_params->shift;
     const int32_t zp_in = svdf_params->input_offset;
     const int32_t zp_out = svdf_params->output_offset;
     const int32_t in_activation_min = svdf_params->input_activation.min;
     const int32_t in_activation_max = svdf_params->input_activation.max;
     const int32_t out_activation_min = svdf_params->output_activation.min;
     const int32_t out_activation_max = svdf_params->output_activation.max;
     const int16_t rank = svdf_params->rank;

     const int32_t input_batches = input_dims->n;
     const int32_t input_height = input_dims->h;
     const int32_t feature_batches = weights_feature_dims->n;
     const int32_t time_batches = weights_time_dims->h;
     const int32_t unit_count = feature_batches / rank;

     q31_t *buffer_a = (q31_t *)input_ctx->buf;
     q31_t *buffer_b = (q31_t *)output_ctx->buf;

     memmove((q15_t *)state_data,
             (q15_t *)state_data + 1,
             (size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t)));

     for (int i_batch = 0; i_batch < input_batches; i_batch++)
     {
         q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
         const q7_t *weight = weights_feature_data;
         const q7_t *input = input_data + i_batch * input_height;

         arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
                                                        weight,
                                                        res_ptr,
                                                        -zp_in,
                                                        0,
                                                        time_batches,
                                                        multiplier_in,
                                                        shift_in,
                                                        input_height,
                                                        feature_batches,
                                                        in_activation_min,
                                                        in_activation_max);

         if (res != ARM_MATH_SUCCESS)
         {
             return res;
         }
     }

     {
         q31_t *ptr_a = buffer_a;
         const q15_t *v2 = state_data;
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
             const q15_t *v1 = weights_time_data;

             for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
             {
                 *ptr_a = 0;
                 int32_t sum = 0;
 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
                 int j = 0;
                 int32_t block_count = time_batches >> 1;
                 for (int i = 0; i < block_count; i++)
                 {
                     j += 2;
                     q31_t r1 = arm_nn_read_q15x2_ia(&v1);
                     q31_t r2 = arm_nn_read_q15x2_ia(&v2);

                     sum = __SMLAD(r1, r2, sum);
                 }

                 // Process the remaining data
                 for (; j < time_batches; j++)
                 {
                     sum += *v1 * *v2;
                     v1++;
                     v2++;
                 }
 #else
                 for (int j = 0; j < time_batches; j++)
                 {
                     sum += *v1 * *v2;
                     v1++;
                     v2++;
                 }
 #endif

                 *ptr_a = sum;
                 ptr_a++;
             }
         }
     }

     if (bias_data)
     {
         if (unit_count == feature_batches)
         {
             for (int i = 0; i < input_batches; i++)
             {
                 q31_t *output_temp = buffer_b + i * feature_batches;
                 const q31_t *ptr_a = buffer_a + i * feature_batches;

                 const int32_t *bi = bias_data;
                 for (int j = 0; j < feature_batches; j++)
                 {
                     output_temp[j] = ptr_a[j] + bi[j];
                 }
             }
         }
         else
         {
             for (int i_batch = 0; i_batch < input_batches; i_batch++)
             {
                 q31_t *output_data_temp = buffer_b + i_batch * unit_count;
                 q31_t *ptr_a = buffer_a + i_batch * feature_batches;

                 for (int i = 0; i < unit_count; i++)
                 {
                     int32_t sum = bias_data[i];
                     for (int j = 0; j < rank; j++)
                     {
                         sum += *ptr_a;
                         ptr_a++;
                     }
                     output_data_temp[i] = sum;
                 }
             }
         }
     }
     else
     {
         for (int i_batch = 0; i_batch < input_batches; i_batch++)
         {
             q31_t *output_data_temp = buffer_b + i_batch * unit_count;
             q31_t *ptr_a = buffer_a + i_batch * feature_batches;

             for (int i = 0; i < unit_count; i++)
             {
                 int32_t sum = 0;
                 for (int j = 0; j < rank; j++)
                 {
                     sum += *ptr_a;
                     ptr_a++;
                 }
                 output_data_temp[i] = sum;
             }
         }
     }

 #if defined(ARM_MATH_MVEI)
     int32_t num_elements = input_batches * unit_count;
     const int32_t loop_count = (num_elements + 3) / 4;
     for (int i_op = 0; i_op < loop_count; i_op++)
     {
         mve_pred16_t p = vctp32q((uint32_t)num_elements);
         int32x4_t op = vldrwq_z_s32(buffer_b, p);
         op = arm_requantize_mve(op, multiplier_out, shift_2);
         op = vaddq_n_s32(op, zp_out);
         const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
         const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
         op = vmaxq_s32(op, min_vec);
         op = vminq_s32(op, max_vec);
         vstrbq_p_s32(output_data, op, p);
         output_data += 4;
         buffer_b += 4;
         num_elements -= 4;
     }
 #else
     for (int i = 0; i < input_batches * unit_count; i++)
     {
         output_data[i] = (q7_t)CLAMP(
             arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
     }
 #endif

     return (ARM_MATH_SUCCESS);
 }

 /**
  * @} end of SVDF group
  */
	/*
	* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
	*
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the License); you may
	* not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* ----------------------------------------------------------------------
	* Project: CMSIS NN Library
	* Title: arm_svdf_s8.c
	* Description: S8 basic SVDF layer function
	*
	* $Date: 15. April 2021
	* $Revision: V.1.5.0
	*
	* Target Processor: Cortex-M processors
	*
	* -------------------------------------------------------------------- */

	#include "arm_nnfunctions.h"
	#include "arm_nnsupportfunctions.h"

	/**
	* @ingroup groupNN
	*/

	/**
	* @addtogroup SVDF
	* @{
	*/

	/*
	* S8 SVDF layer function for TensorFlow Lite
	*
	* Refer to header file for details.
	*
	*/

	arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
	const cmsis_nn_context *output_ctx,
	const cmsis_nn_svdf_params *svdf_params,
	const cmsis_nn_per_tensor_quant_params *input_quant_params,
	const cmsis_nn_per_tensor_quant_params *output_quant_params,
	const cmsis_nn_dims *input_dims,
	const q7_t *input_data,
	const cmsis_nn_dims *state_dims,
	q15_t *state_data,
	const cmsis_nn_dims *weights_feature_dims,
	const q7_t *weights_feature_data,
	const cmsis_nn_dims *weights_time_dims,
	const q15_t *weights_time_data,
	const cmsis_nn_dims *bias_dims,
	const q31_t *bias_data,
	const cmsis_nn_dims *output_dims,
	q7_t *output_data)
	{
	(void)bias_dims;
	(void)state_dims;
	(void)output_dims;

	const q31_t multiplier_in = input_quant_params->multiplier;
	const q31_t shift_in = input_quant_params->shift;
	const q31_t multiplier_out = output_quant_params->multiplier;
	const q31_t shift_2 = output_quant_params->shift;
	const int32_t zp_in = svdf_params->input_offset;
	const int32_t zp_out = svdf_params->output_offset;
	const int32_t in_activation_min = svdf_params->input_activation.min;
	const int32_t in_activation_max = svdf_params->input_activation.max;
	const int32_t out_activation_min = svdf_params->output_activation.min;
	const int32_t out_activation_max = svdf_params->output_activation.max;
	const int16_t rank = svdf_params->rank;

	const int32_t input_batches = input_dims->n;
	const int32_t input_height = input_dims->h;
	const int32_t feature_batches = weights_feature_dims->n;
	const int32_t time_batches = weights_time_dims->h;
	const int32_t unit_count = feature_batches / rank;

	q31_t buffer_a = (q31_t )input_ctx->buf;
	q31_t buffer_b = (q31_t )output_ctx->buf;

	memmove((q15_t *)state_data,
	(q15_t *)state_data + 1,
	(size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t)));

	for (int i_batch = 0; i_batch < input_batches; i_batch++)
	{
	q15_t res_ptr = state_data + (time_batches i_batch * feature_batches) + (time_batches - 1);
	const q7_t *weight = weights_feature_data;
	const q7_t input = input_data + i_batch input_height;

	arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input,
	weight,
	res_ptr,
	-zp_in,
	0,
	time_batches,
	multiplier_in,
	shift_in,
	input_height,
	feature_batches,
	in_activation_min,
	in_activation_max);

	if (res != ARM_MATH_SUCCESS)
	{
	return res;
	}
	}

	{
	q31_t *ptr_a = buffer_a;
	const q15_t *v2 = state_data;
	for (int i_batch = 0; i_batch < input_batches; i_batch++)
	{
	const q15_t *v1 = weights_time_data;

	for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
	{
	*ptr_a = 0;
	int32_t sum = 0;
	#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
	int j = 0;
	int32_t block_count = time_batches >> 1;
	for (int i = 0; i < block_count; i++)
	{
	j += 2;
	q31_t r1 = arm_nn_read_q15x2_ia(&v1);
	q31_t r2 = arm_nn_read_q15x2_ia(&v2);

	sum = __SMLAD(r1, r2, sum);
	}

	// Process the remaining data
	for (; j < time_batches; j++)
	{
	sum += v1 *v2;
	v1++;
	v2++;
	}
	#else
	for (int j = 0; j < time_batches; j++)
	{
	sum += v1 *v2;
	v1++;
	v2++;
	}
	#endif

	*ptr_a = sum;
	ptr_a++;
	}
	}
	}

	if (bias_data)
	{
	if (unit_count == feature_batches)
	{
	for (int i = 0; i < input_batches; i++)
	{
	q31_t output_temp = buffer_b + i feature_batches;
	const q31_t ptr_a = buffer_a + i feature_batches;

	const int32_t *bi = bias_data;
	for (int j = 0; j < feature_batches; j++)
	{
	output_temp[j] = ptr_a[j] + bi[j];
	}
	}
	}
	else
	{
	for (int i_batch = 0; i_batch < input_batches; i_batch++)
	{
	q31_t output_data_temp = buffer_b + i_batch unit_count;
	q31_t ptr_a = buffer_a + i_batch feature_batches;

	for (int i = 0; i < unit_count; i++)
	{
	int32_t sum = bias_data[i];
	for (int j = 0; j < rank; j++)
	{
	sum += *ptr_a;
	ptr_a++;
	}
	output_data_temp[i] = sum;
	}
	}
	}
	}
	else
	{
	for (int i_batch = 0; i_batch < input_batches; i_batch++)
	{
	q31_t output_data_temp = buffer_b + i_batch unit_count;
	q31_t ptr_a = buffer_a + i_batch feature_batches;

	for (int i = 0; i < unit_count; i++)
	{
	int32_t sum = 0;
	for (int j = 0; j < rank; j++)
	{
	sum += *ptr_a;
	ptr_a++;
	}
	output_data_temp[i] = sum;
	}
	}
	}

	#if defined(ARM_MATH_MVEI)
	int32_t num_elements = input_batches * unit_count;
	const int32_t loop_count = (num_elements + 3) / 4;
	for (int i_op = 0; i_op < loop_count; i_op++)
	{
	mve_pred16_t p = vctp32q((uint32_t)num_elements);
	int32x4_t op = vldrwq_z_s32(buffer_b, p);
	op = arm_requantize_mve(op, multiplier_out, shift_2);
	op = vaddq_n_s32(op, zp_out);
	const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min);
	const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max);
	op = vmaxq_s32(op, min_vec);
	op = vminq_s32(op, max_vec);
	vstrbq_p_s32(output_data, op, p);
	output_data += 4;
	buffer_b += 4;
	num_elements -= 4;
	}
	#else
	for (int i = 0; i < input_batches * unit_count; i++)
	{
	output_data[i] = (q7_t)CLAMP(
	arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min);
	}
	#endif

	return (ARM_MATH_SUCCESS);
	}

	/**
	* @} end of SVDF group
	*/