SoftMax Quantization specific registration for CMSIS-NN (#1018)

* Quantization specific registration for CMSIS-NN Adds three specific registrations for softmax kernel: - Pure int8. - Pure int16. - And one with int8 input and int16 output. In order to avoid duplicating code CalculateSoftmaxParams is made public. Change-Id: I51de85e85f3bfb7a2d936593bd6512f263e6e5e4 * Add helper function InitializeLutForInt16 * Do not inline helper function Co-authored-by: N mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>

SoftMax Quantization specific registration for CMSIS-NN (#1018)
* Quantization specific registration for CMSIS-NN Adds three specific registrations for softmax kernel: - Pure int8. - Pure int16. - And one with int8 input and int16 output. In order to avoid duplicating code CalculateSoftmaxParams is made public. Change-Id: I51de85e85f3bfb7a2d936593bd6512f263e6e5e4 * Add helper function InitializeLutForInt16 * Do not inline helper function Co-authored-by: N mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
a3e7bdad · Måns Nilsson · GitHub · 67a2b960 · a3e7bdad · a3e7bdad
5 changed file
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -79,7 +79,7 @@ void setup() {
  micro_op_resolver.AddDepthwiseConv2D(
      tflite::Register_DEPTHWISE_CONV_2D_INT8());
  micro_op_resolver.AddReshape();
-  micro_op_resolver.AddSoftmax();
+  micro_op_resolver.AddSoftmax(tflite::Register_SOFTMAX_INT8());

  // Build an interpreter to run the model with.
  // NOLINTNEXTLINE(runtime-global-variables)

--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -59,7 +59,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
  micro_op_resolver.AddDepthwiseConv2D(
      tflite::Register_DEPTHWISE_CONV_2D_INT8());
  micro_op_resolver.AddReshape();
-  micro_op_resolver.AddSoftmax();
+  micro_op_resolver.AddSoftmax(tflite::Register_SOFTMAX_INT8());

  // Build an interpreter to run the model with.
  tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,

--- a/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/softmax.cc
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,36 +28,50 @@ limitations under the License.
 namespace tflite {
 namespace {

-void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
-                      const SoftmaxParams& op_data) {
-  if (input->type == kTfLiteInt8) {
-    if (output->type == kTfLiteInt16) {
-      tflite::reference_ops::Softmax(
-          op_data, tflite::micro::GetTensorShape(input),
-          tflite::micro::GetTensorData<int8_t>(input),
-          tflite::micro::GetTensorShape(output),
-          tflite::micro::GetTensorData<int16_t>(output));
-    } else {
-      const auto input_shape = tflite::micro::GetTensorShape(input);
-      const auto output_shape = tflite::micro::GetTensorShape(output);
-      const int trailing_dim = input_shape.DimensionsCount() - 1;
-      const int outer_size =
-          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-      const int depth =
-          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
-      arm_softmax_s8(tflite::micro::GetTensorData<int8_t>(input), outer_size,
-                     depth, op_data.input_multiplier, op_data.input_left_shift,
-                     op_data.diff_min,
-                     tflite::micro::GetTensorData<int8_t>(output));
-    }
-  } else {
-    tflite::reference_ops::SoftmaxInt16(
-        op_data, tflite::micro::GetTensorShape(input),
-        tflite::micro::GetTensorData<int16_t>(input),
-        tflite::micro::GetTensorShape(output),
-        tflite::micro::GetTensorData<int16_t>(output));
-  }
+struct CMSISNNSoftmaxParams {
+  SoftmaxParams softmax_params;
+  int32_t num_rows;
+  int32_t row_size;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(CMSISNNSoftmaxParams));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  CMSISNNSoftmaxParams* op_data =
+      static_cast<CMSISNNSoftmaxParams*>(node->user_data);
+
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  auto ret_val = CalculateSoftmaxParams(context, input, output, params,
+                                        &op_data->softmax_params);
+
+  const auto input_shape = GetTensorShape(input);
+  const auto output_shape = GetTensorShape(output);
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  op_data->num_rows = outer_size;
+  op_data->row_size = depth;
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return ret_val;
 }

 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
@@ -65,21 +79,48 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

  TFLITE_DCHECK(node->user_data != nullptr);
-  const SoftmaxParams data =
-      *static_cast<const SoftmaxParams*>(node->user_data);
+  const CMSISNNSoftmaxParams op_data =
+      *static_cast<const CMSISNNSoftmaxParams*>(node->user_data);

  switch (input->type) {
    case kTfLiteFloat32: {
      tflite::reference_ops::Softmax(
-          data, tflite::micro::GetTensorShape(input),
+          op_data.softmax_params, tflite::micro::GetTensorShape(input),
          tflite::micro::GetTensorData<float>(input),
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<float>(output));
      return kTfLiteOk;
    }
-    case kTfLiteInt8:
+    case kTfLiteInt8: {
+      if (output->type == kTfLiteInt8) {
+        arm_softmax_s8(tflite::micro::GetTensorData<int8_t>(input),
+                       op_data.num_rows, op_data.row_size,
+                       op_data.softmax_params.input_multiplier,
+                       op_data.softmax_params.input_left_shift,
+                       op_data.softmax_params.diff_min,
+                       tflite::micro::GetTensorData<int8_t>(output));
+      } else {
+        arm_softmax_s8_s16(tflite::micro::GetTensorData<int8_t>(input),
+                           op_data.num_rows, op_data.row_size,
+                           op_data.softmax_params.input_multiplier,
+                           op_data.softmax_params.input_left_shift,
+                           op_data.softmax_params.diff_min,
+                           tflite::micro::GetTensorData<int16_t>(output));
+      }
+      return kTfLiteOk;
+    }
    case kTfLiteInt16: {
-      SoftmaxQuantized(input, output, data);
+      const cmsis_nn_softmax_lut_s16 softmax_params = {
+          .exp_lut = op_data.softmax_params.exp_lut,
+          .one_by_one_lut = op_data.softmax_params.one_over_one_plus_x_lut};
+
+      TFLITE_DCHECK_EQ(
+          arm_softmax_s16(
+              tflite::micro::GetTensorData<int16_t>(input), op_data.num_rows,
+              op_data.row_size, op_data.softmax_params.input_multiplier,
+              op_data.softmax_params.input_left_shift, &softmax_params,
+              tflite::micro::GetTensorData<int16_t>(output)),
+          ARM_CMSIS_NN_SUCCESS);
      return kTfLiteOk;
    }
    default:
@@ -89,10 +130,79 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
  }
 }

+TfLiteStatus SoftmaxEvalInt8(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const CMSISNNSoftmaxParams op_data =
+      *static_cast<const CMSISNNSoftmaxParams*>(node->user_data);
+
+  arm_softmax_s8(tflite::micro::GetTensorData<int8_t>(input), op_data.num_rows,
+                 op_data.row_size, op_data.softmax_params.input_multiplier,
+                 op_data.softmax_params.input_left_shift,
+                 op_data.softmax_params.diff_min,
+                 tflite::micro::GetTensorData<int8_t>(output));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SoftmaxEvalInt8_Int16(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const CMSISNNSoftmaxParams op_data =
+      *static_cast<const CMSISNNSoftmaxParams*>(node->user_data);
+
+  arm_softmax_s8_s16(
+      tflite::micro::GetTensorData<int8_t>(input), op_data.num_rows,
+      op_data.row_size, op_data.softmax_params.input_multiplier,
+      op_data.softmax_params.input_left_shift, op_data.softmax_params.diff_min,
+      tflite::micro::GetTensorData<int16_t>(output));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SoftmaxEvalInt16(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const CMSISNNSoftmaxParams op_data =
+      *static_cast<const CMSISNNSoftmaxParams*>(node->user_data);
+
+  const cmsis_nn_softmax_lut_s16 softmax_params = {
+      .exp_lut = op_data.softmax_params.exp_lut,
+      .one_by_one_lut = op_data.softmax_params.one_over_one_plus_x_lut};
+
+  TFLITE_DCHECK_EQ(
+      arm_softmax_s16(tflite::micro::GetTensorData<int16_t>(input),
+                      op_data.num_rows, op_data.row_size,
+                      op_data.softmax_params.input_multiplier,
+                      op_data.softmax_params.input_left_shift, &softmax_params,
+                      tflite::micro::GetTensorData<int16_t>(output)),
+      ARM_CMSIS_NN_SUCCESS);
+
+  return kTfLiteOk;
+}
+
 }  // namespace

 TfLiteRegistration Register_SOFTMAX() {
-  return tflite::micro::RegisterOp(SoftmaxInit, SoftmaxPrepare, SoftmaxEval);
+  return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEval);
+}
+
+TfLiteRegistration Register_SOFTMAX_INT8() {
+  return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8);
+}
+
+TfLiteRegistration Register_SOFTMAX_INT8_INT16() {
+  return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt8_Int16);
+}
+
+TfLiteRegistration Register_SOFTMAX_INT16() {
+  return tflite::micro::RegisterOp(Init, Prepare, SoftmaxEvalInt16);
 }

 }  // namespace tflite
--- a/tensorflow/lite/micro/kernels/softmax.h
+++ b/tensorflow/lite/micro/kernels/softmax.h
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,6 +23,13 @@ namespace tflite {

 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length);

+// Common helper function to SoftmaxPrepare.
+TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    SoftmaxParams* op_data);
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);

 // This is the most generic TfLiteRegistration. The actual supported types may
@@ -30,7 +37,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);
 // (reference or optimized) must define this function.
 TfLiteRegistration Register_SOFTMAX();

-#if defined(XTENSA)
+#if defined(XTENSA) || defined(CMSIS_NN)
 // Returns a TfLiteRegistration struct for kernel variant that only supports
 // int8 input and int16 output.
 TfLiteRegistration Register_SOFTMAX_INT8_INT16();
@@ -40,6 +47,23 @@ inline TfLiteRegistration Register_SOFTMAX_INT8_INT16() {
 }
 #endif

+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8 input/output and uses the latency optimized implementations.
+TfLiteRegistration Register_SOFTMAX_INT8();
+
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16 input/output and uses the latency optimized implementations.
+TfLiteRegistration Register_SOFTMAX_INT16();
+
+#else
+inline TfLiteRegistration Register_SOFTMAX_INT8() { return Register_SOFTMAX(); }
+
+inline TfLiteRegistration Register_SOFTMAX_INT16() {
+  return Register_SOFTMAX();
+}
+#endif
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_SOFTMAX_H_
--- a/tensorflow/lite/micro/kernels/softmax_common.cc
+++ b/tensorflow/lite/micro/kernels/softmax_common.cc
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,11 +28,59 @@ namespace {
 // Softmax parameter data that persists in user_data
 const int kInt16LUTArraySize = 513;

+TfLiteStatus InitializeLutForInt16(TfLiteContext* context,
+                                   const TfLiteTensor* input,
+                                   TfLiteTensor* output,
+                                   SoftmaxParams* op_data) {
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    op_data->one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context,
+                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return std::exp(value); }, -10.0f, 0.0f, -1.0f, 1.0f,
+        op_data->exp_lut);
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, -1.0f,
+        1.0f, op_data->one_over_one_plus_x_lut);
+    op_data->zero_point = output->params.zero_point;
+    op_data->scale = output->params.scale;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
                                    const TfLiteTensor* input,
                                    TfLiteTensor* output,
                                    const TfLiteSoftmaxParams* params,
                                    SoftmaxParams* op_data) {
+  if (InitializeLutForInt16(context, input, output, op_data) != kTfLiteOk) {
+    return kTfLiteError;
+  }
+
  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
    if (input->type == kTfLiteInt16) {
      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -83,8 +131,6 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
  return kTfLiteOk;
 }

-}  // namespace
-
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
@@ -103,40 +149,6 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {

  TF_LITE_ENSURE(context, node->user_data != nullptr);
  SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
-  // Only allocate LUTs for KTfLiteInt16 data type
-  if (input->type == kTfLiteInt16) {
-    void* raw_exp_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
-    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
-    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
-    op_data->one_over_one_plus_x_lut =
-        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
-  }
-
-  if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE(context,
-                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input->type, output->type);
-  }
-
-  // Populate LUT if required
-  if (input->type == kTfLiteInt16) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    // exp LUT only used on negative values
-    // we consider exp(-10.0) is insignificant to accumulation
-    gen_lut<float, int16_t, int16_t>(
-        [](float value) { return std::exp(value); }, -10.0f, 0.0f, -1.0f, 1.0f,
-        op_data->exp_lut);
-    gen_lut<float, int16_t, int16_t>(
-        [](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, -1.0f,
-        1.0f, op_data->one_over_one_plus_x_lut);
-    op_data->zero_point = output->params.zero_point;
-    op_data->scale = output->params.scale;
-  }

  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
  auto ret_val =