Cadence:LUT based impln for sigmoid and tanh

LUT based reference_integer_ops implementation for sigmoid and tanh are called instead of gemmlowp implementation. BUG= #1148

Cadence:LUT based impln for sigmoid and tanh
LUT based reference_integer_ops implementation for sigmoid and tanh are called instead of gemmlowp implementation. BUG= #1148
e5b0712b · lukmanr-cadence · GitHub · 4c5b014c · e5b0712b · e5b0712b
3 changed file
--- a/tensorflow/lite/micro/kernels/lstm_eval.cc
+++ b/tensorflow/lite/micro/kernels/lstm_eval.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
@@ -530,11 +532,20 @@ void CalculateLstmGateInteger8x8_16(
  // Apply activation
  switch (activation) {
    case kTfLiteActSigmoid:
-      micro_tensor_utils::ApplySigmoid(gate, n_batch, n_cell, gate);
-      break;
-    case kTfLiteActTanh:
-      micro_tensor_utils::ApplyTanh(3, gate, n_batch, n_cell, gate);
+
+      reference_integer_ops::Logistic(
+          0 /*data->input_multiplier*/, 0 /*data->input_left_shift */,
+          n_batch * n_cell /*NumElements(input->dims)*/,
+          gate /* tflite::micro::GetTensorData<int16_t>(input) */,
+          gate /*tflite::micro::GetTensorData<int16_t>(output) */);
+
      break;
+    case kTfLiteActTanh: {
+      int32_t dims_data = n_batch * n_cell;
+      RuntimeShape tanh_inp_shape = RuntimeShape(1, &dims_data);
+      reference_integer_ops::Tanh(0, 0, tanh_inp_shape, gate, tanh_inp_shape,
+                                  gate);
+    } break;
    default:
      // Only Sigmoid or Tanh is used.
      TFLITE_ASSERT_FALSE;
@@ -599,7 +610,7 @@ void UpdateLstmCellInteger(int n_batch, int n_cell, int16_t* cell_state,
 //  - scratch1: scratch area of size n_batch*n_cell
 //  - scratch2: scratch area used by MatrixBatchVectorMultiplyAccumulate
 void CalculateLstmOutputInteger8x8_16(
-    int n_batch, int n_cell, int n_output, const int16_t* cell_state,
+    int n_batch, int n_cell, int n_output, int16_t* cell_state,
    int32_t cell_state_scale, const int16_t* output_gate,
    int32_t hidden_scale_a, int32_t hidden_scale_b, int32_t hidden_zp,
    const int8_t* projection_weights, int32_t proj_scale_a,
@@ -607,8 +618,23 @@ void CalculateLstmOutputInteger8x8_16(
    int32_t output_state_zp, int8_t quantized_proj_clip, int8_t* output_state,
    int16_t* scratch0, int8_t* scratch1, int32_t* scratch2) {
  // Note: unlike float/hybrid, the activation is always Tanh.
-  micro_tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch,
-                                n_cell, scratch0);
+
+  {
+    int32_t tanh_input_left_shift = (15 + cell_state_scale) - 3;
+    int32_t dims_data = n_batch * n_cell;
+    if (tanh_input_left_shift < 0) /* handling negative shift value */
+    {
+      int32_t i;
+      tanh_input_left_shift = -tanh_input_left_shift;
+      for (i = 0; i < dims_data; i++) {
+        cell_state[i] = cell_state[i] >> tanh_input_left_shift;
+      }
+      tanh_input_left_shift = 0;
+    }
+    RuntimeShape tanh_inp_shape = RuntimeShape(1, &dims_data);
+    reference_integer_ops::Tanh(0, tanh_input_left_shift, tanh_inp_shape,
+                                cell_state, tanh_inp_shape, scratch0);
+  }
  micro_tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a,
                               hidden_scale_b, n_batch, n_cell, hidden_zp,
                               scratch1);

--- a/tensorflow/lite/micro/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/micro/kernels/lstm_eval_test.cc
@@ -778,7 +778,7 @@ TF_LITE_MICRO_TEST(TestOneFullyQuantizedLSTM) {

  // Verify results.
  const int16_t expected_cell[20] = {
-      7, 1, 3, 2, 0, 1, 0, 2, -2, 4, 1, 6, 4, 3, 0, 1, 0, 2, -2, 4,
+      7, 2, 3, 3, 0, 1, 0, 2, -2, 3, 1, 6, 3, 3, 0, 1, 0, 2, -2, 3,
  };
  const int8_t expected_activation[12] = {
      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,

--- a/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test_config.cc
+++ b/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test_config.cc
@@ -237,7 +237,7 @@ LstmIntegerTestConfig lstm_integer_no_peephole_config = {
                kLstmSeqLengthIntegerNoPeephole *
                kLstmNumOutputIntegerNoPeephole]){
            127, 127, -108, -67, 127, 127, -128, 127, 127, -128, 127, 127, 127,
-            127, 127, -128, 127, 127},
+            127, 13, -128, 127, 127},

    .asymmetric_quantize_inputs = false,
    .ranges = {{-1.0, 127.0 / 128},
@@ -484,8 +484,8 @@ LstmIntegerTestConfig lstm_integer_peephole_config = {
    .expected_output =
        (int8_t[kLstmNumBatchIntegerPeephole * kLstmSeqLengthIntegerPeephole *
                kLstmNumOutputIntegerPeephole]){127, 127, -16, -21, 127, 127,
-                                                23, 127, 127, -128, 127, 127,
-                                                127, 127, 127, -128, 127, 127},
+                                                -128, 127, 127, -128, 127, 127,
+                                                127, 127, 105, -128, 127, 127},

    .asymmetric_quantize_inputs = false,
    .ranges = {{-1.0, 127.0 / 128},