Updated fully_connected kernel to work with embARC MLI Library 2.0 for ARC (#229)

* Updated fully_connected kernel to work with embARC MLI Library 2.0 * Update fully_connected.cc * Minor fix for fully_connected activation functions.

Updated fully_connected kernel to work with embARC MLI Library 2.0 for ARC (#229)
* Updated fully_connected kernel to work with embARC MLI Library 2.0 * Update fully_connected.cc * Minor fix for fully_connected activation functions.
46ce0bd3 · Artem Tsvetkov · GitHub · 56d10b04 · 46ce0bd3 · 46ce0bd3
2 changed file
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -52,10 +52,14 @@ struct OpData {
  bool is_mli_applicable;

  // Tensors in MLI format.
-  mli_tensor* mli_in;
-  mli_tensor* mli_weights;
-  mli_tensor* mli_bias;
-  mli_tensor* mli_out;
+  mutable ops::micro::MliTensorInterface mli_in;
+  mutable ops::micro::MliTensorInterface mli_weights;
+  mutable ops::micro::MliTensorInterface mli_bias;
+  mutable ops::micro::MliTensorInterface mli_out;
+
+#ifdef MLI_2_0
+  mli_fully_connected_cfg* cfg;
+#endif
 };

 constexpr int kInputTensor = 0;
@@ -65,13 +69,19 @@ constexpr int kOutputTensor = 0;

 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                     const TfLiteTensor* filter, const TfLiteTensor* bias,
-                     const TfLiteFullyConnectedParams* params) {
+                     const TfLiteFullyConnectedParams* params,
+                     int32_t output_activation_min,
+                     int32_t output_activation_max) {
  // MLI optimized version only supports int8_t datatype and no fused Relu and
  // symmetric per-tensor quantization of weights (not per-axis)
-  bool ret_val = (filter->type == kTfLiteInt8) &&
-                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
-                 (params->activation == kTfLiteActNone) &&
-                 (filter->params.zero_point == 0);
+  bool ret_val =
+      (filter->type == kTfLiteInt8) && (input->type == kTfLiteInt8) &&
+      (bias->type == kTfLiteInt32) &&
+#ifndef MLI_2_0
+      (params->activation == kTfLiteActNone ||
+       (output_activation_min == -128 && output_activation_max == 127)) &&
+#endif
+      (filter->params.zero_point == 0);
  return ret_val;
 }

@@ -126,37 +136,59 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  data->filter_zero_point = filter->params.zero_point;
  data->output_zero_point = output->params.zero_point;

+  TfLiteStatus status = CalculateOpData(context, params, input->type, input,
+                                        filter, bias, output, data);
+
  data->is_mli_applicable =
-      IsMliApplicable(context, input, filter, bias, params);
+      IsMliApplicable(context, input, filter, bias, params,
+                      data->output_activation_min, data->output_activation_max);

  if (input->type == kTfLiteInt8 && data->is_mli_applicable) {
-    data->mli_in = static_cast<mli_tensor*>(
-        context->AllocatePersistentBuffer(context, sizeof(mli_tensor)));
-    data->mli_weights = static_cast<mli_tensor*>(
-        context->AllocatePersistentBuffer(context, sizeof(mli_tensor)));
-    data->mli_bias = static_cast<mli_tensor*>(
-        context->AllocatePersistentBuffer(context, sizeof(mli_tensor)));
-    data->mli_out = static_cast<mli_tensor*>(
-        context->AllocatePersistentBuffer(context, sizeof(mli_tensor)));
-
-    ops::micro::ConvertToMliTensor(input, data->mli_in);
-    ops::micro::ConvertToMliTensor(filter, data->mli_weights);
-    ops::micro::ConvertToMliTensor(bias, data->mli_bias);
-    ops::micro::ConvertToMliTensor(output, data->mli_out);
+    data->mli_in = ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
+        context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
+    data->mli_weights = ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
+        context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
+    data->mli_bias = ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
+        context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
+    data->mli_out = ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
+        context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
+
+    ops::micro::ConvertToMliTensor(input, &data->mli_in);
+    ops::micro::ConvertToMliTensor(filter, &data->mli_weights);
+    ops::micro::ConvertToMliTensor(bias, &data->mli_bias);
+    ops::micro::ConvertToMliTensor(output, &data->mli_out);
+
+#ifdef MLI_2_0
+    if (data->output_activation_min == -128 &&
+        data->output_activation_max == 127) {
+      data->cfg->relu.type = MLI_RELU_NONE;
+    } else if (params->activation == kTfLiteActRelu) {
+      data->cfg->relu.type = MLI_RELU_GEN;
+    } else if (params->activation == kTfLiteActRelu6) {
+      data->cfg->relu.type = MLI_RELU_6;
+    } else if (params->activation == kTfLiteActReluN1To1) {
+      data->cfg->relu.type = MLI_RELU_1;
+    } else {
+      data->cfg->relu.type = MLI_RELU_NONE;
+    }
+#endif

    /* The input tensor can have more than 2 dimensions. for the compute this
   doesn't make any difference because all the inputs or a batch entry will
   be used anyway. because the MLI kernel doesn't recognize the multiple
   dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
-    data->mli_in->shape[0] = data->mli_out->shape[0];
-    data->mli_in->shape[1] = data->mli_weights->shape[1];
-    data->mli_in->shape[2] = 0;
-    data->mli_in->shape[3] = 0;
-    data->mli_in->rank = 2;
+    data->mli_in.Shape()[0] = data->mli_out.Shape()[0];
+#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
+    data->mli_in.Shape()[1] = data->mli_weights.Shape()[0];
+#else
+    data->mli_in.Shape()[1] = data->mli_weights.Shape()[1];
+#endif
+    data->mli_in.Shape()[2] = 0;
+    data->mli_in.Shape()[3] = 0;
+    *data->mli_in.Rank() = 2;
  }

-  return (CalculateOpData(context, params, input->type, input, filter, bias,
-                          output, data));
+  return status;
 }

 TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
@@ -166,62 +198,103 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                  const TfLiteEvalTensor* filter,
                                  const TfLiteEvalTensor* bias,
                                  TfLiteEvalTensor* output) {
-  ops::micro::MliTensorAttachBuffer<int8_t>(input, data.mli_in);
-  ops::micro::MliTensorAttachBuffer<int8_t>(filter, data.mli_weights);
-  ops::micro::MliTensorAttachBuffer<int32_t>(bias, data.mli_bias);
-  ops::micro::MliTensorAttachBuffer<int8_t>(output, data.mli_out);
+  ops::micro::MliTensorAttachBuffer<int8_t>(input, &data.mli_in);
+  ops::micro::MliTensorAttachBuffer<int8_t>(filter, &data.mli_weights);
+  ops::micro::MliTensorAttachBuffer<int32_t>(bias, &data.mli_bias);
+  ops::micro::MliTensorAttachBuffer<int8_t>(output, &data.mli_out);

  // Tensors for data in fast (local) memory and config to copy data from
  // external to local memory
-  mli_tensor weights_local = *data.mli_weights;
-  mli_tensor bias_local = *data.mli_bias;
-  mli_tensor in_local = *data.mli_in;
-  mli_tensor out_local = *data.mli_out;
+  mli_tensor weights_local = *data.mli_weights.MliTensor();
+  mli_tensor bias_local = *data.mli_bias.MliTensor();
+  mli_tensor in_local = *data.mli_in.MliTensor();
+  mli_tensor out_local = *data.mli_out.MliTensor();
+
+  ops::micro::MliTensorInterface weights_local_interface(&weights_local);
+  ops::micro::MliTensorInterface bias_local_interface(&bias_local);
+  ops::micro::MliTensorInterface in_local_interface(&in_local);
+  ops::micro::MliTensorInterface out_local_interface(&out_local);
+
  mli_mov_cfg_t copy_config;
  mli_mov_cfg_for_copy(&copy_config);
+#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
+  const int weight_out_dimension = 1;
+#else
  const int weight_out_dimension = 0;
+#endif
+  // bias has only 1 dimension
+  const int bias_out_ch_dimension = 0;
  const int out_tensor_dimension = 1;
  const int input_size_dimension = 1;
-  int slice_size = data.mli_weights->shape[weight_out_dimension];
+  int slice_size = data.mli_weights.Shape()[weight_out_dimension];

  /* allocate the local buffers, and compute the slice size */
  TF_LITE_ENSURE_STATUS(
      ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
-          context, &in_local, &weights_local, &bias_local, &out_local));
+          context, &in_local_interface, &weights_local_interface,
+          &bias_local_interface, &out_local_interface));
  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
-      &weights_local, &bias_local, weight_out_dimension, &slice_size));
-  int max_out_slice_size =
-      out_local.capacity / mli_hlp_tensor_element_size(&out_local);
+      &weights_local_interface, &bias_local_interface, weight_out_dimension,
+      &slice_size));
+
+  int max_out_slice_size = *out_local_interface.DataCapacity() /
+                           mli_hlp_tensor_element_size(&out_local);
+
  if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;

  /* is_local indicates that the tensor is already in local memory,
     so in that case the original tensor can be used,
     and there is no need to copy it to the local tensor*/
-  const bool in_is_local = in_local.data == data.mli_in->data;
-  const bool out_is_local = out_local.data == data.mli_out->data;
-  const bool w_is_local = weights_local.data == data.mli_weights->data;
-  const bool b_is_local = bias_local.data == data.mli_bias->data;
-
-  ops::micro::TensorSlicer w_slice(data.mli_weights, weight_out_dimension,
-                                   slice_size);
-  ops::micro::TensorSlicer b_slice(data.mli_bias, weight_out_dimension,
-                                   slice_size);
-  ops::micro::TensorSlicer out_ch_slice(data.mli_out, out_tensor_dimension,
-                                        slice_size, 0, 0, 0, true);
+  const bool in_is_local =
+      in_local_interface.Data<int8_t>() == data.mli_in.Data<int8_t>();
+  const bool out_is_local =
+      out_local_interface.Data<int8_t>() == data.mli_out.Data<int8_t>();
+  const bool b_is_local =
+      bias_local_interface.Data<int32_t>() == data.mli_bias.Data<int32_t>();
+#ifndef MLI_2_0_KRNL_TEST
+  const bool w_is_local =
+      weights_local_interface.Data<int8_t>() == data.mli_weights.Data<int8_t>();
+#endif

+#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
+  ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
+                                   weight_out_dimension, slice_size, 0, 0, 0,
+                                   true);
+#else
+  ops::micro::TensorSlicer w_slice(data.mli_weights.MliTensor(),
+                                   weight_out_dimension, slice_size);
+#endif
+  ops::micro::TensorSlicer b_slice(data.mli_bias.MliTensor(),
+                                   bias_out_ch_dimension, slice_size);
+  ops::micro::TensorSlicer out_ch_slice(data.mli_out.MliTensor(),
+                                        out_tensor_dimension, slice_size, 0, 0,
+                                        0, true);
+
+#ifdef MLI_2_0_KRNL_TEST
+  mli_tensor* w_ptr = &weights_local;
+#else
  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+#endif
  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;

  void* input_buffer_ptr = NULL;

  while (!w_slice.Done()) {
+#if defined(MLI_2_0) && !defined(MLI_2_0_KRNL_TEST)
+    w_ptr->el_params.sa.scale.mem.pi16 = NULL;
+    b_ptr->el_params.sa.scale.mem.pi16 = NULL;
+#endif
+
+#ifndef MLI_2_0_KRNL_TEST
    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+#endif
    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);

    // Slice the input over the batches (one at a time with the size of a
    // complete input)
-    ops::micro::TensorSlicer in_slice(data.mli_in, input_size_dimension,
-                                      data.mli_in->shape[input_size_dimension]);
+    ops::micro::TensorSlicer in_slice(
+        data.mli_in.MliTensor(), input_size_dimension,
+        data.mli_in.Shape()[input_size_dimension]);

    /* output tensor is already sliced in the output size dimension.
    out_ch_slice.Sub() is the tensor for the amount of output size of this
@@ -235,13 +308,38 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;

+#ifdef MLI_2_0_KRNL_TEST
+    /* Permute weights tensor to the HWCN layout */
+    // Assertion here to prevent usage non-contiguous buffer memory.
+    if (data.mli_out.Shape()[out_tensor_dimension] !=
+        out_slice.Sub()->shape[0]) {
+      TF_LITE_KERNEL_LOG(
+          context, "Slicing is not supported with real-time permutation.");
+      return kTfLiteError;
+    }
+    mli_permute_cfg permute_cfg = {{1, 0, 2, 3}};
+    ops::micro::permute_weights(data.mli_weights.MliTensor(), &permute_cfg,
+                                w_ptr, &out_ptr->data);
+#endif
+
    while (!out_slice.Done()) {
      // if same input copy as previous iteration, skip the copy of input
+#ifdef MLI_2_0
+      if (in_slice.Sub()->data.mem.pi8 != input_buffer_ptr) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data.mem.pi8;
+      }
+      mli_fully_connected_cfg cfg;
+      cfg.relu.type = MLI_RELU_NONE;
+      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+#else
      if (in_slice.Sub()->data != input_buffer_ptr) {
        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
        input_buffer_ptr = in_slice.Sub()->data;
      }
      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
+#endif
+
      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());

      in_slice.Next();

--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
@@ -167,7 +167,11 @@ TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) {

  const int output_dims_count = 6;

+#ifdef __Xvdsp
+#pragma Bss(".vecmem_data")
+#else
 #pragma Bss(".Zdata")
+#endif
  const int8_t input_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
  const int8_t weights_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -239,7 +243,11 @@ TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) {

  const int output_dims_count_local_2 = 60;

+#ifdef __Xvdsp
+#pragma Bss(".vecmem_data")
+#else
 #pragma Bss(".Zdata")
+#endif
  const int8_t input_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
@@ -309,7 +317,11 @@ TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) {

  const int output_dims_count_local_3 = 20;

+#ifdef __Xvdsp
+#pragma Bss(".vecmem_data")
+#else
 #pragma Bss(".Zdata")
+#endif
  static int8_t input_data_local_3[10];
  static int8_t weights_data_local_3[50];
  static int32_t bias_data_local_3[10];
@@ -397,7 +409,11 @@ TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {

  const int output_dims_count_local_4 = 25;

+#ifdef __Xvdsp
+#pragma Bss(".vecmem_data")
+#else
 #pragma Bss(".Zdata")
+#endif
  const int8_t input_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -422,4 +438,4 @@ TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {
      output_max, kTfLiteActNone, output_data_local_4);
 }

-TF_LITE_MICRO_TESTS_END
+TF_LITE_MICRO_TESTS_END
\ No newline at end of file