Updated utility functions to work with embARC MLI Library 2.0 for ARC (#231)

* Updated utility functions to work with embARC MLI Library 2.0 * Updated copyrights in several files. * Minor fix for mli_tf_utils.h * Update mli_tf_utils.h

Updated utility functions to work with embARC MLI Library 2.0 for ARC (#231)
* Updated utility functions to work with embARC MLI Library 2.0 * Updated copyrights in several files. * Minor fix for mli_tf_utils.h * Update mli_tf_utils.h
e9591074 · Artem Tsvetkov · GitHub · d47d48e3 · e9591074 · e9591074
8 changed file
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_interface.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_interface.cc
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mli_interface.h"  // NOLINT
+
+#include <math.h>
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+template <>
+int8_t* MliTensorInterface::Data<int8_t>(void) {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I8);
+  return static_cast<int8_t*>(tensor_->data);
+}
+
+template <>
+int32_t* MliTensorInterface::Data<int32_t>(void) {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I32);
+  return static_cast<int32_t*>(tensor_->data);
+}
+
+template <>
+int32_t* MliTensorInterface::Scale(void) {
+  return &tensor_->el_params.asym.scale.i32;
+}
+
+template <>
+int32_t** MliTensorInterface::Scale(void) {
+  return &tensor_->el_params.asym.scale.pi32;
+}
+
+template <>
+void MliTensorInterface::SetData(int8_t* data, uint32_t capacity) const {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I8);
+  tensor_->data = data;
+  tensor_->capacity = capacity;
+}
+
+template <>
+void MliTensorInterface::SetData(int32_t* data, uint32_t capacity) const {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_ASYM_I32);
+  tensor_->data = data;
+  tensor_->capacity = capacity;
+}
+
+mli_tensor* MliTensorInterface::MliTensor(void) { return tensor_; }
+
+const mli_tensor* MliTensorInterface::MliTensor(void) const {
+  return static_cast<const mli_tensor*>(
+      const_cast<MliTensorInterface*>(this)->MliTensor());
+}
+
+uint32_t* MliTensorInterface::Rank(void) { return &tensor_->rank; }
+
+const uint32_t* MliTensorInterface::DataCapacity(void) const {
+  return &tensor_->capacity;
+}
+
+mli_element_type* MliTensorInterface::ElType(void) { return &tensor_->el_type; }
+
+template <>
+int16_t* MliTensorInterface::ZeroPoint(void) {
+  return &tensor_->el_params.asym.zero_point.i16;
+}
+
+template <>
+int16_t** MliTensorInterface::ZeroPoint(void) {
+  return &tensor_->el_params.asym.zero_point.pi16;
+}
+
+uint32_t* MliTensorInterface::ZeroPointCapacity(void) { return nullptr; }
+
+int32_t* MliTensorInterface::Dim(void) { return &tensor_->el_params.asym.dim; }
+
+uint32_t* MliTensorInterface::ScaleCapacity(void) { return nullptr; }
+
+template <>
+int8_t* MliTensorInterface::ScaleFracBits(void) {
+  return &tensor_->el_params.asym.scale_frac_bits;
+}
+
+uint32_t* MliTensorInterface::ScaleFracBitsCapacity(void) { return nullptr; }
+
+int32_t* MliTensorInterface::MemStride(void) { return tensor_->mem_stride; }
+
+uint32_t* MliTensorInterface::Shape(void) { return tensor_->shape; }
+
+const uint32_t* MliTensorInterface::Shape(void) const {
+  return static_cast<const uint32_t*>(
+      const_cast<MliTensorInterface*>(this)->Shape());
+}
+
+void MliTensorInterface::SetScale(float fscale) {
+  int exp;
+  frexpf(fscale, &exp);
+  int frac_bits = 31 - exp;
+  int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
+  *(this->ScaleFracBits<int8_t*>()) = frac_bits;
+  *(this->Scale<int32_t*>()) = (int32_t)iscale;
+}
+
+void MliTensorInterface::SetScalePerChannel(float* fscale,
+                                            const int num_channels) {
+  int min_frac_bits;
+  for (int i = 0; i < num_channels; i++) {
+    int exp;
+    frexpf(fscale[i], &exp);
+    int cur_frac_bits = 31 - exp;
+    if (i == 0) {
+      min_frac_bits = cur_frac_bits;
+    } else {
+      min_frac_bits =
+          min_frac_bits < cur_frac_bits ? min_frac_bits : cur_frac_bits;
+    }
+  }
+  *this->ScaleFracBits<int8_t*>() = min_frac_bits;
+
+  for (int i = 0; i < num_channels; i++) {
+    int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
+    (*this->Scale<int32_t**>())[i] = iscale;
+  }
+}
+
+void MliTensorInterface::SetElType(TfLiteType type) {
+  if (type == kTfLiteInt8) {
+    *this->ElType() = MLI_EL_ASYM_I8;
+  } else if (type == kTfLiteInt32) {
+    *this->ElType() = MLI_EL_ASYM_I32;
+  } else {
+    TF_LITE_FATAL("Wrong data type. Expected int8_t or int32_t.");
+  }
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_interface.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_interface.h
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_INTERFACE_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_INTERFACE_H_
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+namespace tflite {
+namespace ops {
+namespace micro {
+
+// Abstracts access to mli_tensor fields to use different versions of MLI
+// Library (1.x and 2.x)
+// Example:
+//    ops::micro::MliTensorInterface mli_in =
+//    ops::micro::MliTensorInterface(static_cast<mli_tensor*>(
+//        context->AllocatePersistentBuffer(context, sizeof(mli_tensor))));
+
+class MliTensorInterface {
+ public:
+  // Make sure that lifetime of MliTensorInterface instance isn't bigger than
+  // related mli_tensor.
+  MliTensorInterface(mli_tensor* tensor) : tensor_(tensor){};
+  MliTensorInterface() = default;
+  ~MliTensorInterface() = default;
+
+  template <typename T>
+  T* Data();
+  template <typename T>
+  T Scale();
+  template <typename T>
+  T ZeroPoint();
+  template <typename T>
+  T ScaleFracBits();
+  mli_tensor* MliTensor();
+  const mli_tensor* MliTensor() const;
+  int32_t* Dim();
+  uint32_t* Rank();
+  uint32_t* Shape();
+  const uint32_t* Shape() const;
+  const uint32_t* DataCapacity() const;
+  uint32_t* ScaleCapacity();
+  mli_element_type* ElType();
+  uint32_t* ScaleFracBitsCapacity();
+  int32_t* MemStride();
+  uint32_t* ZeroPointCapacity();
+
+  template <typename T>
+  void SetData(T* data, uint32_t capacity) const;
+  void SetScale(float fscale);
+  void SetScalePerChannel(float* fscale, const int num_channels);
+  void SetElType(TfLiteType type);
+
+ private:
+  mli_tensor* tensor_;
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_interface_mli_20.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_interface_mli_20.cc
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "mli_interface.h"  // NOLINT
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+#ifdef MLI_2_0
+template <>
+int8_t* MliTensorInterface::Data(void) {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_8);
+  return tensor_->data.mem.pi8;
+}
+
+template <>
+int32_t* MliTensorInterface::Data(void) {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_32);
+  return tensor_->data.mem.pi32;
+}
+
+template <>
+int16_t** MliTensorInterface::Scale(void) {
+  return &tensor_->el_params.sa.scale.mem.pi16;
+}
+
+template <>
+int16_t* MliTensorInterface::Scale(void) {
+  return &tensor_->el_params.sa.scale.mem.i16;
+}
+
+template <>
+void MliTensorInterface::SetData(int8_t* data, uint32_t capacity) const {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_8);
+  tensor_->data.mem.pi8 = data;
+  tensor_->data.capacity = capacity;
+}
+
+template <>
+void MliTensorInterface::SetData(int32_t* data, uint32_t capacity) const {
+  TFLITE_DCHECK(tensor_->el_type == MLI_EL_SA_32);
+  tensor_->data.mem.pi32 = data;
+  tensor_->data.capacity = capacity;
+}
+
+mli_tensor* MliTensorInterface::MliTensor(void) { return tensor_; }
+
+const mli_tensor* MliTensorInterface::MliTensor(void) const {
+  return static_cast<const mli_tensor*>(
+      const_cast<MliTensorInterface*>(this)->MliTensor());
+}
+
+uint32_t* MliTensorInterface::Rank(void) { return &tensor_->rank; }
+
+const uint32_t* MliTensorInterface::DataCapacity(void) const {
+  return &tensor_->data.capacity;
+}
+
+mli_element_type* MliTensorInterface::ElType(void) { return &tensor_->el_type; }
+
+template <>
+int16_t* MliTensorInterface::ZeroPoint(void) {
+  return &tensor_->el_params.sa.zero_point.mem.i16;
+}
+
+template <>
+int16_t** MliTensorInterface::ZeroPoint(void) {
+  return &tensor_->el_params.sa.zero_point.mem.pi16;
+}
+
+uint32_t* MliTensorInterface::ZeroPointCapacity(void) {
+  return &tensor_->el_params.sa.zero_point.capacity;
+}
+
+int32_t* MliTensorInterface::Dim(void) { return &tensor_->el_params.sa.dim; }
+
+uint32_t* MliTensorInterface::ScaleCapacity(void) {
+  return &tensor_->el_params.sa.scale.capacity;
+}
+
+template <>
+int8_t** MliTensorInterface::ScaleFracBits(void) {
+  return &tensor_->el_params.sa.scale_frac_bits.mem.pi8;
+}
+
+template <>
+int8_t* MliTensorInterface::ScaleFracBits(void) {
+  return &tensor_->el_params.sa.scale_frac_bits.mem.i8;
+}
+
+uint32_t* MliTensorInterface::ScaleFracBitsCapacity(void) {
+  return &tensor_->el_params.sa.scale_frac_bits.capacity;
+}
+
+int32_t* MliTensorInterface::MemStride(void) { return tensor_->mem_stride; }
+
+uint32_t* MliTensorInterface::Shape(void) { return tensor_->shape; }
+
+const uint32_t* MliTensorInterface::Shape(void) const {
+  return static_cast<const uint32_t*>(
+      const_cast<MliTensorInterface*>(this)->Shape());
+}
+
+void MliTensorInterface::SetScale(float fscale) {
+  int exp;
+  frexpf(fscale, &exp);
+  int frac_bits = 15 - exp;
+  int16_t iscale = (int16_t)((1ll << frac_bits) * fscale + 0.5f);
+  *(this->Scale<int16_t*>()) = (int16_t)iscale;
+  *(this->ScaleFracBits<int8_t*>()) = frac_bits;
+  *this->ScaleCapacity() = 1 * sizeof(int16_t);
+  *this->ScaleFracBitsCapacity() = 1 * sizeof(int8_t);
+}
+
+void MliTensorInterface::SetScalePerChannel(float* fscale,
+                                            const int num_channels) {
+  for (int i = 0; i < num_channels; i++) {
+    int exp;
+    frexpf(fscale[i], &exp);
+    int cur_frac_bits = 15 - exp;
+    (*this->ScaleFracBits<int8_t**>())[i] = cur_frac_bits;
+  }
+
+  for (int i = 0; i < num_channels; i++) {
+    int16_t iscale =
+        (int16_t)((1ll << (*this->ScaleFracBits<int8_t**>())[i]) * fscale[i] +
+                  0.5f);
+    (*this->Scale<int16_t**>())[i] = iscale;
+  }
+}
+
+void MliTensorInterface::SetElType(TfLiteType type) {
+  if (type == kTfLiteInt8) {
+    *this->ElType() = MLI_EL_SA_8;
+  } else if (type == kTfLiteInt32) {
+    *this->ElType() = MLI_EL_SA_32;
+  } else {
+    TF_LITE_FATAL("Wrong data type. Expected int8_t or int32_t.");
+  }
+}
+#endif
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,104 +17,261 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_TF_UTILS_H_

 #include "mli_api.h"  // NOLINT
+#include "mli_interface.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"

-constexpr int kFracBitsQ15 = 15;
-constexpr int kFracBitsQ31 = 31;
+#define KRNL_C_DIM_NHWC 0  // output channels

 namespace tflite {
 namespace ops {
 namespace micro {

-inline void ConvertToMliTensorData(const TfLiteTensor* tfT, mli_tensor* mliT) {
+inline void ConvertToMliTensorData(const TfLiteTensor* tfT,
+                                   MliTensorInterface* mliT,
+                                   bool is_bias_tensor) {
  // Data is NULL until MliTensorAttachBuffer is called.
-  mliT->data = NULL;
+  mliT->SetElType(tfT->type);
  if (tfT->type == kTfLiteInt8) {
-    mliT->el_type = MLI_EL_ASYM_I8;
+    mliT->SetData<int8_t>(nullptr, tfT->bytes);
  } else if (tfT->type == kTfLiteInt32) {
-    mliT->el_type = MLI_EL_ASYM_I32;
+    mliT->SetData<int32_t>(nullptr, tfT->bytes);
  } else {
    MicroPrintf("Wrong data type. Expected int8_t or int32_t.");
    TFLITE_ABORT;
  }
+  const int32_t dims_count = GetTensorShape(tfT).DimensionsCount();
+  *mliT->Rank() = is_bias_tensor ? 1 : dims_count;

-  mliT->capacity = tfT->bytes;
-  mliT->rank = GetTensorShape(tfT).DimensionsCount();
-  for (int i = 0; i < GetTensorShape(tfT).DimensionsCount(); i++) {
-    mliT->shape[i] = GetTensorShape(tfT).Dims(i);
+  if (is_bias_tensor) {
+    mliT->Shape()[0] = GetTensorShape(tfT).Dims(dims_count - 1);
+    mliT->MemStride()[0] = 0;
+  } else {
+    for (int i = 0; i < dims_count; i++) {
+      mliT->Shape()[i] = GetTensorShape(tfT).Dims(i);
+      mliT->MemStride()[i] = 0;
+    }
  }
 }

-inline void ConvertToMliQuantParams(const TfLiteTensor* tfT, mli_tensor* mliT) {
-  mliT->el_params.asym.dim = -1;
-  mliT->el_params.asym.zero_point.i16 = tfT->params.zero_point;
+inline void ConvertToMliQuantParams(const TfLiteTensor* tfT,
+                                    MliTensorInterface* mliT) {
+  *mliT->Dim() = -1;
+#ifdef MLI_2_0
+  *mliT->ZeroPointCapacity() = 1 * sizeof(int16_t);
+#endif
+  *mliT->ZeroPoint<int16_t*>() = tfT->params.zero_point;
  float fscale = tfT->params.scale;
-  int exp;
-  frexpf(fscale, &exp);
-  int frac_bits = kFracBitsQ31 - exp;
-  int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
-  mliT->el_params.asym.scale_frac_bits = frac_bits;
-  mliT->el_params.asym.scale.i32 = (int32_t)iscale;
+  mliT->SetScale(fscale);
 }

 inline void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
-                                              mli_tensor* mliT) {
+                                              MliTensorInterface* mliT,
+                                              bool is_bias_tensor) {
  // mli tensor scale and zero_point arrays should be allocated at this point
-  TFLITE_DCHECK_NE(mliT->el_params.asym.scale.pi16, 0);
-  TFLITE_DCHECK_NE(mliT->el_params.asym.zero_point.pi16, 0);
+#ifdef MLI_2_0
+  TFLITE_DCHECK_NE(*mliT->Scale<int16_t**>(), 0);
+  TFLITE_DCHECK_NE(*mliT->ZeroPoint<int16_t**>(), 0);
+#else
+  TFLITE_DCHECK_NE(*mliT->Scale<int32_t**>(), 0);
+  TFLITE_DCHECK_NE(*mliT->ZeroPoint<int16_t**>(), 0);
+#endif

  // get per channel quantization parameters
  const auto* affine_quantization =
      reinterpret_cast<TfLiteAffineQuantization*>(tfT->quantization.params);
-  mliT->el_params.asym.dim = affine_quantization->quantized_dimension;
+  int32_t quantized_dimension =
+      is_bias_tensor ? 0 : affine_quantization->quantized_dimension;
+  const int num_channels = mliT->Shape()[quantized_dimension];
+
+  *mliT->Dim() = quantized_dimension;

-  // find frac_bits
-  const int num_channels =
-      mliT->shape[affine_quantization->quantized_dimension];
-  int min_frac_bits;
+  // set capacities
+#ifdef MLI_2_0
+  *mliT->ScaleFracBitsCapacity() = num_channels * sizeof(int8_t);
+  *mliT->ScaleCapacity() = num_channels * sizeof(int16_t);
+  *mliT->ZeroPointCapacity() = num_channels * sizeof(int16_t);
+#endif
  float* fscale = affine_quantization->scale->data;
-  for (int i = 0; i < num_channels; i++) {
-    int exp;
-    frexpf(fscale[i], &exp);
-    int cur_frac_bits = kFracBitsQ31 - exp;
-    if (i == 0) {
-      min_frac_bits = cur_frac_bits;
-    } else {
-      min_frac_bits =
-          min_frac_bits < cur_frac_bits ? min_frac_bits : cur_frac_bits;
-    }
-  }
-  mliT->el_params.asym.scale_frac_bits = min_frac_bits;
+  mliT->SetScalePerChannel(fscale, num_channels);

+#ifdef MLI_2_0
+  int16_t* zero_point = *mliT->ZeroPoint<int16_t**>();
  for (int i = 0; i < num_channels; i++) {
-    int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
-    mliT->el_params.asym.scale.pi32[i] = iscale;
+    zero_point[i] = tfT->params.zero_point;
  }
+#endif
 }

 template <typename datatype>
-inline void MliTensorAttachBuffer(const TfLiteEvalTensor* tfT,
-                                  mli_tensor* mliT) {
+inline void MliTensorAttachBuffer(const TfLiteEvalTensor*,
+                                  const MliTensorInterface*);
+
+template <>
+inline void MliTensorAttachBuffer<int8_t>(const TfLiteEvalTensor* tfT,
+                                          const MliTensorInterface* mliT) {
  // "const_cast" here used to attach const data buffer to the initially
  // non-const mli_tensor. This is required by current implementation of MLI
  // backend and planned for redesign due to this and some other aspects.
-  mliT->data = const_cast<void*>(
-      static_cast<const void*>(tflite::micro::GetTensorData<datatype>(tfT)));
+  mliT->SetData<int8_t>(
+      const_cast<int8_t*>(tflite::micro::GetTensorData<int8_t>(tfT)),
+      *mliT->DataCapacity());
 }

-inline void ConvertToMliTensor(const TfLiteTensor* tfT, mli_tensor* mliT) {
-  ConvertToMliTensorData(tfT, mliT);
+template <>
+inline void MliTensorAttachBuffer<int32_t>(const TfLiteEvalTensor* tfT,
+                                           const MliTensorInterface* mliT) {
+  // "const_cast" here used to attach const data buffer to the initially
+  // non-const mli_tensor. This is required by current implementation of MLI
+  // backend and planned for redesign due to this and some other aspects.
+  mliT->SetData<int32_t>(
+      const_cast<int32_t*>(tflite::micro::GetTensorData<int32_t>(tfT)),
+      *mliT->DataCapacity());
+}
+
+inline void ConvertToMliTensor(const TfLiteTensor* tfT,
+                               MliTensorInterface* mliT) {
+  ConvertToMliTensorData(tfT, mliT, false);
  ConvertToMliQuantParams(tfT, mliT);
 }

 inline void ConvertToMliTensorPerChannel(const TfLiteTensor* tfT,
-                                         mli_tensor* mliT) {
-  ConvertToMliTensorData(tfT, mliT);
-  ConvertToMliQuantParamsPerChannel(tfT, mliT);
+                                         MliTensorInterface* mliT,
+                                         bool is_bias_tensor) {
+  ConvertToMliTensorData(tfT, mliT, is_bias_tensor);
+  ConvertToMliQuantParamsPerChannel(tfT, mliT, is_bias_tensor);
 }
+
+#ifdef MLI_2_0_KRNL_TEST
+// Reorder an array according to given indexes. If backward is true, order of
+// index array must be reversed.
+inline static void reorder(uint32_t* arr, const uint8_t index[],
+                           bool backward) {
+  uint32_t temp[MLI_MAX_RANK];
+  for (int8_t i = 0; i < MLI_MAX_RANK; i++) {
+    if (backward)
+      temp[index[i]] = arr[i];
+    else
+      temp[i] = arr[index[i]];
+  }
+  for (int8_t i = 0; i < MLI_MAX_RANK; i++) {
+    arr[i] = temp[i];
+  }
+}
+
+// Change shape of mli tensor and recalculate mem strides.
+inline void change_shape(mli_tensor* mliT, const uint8_t dim_order[]) {
+  reorder(mliT->shape, dim_order, false);
+
+  // Calculate strides for new layout
+  int mli_tensor_memstride = 1;
+  for (int shape_idx = mliT->rank - 1; shape_idx >= 0; --shape_idx) {
+    mliT->mem_stride[shape_idx] = mli_tensor_memstride;
+    mli_tensor_memstride *= mliT->shape[shape_idx];
+  }
+}
+
+inline void permute_weights(const mli_tensor* weights_src,
+                            const mli_permute_cfg* permute_cfg,
+                            mli_tensor* weights_dst,
+                            mli_data_container* buffer_data) {
+  mli_tensor buffer = {};
+  buffer.el_params = weights_dst->el_params;
+  buffer.data = *buffer_data;
+  // Compare weights tensor size and avaliable buffer capacity.
+  int buffer_size = buffer_data->capacity;
+  int weights_size = mli_hlp_count_elem_num(weights_src, 0) *
+                     mli_hlp_tensor_element_size(weights_src);
+
+  if (buffer_size >= weights_size) {
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    mli_mov_tensor_sync(weights_src, &copy_config, &buffer);
+    mli_krn_permute_sa8(&buffer, permute_cfg, weights_dst);
+  } else {
+    // Weights shape is NHWC and output (buffer) shape is HWC where N_w = C_o.
+    // Buffer size (H_o * W_o) must be more or equal then the weights size (H_w
+    // * W_w * C_w). So, this is the reason, why buffer size (output tensor) is
+    // divided by channel shape.
+    uint32_t slice_size = buffer_size / weights_src->shape[KRNL_C_DIM_NHWC];
+
+    mli_mov_cfg_t copy_config = {};
+    uint32_t src_offsets[] = {0, 0, 0, 0};
+    uint32_t src_sizes[] = {0, 0, 0, 0};
+    int dst_mem_stride[] = {0, 0, 0, 0};
+
+    // Need to change shape of distanation weights buffer according to permute
+    // dimensions order to calculate slice sizes
+    change_shape(weights_dst, permute_cfg->perm_dim);
+
+    mli_tensor weights_dst_sub_tensor;
+    mli_sub_tensor_cfg sub_tensor_cfg = {};
+    sub_tensor_cfg.sub_tensor_rank = weights_src->rank;
+
+    // Calculate dimensions for slice accroding to buffer capacity.
+    // Now, after calling change_shape() function, dst weights buffer has the
+    // MLI layout (HWCN). This means, the innermost dimension (N) of dst weights
+    // tensor is equal to the innermost dimension of output tensor (N).
+    sub_tensor_cfg.size[weights_dst->rank - 1] =
+        src_sizes[weights_dst->rank - 1] = weights_src->shape[KRNL_C_DIM_NHWC];
+    // Now need to calculate other shapes for weights slice. Total slice size is
+    // H*W*C*N, so to calculate sizes for each axis, avaliable slice size is
+    // divided by shape for each axis.
+    uint32_t slice_size_left = slice_size;
+    for (uint32_t i = 0; i < weights_dst->rank - 1; i++) {
+      sub_tensor_cfg.size[i] = src_sizes[i] =
+          slice_size_left / weights_dst->shape[i] > 0 ? weights_dst->shape[i]
+                                                      : slice_size_left;
+      slice_size_left /= weights_dst->shape[i];
+      slice_size_left = slice_size_left > 0 ? slice_size_left : 1;
+    }
+    // Need to reorder src tensor sizes because it is still in TFLM format
+    // (NHWC) and src_sizes array calculated as (HWCN).
+    reorder(src_sizes, permute_cfg->perm_dim, true);
+
+    sub_tensor_cfg.offset[KRNL_C_DIM_HWCN] = src_offsets[KRNL_H_DIM_HWCN] = 0;
+    sub_tensor_cfg.offset[KRNL_H_DIM_HWCN] = src_offsets[KRNL_W_DIM_HWCN] = 0;
+    sub_tensor_cfg.offset[KRNL_W_DIM_HWCN] = src_offsets[KRNL_D_DIM_HWCN] = 0;
+    sub_tensor_cfg.offset[KRNL_D_DIM_HWCN] = src_offsets[KRNL_C_DIM_HWCN] = 0;
+    do {
+      do {
+        do {
+          do {
+            mli_mov_cfg_for_slice(&copy_config, (int*)src_offsets,
+                                  (int*)src_sizes, dst_mem_stride);
+            mli_mov_tensor_sync(weights_src, &copy_config, &buffer);
+
+            mli_hlp_create_subtensor(weights_dst, &sub_tensor_cfg,
+                                     &weights_dst_sub_tensor);
+            mli_krn_permute_sa8(&buffer, permute_cfg, &weights_dst_sub_tensor);
+
+            // For each axis, it is necessary to recalculate the offsets and
+            // slice sizes.
+            sub_tensor_cfg.offset[2] = src_offsets[3] += src_sizes[3];
+            src_sizes[3] =
+                std::min(src_sizes[3], weights_src->shape[3] - src_offsets[3]);
+          } while (src_offsets[3] < weights_src->shape[3]);
+
+          sub_tensor_cfg.offset[1] = src_offsets[2] += src_sizes[2];
+          src_sizes[2] =
+              std::min(src_sizes[2], weights_src->shape[2] - src_offsets[2]);
+        } while (src_offsets[2] < weights_src->shape[2]);
+
+        sub_tensor_cfg.offset[0] = src_offsets[1] += src_sizes[1];
+        src_sizes[1] =
+            std::min(src_sizes[1], weights_src->shape[1] - src_offsets[1]);
+      } while (src_offsets[1] < weights_src->shape[1]);
+
+      sub_tensor_cfg.offset[3] = src_offsets[0] += src_sizes[0];
+      src_sizes[0] =
+          std::min(src_sizes[0], weights_src->shape[0] - src_offsets[0]);
+    } while (src_offsets[0] < weights_src->shape[0]);
+  }
+}
+#endif
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite

--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace tflite {
 namespace ops {
 namespace micro {

-#ifdef __Xxy
+#if (defined(__Xxy)) || (defined(__Xvdsp))
 static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
                                     int* grant_size_1, int* grant_size_2) {
  int maxrequest = 0;
@@ -66,202 +66,215 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
 }

 static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
-    TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out) {
  int request_size_in = 0;
  int request_size_out = 0;
  int grant_size_in = 0;
  int grant_size_out = 0;
-  if (!inside_arc_ccm(in->data)) {
+  if (!inside_arc_ccm(in->Data<int8_t>())) {
    // In case the input tensor contains multiple batches, it has rank 4
    // because the mli kernel cannot operate on batches, we need to have the
    // size of a single HWC tensor. that is why the start_rank is 1 in case of
    // input rank 4
-    int start_rank = in->rank - 3;
-    request_size_in = mli_hlp_count_elem_num(in, start_rank) *
-                      mli_hlp_tensor_element_size(in);
+    int start_rank = *in->Rank() - 3;
+    request_size_in = mli_hlp_count_elem_num(in->MliTensor(), start_rank) *
+                      mli_hlp_tensor_element_size(in->MliTensor());
  }
-  if (!inside_arc_ccm(out->data)) {
+  if (!inside_arc_ccm(out->Data<int8_t>())) {
    // In case the input tensor contains multiple batches, it has rank 4
    // because the mli kernel cannot operate on batches, we need to have the
    // size of a single batch. that is why the start_rank is 1 in case of input
    // rank 4
-    int start_rank = out->rank - 3;
-    request_size_out = mli_hlp_count_elem_num(out, start_rank) *
-                       mli_hlp_tensor_element_size(out);
+    int start_rank = *out->Rank() - 3;
+    request_size_out = mli_hlp_count_elem_num(out->MliTensor(), start_rank) *
+                       mli_hlp_tensor_element_size(out->MliTensor());
  }

  get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in,
                           &grant_size_out);
-
-  if (!inside_arc_ccm(in->data)) {
-    in->data = get_arc_scratch_buffer(grant_size_in);
-    in->capacity = grant_size_in;
-    if (in->data == NULL) return kTfLiteError;
+  if (!inside_arc_ccm(in->Data<int8_t>())) {
+    in->SetData<int8_t>(
+        static_cast<int8_t*>(get_arc_scratch_buffer(grant_size_in)),
+        grant_size_in);
+    if (in->Data<int8_t>() == NULL) return kTfLiteError;
  }
-  if (!inside_arc_ccm(out->data)) {
-    out->data = get_arc_scratch_buffer(grant_size_out);
-    out->capacity = grant_size_out;
-    if (out->data == NULL) return kTfLiteError;
+
+  if (!inside_arc_ccm(out->Data<int8_t>())) {
+    out->SetData<int8_t>(
+        static_cast<int8_t*>(get_arc_scratch_buffer(grant_size_out)),
+        grant_size_out);
+    if (out->Data<int8_t>() == NULL) return kTfLiteError;
  }

  return kTfLiteOk;
 }
 #endif

-TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-                                                     mli_tensor* in,
-                                                     mli_tensor* weights,
-                                                     mli_tensor* bias,
-                                                     mli_tensor* out) {
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
+    MliTensorInterface* bias, MliTensorInterface* out) {
  TfLiteStatus ret_val = kTfLiteOk;
-#ifdef __Xxy
+#if (defined(__Xxy)) || (defined(__Xvdsp))
  init_arc_scratch_buffers();
-  if (!inside_arc_ccm(weights->data)) {
-    int weights_size = mli_hlp_count_elem_num(weights, 0) *
-                       mli_hlp_tensor_element_size(weights);
-    int max_weights_size = 0;
-    weights->data = get_arc_scratch_buffer(weights_size);
-    weights->capacity = weights_size;
-    if (weights->data == NULL) {
-      get_arc_scratch_buffer_max_size(&max_weights_size);
-      weights->data = get_arc_scratch_buffer(max_weights_size);
-      weights->capacity = max_weights_size;
-      if (max_weights_size == 0) ret_val = kTfLiteError;
-    }
-    if (weights->data == NULL) ret_val = kTfLiteError;
-  }

-  if (!inside_arc_ccm(bias->data)) {
+  if (!inside_arc_ccm(bias->Data<int32_t>())) {
    uint32_t bias_mem_requirements =
-        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
-    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
-    bias->capacity = bias_mem_requirements;
-  }
-
-  if (ret_val == kTfLiteOk) {
-    ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
+        mli_hlp_count_elem_num(bias->MliTensor(), 0) *
+        mli_hlp_tensor_element_size(bias->MliTensor());
+    bias->SetData<int32_t>(
+        static_cast<int32_t*>(get_arc_scratch_buffer(bias_mem_requirements)),
+        bias_mem_requirements);
  }

-  if (bias->data == NULL) {
+  if (bias->Data<int32_t>() == NULL) {
    int max_bias_size = 0;
    get_arc_scratch_buffer_max_size(&max_bias_size);
-    bias->data = get_arc_scratch_buffer(max_bias_size);
-    bias->capacity = max_bias_size;
+    bias->SetData<int32_t>(
+        static_cast<int32_t*>(get_arc_scratch_buffer(max_bias_size)),
+        max_bias_size);
    if (max_bias_size == 0) ret_val = kTfLiteError;
  }
-  if (bias->data == NULL) ret_val = kTfLiteError;
+  if (bias->Data<int32_t>() == NULL) ret_val = kTfLiteError;

+  if (!inside_arc_ccm(weights->Data<int8_t>())) {
+    int weights_size = mli_hlp_count_elem_num(weights->MliTensor(), 0) *
+                       mli_hlp_tensor_element_size(weights->MliTensor());
+    int max_weights_size = 0;
+    weights->SetData<int8_t>(
+        static_cast<int8_t*>(get_arc_scratch_buffer(weights_size)),
+        weights_size);
+    if (weights->Data<int8_t>() == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->SetData<int8_t>(
+          static_cast<int8_t*>(get_arc_scratch_buffer(max_weights_size)),
+          max_weights_size);
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->Data<int8_t>() == NULL) ret_val = kTfLiteError;
+  }
+
+  if (ret_val == kTfLiteOk) {
+    ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
+  }
 #endif
  return ret_val;
 }

 TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
-    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
-    mli_tensor* bias, mli_tensor* out) {
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
+    MliTensorInterface* bias, MliTensorInterface* out) {
  TfLiteStatus ret_val = kTfLiteOk;
-#ifdef __Xxy
+
+#if (defined(__Xxy)) || (defined(__Xvdsp))
  init_arc_scratch_buffers();
-  /* strategy for FC kernels:
-     first allocate input, because this cannot be sliced. (in case of batch
-     processing, only a single input needs to be allocated) then weights & bias
-     because if fully loaded, they can be reused over batches. then output.
-     The number of output channels (for weights slicing) depends on size of
-     output and size of weights&bias */

-  if (!inside_arc_ccm(in->data)) {
-    /* In case the input tensor contains multiple batches,
-       only count the size if the inner most dimension */
-    int size_in = mli_hlp_count_elem_num(in, in->rank - 1) *
-                  mli_hlp_tensor_element_size(in);
-    in->data = get_arc_scratch_buffer(size_in);
-    in->capacity = size_in;
-    if (in->data == NULL) {
-      in->capacity = 0;
-      ret_val = kTfLiteError;
-    }
+  if (!inside_arc_ccm(bias->Data<int32_t>())) {
+    int bias_mem_requirements = mli_hlp_count_elem_num(bias->MliTensor(), 0) *
+                                mli_hlp_tensor_element_size(bias->MliTensor());
+    bias->SetData<int32_t>(
+        static_cast<int32_t*>(get_arc_scratch_buffer(bias_mem_requirements)),
+        bias_mem_requirements);
+  }
+
+  if (bias->Data<int32_t>() == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->SetData<int32_t>(
+        static_cast<int32_t*>(get_arc_scratch_buffer(max_bias_size)),
+        max_bias_size);
+    if (max_bias_size == 0) ret_val = kTfLiteError;
  }
+  if (bias->Data<int32_t>() == NULL) ret_val = kTfLiteError;

-  if (!inside_arc_ccm(weights->data)) {
-    int weights_size = mli_hlp_count_elem_num(weights, 0) *
-                       mli_hlp_tensor_element_size(weights);
+  if (!inside_arc_ccm(weights->Data<int8_t>())) {
+    int weights_size = mli_hlp_count_elem_num(weights->MliTensor(), 0) *
+                       mli_hlp_tensor_element_size(weights->MliTensor());
    int max_weights_size = 0;
-    weights->data = get_arc_scratch_buffer(weights_size);
-    weights->capacity = weights_size;
-    if (weights->data == NULL) {
+    weights->SetData<int8_t>(
+        static_cast<int8_t*>(get_arc_scratch_buffer(weights_size)),
+        weights_size);
+    if (weights->Data<int8_t>() == NULL) {
      get_arc_scratch_buffer_max_size(&max_weights_size);
-      weights->data = get_arc_scratch_buffer(max_weights_size);
-      weights->capacity = max_weights_size;
+      weights->SetData<int8_t>(
+          static_cast<int8_t*>(get_arc_scratch_buffer(max_weights_size)),
+          max_weights_size);
      if (max_weights_size == 0) ret_val = kTfLiteError;
    }
-    if (weights->data == NULL) ret_val = kTfLiteError;
+    if (weights->Data<int8_t>() == NULL) ret_val = kTfLiteError;
  }

-  if (!inside_arc_ccm(bias->data)) {
-    int bias_mem_requirements =
-        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
-    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
-    bias->capacity = bias_mem_requirements;
-  }
+  /* strategy for FC kernels:
+     first allocate input, because this cannot be sliced. (in case of batch
+     processing, only a single input needs to be allocated) then weights &
+     bias because if fully loaded, they can be reused over batches. then
+     output. The number of output channels (for weights slicing) depends on
+     size of output and size of weights&bias */

-  if (!inside_arc_ccm(out->data)) {
+  if (!inside_arc_ccm(in->Data<int8_t>())) {
    /* In case the input tensor contains multiple batches,
       only count the size if the inner most dimension */
-    int out_size = mli_hlp_count_elem_num(out, out->rank - 1) *
-                   mli_hlp_tensor_element_size(out);
+    int size_in = mli_hlp_count_elem_num(in->MliTensor(), *in->Rank() - 1) *
+                  mli_hlp_tensor_element_size(in->MliTensor());
+    in->SetData<int8_t>(static_cast<int8_t*>(get_arc_scratch_buffer(size_in)),
+                        size_in);
+    if (in->Data<int8_t>() == NULL) {
+      in->SetData<int8_t>(nullptr, 0);
+      ret_val = kTfLiteError;
+    }
+  }
+  if (!inside_arc_ccm(out->Data<int8_t>())) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int out_size = mli_hlp_count_elem_num(out->MliTensor(), *out->Rank() - 1) *
+                   mli_hlp_tensor_element_size(out->MliTensor());
    int max_out_size = 0;
-    out->data = get_arc_scratch_buffer(out_size);
-    out->capacity = out_size;
-    if (out->data == NULL) {
+    out->SetData<int8_t>(static_cast<int8_t*>(get_arc_scratch_buffer(out_size)),
+                         out_size);
+    if (out->Data<int8_t>() == NULL) {
      get_arc_scratch_buffer_max_size(&max_out_size);
-      out->data = get_arc_scratch_buffer(max_out_size);
-      out->capacity = max_out_size;
+      out->SetData<int8_t>(
+          static_cast<int8_t*>(get_arc_scratch_buffer(max_out_size)),
+          max_out_size);
      if (max_out_size == 0) ret_val = kTfLiteError;
    }
-    if (out->data == NULL) ret_val = kTfLiteError;
+    if (out->Data<int8_t>() == NULL) ret_val = kTfLiteError;
  }
-
-  if (bias->data == NULL) {
-    int max_bias_size = 0;
-    get_arc_scratch_buffer_max_size(&max_bias_size);
-    bias->data = get_arc_scratch_buffer(max_bias_size);
-    bias->capacity = max_bias_size;
-    if (max_bias_size == 0) ret_val = kTfLiteError;
-  }
-  if (bias->data == NULL) ret_val = kTfLiteError;
-
 #endif
  return ret_val;
 }

 TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
-    const mli_tensor* in, const mli_tensor* out, const int kernel_height,
-    const int stride_height, const int padding_top, const int padding_bot,
-    int* in_slice_height, int* out_slice_height) {
+    const MliTensorInterface* in, const MliTensorInterface* out,
+    const int kernel_height, const int stride_height, const int padding_top,
+    const int padding_bot, int* in_slice_height, int* out_slice_height) {
  const int height_dimension = 1;
-  const int in_height = in->shape[height_dimension];
-  const int out_height = out->shape[height_dimension];
-  const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) *
-                           mli_hlp_tensor_element_size(in);
-  const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) *
-                            mli_hlp_tensor_element_size(out);
+  const int in_height = in->Shape()[height_dimension];
+  const int out_height = out->Shape()[height_dimension];
+  const int line_size_in =
+      mli_hlp_count_elem_num(in->MliTensor(), height_dimension + 1) *
+      mli_hlp_tensor_element_size(in->MliTensor());
+  const int line_size_out =
+      mli_hlp_count_elem_num(out->MliTensor(), height_dimension + 1) *
+      mli_hlp_tensor_element_size(out->MliTensor());
  int max_lines_in = 0;
  int max_lines_out = 0;
  int max_out_lines_for_input = 0;
-  bool fit = (static_cast<int>(in->capacity) >= in_height * line_size_in) &&
-             (static_cast<int>(out->capacity) >= out_height * line_size_out);
+  bool fit =
+      (static_cast<int>(*in->DataCapacity()) >= in_height * line_size_in) &&
+      (static_cast<int>(*out->DataCapacity()) >= out_height * line_size_out);
  if (fit) {
-    // in case both tensors completely fit in the capacity, there is no need for
-    // slicing. As padding can affect effective input region, we also derive it
-    // from output height, and rely on a clipping logic which intend to reduce
-    // last smaller slice. I.e the only slice is a kind of
-    // "smaller last slice that need to be corrected"
+    // in case both tensors completely fit in the capacity, there is no need
+    // for slicing. As padding can affect effective input region, we also
+    // derive it from output height, and rely on a clipping logic which intend
+    // to reduce last smaller slice. I.e the only slice is a kind of "smaller
+    // last slice that need to be corrected"
    *in_slice_height = std::max(in_height, out_height * stride_height);
    *out_slice_height = out_height;
  } else {
    // First compute how many lines fit into the input tensor, and compute how
    // many output lines can be computed with that.
-    max_lines_in =
-        std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
+    max_lines_in = std::min(
+        in_height, static_cast<int>(*in->DataCapacity()) / line_size_in);
    if (max_lines_in >= in_height) {
      max_out_lines_for_input = out_height;
    } else if (2 * max_lines_in >= in_height) {
@@ -276,8 +289,8 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
          (max_lines_in - kernel_height + 1) / stride_height;
    }
    // Then compute how many output lines fit into the output tensor.
-    max_lines_out =
-        std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
+    max_lines_out = std::min(
+        out_height, static_cast<int>(*out->DataCapacity()) / line_size_out);
    // the smallest of the two determines the slice height for the output, and
    // the derived sliceheight for the input.
    *out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
@@ -292,29 +305,32 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
 }

 TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
-    const mli_tensor* weights, const mli_tensor* bias,
+    const MliTensorInterface* weights, const MliTensorInterface* bias,
    const int weight_out_ch_dimension, int* slice_channels) {
-  const int channels = weights->shape[weight_out_ch_dimension];
-  const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) *
-                        mli_hlp_tensor_element_size(weights);
-  const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) *
-                        mli_hlp_tensor_element_size(bias);
+  const int channels = weights->Shape()[weight_out_ch_dimension];
+  const int ch_size_w =
+      (mli_hlp_count_elem_num(weights->MliTensor(), 0) / channels) *
+      mli_hlp_tensor_element_size(weights->MliTensor());
+  const int ch_size_b =
+      (mli_hlp_count_elem_num(bias->MliTensor(), 0) / channels) *
+      mli_hlp_tensor_element_size(bias->MliTensor());
  int max_ch_weigths = 0;
  int max_ch_bias = 0;

-  bool fit = (static_cast<int>(weights->capacity) >= channels * ch_size_w) &&
-             (static_cast<int>(bias->capacity) >= channels * ch_size_b);
+  bool fit =
+      (static_cast<int>(*weights->DataCapacity()) >= channels * ch_size_w) &&
+      (static_cast<int>(*bias->DataCapacity()) >= channels * ch_size_b);
  if (fit) {
-    // in case both tensors completely fit in the capacity, there is no need for
-    // slicing
+    // in case both tensors completely fit in the capacity, there is no need
+    // for slicing
    *slice_channels = channels;
  } else {
    // First compute how many channels fit into the weights tensor
-    max_ch_weigths =
-        std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
+    max_ch_weigths = std::min(
+        channels, static_cast<int>(*weights->DataCapacity()) / ch_size_w);
    // Ten compute how many channels fit into the bias tensor.
    max_ch_bias =
-        std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
+        std::min(channels, static_cast<int>(*bias->DataCapacity()) / ch_size_b);
    // the smallest of the two determines the slice size
    *slice_channels = std::min(max_ch_weigths, max_ch_bias);
  }
@@ -326,10 +342,9 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
  }
 }

-TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
-                                                        mli_tensor* in,
-                                                        mli_tensor* out) {
-#ifdef __Xxy
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out) {
+#if (defined(__Xxy)) || (defined(__Xvdsp))
  init_arc_scratch_buffers();
  return get_arc_scratch_buffer_for_io_tensors(context, in, out);
 #else

--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_

 #include "mli_api.h"  // NOLINT
+#include "mli_interface.h"
 #include "tensorflow/lite/c/common.h"

 namespace tflite {
@@ -37,11 +38,9 @@ namespace micro {
 *
 * @return Tf Lite status code
 */
-TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-                                                     mli_tensor* in,
-                                                     mli_tensor* weights,
-                                                     mli_tensor* bias,
-                                                     mli_tensor* out);
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
+    MliTensorInterface* bias, MliTensorInterface* out);

 /**
 * @brief Function to allocate scratch buffers for pooling kernels with only
@@ -56,9 +55,8 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
 *
 * @return Tf Lite status code
 */
-TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
-                                                        mli_tensor* in,
-                                                        mli_tensor* out);
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* out);

 /**
 * @brief Function to allocate scratch buffers for the fully connect tensors
@@ -75,8 +73,8 @@ TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
 * @return Tf Lite status code
 */
 TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
-    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
-    mli_tensor* bias, mli_tensor* out);
+    TfLiteContext* context, MliTensorInterface* in, MliTensorInterface* weights,
+    MliTensorInterface* bias, MliTensorInterface* out);

 /**
 * @brief Function to calculate slice size for io tensors
@@ -99,9 +97,9 @@ TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
 * @return Tf Lite status code
 */
 TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
-    const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
-    const int strideHeight, const int padding_top, const int padding_bot,
-    int* in_slice_height, int* out_slice_height);
+    const MliTensorInterface* in, const MliTensorInterface* out,
+    const int kernelHeight, const int strideHeight, const int padding_top,
+    const int padding_bot, int* in_slice_height, int* out_slice_height);

 /**
 * @brief Function to calculate slice size for weight slicing
@@ -119,7 +117,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
 * @return Tf Lite status code
 */
 TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
-    const mli_tensor* weights, const mli_tensor* bias,
+    const MliTensorInterface* weights, const MliTensorInterface* bias,
    const int weight_out_ch_dimension, int* slice_channels);

 }  // namespace micro

--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,31 +25,45 @@ namespace micro {
 * used for the data section and the stack. the values can be overruled by
 * adding a -D option to the makefile of the application
 */
+
+#ifdef __Xxy
+
 #ifndef SCRATCH_MEM_X_SIZE
 #ifdef core_config_xy_size
 #define SCRATCH_MEM_X_SIZE (core_config_xy_size)
-#else
-#define SCRATCH_MEM_X_SIZE (0)
 #endif
 #endif

 #ifndef SCRATCH_MEM_Y_SIZE
 #ifdef core_config_xy_size
 #define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
-#else
-#define SCRATCH_MEM_Y_SIZE (0)
 #endif
 #endif

 #ifndef SCRATCH_MEM_Z_SIZE
 #ifdef core_config_dccm_size
 #define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
-#else
-#define SCRATCH_MEM_Z_SIZE (0)
 #endif
 #endif

+#elif defined(__Xvdsp)
+
+#ifndef SCRATCH_MEM_VEC_SIZE
+#ifdef core_config_vec_mem_size
+#define SCRATCH_MEM_VEC_SIZE ((core_config_vec_mem_size * 3) / 4)
+#endif
+#endif
+
+#else
+
+#define SCRATCH_MEM_SIZE (65536)
+
+#endif
+
 namespace {
+
+#ifdef __Xxy
+
 #pragma Bss(".Xdata")
 static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
 #pragma Bss()
@@ -61,12 +75,43 @@ static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
 #pragma Bss(".Zdata")
 static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
 #pragma Bss()
+
+#elif defined(__Xvdsp)
+
+#pragma Bss(".vecmem_data")
+static int8_t scratch_mem_vec_1[SCRATCH_MEM_VEC_SIZE / 4];
+static int8_t scratch_mem_vec_2[SCRATCH_MEM_VEC_SIZE / 4];
+static int8_t scratch_mem_vec_3[SCRATCH_MEM_VEC_SIZE / 2];
+#pragma Bss()
+
+#else
+
+static int8_t scratch_mem_stack[SCRATCH_MEM_SIZE];
+
+#endif
 }  // namespace

+#ifdef __Xxy
+
 static int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
 static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE,
                                   SCRATCH_MEM_Z_SIZE};

+#elif defined(__Xvdsp)
+
+static int8_t* scratch_mem[] = {scratch_mem_vec_1, scratch_mem_vec_2,
+                                scratch_mem_vec_3};
+static uint32_t scratch_sizes[] = {SCRATCH_MEM_VEC_SIZE / 4,
+                                   SCRATCH_MEM_VEC_SIZE / 4,
+                                   SCRATCH_MEM_VEC_SIZE / 2};
+
+#else
+
+static int8_t* scratch_mem[] = {scratch_mem_stack};
+static uint32_t scratch_sizes[] = {SCRATCH_MEM_SIZE};
+
+#endif
+
 void* get_arc_scratch_buffer(int size) {
  // Function to asign fast memory from one of 3 scratch buffers.
  // Best Fit strategy - memory is allocated from that memory bank that leaves
@@ -85,7 +130,7 @@ void* get_arc_scratch_buffer(int size) {
    }
  }
  if (best_mem_idx >= 0) {
-    buf = static_cast<void*>(scratch_mem[best_mem_idx]);
+    buf = scratch_mem[best_mem_idx];
    scratch_mem[best_mem_idx] += size;
    scratch_sizes[best_mem_idx] -= size;
  }
@@ -122,12 +167,24 @@ void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2) {
 }

 void init_arc_scratch_buffers(void) {
+#ifdef __Xxy
  scratch_mem[0] = scratch_mem_x;
  scratch_mem[1] = scratch_mem_y;
  scratch_mem[2] = scratch_mem_z;
  scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
  scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
  scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
+#elif defined(__Xvdsp)
+  scratch_mem[0] = scratch_mem_vec_1;
+  scratch_mem[1] = scratch_mem_vec_2;
+  scratch_mem[2] = scratch_mem_vec_3;
+  scratch_sizes[0] = SCRATCH_MEM_VEC_SIZE / 4;
+  scratch_sizes[1] = SCRATCH_MEM_VEC_SIZE / 4;
+  scratch_sizes[2] = SCRATCH_MEM_VEC_SIZE / 2;
+#else
+  scratch_mem[0] = scratch_mem_stack;
+  scratch_sizes[0] = SCRATCH_MEM_SIZE;
+#endif
 }

 }  // namespace micro

--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ static inline bool inside_arc_xccm(void* p) {
 }

 static inline bool inside_arc_yccm(void* p) {
-#if core_config_xy
+#if core_config_xy_size
  return ((unsigned)p >= core_config_xy_y_base) &&
         ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
 #else
@@ -57,8 +57,18 @@ static inline bool inside_arc_yccm(void* p) {
 #endif
 }

+static inline bool inside_arc_vccm(void* p) {
+#if core_config_vec_mem_size
+  return ((unsigned)p >= core_config_vec_mem_base) &&
+         ((unsigned)p < core_config_vec_mem_base + core_config_vec_mem_size);
+#else
+  return false;
+#endif
+}
+
 static inline bool inside_arc_ccm(void* p) {
-  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
+  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p) ||
+         inside_arc_vccm(p);
 }

 }  // namespace micro