Automated sync from github.com/tensorflow/tensorflow (#67)

3bc6526a · TFLM-bot · GitHub · 4a60d9a9 · 3bc6526a · 3bc6526a
28 changed file
--- a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
--- a/tensorflow/lite/micro/cortex_m_corstone_300/README.md
+++ b/tensorflow/lite/micro/cortex_m_corstone_300/README.md
 <!-- mdformat off(b/169948621#comment2) -->

-# Running a fixed virtual platform based on Corstone-300 software
+# Running a fixed virtual platform based on Arm(R) Corstone(TM)-300 software

-This target makes use of a fixed virtual platform (FVP) based on Arm Cortex-300
-based software. More info about Arm Corstone-300 software:
+This target makes use of a fixed virtual platform (FVP) based on Arm
+Corstone-300 software. More info about Arm Corstone-300 software:
 https://developer.arm.com/ip-products/subsystem/corstone/corstone-300. More info
 about FVPs:
 https://developer.arm.com/tools-and-software/simulation-models/fixed-virtual-platforms.
@@ -40,7 +40,10 @@ not matter when running unit tests or for debugging.
 Some examples:

 ```
-make -j -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=cmsis_nn TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_kernel_fully_connected_test
+make -j -f tensorflow/lite/micro/tools/make/Makefile CO_PROCESSOR=ethos_u TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_network_tester_test
+make -j -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=cmsis_nn TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_network_tester_test
+make -j -f tensorflow/lite/micro/tools/make/Makefile CO_PROCESSOR=ethos_u OPTIMIZED_KERNEL_DIR=cmsis_nn TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_network_tester_test
+make -j -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_network_tester_test
 make -j -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m55 test_kernel_fully_connected_test
 make -j -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=cmsis_nn TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m7+fp test_kernel_fully_connected_test
 make -j -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_corstone_300 TARGET_ARCH=cortex-m3 test_kernel_fully_connected_test

--- a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
+++ b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
@@ -13,14 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#ifdef ETHOS_U
+#include "ethosu_driver.h"
+
+// This is set in micro/tools/make/targets/cortex_m_corstone_300_makefile.inc.
+// It is needed for the calls to NVIC_SetVector()/NVIC_EnableIR().
+#include CMSIS_DEVICE_ARM_CORTEX_M_XX_HEADER_FILE
+#endif
+#include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/system_setup.h"

 namespace tflite {

+#ifdef ETHOS_U
+void ethosuIrqHandler0() { ethosu_irq_handler(); }
+#endif
+
 extern "C" {
 void uart_init(void);
 }

-void InitializeTarget() { uart_init(); }
+void InitializeTarget() {
+  uart_init();
+
+#ifdef ETHOS_U
+  constexpr int ethosu_base_address = 0x48102000;
+  constexpr int ethosu_irq = 56;
+
+  // Initialize Ethos-U NPU driver.
+  if (ethosu_init(reinterpret_cast<void*>(ethosu_base_address))) {
+    MicroPrintf("Failed to initialize Ethos-U driver");
+  }
+  NVIC_SetVector(static_cast<IRQn_Type>(ethosu_irq),
+                 (uint32_t)&ethosuIrqHandler0);
+  NVIC_EnableIRQ(static_cast<IRQn_Type>(ethosu_irq));
+#endif
+}

 }  // namespace tflite
--- a/tensorflow/lite/micro/examples/network_tester/Makefile.inc
+++ b/tensorflow/lite/micro/examples/network_tester/Makefile.inc
 NETWORK_TESTER_TEST_SRCS := \
 tensorflow/lite/micro/examples/network_tester/network_tester_test.cc

+ifeq ($(CO_PROCESSOR),ethos_u)
+  NETWORK_TESTER_TEST_SRCS += \
+    $(MAKEFILE_DIR)/downloads/person_model_int8/person_detect_model_data_vela.cc \
+    $(MAKEFILE_DIR)/downloads/person_model_int8/person_image_data.cc
+endif
+
 NETWORK_TESTER_TEST_HDRS := \
 tensorflow/lite/micro/examples/network_tester/network_model.h \
 tensorflow/lite/micro/examples/network_tester/input_data.h \

--- a/tensorflow/lite/micro/examples/network_tester/README.md
+++ b/tensorflow/lite/micro/examples/network_tester/README.md
@@ -8,6 +8,19 @@ input data (input_data.h) and default expected output data
 The default model is a single MaxPool2D operator, with an input shape of {1, 4,
 4, 1} and an output shape of {1, 2, 2, 1}.

+When building the FVP target for Ethos-U (CO_PROCESSOR=ethos_u) the person
+detect int8 model is used instead. The downloaded model is optimized for Ethos-U
+with Ethos-U Vela. For more info see the following readmes:
+tensorflow/lite/micro/kernels/ethos_u/README.md
+tensorflow/lite/micro/cortex_m_corstone_300/README.md
+tensorflow/lite/micro/examples/person_detection/README.md The following Vela
+configuration has been used, which is compatible with the FVP build target
+(TARGET=cortex_m_corstone_300).
+
+```
+vela --accelerator-config=ethos-u55-256
+```
+
 In order to use another model, input data, or expected output data, simply
 specify the path to the new header files when running make as seen below.


--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_

+#ifdef ETHOS_U
+static unsigned char expected_output_data[1][2] = {{143, 113}};
+#else
 static unsigned char expected_output_data[1][4] = {{6, 8, 14, 16}};
+#endif

 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
--- a/tensorflow/lite/micro/examples/network_tester/network_model.h
+++ b/tensorflow/lite/micro/examples/network_tester/network_model.h
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -23,9 +23,18 @@ limitations under the License.
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"

+#ifdef ETHOS_U
+#include "tensorflow/lite/micro/examples/person_detection/person_detect_model_data.h"
+#include "tensorflow/lite/micro/examples/person_detection/person_image_data.h"
+#endif
+
 #ifndef TENSOR_ARENA_SIZE
+#ifdef ETHOS_U
+#define TENSOR_ARENA_SIZE (136 * 1024)
+#else
 #define TENSOR_ARENA_SIZE (1024)
 #endif
+#endif

 #ifndef NUM_INFERENCES
 #define NUM_INFERENCES 1
@@ -73,7 +82,11 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestInvoke) {
  tflite::MicroErrorReporter micro_error_reporter;

+#ifdef ETHOS_U
+  const tflite::Model* model = ::tflite::GetModel(g_person_detect_model_data);
+#else
  const tflite::Model* model = ::tflite::GetModel(network_model);
+#endif
  if (model->version() != TFLITE_SCHEMA_VERSION) {
    TF_LITE_REPORT_ERROR(&micro_error_reporter,
                         "Model provided is schema version %d not equal "
@@ -96,7 +109,11 @@ TF_LITE_MICRO_TEST(TestInvoke) {
  for (int n = 0; n < NUM_INFERENCES; n++) {
    for (size_t i = 0; i < interpreter.inputs_size(); ++i) {
      TfLiteTensor* input = interpreter.input(i);
+#ifdef ETHOS_U
+      memcpy(input->data.int8, g_person_data, input->bytes);
+#else
      memcpy(input->data.data, input_data[i], input->bytes);
+#endif
    }
    TfLiteStatus invoke_status = interpreter.Invoke();
    if (invoke_status != kTfLiteOk) {

--- a/tensorflow/lite/micro/kernels/ethos_u/README.md
+++ b/tensorflow/lite/micro/kernels/ethos_u/README.md
@@ -4,7 +4,7 @@
 Arm(R) Ethos(TM)-U is a new class of machine learning processors, called a
 microNPU, specifically designed to accelerate ML inference in area-constrained
 embedded and IoT devices. This readme briefly describes how to integrate Ethos-U
-related hardware and software into TFLM.
+related hardware and software into TFLM. See also [Ethos-U ML Evaluation kit examples](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ml-embedded-evaluation-kit).

 To enable the Ethos-U software stack, add `CO_PROCESSOR=ethos_u` to the make
 command line. See example below.
@@ -16,7 +16,7 @@ command line. See example below.
 ## Ethos-U custom operator
 The TFLM runtime will dispatch workloads to Ethos-U when it encounters an
 Ethos-U custom op in the tflite file. The Ethos-U custom op is added by a tool
-called Vela and contains information the Ethos-U hardware need to execute
+called Ethos-U Vela and contains information the Ethos-U hardware need to execute
 the workload. More info in the [Vela repo](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela).

 ```
@@ -47,14 +47,19 @@ startup, before calling the TFLM API. More info in the [Ethos-U driver repo](htt

 For even more info regarding Vela and Ethos-U, checkout [Ethos-U landing page](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u/+/refs/heads/master).

-# Example 1
+# Some examples of compiling a binary and running a network with Ethos-U support.
+In order to run a test with Ethos-U55 enabled, a platform with corresponding hardware support is required. One such platform is the fixed virtual platform (FVP) based on Arm Corstone-300 software. See tensorflow/lite/micro/cortex_m_corstone_300/README.md for more info.

-Compile a binary with Ethos-U support using the following command:
+On top of that the .tflite model needs to be modified according subchapter "Ethos-U custom operator" above.
+
+## Example using network tester
+See tensorflow/lite/micro/examples/network_tester/README.md for more info.

 ```
-make -f tensorflow/lite/micro/tools/make/Makefile network_tester_test CO_PROCESSOR=ethos_u \
-TARGET=<ethos_u_enabled_target> NETWORK_MODEL=<ethos_u_enabled_tflite>
-```
+make -f tensorflow/lite/micro/tools/make/Makefile network_tester_test CO_PROCESSOR=ethos_u TARGET=cortex_m_corstone_300 \
+TARGET_ARCH=cortex-m55 test_network_tester_test NETWORK_MODEL=path/to/network_model.h INPUT_DATA=path/to/input_data.h \
+OUTPUT_DATA=path/to/expected_output_data.h

-TODO: Replace `ethos_u_enabled_target` and `ethos_u_enabled_tflite` once the
-Arm Corstone(TM)-300 example is up and running.
+make -f tensorflow/lite/micro/tools/make/Makefile network_tester_test CO_PROCESSOR=ethos_u TARGET=cortex_m_corstone_300 \
+TARGET_ARCH=cortex-m55 test_network_tester_test
+```
--- a/tensorflow/lite/micro/kernels/xtensa/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/conv.cc
@@ -34,9 +34,9 @@ namespace {
 struct OpData {
  OpDataConv reference_op_data;

-#if defined(FUSION_F1)
+#if defined(FUSION_F1) || defined(HIFI5)
  int scratch_tensor_index;
-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)
 };

 #if defined(HIFIMINI)
@@ -250,7 +250,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(context, ConvPrepare(context, node));

-#if defined(FUSION_F1)
+#if defined(FUSION_F1) || defined(HIFI5)
  OpData* data = static_cast<OpData*>(node->user_data);
  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);

@@ -288,17 +288,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(
      context, context->RequestScratchBufferInArena(
                   context, required_scratch, &data->scratch_tensor_index));
-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)
  return kTfLiteOk;
 }

-#if defined(FUSION_F1)
-TfLiteStatus EvalHifi4(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteConvParams& params, const OpData& data,
-                       const TfLiteEvalTensor* input,
-                       const TfLiteEvalTensor* filter,
-                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output,
-                       TfLiteEvalTensor* im2col) {
+#if defined(FUSION_F1) || defined(HIFI5)
+TfLiteStatus EvalHifi(TfLiteContext* context, TfLiteNode* node,
+                      const TfLiteConvParams& params, const OpData& data,
+                      const TfLiteEvalTensor* input,
+                      const TfLiteEvalTensor* filter,
+                      const TfLiteEvalTensor* bias, TfLiteEvalTensor* output,
+                      TfLiteEvalTensor* im2col) {
  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
  const RuntimeShape& filter_shape = tflite::micro::GetTensorShape(filter);
  /* Dilation is currently not supported on HiFi 4 NN Library */
@@ -411,7 +411,7 @@ TfLiteStatus EvalHifi4(TfLiteContext* context, TfLiteNode* node,
      tflite::micro::GetTensorData<int8_t>(output));
  return kTfLiteOk;
 }
-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
@@ -470,9 +470,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                   tflite::micro::GetTensorData<int32_t>(bias),
                   tflite::micro::GetTensorShape(output),
                   tflite::micro::GetTensorData<int8_t>(output));
-#elif defined(FUSION_F1)
-      EvalHifi4(context, node, params, op_data, input, filter, bias, output,
-                nullptr);
+#elif defined(FUSION_F1) || defined(HIFI5)
+      EvalHifi(context, node, params, op_data, input, filter, bias, output,
+               nullptr);
 #else
      reference_integer_ops::ConvPerChannel(
          ConvParamsQuantized(params, op_data.reference_op_data),

--- a/tensorflow/lite/micro/kernels/xtensa/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/depthwise_conv.cc
@@ -35,9 +35,9 @@ namespace {
 struct OpData {
  OpDataConv reference_op_data;

-#if defined(FUSION_F1)
+#if defined(FUSION_F1) || defined(HIFI5)
  int scratch_tensor_index;
-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)
 };

 #if defined(HIFIMINI)
@@ -294,7 +294,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(context, DepthwiseConvPrepare(context, node));

-#if defined(FUSION_F1)
+#if defined(FUSION_F1) || defined(HIFI5)
  OpData* data = static_cast<OpData*>(node->user_data);
  const auto& params =
      *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
@@ -340,16 +340,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(
      context, context->RequestScratchBufferInArena(
                   context, required_scratch, &data->scratch_tensor_index));
-#endif  // defined(FUISON_F1)
+#endif  // defined(FUISON_F1) || defined(HIFI5)
  return kTfLiteOk;
 }

-#if defined(FUSION_F1)
-TfLiteStatus EvalHifi4(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteDepthwiseConvParams& params,
-                       const OpData& data, const TfLiteEvalTensor* input,
-                       const TfLiteEvalTensor* filter,
-                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
+#if defined(FUSION_F1) || defined(HIFI5)
+TfLiteStatus EvalHifi(TfLiteContext* context, TfLiteNode* node,
+                      const TfLiteDepthwiseConvParams& params,
+                      const OpData& data, const TfLiteEvalTensor* input,
+                      const TfLiteEvalTensor* filter,
+                      const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
  // If dilation is not required use the optimized NN Library kernel.
  // Otherwise call the reference implementation.
  if ((params.dilation_width_factor == 1) &&
@@ -439,7 +439,7 @@ TfLiteStatus EvalHifi4(TfLiteContext* context, TfLiteNode* node,

  return kTfLiteOk;
 }
-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
@@ -499,8 +499,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
          tflite::micro::GetTensorData<int32_t>(bias),
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<int8_t>(output));
-#elif defined(FUSION_F1)
-      EvalHifi4(context, node, params, op_data, input, filter, bias, output);
+#elif defined(FUSION_F1) || defined(HIFI5)
+      EvalHifi(context, node, params, op_data, input, filter, bias, output);
 #else
      reference_integer_ops::DepthwiseConvPerChannel(
          DepthwiseConvParamsQuantized(params, op_data.reference_op_data),

--- a/tensorflow/lite/micro/kernels/xtensa/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/softmax.cc
@@ -33,7 +33,7 @@ namespace {
 struct OpData {
  uint16_t* exp_lut;
 };
-#elif defined(FUSION_F1)
+#elif defined(FUSION_F1) || defined(HIFI5)
 struct OpData {
  SoftmaxParams params;
  int scratch_tensor_index;
@@ -181,8 +181,8 @@ TfLiteStatus PrepareHifimini(TfLiteContext* context, TfLiteNode* node) {
 }
 #endif  // defined(HIFIMINI)

-#if defined(FUSION_F1)
-TfLiteStatus PrepareHifi4(TfLiteContext* context, TfLiteNode* node) {
+#if defined(FUSION_F1) || defined(HIFI5)
+TfLiteStatus PrepareHifi(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(context, SoftmaxPrepare(context, node));

  // Calculate scratch memory requirements and request scratch buffer
@@ -209,8 +209,8 @@ TfLiteStatus PrepareHifi4(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }

-TfLiteStatus EvalHifi4(const OpData* op_data, const TfLiteEvalTensor* input,
-                       TfLiteEvalTensor* output, TfLiteContext* context) {
+TfLiteStatus EvalHifi(const OpData* op_data, const TfLiteEvalTensor* input,
+                      TfLiteEvalTensor* output, TfLiteContext* context) {
  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
  const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
  const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
@@ -234,10 +234,10 @@ TfLiteStatus EvalHifi4(const OpData* op_data, const TfLiteEvalTensor* input,
  return kTfLiteOk;
 }

-#endif  // defined(FUSION_F1)
+#endif  // defined(FUSION_F1) || defined(HIFI5)

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-#if defined(HIFIMINI) || defined(FUSION_F1)
+#if defined(HIFIMINI) || defined(FUSION_F1) || defined(HIFI5)
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 #else
@@ -248,8 +248,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 #if defined(HIFIMINI)
  return PrepareHifimini(context, node);
-#elif defined(FUSION_F1)
-  return PrepareHifi4(context, node);
+#elif defined(FUSION_F1) || defined(HIFI5)
+  return PrepareHifi(context, node);
 #else
  return SoftmaxPrepare(context, node);
 #endif
@@ -267,9 +267,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                           tflite::micro::GetTensorData<int8_t>(input),
                           tflite::micro::GetTensorShape(output),
                           tflite::micro::GetTensorData<int16_t>(output));
-#elif defined(FUSION_F1)
-    return EvalHifi4(static_cast<OpData*>(node->user_data), input, output,
-                     context);
+#elif defined(FUSION_F1) || defined(HIFI5)
+    return EvalHifi(static_cast<OpData*>(node->user_data), input, output,
+                    context);
 #else
    SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
    tflite::reference_ops::Softmax(

--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// Adding a new comment that will be removed when we sync from upstream TF.
-
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_


--- a/tensorflow/lite/micro/testing/generate_test_models.py
+++ b/tensorflow/lite/micro/testing/generate_test_models.py
@@ -28,7 +28,7 @@ from __future__ import print_function

 from absl import app
 import numpy as np
-import tensorflow.compat.v2 as tf
+import tensorflow as tf


 def generate_conv_model():

--- a/tensorflow/lite/micro/testing/test_with_arm_corstone_300.sh
+++ b/tensorflow/lite/micro/testing/test_with_arm_corstone_300.sh
@@ -31,11 +31,12 @@ MICRO_LOG_FILENAME=${RESULTS_DIRECTORY}/logs.txt
 mkdir -p ${RESULTS_DIRECTORY}

 FVP="FVP_Corstone_SSE-300_Ethos-U55 "
-FVP+="--cpulimit 1 "
+FVP+="-C ethosu.num_macs=256 "
 FVP+="-C mps3_board.visualisation.disable-visualisation=1 "
 FVP+="-C mps3_board.telnetterminal0.start_telnet=0 "
 FVP+='-C mps3_board.uart0.out_file="-" '
-FVP+='-C mps3_board.uart0.unbuffered_output=1'
+FVP+='-C mps3_board.uart0.unbuffered_output=1 '
+FVP+='-C mps3_board.uart0.shutdown_on_eot=1'
 ${FVP} ${BINARY_TO_TEST} | tee ${MICRO_LOG_FILENAME}

 if grep -q "$PASS_STRING" ${MICRO_LOG_FILENAME}

--- a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_corstone_300.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_corstone_300.sh
@@ -29,9 +29,9 @@ TARGET_ARCH=cortex-m55
 OPTIMIZED_KERNEL_DIR=cmsis_nn

 # TODO(b/143715361): downloading first to allow for parallel builds.
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} third_party_downloads
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile CO_PROCESSOR=ethos_u OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} third_party_downloads

 # Avoid running tests in parallel.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} build
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} test
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile CO_PROCESSOR=ethos_u OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} build
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile CO_PROCESSOR=ethos_u OPTIMIZED_KERNEL_DIR=${OPTIMIZED_KERNEL_DIR} TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} test
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -584,7 +584,10 @@ ifneq ($(DISABLE_DOWNLOADS), true)
  $(eval $(call add_third_party_download,$(GEMMLOWP_URL),$(GEMMLOWP_MD5),gemmlowp,))
  $(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
  $(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
-  $(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
+  RESULT := $(shell $(MAKEFILE_DIR)/person_detection_int8_download.sh ${MAKEFILE_DIR}/downloads $(CO_PROCESSOR))
+  ifneq ($(RESULT), SUCCESS)
+    $(error Something went wrong with the person detection int8 model download: $(RESULT))
+  endif
 endif

 # The target-specific makefile must have a name that is exactly

--- a/tensorflow/lite/micro/tools/make/corstone_300_download.sh
+++ b/tensorflow/lite/micro/tools/make/corstone_300_download.sh
@@ -50,8 +50,8 @@ if [ -d ${DOWNLOADED_CORSTONE_PATH} ]; then
 else
  UNAME_S=`uname -s`
  if [ ${UNAME_S} == Linux ]; then
-    CORSTONE_URL=https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_Ethos-U55_11.12_57.tgz
-    EXPECTED_MD5=08cc89b02a41917c2224f390f3ac0b47
+    CORSTONE_URL=https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/MPS3/FVP_Corstone_SSE-300_Ethos-U55_11.14_24.tgz
+    EXPECTED_MD5=89ca3355452072f879c134d04b6f94e2
  else
    echo "OS type ${UNAME_S} not supported."
    exit 1

--- a/tensorflow/lite/micro/tools/make/ethos_u_core_platform_download.sh
+++ b/tensorflow/lite/micro/tools/make/ethos_u_core_platform_download.sh
@@ -50,8 +50,8 @@ if [ -d ${DOWNLOADED_ETHOS_U_CORE_PLATFORM_PATH} ]; then
 else
  UNAME_S=`uname -s`
  if [ ${UNAME_S} == Linux ]; then
-    ETHOS_U_CORE_PLATFORM_URL=https://git.mlplatform.org/ml/ethos-u/ethos-u-core-platform.git/snapshot/ethos-u-core-platform-6663630bb3feea222fd38278a962297c08d0b320.tar.gz
-    EXPECTED_MD5=11683ce5cbf4e4d1003ca93a85ad0b08
+    ETHOS_U_CORE_PLATFORM_URL=https://git.mlplatform.org/ml/ethos-u/ethos-u-core-platform.git/snapshot/ethos-u-core-platform-b5f7cfe253dfeadd83caf60fde34b5b66f356782.tar.gz
+    EXPECTED_MD5=9431cd98f9d42d3bca9742dd7cab7229
  else
    echo "OS type ${UNAME_S} not supported."
    exit 1
@@ -75,6 +75,20 @@ else
  fi
  LINKER_PATH=${DOWNLOADED_ETHOS_U_CORE_PLATFORM_PATH}/targets/corstone-300
  ${COMPILER} -E -x c -P -o ${LINKER_PATH}/platform_parsed.ld ${LINKER_PATH}/platform.ld
+
+  # Move rodata from ITCM to DDR in order to support a bigger model without a specified section.
+  sed -i '/rodata/d' ${LINKER_PATH}/platform_parsed.ld
+  sed -i 's/network_model_sec/\.rodata\*/' ${LINKER_PATH}/platform_parsed.ld
+
+  # Patch retarget.c so that g++ can find _exit symbol.
+  cat <<EOT >> ${DOWNLOADED_ETHOS_U_CORE_PLATFORM_PATH}/targets/corstone-300/retarget.c
+
+void RETARGET(exit)(int return_code) {
+  _exit(return_code);
+  while (1) {}
+}
+EOT
+
 fi

 echo "SUCCESS"
--- a/tensorflow/lite/micro/tools/make/ext_libs/ethos_u.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/ethos_u.inc
@@ -41,3 +41,11 @@ THIRD_PARTY_CC_HDRS += $(CMSIS_PATH)/CMSIS/Core/Include/cmsis_compiler.h
 INCLUDES += -I$(ETHOSU_DRIVER_PATH)/include \
            -I$(CMSIS_PATH)/CMSIS/Core/Include
 GENERATED_PROJECT_INCLUDES += -I./$(ETHOSU_DRIVER_PATH)/include
+
+ETHOSU_LOG_SEVERITY := ETHOSU_LOG_INFO
+CCFLAGS += -DETHOSU_LOG_SEVERITY=$(ETHOSU_LOG_SEVERITY)
+
+# TODO(#47718): resolve warnings.
+CCFLAGS += \
+  -Wno-return-type \
+  -Wno-format
--- a/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/person_detection_int8_vela_convert.sh
+#!/bin/bash
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Called with following arguments:
+# 1 - Path to the downloads folder which is typically
+#     tensorflow/lite/micro/tools/make/downloads
+#
+# This script is called from the Makefile and uses the following convention to
+# enable determination of sucess/failure:
+#
+#   - If the script is successful, the only output on stdout should be SUCCESS.
+#     The makefile checks for this particular string.
+#
+#   - Any string on stdout that is not SUCCESS will be shown in the makefile as
+#     the cause for the script to have failed.
+#
+#   - Any other informational prints should be on stderr.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd "${ROOT_DIR}"
+
+source tensorflow/lite/micro/tools/make/bash_helpers.sh
+
+DOWNLOADS_DIR=${1}
+if [ ! -d ${DOWNLOADS_DIR} ]; then
+  echo "The top-level downloads directory: ${DOWNLOADS_DIR} does not exist."
+  exit 1
+fi
+
+DOWNLOADED_PERSON_MODEL_INT8_PATH=${DOWNLOADS_DIR}/person_model_int8
+if [ ! -d ${DOWNLOADED_PERSON_MODEL_INT8_PATH} ]; then
+  echo "${DOWNLOADED_PERSON_MODEL_INT8_PATH} is not downloaded."
+  exit 1
+fi
+
+# Optimize downloaded model with Vela for Ethos-U.
+# See tensorflow/lite/micro/kernels/ethos_u/README.md for more info.
+CONVERTED_PERSON_MODEL_INT8=${DOWNLOADED_PERSON_MODEL_INT8_PATH}/person_detect_model_data_vela.cc
+if [ ! -f ${CONVERTED_PERSON_MODEL_INT8} ]; then
+  command xxd -v >&2 || (echo "xxd command is needed, please install.." && exit 1)
+  echo >&2 "Converting person detection int8 model to Ethos-U optimized model.."
+
+  # Convert original model to .tflite format.
+  grep -E "(0x[0-9a-f]{2}(,|))" ${DOWNLOADED_PERSON_MODEL_INT8_PATH}/person_detect_model_data.cc | xxd -r -p > \
+      ${DOWNLOADED_PERSON_MODEL_INT8_PATH}/person_detect.tflite
+
+  # Compile an optimized .tflite version for Ethos-U.
+  TEMPFILE=$(mktemp -d)/
+  python3 -m venv $TEMPFILE
+  source $TEMPFILE/bin/activate
+  pip install --upgrade setuptools >&2
+  pip install ethos-u-vela >&2
+  vela --accelerator-config=ethos-u55-256 ${DOWNLOADED_PERSON_MODEL_INT8_PATH}/person_detect.tflite \
+       --output-dir ${DOWNLOADED_PERSON_MODEL_INT8_PATH} >&2
+  deactivate
+
+  # Convert .tflite back to C array.
+  echo "// This file is generated by $0." > ${CONVERTED_PERSON_MODEL_INT8}
+  echo '#include "tensorflow/lite/micro/examples/person_detection/person_detect_model_data.h"' >> \
+       ${CONVERTED_PERSON_MODEL_INT8}
+  echo -n "const " >> ${CONVERTED_PERSON_MODEL_INT8}
+  xxd -i ${DOWNLOADED_PERSON_MODEL_INT8_PATH}/person_detect_vela.tflite >> \
+      ${CONVERTED_PERSON_MODEL_INT8}
+  sed -i 's/tensorflow_lite_micro_tools_make_downloads_person_model_int8_person_detect_vela_tflite/g_person_detect_model_data/' \
+      ${CONVERTED_PERSON_MODEL_INT8}
+  sed -i 's/^const unsigned char g_person_detect_model_data/alignas\(16\) &/'  ${CONVERTED_PERSON_MODEL_INT8}
+  sed -i 's/unsigned int/const int/' ${CONVERTED_PERSON_MODEL_INT8}
+fi
+
+echo "SUCCESS"
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa.inc
@@ -26,7 +26,16 @@ ifeq ($(TARGET_ARCH), $(findstring $(TARGET_ARCH), "hifi5"))
    $(NNLIB_PATH)/algo/kernels/matXvec/hifi5/xa_nn_matXvec_8x16.c \
    $(NNLIB_PATH)/algo/kernels/matXvec/hifi5/xa_nn_matXvec_8x8.c \
    $(NNLIB_PATH)/algo/kernels/activations/hifi5/xa_nn_activations_8_8.c \
-    $(NNLIB_PATH)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c
+    $(NNLIB_PATH)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
+    $(NNLIB_PATH)/algo/kernels/activations/hifi5/xa_nn_softmax_asym8_asym8.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_matXvec_sym8sxasym8s_asym8s_circ.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_circ_buf.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_conv2d_std_circ_buf.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_conv2d_std_sym8sxasym8s.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise_sym8sxasym8s.c \
+    $(NNLIB_PATH)/algo/kernels/cnn/hifi5/xa_nn_conv2d_pointwise_sym8sxasym8s.c \
+    $(NNLIB_PATH)/algo/kernels/matXvec/hifi5/xa_nn_matmul_sym8sxasym8s.c

  INCLUDES += \
    -I$(NNLIB_PATH)/ \

--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_depthwise_patch_hifi5.patch
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_depthwise_patch_hifi5.patch
+diff --git a/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise.c b/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise.c
+index 48fccdc..3c10040 100644
+--- a/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise.c
+++ b/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise.c
+@@ -348,8 +348,6 @@ WORD32 xa_nn_conv2d_depthwise_getsize
+   XA_NNLIB_CHK_COND((kernel_height <= 0), -1);
+   XA_NNLIB_CHK_COND((kernel_width <= 0), -1);
+   XA_NNLIB_CHK_COND((channels_multiplier <= 0), -1);
+-  XA_NNLIB_CHK_COND((x_stride <= 0 || x_stride > kernel_width), -1); //TODO: x_stride > kernel_width is supported ?
+-  XA_NNLIB_CHK_COND((y_stride <= 0 || y_stride > kernel_height), -1);
+   XA_NNLIB_CHK_COND((x_padding < 0), -1);
+   XA_NNLIB_CHK_COND((y_padding < 0), -1);
+   XA_NNLIB_CHK_COND((output_height <= 0), -1);
+diff --git a/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise_sym8sxasym8s.c b/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise_sym8sxasym8s.c
+index 71611f4..0fd19ae 100644
+--- a/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise_sym8sxasym8s.c
+++ b/algo/kernels/cnn/hifi5/xa_nn_conv2d_depthwise_sym8sxasym8s.c
+@@ -2122,10 +2122,7 @@ WORD32 xa_nn_conv2d_depthwise_nhwc_per_chan_sym8sxasym8s_k3x3
+   /* Implementation dependent checks */
+   //TOOD: support y_stride 2
+   XA_NNLIB_ARG_CHK_COND((y_stride != 1) && (y_stride != 2), -1);
+-  XA_NNLIB_ARG_CHK_COND((y_stride > kernel_height), -1);
+-  XA_NNLIB_ARG_CHK_COND((x_stride > kernel_width), -1);
+   XA_NNLIB_ARG_CHK_COND((kernel_height > input_height), -1);
+-  XA_NNLIB_ARG_CHK_COND((kernel_width > input_width), -1);
+ 
+ #ifndef DISABLE_DEPTHWISE_CONV2D_K3X3_SPECIAL_CASE
+   WORD32 input_zero_bias_neg = -input_zero_bias;
+@@ -2271,7 +2268,6 @@ WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s_generic
+   XA_NNLIB_ARG_CHK_COND((input_channels <= 0), -1);
+   XA_NNLIB_ARG_CHK_COND((kernel_height <= 0 || kernel_width <= 0), -1);
+   XA_NNLIB_ARG_CHK_COND((kernel_height > input_height), -1);
+-  XA_NNLIB_ARG_CHK_COND((kernel_width > input_width), -1);
+   XA_NNLIB_ARG_CHK_COND((channels_multiplier <= 0), -1);
+   XA_NNLIB_ARG_CHK_COND((y_stride <= 0 || x_stride <= 0), -1);
+   XA_NNLIB_ARG_CHK_COND((y_padding < 0 || x_padding < 0), -1);
+@@ -2282,8 +2278,6 @@ WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s_generic
+   XA_NNLIB_ARG_CHK_COND((inp_data_format != 0 && inp_data_format != 1), -1);
+   XA_NNLIB_ARG_CHK_COND((out_data_format != 0), -1);
+   /* Implementation dependent checks */
+-  XA_NNLIB_ARG_CHK_COND((y_stride > kernel_height), -1);
+-  XA_NNLIB_ARG_CHK_COND((x_stride > kernel_width), -1);
+ 
+   if(inp_data_format == 0)
+   {
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_download.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_download.sh
@@ -81,6 +81,17 @@ else
    popd >&2
  fi

+  if [[ ${2} == "hifi5" ]]; then
+    pushd ${DOWNLOADS_DIR}/xa_nnlib_hifi5/ >&2
+    git init . >&2
+    git config user.email "tflm@google.com"
+    git config user.name "TensorflowLite Micro"
+    git add *
+    git commit -a -m "Commit for a temporary repository." > /dev/null
+    git apply ../../ext_libs/xtensa_depthwise_patch_hifi5.patch
+    popd >&2
+  fi
+
 fi

 echo "SUCCESS"
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_patch.patch
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_patch.patch
+diff --git a/algo/kernels/basic/hifi4/xa_nn_dot_prod_16x16.c b/algo/kernels/basic/hifi4/xa_nn_dot_prod_16x16.c
+index 0c58a3c..7439d6d 100644
+--- a/algo/kernels/basic/hifi4/xa_nn_dot_prod_16x16.c
+++ b/algo/kernels/basic/hifi4/xa_nn_dot_prod_16x16.c
+@@ -90,8 +90,8 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+   ae_int32x2 d_bias;
+   int i;
+ 
+-  /* inp1 and inp2 8-byte aligned case */
+-  if(((vec_length & 3) == 0) && (((int)p_inp1_start & 7) == 0) && (((int)p_inp2_start & 7) == 0))
+ /* handle cases where vec_length is multiple of 8 */
+  if(vec_length == 8)
+   {
+     /* Assumption: 
+      * p_inp1_start - memory is continuous => vec_count1 end and vect_count2 start are continuous 
+@@ -100,19 +100,21 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+     pt_inp1 = (const ae_int16x4 *)((WORD16 *)p_inp1_start);
+     pt_inp2 = (const ae_int16x4 *)((WORD16 *)p_inp2_start);
+ 
+    align_inp1 = AE_LA64_PP(pt_inp1);
+    align_inp2 = AE_LA64_PP(pt_inp2);
+     /* TBD: multiple vec_count processing in a single loop can be done */
+     for(loopcnt = 0; loopcnt < vec_count; loopcnt++)
+     {
+       AE_L32_XP(d_bias, (ae_int32 *)p_bias_load, bias_address_increment);
+ 
+       d_out64_0 = ZERO64;
+      AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+      AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+      AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+      AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+      AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+      AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+ 
+-      for(i = 0; i < (vec_length >> 2); i++)
+-      {
+-        AE_L16X4_IP(d_inp1_0, pt_inp1, 8);
+-        AE_L16X4_IP(d_inp2_0, pt_inp2, 8);
+-        AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+-      }
+       AE_SAT32X2_HIFI4(d_out32, d_out64_0);
+       d_out32 = AE_ADD32S(d_out32, d_bias);
+ 
+@@ -122,8 +124,7 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+       *p_out++ = (WORD8)AE_MOVAD32_L(d_out32);
+     }
+   }
+-  /* handle cases where vec_length is multiple of 8 */
+-  else if(vec_length == 8)
+  else if(vec_length == 32)
+   {
+     /* Assumption: 
+      * p_inp1_start - memory is continuous => vec_count1 end and vect_count2 start are continuous 
+@@ -140,13 +141,13 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+       AE_L32_XP(d_bias, (ae_int32 *)p_bias_load, bias_address_increment);
+ 
+       d_out64_0 = ZERO64;
+-      AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+-      AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+-      AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+-      AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+-      AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+-      AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+-
+#pragma loop_count min=3
+      for(i = 0; i < (vec_length >> 2); i++)
+      {
+        AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+        AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+        AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+      }
+       AE_SAT32X2_HIFI4(d_out32, d_out64_0);
+       d_out32 = AE_ADD32S(d_out32, d_bias);
+ 
+@@ -156,7 +157,8 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+       *p_out++ = (WORD8)AE_MOVAD32_L(d_out32);
+     }
+   }
+-  else if(vec_length == 32)
+  /* inp1 and inp2 8-byte aligned case */
+  else if(((vec_length & 3) == 0) && (((int)p_inp1_start & 7) == 0) && (((int)p_inp2_start & 7) == 0))
+   {
+     /* Assumption: 
+      * p_inp1_start - memory is continuous => vec_count1 end and vect_count2 start are continuous 
+@@ -165,19 +167,17 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+     pt_inp1 = (const ae_int16x4 *)((WORD16 *)p_inp1_start);
+     pt_inp2 = (const ae_int16x4 *)((WORD16 *)p_inp2_start);
+ 
+-    align_inp1 = AE_LA64_PP(pt_inp1);
+-    align_inp2 = AE_LA64_PP(pt_inp2);
+     /* TBD: multiple vec_count processing in a single loop can be done */
+     for(loopcnt = 0; loopcnt < vec_count; loopcnt++)
+     {
+       AE_L32_XP(d_bias, (ae_int32 *)p_bias_load, bias_address_increment);
+ 
+       d_out64_0 = ZERO64;
+-#pragma loop_count min=3
+
+       for(i = 0; i < (vec_length >> 2); i++)
+       {
+-        AE_LA16X4_IP(d_inp1_0, align_inp1, pt_inp1);
+-        AE_LA16X4_IP(d_inp2_0, align_inp2, pt_inp2);
+        AE_L16X4_IP(d_inp1_0, pt_inp1, 8);
+        AE_L16X4_IP(d_inp2_0, pt_inp2, 8);
+         AE_MULAAAAQ16(d_out64_0, d_inp1_0, d_inp2_0);
+       }
+       AE_SAT32X2_HIFI4(d_out32, d_out64_0);
+@@ -189,7 +189,7 @@ WORD32 xa_nn_dot_prod_16x16_asym8s(
+       *p_out++ = (WORD8)AE_MOVAD32_L(d_out32);
+     }
+   }
+-  else if(((vec_length & 3) == 0) && (((int)p_inp1_start & 7) == 0))
+   else if(((vec_length & 3) == 0) && (((int)p_inp1_start & 7) == 0))
+   {
+     /* Assumption: 
+      * p_inp1_start - memory is continuous => vec_count1 end and vect_count2 start are continuous 
 diff --git a/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c b/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c
 index 3e29856..320987b 100644
 --- a/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c

--- a/tensorflow/lite/micro/tools/make/person_detection_int8_download.sh
+++ b/tensorflow/lite/micro/tools/make/person_detection_int8_download.sh
+#!/bin/bash
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Called with following arguments:
+# 1 - Path to the downloads folder which is typically
+#     tensorflow/lite/micro/tools/make/downloads
+#
+# This script is called from the Makefile and uses the following convention to
+# enable determination of sucess/failure:
+#
+#   - If the script is successful, the only output on stdout should be SUCCESS.
+#     The makefile checks for this particular string.
+#
+#   - Any string on stdout that is not SUCCESS will be shown in the makefile as
+#     the cause for the script to have failed.
+#
+#   - Any other informational prints should be on stderr.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../..
+cd "${ROOT_DIR}"
+
+source tensorflow/lite/micro/tools/make/bash_helpers.sh
+
+DOWNLOADS_DIR=${1}
+if [ ! -d ${DOWNLOADS_DIR} ]; then
+  echo "The top-level downloads directory: ${DOWNLOADS_DIR} does not exist."
+  exit 1
+fi
+
+DOWNLOADED_PERSON_MODEL_INT8_PATH=${DOWNLOADS_DIR}/person_model_int8
+if [ -d ${DOWNLOADED_PERSON_MODEL_INT8_PATH} ]; then
+  echo >&2 "${DOWNLOADED_PERSON_MODEL_INT8_PATH} already exists, skipping the download."
+else
+  PERSON_MODEL_INT8_URL=https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_12_1.zip
+  EXPECTED_MD5=e765cc76889db8640cfe876a37e4ec00
+
+  TEMPFILE=$(mktemp -d)/temp_file
+  wget ${PERSON_MODEL_INT8_URL} -O ${TEMPFILE} >&2
+  check_md5 ${TEMPFILE} ${EXPECTED_MD5}
+  unzip ${TEMPFILE} -d ${DOWNLOADS_DIR} >&2
+
+fi
+
+echo "SUCCESS"
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
@@ -23,6 +23,12 @@ ifeq ($(CMSIS_PATH), $(CMSIS_DEFAULT_DOWNLOAD_PATH))
  endif
 endif

+# Convert downloaded person detect int8 model.
+DOWNLOAD_RESULT := $(shell $(MAKEFILE_DIR)/ext_libs/person_detection_int8_vela_convert.sh ${MAKEFILE_DIR}/downloads)
+ifneq ($(DOWNLOAD_RESULT), SUCCESS)
+  $(error Something went wrong with the person detection int8 model conversion: $(DOWNLOAD_RESULT))
+endif
+
 FLOAT := soft
 GCC_TARGET_ARCH := $(TARGET_ARCH)

@@ -129,6 +135,9 @@ ARM_CPU := $(subst +nofp,,$(ARM_CPU))
 CXXFLAGS += -D$(ARM_CPU)$(CMSIS_ARM_FEATURES)
 CCFLAGS += -D$(ARM_CPU)$(CMSIS_ARM_FEATURES)

+# For Ethos-U Core Driver. Header file name is depending on target architecture.
+CXXFLAGS += -DCMSIS_DEVICE_ARM_CORTEX_M_XX_HEADER_FILE=\"$(ARM_CPU).h\"
+
 THIRD_PARTY_CC_SRCS += \
  $(ETHOS_U_CORE_PLATFORM)/retarget.c \
  $(ETHOS_U_CORE_PLATFORM)/uart.c
@@ -159,7 +168,6 @@ MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 EXCLUDED_EXAMPLE_TESTS := \
  tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
  tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-  tensorflow/lite/micro/examples/person_detection/Makefile.inc \
  tensorflow/lite/micro/examples/hello_world/Makefile.inc \
  tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
 MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))

--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -57,8 +57,8 @@ EMBARC_MLI_PRE_COMPILED_MD5 := "173990c2dde4efef6a2c95b92d1f0244"
 ZEPHYR_URL := "http://mirror.tensorflow.org/github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"

-ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-2b201c340788ac582cec160b7217c2b5405b04f9.tar.gz"
-ETHOSU_MD5 := "0c148b90a1ee01de398892eb3a63e717"
+ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-dfed5fd24699246811300a05ebc421a978e52149.tar.gz"
+ETHOSU_MD5 := "b7ded51a4186daa711c2517c4ab23901"

 HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v04.zip"
 HIMAX_WE1_SDK_MD5 ="40b3ccb3c2e41210fe5c970d61e7e7d3"