diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index 61979bea388c32860f2b9c472dac751ae3b0c884..986b10570c32fd8b74c8f8ba04f197833a24c7bd 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/benchmarks/micro_benchmark.h"
 #include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
+#include "tensorflow/lite/micro/kernels/svdf.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_profiler.h"
@@ -55,7 +56,7 @@ KeywordBenchmarkRunner* CreateBenchmarkRunner(MicroProfiler* profiler) {
   op_resolver->AddFullyConnected(tflite::Register_FULLY_CONNECTED_INT8());
   op_resolver->AddQuantize();
   op_resolver->AddSoftmax(tflite::Register_SOFTMAX_INT8_INT16());
-  op_resolver->AddSvdf();
+  op_resolver->AddSvdf(tflite::Register_SVDF_INT8());
 
   return new (benchmark_runner_buffer)
       KeywordBenchmarkRunner(g_keyword_scrambled_model_data, op_resolver,
diff --git a/tensorflow/lite/micro/kernels/fully_connected.h b/tensorflow/lite/micro/kernels/fully_connected.h
index 0b672e43b009f2470f901c7de2b167f238c448cc..e1215da61ba1c352aaa11b0de9489811040fb9c4 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/tensorflow/lite/micro/kernels/fully_connected.h
@@ -65,14 +65,9 @@ TfLiteStatus CalculateOpDataFullyConnected(
 // (reference or optimized) must define this function.
 TfLiteRegistration Register_FULLY_CONNECTED();
 
-#if defined(CMSIS_NN)
-// The Arduino is a special case where we use the CMSIS kernels, but because of
-// the current approach to building for Arduino, we do not support -DCMSIS_NN as
-// part of the build. As a result, we use defined(ARDUINO) as proxy for the
-// CMSIS kernels for this one special case.
-
-// Returns a TfLiteRegistration struct for cmsis_nn kernel variant that only
-// supports int8.
+#if defined(CMSIS_NN) || defined(HEXAGON)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8.
 TfLiteRegistration Register_FULLY_CONNECTED_INT8();
 
 #else
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index cd22e31b1f07bd747b7389e01cd71dd11b1ed197..ae2998eaff318ce5deb61ed3032d7042c8ab97b9 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -33,13 +33,13 @@ namespace {
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
   TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  const OpDataSvdf& data = *(static_cast<const OpDataSvdf*>(node->user_data));
 
   const TfLiteEvalTensor* input =
       tflite::micro::GetEvalInput(context, node, kSvdfInputTensor);
diff --git a/tensorflow/lite/micro/kernels/svdf.h b/tensorflow/lite/micro/kernels/svdf.h
index d04787be9cf3bf04d3fb84e38ac5a9bcb9a47ec7..8a7269f33a3a27a25f4a69d636fa4ea5d69afbf3 100644
--- a/tensorflow/lite/micro/kernels/svdf.h
+++ b/tensorflow/lite/micro/kernels/svdf.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tflite {
 
-struct OpData {
+struct OpDataSvdf {
   int32_t effective_scale_1_a;
   int32_t effective_scale_2_a;
   // b versions of each scale are kept at int since the numbers are just the
@@ -55,7 +55,7 @@ void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteSVDFParams* params,
                               TfLiteEvalTensor* activation_state_tensor,
                               TfLiteEvalTensor* output_tensor,
-                              const OpData& data);
+                              const OpDataSvdf& data);
 
 void EvalFloatSvdfReference(
     TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
@@ -66,6 +66,23 @@ void EvalFloatSvdfReference(
 
 TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node);
 
+// This is the most generic TfLiteRegistration. The actual supported types may
+// still be target dependent. The only requirement is that every implementation
+// (reference or optimized) must define this function.
+TfLiteRegistration Register_SVDF();
+
+#if defined(HEXAGON)
+TfLiteRegistration Register_SVDF_INT8();
+
+#else
+// Note that while this block gets used for both reference and optimized kernels
+// that do not have any specialized implementations, the only goal here is to
+// define fallback implementation that allow reference kernels to still be used
+// from applications that call a more specific kernel variant.
+
+inline TfLiteRegistration Register_SVDF_INT8() { return Register_SVDF(); }
+
+#endif
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_SVDF_H_
diff --git a/tensorflow/lite/micro/kernels/svdf_common.cc b/tensorflow/lite/micro/kernels/svdf_common.cc
index 12e697b1461aad3faf3a8e61563243561c49d0e6..4f62b519bcd3cb1acbe11da360737bef25b64fee 100644
--- a/tensorflow/lite/micro/kernels/svdf_common.cc
+++ b/tensorflow/lite/micro/kernels/svdf_common.cc
@@ -56,7 +56,7 @@ void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteSVDFParams* params,
                               TfLiteEvalTensor* activation_state_tensor,
                               TfLiteEvalTensor* output_tensor,
-                              const OpData& data) {
+                              const OpDataSvdf& data) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -401,7 +401,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
+  OpDataSvdf* data = static_cast<OpDataSvdf*>(node->user_data);
 
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
diff --git a/tensorflow/lite/micro/kernels/xtensa/svdf.cc b/tensorflow/lite/micro/kernels/xtensa/svdf.cc
index 274d8d4520c4c3ba08c066ee881e03527e1fb4e6..e26a1affc9f70e3263f9eeb0c93b8d6dce6a0744 100644
--- a/tensorflow/lite/micro/kernels/xtensa/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/svdf.cc
@@ -48,7 +48,7 @@ constexpr int kOutputTensor = 0;
  * This version of SVDF is specific to TFLite Micro. It contains only a full
  * integer receipe with optimizations for the Xtensa HiFiMini platform.
  *
- * Note: passing OpData by value might seem like an oversight but it helps
+ * Note: passing OpDataSvdf by value might seem like an oversight but it helps
  * reduce the latency. See b/155656675 for more details.
  */
 void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node,
@@ -58,7 +58,7 @@ void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node,
                              const TfLiteEvalTensor* bias_tensor,
                              const TfLiteSVDFParams* params,
                              TfLiteEvalTensor* activation_state_tensor,
-                             TfLiteEvalTensor* output_tensor, OpData data) {
+                             TfLiteEvalTensor* output_tensor, OpDataSvdf data) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -254,7 +254,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node,
                                  const TfLiteSVDFParams* params,
                                  TfLiteEvalTensor* activation_state_tensor,
                                  TfLiteEvalTensor* output_tensor,
-                                 const OpData& data) {
+                                 const OpDataSvdf& data) {
   const int n_rank = params->rank;
   const int n_batch = input_tensor->dims->data[0];
   const int n_input = input_tensor->dims->data[1];
@@ -321,7 +321,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node,
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -422,7 +422,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       1e-5);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
+  OpDataSvdf* data = static_cast<OpDataSvdf*>(node->user_data);
 
 #if defined(HIFIMINI)
   QuantizeMultiplierForInt24(effective_scale_1, &data->effective_scale_1_a,
@@ -471,7 +471,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  const OpDataSvdf& data = *(static_cast<const OpDataSvdf*>(node->user_data));
 
 #if defined(HIFIMINI)
   EvalIntegerSvdfHifimini(context, node, input, weights_feature, weights_time,
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index fe0db48aee856dab556e4928fbf74237b6ab903e..982a930264154dfe91449b6353b0b88dc43ea47d 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -498,8 +498,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseSub);
   }
 
-  TfLiteStatus AddSvdf() {
-    return AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), ParseSvdf);
+  TfLiteStatus AddSvdf(
+      const TfLiteRegistration& registration = Register_SVDF()) {
+    return AddBuiltin(BuiltinOperator_SVDF, registration, ParseSvdf);
   }
 
   TfLiteStatus AddTanh() {
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc b/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc
index f84da5d3b1cade2ba37d77f1a937df26d67ecf1d..7ab009ca1fb4a87a5c4b3fbb92ce7a9d7fa8f8b2 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc
@@ -1,3 +1,7 @@
+MICROLITE_CC_KERNEL_SRCS += \
+tensorflow/lite/micro/kernels/hexagon/fully_connected_int8.cc \
+tensorflow/lite/micro/kernels/hexagon/svdf_int8.cc
+
 # Full path to the hexagon_tflm static library.
 HEXAGON_TFLM_LIB :=
 
diff --git a/third_party/hexagon/fully_connected.cc b/third_party/hexagon/fully_connected.cc
index 9e4a108f86391bb0e3dc5f92c55800880f3727b3..05ae1c347be1f8794daae2d3c696e759c7d1d5a4 100644
--- a/third_party/hexagon/fully_connected.cc
+++ b/third_party/hexagon/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -52,181 +52,14 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/hexagon/hexagon_fully_connected.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
-#include "third_party/hexagon/hexagon_tflm_translation_fully_connected.h"
 
-namespace tflite {
-namespace {
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-  // Cached zero point values of tensors.
-  int32_t input_zero_point;
-  int32_t filter_zero_point;
-  int32_t output_zero_point;
-
-  void* hexagon_data;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFusedActivation activation,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-
-    data->input_zero_point = input->params.zero_point;
-    data->filter_zero_point = filter->params.zero_point;
-    data->output_zero_point = output->params.zero_point;
-  }
-  return status;
-}
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  data = context->AllocatePersistentBuffer(context, sizeof(OpData));
-
-  if (data == nullptr) {
-    return nullptr;
-  }
-  OpData* opdata = static_cast<OpData*>(data);
-  opdata->hexagon_data =
-      tflite::hexagon_fully_connected::HexagonInit(context, buffer, length);
-
-  return data;
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  OpData* data = static_cast<OpData*>(node->user_data);
-  const auto params =
-      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
+#include "hexagon_tflm_translation_fully_connected.h"
 
-  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
-                     "Hybrid models are not supported on TFLite Micro.");
-
-  tflite::hexagon_fully_connected::HexagonOptimizationEvaluation(context, node);
-
-  if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) {
-    return tflite::hexagon_fully_connected::HexagonPrepare(context, node);
-  } else {
-    return CalculateOpData(context, params->activation, input->type, input,
-                           filter, bias, output, data);
-  }
-}
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data,
-                               const TfLiteEvalTensor* input,
-                               const TfLiteEvalTensor* filter,
-                               const TfLiteEvalTensor* bias,
-                               TfLiteEvalTensor* output) {
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = -data.input_zero_point;
-  op_params.weights_offset = -data.filter_zero_point;
-  op_params.output_offset = data.output_zero_point;
-  op_params.output_multiplier = data.output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-  reference_integer_ops::FullyConnected(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteEvalTensor* input,
-                           const TfLiteEvalTensor* filter,
-                           const TfLiteEvalTensor* bias,
-                           TfLiteEvalTensor* output) {
-  const int32_t input_offset = -data.input_zero_point;
-  const int32_t filter_offset = -data.filter_zero_point;
-  const int32_t output_offset = data.output_zero_point;
-
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data.output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
-  reference_ops::FullyConnected(                       \
-      op_params, tflite::micro::GetTensorShape(input), \
-      tflite::micro::GetTensorData<uint8_t>(input),    \
-      tflite::micro::GetTensorShape(filter),           \
-      tflite::micro::GetTensorData<uint8_t>(filter),   \
-      tflite::micro::GetTensorShape(bias),             \
-      tflite::micro::GetTensorData<int32_t>(bias),     \
-      tflite::micro::GetTensorShape(output),           \
-      tflite::micro::GetTensorData<output_data_type>(output))
-  switch (output->type) {
-    case kTfLiteUInt8:
-      TF_LITE_FULLY_CONNECTED(uint8_t);
-      break;
-    case kTfLiteInt16:
-      TF_LITE_FULLY_CONNECTED(int16_t);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
+namespace tflite {
 
-  return kTfLiteOk;
-}
+namespace {
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFusedActivation activation,
@@ -251,22 +84,23 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+}  // namespace
+
+TfLiteStatus HexagonFullyConnectedEval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->builtin_data != nullptr);
   const auto* params =
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
   const TfLiteEvalTensor* filter =
-      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
   const TfLiteEvalTensor* bias =
-      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
   TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
@@ -275,16 +109,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        output);
 
     case kTfLiteInt8:
-      if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) {
-        return tflite::hexagon_fully_connected::HexagonEvalQuantizedInt8(
-            context, node, node->user_data, input, filter, bias, output);
-      } else {
-        return EvalQuantizedInt8(context, node, data, input, filter, bias,
-                                 output);
-      }
-
-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, data, input, filter, bias, output);
+      return HexagonFullyConnectedEvalInt8(context, node);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -294,13 +119,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace
-
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/Init,
+  return {/*init=*/HexagonFullyConnectedInit,
           /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
+          /*prepare=*/HexagonFullyConnectedPrepare,
+          /*invoke=*/HexagonFullyConnectedEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
diff --git a/third_party/hexagon/fully_connected_int8.cc b/third_party/hexagon/fully_connected_int8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85621560b6d5128115ea2705b260e8e874778a81
--- /dev/null
+++ b/third_party/hexagon/fully_connected_int8.cc
@@ -0,0 +1,188 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Copyright 2020 The Qualcomm Innovation Center, Inc. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer
+below) provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of Qualcomm Innovation Center, Inc. nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+==============================================================================*/
+
+#include "hexagon_tflm_translation_fully_connected.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/hexagon/hexagon_fully_connected.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                               const HexagonOpDataFullyConnected& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = -data.reference_op_data.input_zero_point;
+  op_params.weights_offset = -data.reference_op_data.filter_zero_point;
+  op_params.output_offset = data.reference_op_data.output_zero_point;
+  op_params.output_multiplier = data.reference_op_data.output_multiplier;
+  // TODO(b/138810107): Figure out whether output shift should be inverted
+  op_params.output_shift = -data.reference_op_data.output_shift;
+  op_params.quantized_activation_min =
+      data.reference_op_data.output_activation_min;
+  op_params.quantized_activation_max =
+      data.reference_op_data.output_activation_max;
+
+  reference_integer_ops::FullyConnected(
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* HexagonFullyConnectedInit(TfLiteContext* context, const char* buffer,
+                         size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  data = context->AllocatePersistentBuffer(context,
+                                           sizeof(HexagonOpDataFullyConnected));
+
+  if (data == nullptr) {
+    return nullptr;
+  }
+  HexagonOpDataFullyConnected* opdata =
+      static_cast<HexagonOpDataFullyConnected*>(data);
+  opdata->hexagon_data =
+      tflite::hexagon_fully_connected::HexagonInit(context, buffer, length);
+
+  return data;
+}
+
+TfLiteStatus HexagonFullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  HexagonOpDataFullyConnected* data =
+      static_cast<HexagonOpDataFullyConnected*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input =
+      GetInput(context, node, kFullyConnectedInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const TfLiteTensor* filter =
+      GetInput(context, node, kFullyConnectedWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  const TfLiteTensor* bias =
+      GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  tflite::hexagon_fully_connected::HexagonOptimizationEvaluation(context, node);
+
+  if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) {
+    return tflite::hexagon_fully_connected::HexagonPrepare(context, node);
+  } else {
+    return CalculateOpDataFullyConnected(context, params->activation, input->type, input,
+                           filter, bias, output, &data->reference_op_data);
+  }
+}
+
+TfLiteStatus HexagonFullyConnectedEvalInt8(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const HexagonOpDataFullyConnected& data =
+      *(static_cast<const HexagonOpDataFullyConnected*>(node->user_data));
+
+  // This kernel only implements the int8 version of the fully_connected kernel.
+  TFLITE_DCHECK(input->type == kTfLiteInt8);
+  TFLITE_DCHECK(filter->type == kTfLiteInt8);
+  TFLITE_DCHECK(bias->type == kTfLiteInt32);
+  TFLITE_DCHECK(output->type == kTfLiteInt8);
+
+  if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) {
+    return tflite::hexagon_fully_connected::HexagonEvalQuantizedInt8(
+        context, node, node->user_data, input, filter, bias, output);
+  } else {
+    return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
+  return {/*init=*/HexagonFullyConnectedInit,
+          /*free=*/nullptr,
+          /*prepare=*/HexagonFullyConnectedPrepare,
+          /*invoke=*/HexagonFullyConnectedEvalInt8,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
diff --git a/third_party/hexagon/hexagon_fully_connected.h b/third_party/hexagon/hexagon_fully_connected.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7fa974167bce2b248d8eb0da7b9c304f1176782
--- /dev/null
+++ b/third_party/hexagon/hexagon_fully_connected.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+
+namespace tflite {
+
+struct HexagonOpDataFullyConnected {
+  struct OpDataFullyConnected reference_op_data;
+  void* hexagon_data;
+};
+
+void* HexagonFullyConnectedInit(TfLiteContext* context, const char* buffer,
+                         size_t length);
+TfLiteStatus HexagonFullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node);
+TfLiteStatus HexagonFullyConnectedEvalInt8(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_
diff --git a/third_party/hexagon/hexagon_svdf.h b/third_party/hexagon/hexagon_svdf.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bd6f93c5b1b329061d0ac54e0237e894db6969b
--- /dev/null
+++ b/third_party/hexagon/hexagon_svdf.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/micro/kernels/svdf.h"
+
+namespace tflite {
+
+struct HexagonOpDataSvdf {
+  struct OpDataSvdf reference_op_data;
+  void* hexagon_data;
+};
+
+void* HexagonSvdfInit(TfLiteContext* context, const char* buffer, size_t length);
+TfLiteStatus HexagonSvdfPrepare(TfLiteContext* context, TfLiteNode* node);
+TfLiteStatus HexagonSvdfEvalInt8(TfLiteContext* context, TfLiteNode* node);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_
diff --git a/third_party/hexagon/svdf.cc b/third_party/hexagon/svdf.cc
index e20811c7d5db95391a7caca7fe33510c4550c026..ac02de2eeaa7afa13f6bf38627f31d80dc55fe16 100644
--- a/third_party/hexagon/svdf.cc
+++ b/third_party/hexagon/svdf.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -52,533 +52,47 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/hexagon/hexagon_svdf.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"
-#include "third_party/hexagon/hexagon_tflm_translation_svdf.h"
 
-namespace tflite {
-namespace {
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-constexpr int kInputActivationStateTensor = 4;
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-struct OpData {
-  int32_t effective_scale_1_a;
-  int32_t effective_scale_2_a;
-  // b versions of each scale are kept at int since the numbers are just the
-  // shift value - typically between [-32, 32].
-  int effective_scale_1_b;
-  int effective_scale_2_b;
-  int scratch_tensor_index;
-  int scratch_output_tensor_index;
-
-  // Cached tensor zero point values for quantized operations.
-  int input_zero_point;
-  int output_zero_point;
-
-  void* hexagon_data;
-};
-
-/**
- * This version of SVDF is specific to TFLite Micro. It contains the following
- * differences between the TFLite version:
- *
- * 1.) Scratch tensor allocation - scratch tensors must be known ahead of time
- * for the Micro interpreter.
- * 2.) Output dimensions - the TFLite version determines output size and runtime
- * and resizes the output tensor. Micro runtime does not support tensor
- * resizing.
- */
-static inline void ApplyTimeWeightsBiasAndActivation(
-    int batch_size, int memory_size, int num_filters, int num_units, int rank,
-    const float* const __restrict__ weights_time_ptr,
-    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
-    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
-    float* const __restrict__ output_ptr) {
-  // Compute matmul(activation_state, weights_time).
-  for (int b = 0; b < batch_size; ++b) {
-    // Perform batched vector dot product:
-    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
-    const float* vector1_ptr = weights_time_ptr;
-    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
-    for (int i = 0; i < num_filters; ++i) {
-      *scratch_ptr_batch = 0.f;
-      for (int j = 0; j < memory_size; ++j) {
-        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
-      }
-      scratch_ptr_batch++;
-    }
-  }
-
-  // Initialize output with bias if provided.
-  if (bias_ptr) {
-    // VectorBatchVectorAssign
-    for (int i = 0; i < batch_size; ++i) {
-      float* output_data = output_ptr + i * num_units;
-      const float* bias_data = bias_ptr;
-      for (int j = 0; j < num_units; ++j) {
-        *output_data++ = *bias_data++;
-      }
-    }
-  } else {
-    float* output_data = output_ptr;
-    for (int i = 0; i < batch_size * num_units; ++i) {
-      *output_data++ = 0.0f;
-    }
-  }
-
-  // Reduction sum.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output_ptr + b * num_units;
-    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
-
-    // Reduction sum vector
-    for (int i = 0; i < num_units; ++i) {
-      for (int j = 0; j < rank; j++) {
-        output_ptr_batch[i] += *scratch_ptr_batch++;
-      }
-    }
-  }
-
-  // Apply activation.
-  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = output_ptr + b * num_units;
-    for (int i = 0; i < num_units; ++i) {
-      *output_ptr_batch =
-          tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch);
-      ++output_ptr_batch;
-    }
-  }
-}
-
-inline void EvalFloatSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
-    const TfLiteEvalTensor* weights_feature,
-    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
-    const TfLiteSVDFParams* params, int scratch_tensor_index,
-    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
-  const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  const float* weights_feature_ptr =
-      tflite::micro::GetTensorData<float>(weights_feature);
-  const float* weights_time_ptr =
-      tflite::micro::GetTensorData<float>(weights_time);
-  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
-  const float* input_ptr = tflite::micro::GetTensorData<float>(input);
-
-  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);
-
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
-  float* scratch_ptr = static_cast<float*>(
-      context->GetScratchBuffer(context, scratch_tensor_index));
-
-  float* output_ptr = tflite::micro::GetTensorData<float>(output);
+#include "hexagon_tflm_translation_svdf.h"
 
-  // Left shift the activation_state.
-  {
-    float* new_state_start = state_ptr;
-    const float* old_state_start = state_ptr + 1;
-    const float* old_state_end =
-        state_ptr + batch_size * num_filters * memory_size;
-    while (old_state_start != old_state_end) {
-      *new_state_start++ = *old_state_start++;
-    }
-  }
-
-  // Note: no need to clear the latest activation, matmul is not accumulative.
-
-  // Compute conv1d(inputs, weights_feature).
-  // The activation_state's rightmost column is used to save current cycle
-  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
-  // having the stride equal to memory_size.
-
-  // Perform batched matrix vector multiply operation:
-  {
-    const float* matrix = weights_feature_ptr;
-    const float* vector = input_ptr;
-    float* result = &state_ptr[memory_size - 1];
-    float* result_in_batch = result;
-    for (int i = 0; i < batch_size; ++i) {
-      const float* matrix_ptr = matrix;
-      for (int j = 0; j < num_filters; ++j) {
-        float dot_prod = 0.0f;
-        const float* vector_in_batch = vector + i * input_size;
-        for (int k = 0; k < input_size; ++k) {
-          dot_prod += *matrix_ptr++ * *vector_in_batch++;
-        }
-        *result_in_batch = dot_prod;
-        result_in_batch += memory_size;
-      }
-    }
-  }
-
-  ApplyTimeWeightsBiasAndActivation(
-      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
-      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
-}
-
-void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteEvalTensor* input_tensor,
-                     const TfLiteEvalTensor* weights_feature_tensor,
-                     const TfLiteEvalTensor* weights_time_tensor,
-                     const TfLiteEvalTensor* bias_tensor,
-                     const TfLiteSVDFParams* params,
-                     TfLiteEvalTensor* activation_state_tensor,
-                     TfLiteEvalTensor* output_tensor, const OpData& data) {
-  const int n_rank = params->rank;
-  const int n_batch = input_tensor->dims->data[0];
-  const int n_input = input_tensor->dims->data[1];
-  const int n_filter = weights_feature_tensor->dims->data[0];
-  const int n_unit = n_filter / n_rank;
-  const int n_memory = weights_time_tensor->dims->data[1];
-
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
-  int32_t* scratch_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_tensor_index));
-  int32_t* scratch_output_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
-
-  // Shift states.
-  int16_t* const state_ptr =
-      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
-
-  // Left shift the activation_state.
-  {
-    int16_t* new_state_start = state_ptr;
-    const int16_t* old_state_start = state_ptr + 1;
-    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
-    while (old_state_start != old_state_end) {
-      *new_state_start++ = *old_state_start++;
-    }
-  }
-
-  // Note: no need to clear the latest activation, matmul is not accumulative.
-
-  // Feature matmul.
-  {
-    int16_t* state =
-        tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
-    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
-    const int8_t* weight_feature =
-        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
-    const int32_t output_max = std::numeric_limits<int16_t>::max();
-    const int32_t output_min = std::numeric_limits<int16_t>::min();
-    int16_t* result_in_batch = state + (n_memory - 1);
-    for (int b = 0; b < n_batch; b++) {
-      const int8_t* matrix_ptr = weight_feature;
-      for (int r = 0; r < n_filter; r++) {
-        int32_t dot_prod = 0;
-        const int8_t* vector_in_batch = input + b * n_input;
-        for (int c = 0; c < n_input; c++) {
-          dot_prod +=
-              *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
-        }
-        dot_prod = MultiplyByQuantizedMultiplier(
-            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
-        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
-        // This assumes state is symmetrically quantized. Otherwise last bit of
-        // state should be initialized to its zero point and accumulate the
-        // dot_prod.
-        // Equivalent as the following:
-        //     result_in_batch = zero point, which happens to be zero.
-        //     result_in_batch += dot_prod_56.
-        *result_in_batch = dot_prod;
-        result_in_batch += n_memory;
-      }
-    }
-  }
-
-  // Time.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
-
-      // Perform batched vector dot product:
-      const int16_t* vector1_ptr =
-          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
-      const int16_t* vector2_ptr =
-          tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-
-      for (int i = 0; i < n_filter; i++) {
-        *scratch_ptr_batch = 0;
-        for (int j = 0; j < n_memory; j++) {
-          *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
-        }
-        scratch_ptr_batch++;
-      }
-    }
-  }
-
-  // Reduce, add bias, rescale, activation.
-  {
-    // Add bias.
-    {
-      if (bias_tensor) {
-        // Vector batch assign:
-        const int32_t* bias_data =
-            tflite::micro::GetTensorData<int32_t>(bias_tensor);
-        for (int i = 0; i < n_batch; ++i) {
-          int32_t* output_ptr = scratch_output_tensor + i * n_unit;
-          const int32_t* bias_ptr = bias_data;
-          for (int j = 0; j < n_unit; ++j) {
-            *output_ptr++ = *bias_ptr++;
-          }
-        }
-      } else {
-        int32_t* output_ptr = scratch_output_tensor;
-        for (int i = 0; i < n_batch * n_unit; ++i) {
-          *output_ptr++ = 0;
-        }
-      }
-    }
-
-    // Reduce.
-    {
-      for (int b = 0; b < n_batch; ++b) {
-        int32_t* output_temp_ptr = scratch_output_tensor + b * n_unit;
-        int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;
-
-        // Reduction sum vector
-        for (int i = 0; i < n_unit; ++i) {
-          for (int j = 0; j < n_rank; ++j) {
-            output_temp_ptr[i] += *scratch_ptr_batch++;
-          }
-        }
-      }
-    }
-
-    // Rescale.
-    {
-      const int32_t output_max = std::numeric_limits<int8_t>::max();
-      const int32_t output_min = std::numeric_limits<int8_t>::min();
-      for (int i = 0; i < n_batch * n_unit; ++i) {
-        int32_t x1 = scratch_output_tensor[i];
-        int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
-                                                   data.effective_scale_2_b);
-        int32_t x3 = x2 + data.output_zero_point;
-        int32_t x4 = std::min(std::max(output_min, x3), output_max);
-        tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
-            static_cast<int8_t>(x4);
-      }
-    }
-  }
-}
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = context->AllocatePersistentBuffer(context, sizeof(OpData));
-
-  if (data == nullptr) {
-    return nullptr;
-  }
-  OpData* opdata = static_cast<OpData*>(data);
-  opdata->hexagon_data =
-      tflite::hexagon_svdf::HexagonInit(context, buffer, length);
-
-  return data;
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
-
-  // Validate Tensor Inputs (dtype depends on quantization):
-  // [0] = Input, {2, batch_size, input_size}
-  // [1] = Weights Feature, {2, num_filters, input_size}
-  // [2] = Weights Time, {2, num_filters, memory_size}
-  // [3] = Bias (optional), {1, num_units}
-  // [4] = Activation State (variable),
-  //         {2, batch_size, memory_size * num_filters}
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  TF_LITE_ENSURE(context, weights_feature != nullptr);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  TF_LITE_ENSURE(context, weights_time != nullptr);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-
-  // Define input constants based on input tensor definition above:
-  const int rank = params->rank;
-  const int input_size = input->dims->data[1];
-  const int batch_size = input->dims->data[0];
-  const int num_filters = weights_feature->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Validate Input Tensor:
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-
-  // Validate Tensor Output:
-  // [0] = float/int8_t, {2, batch_size, num_units}
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
-
-  // Validate Weights Feature Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
-  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
-
-  // Validate Weights Time Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
-
-  // Validate Optional Bias Input Tensor:
-  if (bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
-  }
-
-  // Validate Activation State Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
-                    memory_size * num_filters);
-  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
-  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-    if (bias != nullptr) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
-    }
-
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-
-    const double effective_scale_1 = static_cast<double>(
-        input->params.scale * weights_feature->params.scale /
-        activation_state->params.scale);
-    const double effective_scale_2 =
-        static_cast<double>(activation_state->params.scale *
-                            weights_time->params.scale / output->params.scale);
-
-    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
-    TF_LITE_ENSURE(
-        context,
-        std::abs(static_cast<double>(bias->params.scale) -
-                 static_cast<double>(activation_state->params.scale *
-                                     weights_time->params.scale)) < 1e-5);
-
-    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
-                       &(data->effective_scale_1_b));
-    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
-                       &(data->effective_scale_2_b));
-
-    data->input_zero_point = input->params.zero_point;
-    data->output_zero_point = output->params.zero_point;
-
-    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
-
-    tflite::hexagon_svdf::HexagonOptimizationEvaluation(context, node);
-
-    if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) {
-      TF_LITE_ENSURE_OK(context,
-                        tflite::hexagon_svdf::HexagonPrepare(context, node));
-    } else {
-      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-          context, batch_size * num_filters * sizeof(int32_t),
-          &(data->scratch_tensor_index));
-      TF_LITE_ENSURE_OK(context, scratch_status);
-
-      const TfLiteStatus scratch_output_status =
-          context->RequestScratchBufferInArena(
-              context, batch_size * num_units * sizeof(int32_t),
-              &(data->scratch_output_tensor_index));
-      TF_LITE_ENSURE_OK(context, scratch_output_status);
-    }
-  } else {
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
-    if (bias != nullptr) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
-    }
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
-
-    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
-    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-        context, batch_size * num_filters * sizeof(float),
-        &(data->scratch_tensor_index));
-    TF_LITE_ENSURE_OK(context, scratch_status);
-  }
-
-  return kTfLiteOk;
-}
+namespace tflite {
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus SvdfEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
   TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+  const HexagonOpDataSvdf& data =
+      *(static_cast<const HexagonOpDataSvdf*>(node->user_data));
 
   const TfLiteEvalTensor* input =
-      tflite::micro::GetEvalInput(context, node, kInputTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfInputTensor);
   const TfLiteEvalTensor* weights_feature =
-      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsFeatureTensor);
   const TfLiteEvalTensor* weights_time =
-      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsTimeTensor);
   const TfLiteEvalTensor* bias =
       (NumInputs(node) == 5)
-          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          ? tflite::micro::GetEvalInput(context, node, kSvdfBiasTensor)
           : nullptr;
   TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
-      context, node, kInputActivationStateTensor);
+      context, node, kSvdfInputActivationStateTensor);
   TfLiteEvalTensor* output =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+      tflite::micro::GetEvalOutput(context, node, kSvdfOutputTensor);
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
-                    params, data.scratch_tensor_index, activation_state,
-                    output);
+      EvalFloatSvdfReference(context, node, input, weights_feature,
+                             weights_time, bias, params,
+                             data.reference_op_data.scratch_tensor_index,
+                             activation_state, output);
       return kTfLiteOk;
       break;
     }
 
     case kTfLiteInt8: {
-      if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) {
-        tflite::hexagon_svdf::HexagonEvalIntegerSVDF(
-            context, node, input, weights_feature, weights_time, bias, params,
-            activation_state, output, node->user_data);
-      } else {
-        EvalIntegerSVDF(context, node, input, weights_feature, weights_time,
-                        bias, params, activation_state, output, data);
-      }
-      return kTfLiteOk;
-      break;
+      return HexagonSvdfEvalInt8(context, node);
     }
 
     default:
@@ -589,13 +103,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace
-
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/Init,
+  return {/*init=*/HexagonSvdfInit,
           /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
+          /*prepare=*/HexagonSvdfPrepare,
+          /*invoke=*/SvdfEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
diff --git a/third_party/hexagon/svdf_int8.cc b/third_party/hexagon/svdf_int8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02599d197f91cfb0f430e957dfe9e5eaa8eabf2c
--- /dev/null
+++ b/third_party/hexagon/svdf_int8.cc
@@ -0,0 +1,137 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Copyright 2020 The Qualcomm Innovation Center, Inc. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer
+below) provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of Qualcomm Innovation Center, Inc. nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+==============================================================================*/
+
+#include <math.h>
+
+#include "hexagon_tflm_translation_svdf.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/hexagon/hexagon_svdf.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+TfLiteStatus HexagonSvdfEvalInt8(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const HexagonOpDataSvdf& data =
+      *(static_cast<const HexagonOpDataSvdf*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kSvdfInputTensor);
+  const TfLiteEvalTensor* weights_feature =
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsFeatureTensor);
+  const TfLiteEvalTensor* weights_time =
+      tflite::micro::GetEvalInput(context, node, kSvdfWeightsTimeTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 5)
+          ? tflite::micro::GetEvalInput(context, node, kSvdfBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
+      context, node, kSvdfInputActivationStateTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kSvdfOutputTensor);
+
+  if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) {
+    tflite::hexagon_svdf::HexagonEvalIntegerSVDF(
+        context, node, input, weights_feature, weights_time, bias, params,
+        activation_state, output, node->user_data);
+  } else {
+    EvalIntegerSvdfReference(context, node, input, weights_feature,
+                             weights_time, bias, params, activation_state,
+                             output, data.reference_op_data);
+  }
+  return kTfLiteOk;
+}
+
+void* HexagonSvdfInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf));
+
+  if (data == nullptr) {
+    return nullptr;
+  }
+
+  HexagonOpDataSvdf* opdata = static_cast<HexagonOpDataSvdf*>(data);
+  opdata->hexagon_data =
+      tflite::hexagon_svdf::HexagonInit(context, buffer, length);
+
+  return data;
+}
+
+TfLiteStatus HexagonSvdfPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteStatus prepare_status = PrepareSvdf(context, node);
+  if (prepare_status != kTfLiteOk) {
+    return prepare_status;
+  }
+
+  tflite::hexagon_svdf::HexagonOptimizationEvaluation(context, node);
+
+  if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) {
+    TF_LITE_ENSURE_OK(context,
+                      tflite::hexagon_svdf::HexagonPrepare(context, node));
+  } 
+
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_SVDF_INT8() {
+  return {/*init=*/HexagonSvdfInit,
+          /*free=*/nullptr,
+          /*prepare=*/HexagonSvdfPrepare,
+          /*invoke=*/HexagonSvdfEvalInt8,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite