diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc index 61979bea388c32860f2b9c472dac751ae3b0c884..986b10570c32fd8b74c8f8ba04f197833a24c7bd 100644 --- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc +++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/lite/micro/benchmarks/micro_benchmark.h" #include "tensorflow/lite/micro/kernels/fully_connected.h" #include "tensorflow/lite/micro/kernels/softmax.h" +#include "tensorflow/lite/micro/kernels/svdf.h" #include "tensorflow/lite/micro/micro_error_reporter.h" #include "tensorflow/lite/micro/micro_mutable_op_resolver.h" #include "tensorflow/lite/micro/micro_profiler.h" @@ -55,7 +56,7 @@ KeywordBenchmarkRunner* CreateBenchmarkRunner(MicroProfiler* profiler) { op_resolver->AddFullyConnected(tflite::Register_FULLY_CONNECTED_INT8()); op_resolver->AddQuantize(); op_resolver->AddSoftmax(tflite::Register_SOFTMAX_INT8_INT16()); - op_resolver->AddSvdf(); + op_resolver->AddSvdf(tflite::Register_SVDF_INT8()); return new (benchmark_runner_buffer) KeywordBenchmarkRunner(g_keyword_scrambled_model_data, op_resolver, diff --git a/tensorflow/lite/micro/kernels/fully_connected.h b/tensorflow/lite/micro/kernels/fully_connected.h index 0b672e43b009f2470f901c7de2b167f238c448cc..e1215da61ba1c352aaa11b0de9489811040fb9c4 100644 --- a/tensorflow/lite/micro/kernels/fully_connected.h +++ b/tensorflow/lite/micro/kernels/fully_connected.h @@ -65,14 +65,9 @@ TfLiteStatus CalculateOpDataFullyConnected( // (reference or optimized) must define this function. TfLiteRegistration Register_FULLY_CONNECTED(); -#if defined(CMSIS_NN) -// The Arduino is a special case where we use the CMSIS kernels, but because of -// the current approach to building for Arduino, we do not support -DCMSIS_NN as -// part of the build. As a result, we use defined(ARDUINO) as proxy for the -// CMSIS kernels for this one special case. - -// Returns a TfLiteRegistration struct for cmsis_nn kernel variant that only -// supports int8. +#if defined(CMSIS_NN) || defined(HEXAGON) +// Returns a TfLiteRegistration struct for kernel variant that only supports +// int8. TfLiteRegistration Register_FULLY_CONNECTED_INT8(); #else diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc index cd22e31b1f07bd747b7389e01cd71dd11b1ed197..ae2998eaff318ce5deb61ed3032d7042c8ab97b9 100644 --- a/tensorflow/lite/micro/kernels/svdf.cc +++ b/tensorflow/lite/micro/kernels/svdf.cc @@ -33,13 +33,13 @@ namespace { void* Init(TfLiteContext* context, const char* buffer, size_t length) { TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); - return context->AllocatePersistentBuffer(context, sizeof(OpData)); + return context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf)); } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); TFLITE_DCHECK(node->user_data != nullptr); - const OpData& data = *(static_cast(node->user_data)); + const OpDataSvdf& data = *(static_cast(node->user_data)); const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, kSvdfInputTensor); diff --git a/tensorflow/lite/micro/kernels/svdf.h b/tensorflow/lite/micro/kernels/svdf.h index d04787be9cf3bf04d3fb84e38ac5a9bcb9a47ec7..8a7269f33a3a27a25f4a69d636fa4ea5d69afbf3 100644 --- a/tensorflow/lite/micro/kernels/svdf.h +++ b/tensorflow/lite/micro/kernels/svdf.h @@ -20,7 +20,7 @@ limitations under the License. namespace tflite { -struct OpData { +struct OpDataSvdf { int32_t effective_scale_1_a; int32_t effective_scale_2_a; // b versions of each scale are kept at int since the numbers are just the @@ -55,7 +55,7 @@ void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node, const TfLiteSVDFParams* params, TfLiteEvalTensor* activation_state_tensor, TfLiteEvalTensor* output_tensor, - const OpData& data); + const OpDataSvdf& data); void EvalFloatSvdfReference( TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input, @@ -66,6 +66,23 @@ void EvalFloatSvdfReference( TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node); +// This is the most generic TfLiteRegistration. The actual supported types may +// still be target dependent. The only requirement is that every implementation +// (reference or optimized) must define this function. +TfLiteRegistration Register_SVDF(); + +#if defined(HEXAGON) +TfLiteRegistration Register_SVDF_INT8(); + +#else +// Note that while this block gets used for both reference and optimized kernels +// that do not have any specialized implementations, the only goal here is to +// define fallback implementation that allow reference kernels to still be used +// from applications that call a more specific kernel variant. + +inline TfLiteRegistration Register_SVDF_INT8() { return Register_SVDF(); } + +#endif } // namespace tflite #endif // TENSORFLOW_LITE_MICRO_KERNELS_SVDF_H_ diff --git a/tensorflow/lite/micro/kernels/svdf_common.cc b/tensorflow/lite/micro/kernels/svdf_common.cc index 12e697b1461aad3faf3a8e61563243561c49d0e6..4f62b519bcd3cb1acbe11da360737bef25b64fee 100644 --- a/tensorflow/lite/micro/kernels/svdf_common.cc +++ b/tensorflow/lite/micro/kernels/svdf_common.cc @@ -56,7 +56,7 @@ void EvalIntegerSvdfReference(TfLiteContext* context, TfLiteNode* node, const TfLiteSVDFParams* params, TfLiteEvalTensor* activation_state_tensor, TfLiteEvalTensor* output_tensor, - const OpData& data) { + const OpDataSvdf& data) { const int n_rank = params->rank; const int n_batch = input_tensor->dims->data[0]; const int n_input = input_tensor->dims->data[1]; @@ -401,7 +401,7 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); TFLITE_DCHECK(node->user_data != nullptr); - OpData* data = static_cast(node->user_data); + OpDataSvdf* data = static_cast(node->user_data); if (input->type == kTfLiteInt8) { TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8); diff --git a/tensorflow/lite/micro/kernels/xtensa/svdf.cc b/tensorflow/lite/micro/kernels/xtensa/svdf.cc index 274d8d4520c4c3ba08c066ee881e03527e1fb4e6..e26a1affc9f70e3263f9eeb0c93b8d6dce6a0744 100644 --- a/tensorflow/lite/micro/kernels/xtensa/svdf.cc +++ b/tensorflow/lite/micro/kernels/xtensa/svdf.cc @@ -48,7 +48,7 @@ constexpr int kOutputTensor = 0; * This version of SVDF is specific to TFLite Micro. It contains only a full * integer receipe with optimizations for the Xtensa HiFiMini platform. * - * Note: passing OpData by value might seem like an oversight but it helps + * Note: passing OpDataSvdf by value might seem like an oversight but it helps * reduce the latency. See b/155656675 for more details. */ void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node, @@ -58,7 +58,7 @@ void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* bias_tensor, const TfLiteSVDFParams* params, TfLiteEvalTensor* activation_state_tensor, - TfLiteEvalTensor* output_tensor, OpData data) { + TfLiteEvalTensor* output_tensor, OpDataSvdf data) { const int n_rank = params->rank; const int n_batch = input_tensor->dims->data[0]; const int n_input = input_tensor->dims->data[1]; @@ -254,7 +254,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node, const TfLiteSVDFParams* params, TfLiteEvalTensor* activation_state_tensor, TfLiteEvalTensor* output_tensor, - const OpData& data) { + const OpDataSvdf& data) { const int n_rank = params->rank; const int n_batch = input_tensor->dims->data[0]; const int n_input = input_tensor->dims->data[1]; @@ -321,7 +321,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node, void* Init(TfLiteContext* context, const char* buffer, size_t length) { TFLITE_DCHECK(context != nullptr); - return context->AllocatePersistentBuffer(context, sizeof(OpData)); + return context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf)); } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { @@ -422,7 +422,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { 1e-5); TFLITE_DCHECK(node->user_data != nullptr); - OpData* data = static_cast(node->user_data); + OpDataSvdf* data = static_cast(node->user_data); #if defined(HIFIMINI) QuantizeMultiplierForInt24(effective_scale_1, &data->effective_scale_1_a, @@ -471,7 +471,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetEvalOutput(context, node, kOutputTensor); TFLITE_DCHECK(node->user_data != nullptr); - const OpData& data = *(static_cast(node->user_data)); + const OpDataSvdf& data = *(static_cast(node->user_data)); #if defined(HIFIMINI) EvalIntegerSvdfHifimini(context, node, input, weights_feature, weights_time, diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h index fe0db48aee856dab556e4928fbf74237b6ab903e..982a930264154dfe91449b6353b0b88dc43ea47d 100644 --- a/tensorflow/lite/micro/micro_mutable_op_resolver.h +++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h @@ -498,8 +498,9 @@ class MicroMutableOpResolver : public MicroOpResolver { ParseSub); } - TfLiteStatus AddSvdf() { - return AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), ParseSvdf); + TfLiteStatus AddSvdf( + const TfLiteRegistration& registration = Register_SVDF()) { + return AddBuiltin(BuiltinOperator_SVDF, registration, ParseSvdf); } TfLiteStatus AddTanh() { diff --git a/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc b/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc index f84da5d3b1cade2ba37d77f1a937df26d67ecf1d..7ab009ca1fb4a87a5c4b3fbb92ce7a9d7fa8f8b2 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/hexagon.inc @@ -1,3 +1,7 @@ +MICROLITE_CC_KERNEL_SRCS += \ +tensorflow/lite/micro/kernels/hexagon/fully_connected_int8.cc \ +tensorflow/lite/micro/kernels/hexagon/svdf_int8.cc + # Full path to the hexagon_tflm static library. HEXAGON_TFLM_LIB := diff --git a/third_party/hexagon/fully_connected.cc b/third_party/hexagon/fully_connected.cc index 9e4a108f86391bb0e3dc5f92c55800880f3727b3..05ae1c347be1f8794daae2d3c696e759c7d1d5a4 100644 --- a/third_party/hexagon/fully_connected.cc +++ b/third_party/hexagon/fully_connected.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -52,181 +52,14 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/hexagon/hexagon_fully_connected.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" -#include "third_party/hexagon/hexagon_tflm_translation_fully_connected.h" -namespace tflite { -namespace { - -// Input tensors. -constexpr int kInputTensor = 0; -constexpr int kWeightsTensor = 1; -constexpr int kBiasTensor = 2; -// Output tensor. -constexpr int kOutputTensor = 0; - -struct OpData { - // The scaling factor from input to output (aka the 'real multiplier') can - // be represented as a fixed point multiplier plus a left shift. - int32_t output_multiplier; - int output_shift; - // The range of the fused activation layer. For example for kNone and - // uint8_t these would be 0 and 255. - int32_t output_activation_min; - int32_t output_activation_max; - // The index of the temporary tensor where the quantized inputs are cached. - int input_quantized_index; - // Cached zero point values of tensors. - int32_t input_zero_point; - int32_t filter_zero_point; - int32_t output_zero_point; - - void* hexagon_data; -}; - -TfLiteStatus CalculateOpData(TfLiteContext* context, - TfLiteFusedActivation activation, - TfLiteType data_type, const TfLiteTensor* input, - const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* output, - OpData* data) { - TfLiteStatus status = kTfLiteOk; - if (data_type != kTfLiteFloat32) { - double real_multiplier = 0.0; - TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( - context, input, filter, bias, output, &real_multiplier)); - int exponent; - QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); - data->output_shift = -exponent; - TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( - context, activation, output, &data->output_activation_min, - &data->output_activation_max)); - - data->input_zero_point = input->params.zero_point; - data->filter_zero_point = filter->params.zero_point; - data->output_zero_point = output->params.zero_point; - } - return status; -} - -void* Init(TfLiteContext* context, const char* buffer, size_t length) { - TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); - void* data = nullptr; - data = context->AllocatePersistentBuffer(context, sizeof(OpData)); - - if (data == nullptr) { - return nullptr; - } - OpData* opdata = static_cast(data); - opdata->hexagon_data = - tflite::hexagon_fully_connected::HexagonInit(context, buffer, length); - - return data; -} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TFLITE_DCHECK(node->user_data != nullptr); - TFLITE_DCHECK(node->builtin_data != nullptr); - - OpData* data = static_cast(node->user_data); - const auto params = - static_cast(node->builtin_data); - - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor); - TF_LITE_ENSURE(context, filter != nullptr); - const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE(context, output != nullptr); +#include "hexagon_tflm_translation_fully_connected.h" - TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); - TF_LITE_ENSURE_MSG(context, input->type == filter->type, - "Hybrid models are not supported on TFLite Micro."); - - tflite::hexagon_fully_connected::HexagonOptimizationEvaluation(context, node); - - if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) { - return tflite::hexagon_fully_connected::HexagonPrepare(context, node); - } else { - return CalculateOpData(context, params->activation, input->type, input, - filter, bias, output, data); - } -} - -TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, - const OpData& data, - const TfLiteEvalTensor* input, - const TfLiteEvalTensor* filter, - const TfLiteEvalTensor* bias, - TfLiteEvalTensor* output) { - tflite::FullyConnectedParams op_params; - op_params.input_offset = -data.input_zero_point; - op_params.weights_offset = -data.filter_zero_point; - op_params.output_offset = data.output_zero_point; - op_params.output_multiplier = data.output_multiplier; - // TODO(b/138810107): Figure out whether output shift should be inverted - op_params.output_shift = -data.output_shift; - op_params.quantized_activation_min = data.output_activation_min; - op_params.quantized_activation_max = data.output_activation_max; - - reference_integer_ops::FullyConnected( - op_params, tflite::micro::GetTensorShape(input), - tflite::micro::GetTensorData(input), - tflite::micro::GetTensorShape(filter), - tflite::micro::GetTensorData(filter), - tflite::micro::GetTensorShape(bias), - tflite::micro::GetTensorData(bias), - tflite::micro::GetTensorShape(output), - tflite::micro::GetTensorData(output)); - - return kTfLiteOk; -} - -TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, - const OpData& data, const TfLiteEvalTensor* input, - const TfLiteEvalTensor* filter, - const TfLiteEvalTensor* bias, - TfLiteEvalTensor* output) { - const int32_t input_offset = -data.input_zero_point; - const int32_t filter_offset = -data.filter_zero_point; - const int32_t output_offset = data.output_zero_point; - - tflite::FullyConnectedParams op_params; - op_params.input_offset = input_offset; - op_params.weights_offset = filter_offset; - op_params.output_offset = output_offset; - op_params.output_multiplier = data.output_multiplier; - // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. - op_params.output_shift = -data.output_shift; - op_params.quantized_activation_min = data.output_activation_min; - op_params.quantized_activation_max = data.output_activation_max; - -#define TF_LITE_FULLY_CONNECTED(output_data_type) \ - reference_ops::FullyConnected( \ - op_params, tflite::micro::GetTensorShape(input), \ - tflite::micro::GetTensorData(input), \ - tflite::micro::GetTensorShape(filter), \ - tflite::micro::GetTensorData(filter), \ - tflite::micro::GetTensorShape(bias), \ - tflite::micro::GetTensorData(bias), \ - tflite::micro::GetTensorShape(output), \ - tflite::micro::GetTensorData(output)) - switch (output->type) { - case kTfLiteUInt8: - TF_LITE_FULLY_CONNECTED(uint8_t); - break; - case kTfLiteInt16: - TF_LITE_FULLY_CONNECTED(int16_t); - break; - default: - TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", - TfLiteTypeGetName(output->type), output->type); - return kTfLiteError; - } +namespace tflite { - return kTfLiteOk; -} +namespace { TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, TfLiteFusedActivation activation, @@ -251,22 +84,23 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, return kTfLiteOk; } -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { +} // namespace + +TfLiteStatus HexagonFullyConnectedEval(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->builtin_data != nullptr); const auto* params = static_cast(node->builtin_data); const TfLiteEvalTensor* input = - tflite::micro::GetEvalInput(context, node, kInputTensor); + tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor); const TfLiteEvalTensor* filter = - tflite::micro::GetEvalInput(context, node, kWeightsTensor); + tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor); const TfLiteEvalTensor* bias = - tflite::micro::GetEvalInput(context, node, kBiasTensor); + tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor); TfLiteEvalTensor* output = - tflite::micro::GetEvalOutput(context, node, kOutputTensor); + tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor); TFLITE_DCHECK(node->user_data != nullptr); - const OpData& data = *(static_cast(node->user_data)); // Checks in Prepare ensure input, output and filter types are all the same. switch (input->type) { @@ -275,16 +109,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { output); case kTfLiteInt8: - if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) { - return tflite::hexagon_fully_connected::HexagonEvalQuantizedInt8( - context, node, node->user_data, input, filter, bias, output); - } else { - return EvalQuantizedInt8(context, node, data, input, filter, bias, - output); - } - - case kTfLiteUInt8: - return EvalQuantized(context, node, data, input, filter, bias, output); + return HexagonFullyConnectedEvalInt8(context, node); default: TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", @@ -294,13 +119,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -} // namespace - TfLiteRegistration Register_FULLY_CONNECTED() { - return {/*init=*/Init, + return {/*init=*/HexagonFullyConnectedInit, /*free=*/nullptr, - /*prepare=*/Prepare, - /*invoke=*/Eval, + /*prepare=*/HexagonFullyConnectedPrepare, + /*invoke=*/HexagonFullyConnectedEval, /*profiling_string=*/nullptr, /*builtin_code=*/0, /*custom_name=*/nullptr, diff --git a/third_party/hexagon/fully_connected_int8.cc b/third_party/hexagon/fully_connected_int8.cc new file mode 100644 index 0000000000000000000000000000000000000000..85621560b6d5128115ea2705b260e8e874778a81 --- /dev/null +++ b/third_party/hexagon/fully_connected_int8.cc @@ -0,0 +1,188 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/* Copyright 2020 The Qualcomm Innovation Center, Inc. All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the disclaimer +below) provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Qualcomm Innovation Center, Inc. nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +==============================================================================*/ + +#include "hexagon_tflm_translation_fully_connected.h" +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/fully_connected.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/fully_connected.h" +#include "tensorflow/lite/micro/kernels/hexagon/hexagon_fully_connected.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" + +namespace tflite { +namespace { + +TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, + const HexagonOpDataFullyConnected& data, + const TfLiteEvalTensor* input, + const TfLiteEvalTensor* filter, + const TfLiteEvalTensor* bias, + TfLiteEvalTensor* output) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = -data.reference_op_data.input_zero_point; + op_params.weights_offset = -data.reference_op_data.filter_zero_point; + op_params.output_offset = data.reference_op_data.output_zero_point; + op_params.output_multiplier = data.reference_op_data.output_multiplier; + // TODO(b/138810107): Figure out whether output shift should be inverted + op_params.output_shift = -data.reference_op_data.output_shift; + op_params.quantized_activation_min = + data.reference_op_data.output_activation_min; + op_params.quantized_activation_max = + data.reference_op_data.output_activation_max; + + reference_integer_ops::FullyConnected( + op_params, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + + return kTfLiteOk; +} + +} // namespace + +void* HexagonFullyConnectedInit(TfLiteContext* context, const char* buffer, + size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + void* data = nullptr; + data = context->AllocatePersistentBuffer(context, + sizeof(HexagonOpDataFullyConnected)); + + if (data == nullptr) { + return nullptr; + } + HexagonOpDataFullyConnected* opdata = + static_cast(data); + opdata->hexagon_data = + tflite::hexagon_fully_connected::HexagonInit(context, buffer, length); + + return data; +} + +TfLiteStatus HexagonFullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + HexagonOpDataFullyConnected* data = + static_cast(node->user_data); + const auto params = + static_cast(node->builtin_data); + + const TfLiteTensor* input = + GetInput(context, node, kFullyConnectedInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + const TfLiteTensor* filter = + GetInput(context, node, kFullyConnectedWeightsTensor); + TF_LITE_ENSURE(context, filter != nullptr); + const TfLiteTensor* bias = + GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor); + TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + + TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); + TF_LITE_ENSURE_MSG(context, input->type == filter->type, + "Hybrid models are not supported on TFLite Micro."); + + tflite::hexagon_fully_connected::HexagonOptimizationEvaluation(context, node); + + if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) { + return tflite::hexagon_fully_connected::HexagonPrepare(context, node); + } else { + return CalculateOpDataFullyConnected(context, params->activation, input->type, input, + filter, bias, output, &data->reference_op_data); + } +} + +TfLiteStatus HexagonFullyConnectedEvalInt8(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->builtin_data != nullptr); + + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor); + const TfLiteEvalTensor* bias = + tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor); + + TFLITE_DCHECK(node->user_data != nullptr); + const HexagonOpDataFullyConnected& data = + *(static_cast(node->user_data)); + + // This kernel only implements the int8 version of the fully_connected kernel. + TFLITE_DCHECK(input->type == kTfLiteInt8); + TFLITE_DCHECK(filter->type == kTfLiteInt8); + TFLITE_DCHECK(bias->type == kTfLiteInt32); + TFLITE_DCHECK(output->type == kTfLiteInt8); + + if (tflite::hexagon_fully_connected::HexagonOptimizable(context, node)) { + return tflite::hexagon_fully_connected::HexagonEvalQuantizedInt8( + context, node, node->user_data, input, filter, bias, output); + } else { + return EvalQuantizedInt8(context, node, data, input, filter, bias, output); + } + return kTfLiteOk; +} + +TfLiteRegistration Register_FULLY_CONNECTED_INT8() { + return {/*init=*/HexagonFullyConnectedInit, + /*free=*/nullptr, + /*prepare=*/HexagonFullyConnectedPrepare, + /*invoke=*/HexagonFullyConnectedEvalInt8, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/third_party/hexagon/hexagon_fully_connected.h b/third_party/hexagon/hexagon_fully_connected.h new file mode 100644 index 0000000000000000000000000000000000000000..c7fa974167bce2b248d8eb0da7b9c304f1176782 --- /dev/null +++ b/third_party/hexagon/hexagon_fully_connected.h @@ -0,0 +1,37 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_ +#define TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/types.h" +#include "tensorflow/lite/micro/kernels/fully_connected.h" + +namespace tflite { + +struct HexagonOpDataFullyConnected { + struct OpDataFullyConnected reference_op_data; + void* hexagon_data; +}; + +void* HexagonFullyConnectedInit(TfLiteContext* context, const char* buffer, + size_t length); +TfLiteStatus HexagonFullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node); +TfLiteStatus HexagonFullyConnectedEvalInt8(TfLiteContext* context, TfLiteNode* node); + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_FULLY_CONNECTED_H_ diff --git a/third_party/hexagon/hexagon_svdf.h b/third_party/hexagon/hexagon_svdf.h new file mode 100644 index 0000000000000000000000000000000000000000..2bd6f93c5b1b329061d0ac54e0237e894db6969b --- /dev/null +++ b/third_party/hexagon/hexagon_svdf.h @@ -0,0 +1,36 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_ +#define TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/types.h" +#include "tensorflow/lite/micro/kernels/svdf.h" + +namespace tflite { + +struct HexagonOpDataSvdf { + struct OpDataSvdf reference_op_data; + void* hexagon_data; +}; + +void* HexagonSvdfInit(TfLiteContext* context, const char* buffer, size_t length); +TfLiteStatus HexagonSvdfPrepare(TfLiteContext* context, TfLiteNode* node); +TfLiteStatus HexagonSvdfEvalInt8(TfLiteContext* context, TfLiteNode* node); + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_KERNELS_HEXAGON_HEXAGON_SVDF_H_ diff --git a/third_party/hexagon/svdf.cc b/third_party/hexagon/svdf.cc index e20811c7d5db95391a7caca7fe33510c4550c026..ac02de2eeaa7afa13f6bf38627f31d80dc55fe16 100644 --- a/third_party/hexagon/svdf.cc +++ b/third_party/hexagon/svdf.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -52,533 +52,47 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/op_macros.h" #include "tensorflow/lite/micro/kernels/activation_utils.h" +#include "tensorflow/lite/micro/kernels/hexagon/hexagon_svdf.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/micro_utils.h" -#include "third_party/hexagon/hexagon_tflm_translation_svdf.h" -namespace tflite { -namespace { - -// Input tensors. -constexpr int kInputTensor = 0; -constexpr int kWeightsFeatureTensor = 1; -constexpr int kWeightsTimeTensor = 2; -constexpr int kBiasTensor = 3; -constexpr int kInputActivationStateTensor = 4; -// Output tensor. -constexpr int kOutputTensor = 0; - -struct OpData { - int32_t effective_scale_1_a; - int32_t effective_scale_2_a; - // b versions of each scale are kept at int since the numbers are just the - // shift value - typically between [-32, 32]. - int effective_scale_1_b; - int effective_scale_2_b; - int scratch_tensor_index; - int scratch_output_tensor_index; - - // Cached tensor zero point values for quantized operations. - int input_zero_point; - int output_zero_point; - - void* hexagon_data; -}; - -/** - * This version of SVDF is specific to TFLite Micro. It contains the following - * differences between the TFLite version: - * - * 1.) Scratch tensor allocation - scratch tensors must be known ahead of time - * for the Micro interpreter. - * 2.) Output dimensions - the TFLite version determines output size and runtime - * and resizes the output tensor. Micro runtime does not support tensor - * resizing. - */ -static inline void ApplyTimeWeightsBiasAndActivation( - int batch_size, int memory_size, int num_filters, int num_units, int rank, - const float* const __restrict__ weights_time_ptr, - const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation, - float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr, - float* const __restrict__ output_ptr) { - // Compute matmul(activation_state, weights_time). - for (int b = 0; b < batch_size; ++b) { - // Perform batched vector dot product: - float* scratch_ptr_batch = scratch_ptr + b * num_filters; - const float* vector1_ptr = weights_time_ptr; - const float* vector2_ptr = state_ptr + b * memory_size * num_filters; - for (int i = 0; i < num_filters; ++i) { - *scratch_ptr_batch = 0.f; - for (int j = 0; j < memory_size; ++j) { - *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++; - } - scratch_ptr_batch++; - } - } - - // Initialize output with bias if provided. - if (bias_ptr) { - // VectorBatchVectorAssign - for (int i = 0; i < batch_size; ++i) { - float* output_data = output_ptr + i * num_units; - const float* bias_data = bias_ptr; - for (int j = 0; j < num_units; ++j) { - *output_data++ = *bias_data++; - } - } - } else { - float* output_data = output_ptr; - for (int i = 0; i < batch_size * num_units; ++i) { - *output_data++ = 0.0f; - } - } - - // Reduction sum. - for (int b = 0; b < batch_size; ++b) { - float* output_ptr_batch = output_ptr + b * num_units; - float* scratch_ptr_batch = scratch_ptr + b * num_filters; - - // Reduction sum vector - for (int i = 0; i < num_units; ++i) { - for (int j = 0; j < rank; j++) { - output_ptr_batch[i] += *scratch_ptr_batch++; - } - } - } - - // Apply activation. - for (int b = 0; b < batch_size; ++b) { - float* output_ptr_batch = output_ptr + b * num_units; - for (int i = 0; i < num_units; ++i) { - *output_ptr_batch = - tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch); - ++output_ptr_batch; - } - } -} - -inline void EvalFloatSVDF( - TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input, - const TfLiteEvalTensor* weights_feature, - const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias, - const TfLiteSVDFParams* params, int scratch_tensor_index, - TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) { - const int rank = params->rank; - const int batch_size = input->dims->data[0]; - const int input_size = input->dims->data[1]; - const int num_filters = weights_feature->dims->data[0]; - const int num_units = num_filters / rank; - const int memory_size = weights_time->dims->data[1]; - - const float* weights_feature_ptr = - tflite::micro::GetTensorData(weights_feature); - const float* weights_time_ptr = - tflite::micro::GetTensorData(weights_time); - const float* bias_ptr = tflite::micro::GetTensorData(bias); - const float* input_ptr = tflite::micro::GetTensorData(input); - - float* state_ptr = tflite::micro::GetTensorData(activation_state); - - TFLITE_DCHECK(context != nullptr); - TFLITE_DCHECK(context->GetScratchBuffer != nullptr); - - float* scratch_ptr = static_cast( - context->GetScratchBuffer(context, scratch_tensor_index)); - - float* output_ptr = tflite::micro::GetTensorData(output); +#include "hexagon_tflm_translation_svdf.h" - // Left shift the activation_state. - { - float* new_state_start = state_ptr; - const float* old_state_start = state_ptr + 1; - const float* old_state_end = - state_ptr + batch_size * num_filters * memory_size; - while (old_state_start != old_state_end) { - *new_state_start++ = *old_state_start++; - } - } - - // Note: no need to clear the latest activation, matmul is not accumulative. - - // Compute conv1d(inputs, weights_feature). - // The activation_state's rightmost column is used to save current cycle - // activation. This is achieved by starting at state_ptr[memory_size - 1] and - // having the stride equal to memory_size. - - // Perform batched matrix vector multiply operation: - { - const float* matrix = weights_feature_ptr; - const float* vector = input_ptr; - float* result = &state_ptr[memory_size - 1]; - float* result_in_batch = result; - for (int i = 0; i < batch_size; ++i) { - const float* matrix_ptr = matrix; - for (int j = 0; j < num_filters; ++j) { - float dot_prod = 0.0f; - const float* vector_in_batch = vector + i * input_size; - for (int k = 0; k < input_size; ++k) { - dot_prod += *matrix_ptr++ * *vector_in_batch++; - } - *result_in_batch = dot_prod; - result_in_batch += memory_size; - } - } - } - - ApplyTimeWeightsBiasAndActivation( - batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr, - bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr); -} - -void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node, - const TfLiteEvalTensor* input_tensor, - const TfLiteEvalTensor* weights_feature_tensor, - const TfLiteEvalTensor* weights_time_tensor, - const TfLiteEvalTensor* bias_tensor, - const TfLiteSVDFParams* params, - TfLiteEvalTensor* activation_state_tensor, - TfLiteEvalTensor* output_tensor, const OpData& data) { - const int n_rank = params->rank; - const int n_batch = input_tensor->dims->data[0]; - const int n_input = input_tensor->dims->data[1]; - const int n_filter = weights_feature_tensor->dims->data[0]; - const int n_unit = n_filter / n_rank; - const int n_memory = weights_time_tensor->dims->data[1]; - - TFLITE_DCHECK(context != nullptr); - TFLITE_DCHECK(context->GetScratchBuffer != nullptr); - - int32_t* scratch_tensor = static_cast( - context->GetScratchBuffer(context, data.scratch_tensor_index)); - int32_t* scratch_output_tensor = static_cast( - context->GetScratchBuffer(context, data.scratch_output_tensor_index)); - - // Shift states. - int16_t* const state_ptr = - tflite::micro::GetTensorData(activation_state_tensor); - - // Left shift the activation_state. - { - int16_t* new_state_start = state_ptr; - const int16_t* old_state_start = state_ptr + 1; - const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory; - while (old_state_start != old_state_end) { - *new_state_start++ = *old_state_start++; - } - } - - // Note: no need to clear the latest activation, matmul is not accumulative. - - // Feature matmul. - { - int16_t* state = - tflite::micro::GetTensorData(activation_state_tensor); - const int8_t* input = tflite::micro::GetTensorData(input_tensor); - const int8_t* weight_feature = - tflite::micro::GetTensorData(weights_feature_tensor); - const int32_t output_max = std::numeric_limits::max(); - const int32_t output_min = std::numeric_limits::min(); - int16_t* result_in_batch = state + (n_memory - 1); - for (int b = 0; b < n_batch; b++) { - const int8_t* matrix_ptr = weight_feature; - for (int r = 0; r < n_filter; r++) { - int32_t dot_prod = 0; - const int8_t* vector_in_batch = input + b * n_input; - for (int c = 0; c < n_input; c++) { - dot_prod += - *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point); - } - dot_prod = MultiplyByQuantizedMultiplier( - dot_prod, data.effective_scale_1_a, data.effective_scale_1_b); - dot_prod = std::min(std::max(output_min, dot_prod), output_max); - // This assumes state is symmetrically quantized. Otherwise last bit of - // state should be initialized to its zero point and accumulate the - // dot_prod. - // Equivalent as the following: - // result_in_batch = zero point, which happens to be zero. - // result_in_batch += dot_prod_56. - *result_in_batch = dot_prod; - result_in_batch += n_memory; - } - } - } - - // Time. - { - for (int b = 0; b < n_batch; ++b) { - int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter; - - // Perform batched vector dot product: - const int16_t* vector1_ptr = - tflite::micro::GetTensorData(weights_time_tensor); - const int16_t* vector2_ptr = - tflite::micro::GetTensorData(activation_state_tensor) + - b * n_memory * n_filter; - - for (int i = 0; i < n_filter; i++) { - *scratch_ptr_batch = 0; - for (int j = 0; j < n_memory; j++) { - *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++; - } - scratch_ptr_batch++; - } - } - } - - // Reduce, add bias, rescale, activation. - { - // Add bias. - { - if (bias_tensor) { - // Vector batch assign: - const int32_t* bias_data = - tflite::micro::GetTensorData(bias_tensor); - for (int i = 0; i < n_batch; ++i) { - int32_t* output_ptr = scratch_output_tensor + i * n_unit; - const int32_t* bias_ptr = bias_data; - for (int j = 0; j < n_unit; ++j) { - *output_ptr++ = *bias_ptr++; - } - } - } else { - int32_t* output_ptr = scratch_output_tensor; - for (int i = 0; i < n_batch * n_unit; ++i) { - *output_ptr++ = 0; - } - } - } - - // Reduce. - { - for (int b = 0; b < n_batch; ++b) { - int32_t* output_temp_ptr = scratch_output_tensor + b * n_unit; - int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter; - - // Reduction sum vector - for (int i = 0; i < n_unit; ++i) { - for (int j = 0; j < n_rank; ++j) { - output_temp_ptr[i] += *scratch_ptr_batch++; - } - } - } - } - - // Rescale. - { - const int32_t output_max = std::numeric_limits::max(); - const int32_t output_min = std::numeric_limits::min(); - for (int i = 0; i < n_batch * n_unit; ++i) { - int32_t x1 = scratch_output_tensor[i]; - int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a, - data.effective_scale_2_b); - int32_t x3 = x2 + data.output_zero_point; - int32_t x4 = std::min(std::max(output_min, x3), output_max); - tflite::micro::GetTensorData(output_tensor)[i] = - static_cast(x4); - } - } - } -} - -void* Init(TfLiteContext* context, const char* buffer, size_t length) { - TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); - void* data = context->AllocatePersistentBuffer(context, sizeof(OpData)); - - if (data == nullptr) { - return nullptr; - } - OpData* opdata = static_cast(data); - opdata->hexagon_data = - tflite::hexagon_svdf::HexagonInit(context, buffer, length); - - return data; -} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TFLITE_DCHECK(node->builtin_data != nullptr); - - const auto* params = static_cast(node->builtin_data); - - // Validate Tensor Inputs (dtype depends on quantization): - // [0] = Input, {2, batch_size, input_size} - // [1] = Weights Feature, {2, num_filters, input_size} - // [2] = Weights Time, {2, num_filters, memory_size} - // [3] = Bias (optional), {1, num_units} - // [4] = Activation State (variable), - // {2, batch_size, memory_size * num_filters} - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* weights_feature = - GetInput(context, node, kWeightsFeatureTensor); - TF_LITE_ENSURE(context, weights_feature != nullptr); - const TfLiteTensor* weights_time = - GetInput(context, node, kWeightsTimeTensor); - TF_LITE_ENSURE(context, weights_time != nullptr); - const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); - const TfLiteTensor* activation_state = - GetInput(context, node, kInputActivationStateTensor); - TF_LITE_ENSURE(context, activation_state != nullptr); - - // Define input constants based on input tensor definition above: - const int rank = params->rank; - const int input_size = input->dims->data[1]; - const int batch_size = input->dims->data[0]; - const int num_filters = weights_feature->dims->data[0]; - TF_LITE_ENSURE_EQ(context, num_filters % rank, 0); - const int num_units = num_filters / rank; - const int memory_size = weights_time->dims->data[1]; - - // Validate Input Tensor: - TF_LITE_ENSURE(context, - input->type == kTfLiteFloat32 || input->type == kTfLiteInt8); - TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2); - - // Validate Tensor Output: - // [0] = float/int8_t, {2, batch_size, num_units} - TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE(context, output != nullptr); - TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2); - TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size); - TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units); - - // Validate Weights Feature Input Tensor: - TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2); - TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size); - - // Validate Weights Time Input Tensor: - TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2); - TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters); - TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size); - - // Validate Optional Bias Input Tensor: - if (bias != nullptr) { - TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units); - } - - // Validate Activation State Input Tensor: - TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2); - TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size); - TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1], - memory_size * num_filters); - // Since is_variable is not part of TFLiteEvalTensor, check is_variable here. - TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true); - - TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); - - TFLITE_DCHECK(node->user_data != nullptr); - OpData* data = static_cast(node->user_data); - - if (input->type == kTfLiteInt8) { - TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8); - TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16); - TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16); - if (bias != nullptr) { - TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32); - } - - TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8); - - const double effective_scale_1 = static_cast( - input->params.scale * weights_feature->params.scale / - activation_state->params.scale); - const double effective_scale_2 = - static_cast(activation_state->params.scale * - weights_time->params.scale / output->params.scale); - - // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready. - TF_LITE_ENSURE( - context, - std::abs(static_cast(bias->params.scale) - - static_cast(activation_state->params.scale * - weights_time->params.scale)) < 1e-5); - - QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a), - &(data->effective_scale_1_b)); - QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a), - &(data->effective_scale_2_b)); - - data->input_zero_point = input->params.zero_point; - data->output_zero_point = output->params.zero_point; - - TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr); - - tflite::hexagon_svdf::HexagonOptimizationEvaluation(context, node); - - if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) { - TF_LITE_ENSURE_OK(context, - tflite::hexagon_svdf::HexagonPrepare(context, node)); - } else { - const TfLiteStatus scratch_status = context->RequestScratchBufferInArena( - context, batch_size * num_filters * sizeof(int32_t), - &(data->scratch_tensor_index)); - TF_LITE_ENSURE_OK(context, scratch_status); - - const TfLiteStatus scratch_output_status = - context->RequestScratchBufferInArena( - context, batch_size * num_units * sizeof(int32_t), - &(data->scratch_output_tensor_index)); - TF_LITE_ENSURE_OK(context, scratch_output_status); - } - } else { - TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32); - TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32); - TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32); - if (bias != nullptr) { - TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32); - } - TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32); - - TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr); - const TfLiteStatus scratch_status = context->RequestScratchBufferInArena( - context, batch_size * num_filters * sizeof(float), - &(data->scratch_tensor_index)); - TF_LITE_ENSURE_OK(context, scratch_status); - } - - return kTfLiteOk; -} +namespace tflite { -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { +TfLiteStatus SvdfEval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); TFLITE_DCHECK(node->user_data != nullptr); - const OpData& data = *(static_cast(node->user_data)); + const HexagonOpDataSvdf& data = + *(static_cast(node->user_data)); const TfLiteEvalTensor* input = - tflite::micro::GetEvalInput(context, node, kInputTensor); + tflite::micro::GetEvalInput(context, node, kSvdfInputTensor); const TfLiteEvalTensor* weights_feature = - tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor); + tflite::micro::GetEvalInput(context, node, kSvdfWeightsFeatureTensor); const TfLiteEvalTensor* weights_time = - tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor); + tflite::micro::GetEvalInput(context, node, kSvdfWeightsTimeTensor); const TfLiteEvalTensor* bias = (NumInputs(node) == 5) - ? tflite::micro::GetEvalInput(context, node, kBiasTensor) + ? tflite::micro::GetEvalInput(context, node, kSvdfBiasTensor) : nullptr; TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput( - context, node, kInputActivationStateTensor); + context, node, kSvdfInputActivationStateTensor); TfLiteEvalTensor* output = - tflite::micro::GetEvalOutput(context, node, kOutputTensor); + tflite::micro::GetEvalOutput(context, node, kSvdfOutputTensor); switch (weights_feature->type) { case kTfLiteFloat32: { - EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias, - params, data.scratch_tensor_index, activation_state, - output); + EvalFloatSvdfReference(context, node, input, weights_feature, + weights_time, bias, params, + data.reference_op_data.scratch_tensor_index, + activation_state, output); return kTfLiteOk; break; } case kTfLiteInt8: { - if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) { - tflite::hexagon_svdf::HexagonEvalIntegerSVDF( - context, node, input, weights_feature, weights_time, bias, params, - activation_state, output, node->user_data); - } else { - EvalIntegerSVDF(context, node, input, weights_feature, weights_time, - bias, params, activation_state, output, data); - } - return kTfLiteOk; - break; + return HexagonSvdfEvalInt8(context, node); } default: @@ -589,13 +103,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } -} // namespace - TfLiteRegistration Register_SVDF() { - return {/*init=*/Init, + return {/*init=*/HexagonSvdfInit, /*free=*/nullptr, - /*prepare=*/Prepare, - /*invoke=*/Eval, + /*prepare=*/HexagonSvdfPrepare, + /*invoke=*/SvdfEval, /*profiling_string=*/nullptr, /*builtin_code=*/0, /*custom_name=*/nullptr, diff --git a/third_party/hexagon/svdf_int8.cc b/third_party/hexagon/svdf_int8.cc new file mode 100644 index 0000000000000000000000000000000000000000..02599d197f91cfb0f430e957dfe9e5eaa8eabf2c --- /dev/null +++ b/third_party/hexagon/svdf_int8.cc @@ -0,0 +1,137 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/* Copyright 2020 The Qualcomm Innovation Center, Inc. All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the disclaimer +below) provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Qualcomm Innovation Center, Inc. nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +==============================================================================*/ + +#include + +#include "hexagon_tflm_translation_svdf.h" +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/op_macros.h" +#include "tensorflow/lite/micro/kernels/activation_utils.h" +#include "tensorflow/lite/micro/kernels/hexagon/hexagon_svdf.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_utils.h" + +namespace tflite { + +TfLiteStatus HexagonSvdfEvalInt8(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast(node->builtin_data); + TFLITE_DCHECK(node->user_data != nullptr); + const HexagonOpDataSvdf& data = + *(static_cast(node->user_data)); + + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kSvdfInputTensor); + const TfLiteEvalTensor* weights_feature = + tflite::micro::GetEvalInput(context, node, kSvdfWeightsFeatureTensor); + const TfLiteEvalTensor* weights_time = + tflite::micro::GetEvalInput(context, node, kSvdfWeightsTimeTensor); + const TfLiteEvalTensor* bias = + (NumInputs(node) == 5) + ? tflite::micro::GetEvalInput(context, node, kSvdfBiasTensor) + : nullptr; + TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput( + context, node, kSvdfInputActivationStateTensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kSvdfOutputTensor); + + if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) { + tflite::hexagon_svdf::HexagonEvalIntegerSVDF( + context, node, input, weights_feature, weights_time, bias, params, + activation_state, output, node->user_data); + } else { + EvalIntegerSvdfReference(context, node, input, weights_feature, + weights_time, bias, params, activation_state, + output, data.reference_op_data); + } + return kTfLiteOk; +} + +void* HexagonSvdfInit(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + void* data = context->AllocatePersistentBuffer(context, sizeof(OpDataSvdf)); + + if (data == nullptr) { + return nullptr; + } + + HexagonOpDataSvdf* opdata = static_cast(data); + opdata->hexagon_data = + tflite::hexagon_svdf::HexagonInit(context, buffer, length); + + return data; +} + +TfLiteStatus HexagonSvdfPrepare(TfLiteContext* context, TfLiteNode* node) { + TfLiteStatus prepare_status = PrepareSvdf(context, node); + if (prepare_status != kTfLiteOk) { + return prepare_status; + } + + tflite::hexagon_svdf::HexagonOptimizationEvaluation(context, node); + + if (tflite::hexagon_svdf::HexagonOptimizable(context, node)) { + TF_LITE_ENSURE_OK(context, + tflite::hexagon_svdf::HexagonPrepare(context, node)); + } + + return kTfLiteOk; +} + +TfLiteRegistration Register_SVDF_INT8() { + return {/*init=*/HexagonSvdfInit, + /*free=*/nullptr, + /*prepare=*/HexagonSvdfPrepare, + /*invoke=*/HexagonSvdfEvalInt8, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite