diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 6e8390ce5509c836a5dfd4859fc6495301e77a8e..723aa7abd2e814164ab8a24f632d2fa8ed4d5b01 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -20,6 +20,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -76,7 +77,6 @@ using reference_ops::Less; using reference_ops::LessEqual; using reference_ops::LessEqualWithScaling; using reference_ops::LessWithScaling; -using reference_ops::Mean; using reference_ops::ProcessBroadcastShapes; using reference_ops::RankOneSelect; using reference_ops::Relu1; @@ -1743,6 +1743,221 @@ inline void ShuffledFullyConnected( gemm_context->workers_pool()->Execute(tasks); } +inline void MeanImpl(const tflite::MeanParams& op_params, + const RuntimeShape& input_shape, const uint8_t* input_data, + int32 input_zero_point, float input_scale, + const RuntimeShape& output_shape, uint8_t* output_data, + int32 output_zero_point, float output_scale, + int start_depth, int end_depth) { + gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8/MeanImpl"); + + // Current implementation only supports dimension equals 4 and simultaneous + // reduction over width and height. + const int output_batch = output_shape.Dims(0); + const int output_height = output_shape.Dims(2); + const int output_width = output_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const float num_elements_in_axis = input_width * input_height; + + TFLITE_DCHECK_EQ(op_params.axis_count, 2); + TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || + (op_params.axis[0] == 2 && op_params.axis[1] == 1)); + TFLITE_DCHECK_EQ(output_height, 1); + TFLITE_DCHECK_EQ(output_width, 1); + + const bool ordinary_mean = + (input_zero_point == output_zero_point && input_scale == output_scale); + float scale, bias; + if (!ordinary_mean) { + scale = input_scale / output_scale; + bias = -input_zero_point * scale + 0.5; + } + +#ifdef USE_NEON + const float32x4_t num_elements_dup = vdupq_n_f32(num_elements_in_axis); + // This is only an approximation as NEON does not offer division instruction. + const float32x4_t num_elements_reverse = vrecpeq_f32(num_elements_dup); + const float32x4_t kRounding = vdupq_n_f32(0.5); + float32x4_t bias_dup; + float32x4_t output_zero_point_dup; + if (!ordinary_mean) { + bias_dup = vdupq_n_f32(bias); + output_zero_point_dup = vdupq_n_f32(output_zero_point); + } +#endif + + for (int out_b = 0; out_b < output_batch; ++out_b) { + int out_d = start_depth; +#ifdef USE_NEON + + for (; out_d < end_depth - 8; out_d += 8) { + float32x4_t temp_sum_1 = vdupq_n_f32(0); + float32x4_t temp_sum_2 = vdupq_n_f32(0); + for (int in_h = 0; in_h < input_height; ++in_h) { + for (int in_w = 0; in_w < input_width; ++in_w) { + const uint8_t* input_data_ptr = + input_data + Offset(input_shape, out_b, in_h, in_w, out_d); + uint8x8_t input_data_val = vld1_u8(input_data_ptr); + int16x8_t input_data_val_shift = + vreinterpretq_s16_u16(vmovl_u8(input_data_val)); + float32x4_t input_float_1 = + vcvtq_f32_s32(vmovl_s16(vget_high_s16(input_data_val_shift))); + float32x4_t input_float_2 = + vcvtq_f32_s32(vmovl_s16(vget_low_s16(input_data_val_shift))); + temp_sum_1 = vaddq_f32(temp_sum_1, input_float_1); + temp_sum_2 = vaddq_f32(temp_sum_2, input_float_2); + } + } + + float32x4_t mean_1 = vmulq_f32(temp_sum_1, num_elements_reverse); + float32x4_t mean_2 = vmulq_f32(temp_sum_2, num_elements_reverse); + + if (!ordinary_mean) { + // maq is not supported, break down into two ops. + mean_1 = vmulq_n_f32(mean_1, scale); + mean_1 = vaddq_f32(mean_1, bias_dup); + mean_2 = vmulq_n_f32(mean_2, scale); + mean_2 = vaddq_f32(mean_2, bias_dup); + } + + if (!ordinary_mean) { + mean_1 = vaddq_f32(mean_1, output_zero_point_dup); + mean_2 = vaddq_f32(mean_2, output_zero_point_dup); + } + + // Rounding. + mean_1 = vaddq_f32(mean_1, kRounding); + mean_2 = vaddq_f32(mean_2, kRounding); + uint32x4_t casted_mean_1 = vcvtq_u32_f32(mean_1); + uint16x4_t narrow_range_mean_1 = vmovn_u32(casted_mean_1); + uint32x4_t casted_mean_2 = vcvtq_u32_f32(mean_2); + uint16x4_t narrow_range_mean_2 = vmovn_u32(casted_mean_2); + uint16x8_t combined_mean = + vcombine_u16(narrow_range_mean_2, narrow_range_mean_1); + uint8x8_t narrowed_combined_mean = vmovn_u16(combined_mean); + uint8_t* output_data_ptr = + output_data + Offset(output_shape, out_b, 0, 0, out_d); + vst1_u8(output_data_ptr, narrowed_combined_mean); + } +#endif + + for (; out_d < end_depth; ++out_d) { + float temp_value = 0; + for (int in_h = 0; in_h < input_height; ++in_h) { + for (int in_w = 0; in_w < input_width; ++in_w) { + temp_value += + input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; + } + } + + temp_value = temp_value / num_elements_in_axis; + if (ordinary_mean) { + output_data[Offset(output_shape, out_b, 0, 0, out_d)] = + static_cast(round(temp_value)); + } else { + output_data[Offset(output_shape, out_b, 0, 0, out_d)] = + static_cast(round(temp_value * scale + bias)) + + output_zero_point; + } + } + } +} + +struct MeanWorkerTask : public gemmlowp::Task { + MeanWorkerTask(const tflite::MeanParams& op_params, + const RuntimeShape& input_shape, const uint8_t* input_data, + int32 input_zero_point, float input_scale, + const RuntimeShape& output_shape, uint8_t* output_data, + int32 output_zero_point, float output_scale, int start_height, + int end_height) + : op_params_(op_params), + input_shape_(input_shape), + input_data_(input_data), + input_zero_point_(input_zero_point), + input_scale_(input_scale), + output_shape_(output_shape), + output_data_(output_data), + output_zero_point_(output_zero_point), + output_scale_(output_scale), + start_height_(start_height), + end_height_(end_height) {} + + void Run() override { + MeanImpl(op_params_, input_shape_, input_data_, input_zero_point_, + input_scale_, output_shape_, output_data_, output_zero_point_, + output_scale_, start_height_, end_height_); + } + + private: + const tflite::MeanParams& op_params_; + const RuntimeShape& input_shape_; + const uint8_t* input_data_; + int32 input_zero_point_; + float input_scale_; + const RuntimeShape& output_shape_; + uint8_t* output_data_; + int32 output_zero_point_; + float output_scale_; + int start_height_; + int end_height_; + gemmlowp::GemmContext* gemm_context_; +}; + +inline void Mean(const tflite::MeanParams& op_params, + const RuntimeShape& unextended_input_shape, + const uint8_t* input_data, int32 input_zero_point, + float input_scale, const RuntimeShape& unextended_output_shape, + uint8_t* output_data, int32 output_zero_point, + float output_scale, gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8"); + + // Current implementation only supports dimension equals 4 and simultaneous + // reduction over width and height. + TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4); + TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int output_depth = output_shape.Dims(3); + + TFLITE_DCHECK_EQ(op_params.axis_count, 2); + TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || + (op_params.axis[0] == 2 && op_params.axis[1] == 1)); + TFLITE_DCHECK_EQ(output_height, 1); + TFLITE_DCHECK_EQ(output_width, 1); + + constexpr int kMinDepthPerThread = 8; + int thread_count = output_depth / kMinDepthPerThread; + thread_count = thread_count > 0 ? thread_count : 1; + const int capped_thread_count = + std::min(thread_count, gemm_context->max_num_threads()); + + if (thread_count == 1) { + MeanImpl(op_params, input_shape, input_data, input_zero_point, input_scale, + output_shape, output_data, output_zero_point, output_scale, 0, + output_depth); + } else { + // Instead parrallel for batch, we loop for the output_depth since batch + // is typical 1. + std::vector tasks(capped_thread_count); + int depth_start = 0; + for (int i = 0; i < capped_thread_count; ++i) { + // Try to distribute the tasks as even as possible. + int depth_end = (output_depth - depth_start) / (capped_thread_count - i); + tasks[i] = new MeanWorkerTask(op_params, input_shape, input_data, + input_zero_point, input_scale, output_shape, + output_data, output_zero_point, + output_scale, depth_start, depth_end); + depth_start = depth_end; + } + gemm_context->workers_pool()->Execute(tasks); + } +} + template inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w, int h, int b, int kheight, int kwidth, diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc index 3fb2715e48fabeee09b5d6670b6a45cf26d89bc8..95d2370aaa8da32b4fdf9b186a809e34c9ff9156 100644 --- a/tensorflow/lite/kernels/reduce.cc +++ b/tensorflow/lite/kernels/reduce.cc @@ -17,6 +17,8 @@ limitations under the License. #include #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/kernels/gemm_support.h" +#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/lite/kernels/internal/tensor.h" @@ -49,6 +51,7 @@ struct OpContext { }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { + gemm_support::IncrementUsageCounter(context); // Creates two temp tensors to store index and axis for internal // implementation only. auto* scratch_tensor_index = new int; @@ -57,6 +60,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } void Free(TfLiteContext* context, void* buffer) { + gemm_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -248,6 +252,7 @@ void ResolveAxis(const int* axis_data, int axis_count, template TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) { OpContext op_context(context, node); + int num_axis = static_cast(NumElements(op_context.axis)); TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); @@ -272,13 +277,15 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) { ((op_params.axis[0] == 1 && op_params.axis[1] == 2) || (op_params.axis[0] == 2 && op_params.axis[1] == 1))) { if (op_context.input->type == kTfLiteUInt8) { - reference_ops::Mean( + gemmlowp::GemmContext* gemm_context = + gemm_support::GetFromContext(context); + optimized_ops::Mean( op_params, GetTensorShape(input), GetTensorData(input), op_context.input->params.zero_point, op_context.input->params.scale, GetTensorShape(op_context.output), GetTensorData(op_context.output), op_context.output->params.zero_point, - op_context.output->params.scale); + op_context.output->params.scale, gemm_context); } else { reference_ops::Mean(op_params, GetTensorShape(input), GetTensorData(input), diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc index 373fa56fc1983baec55ed6421950012f4564e9c6..f9f3cdecbdf48f346ad77aff60f2f8ca2973928c 100644 --- a/tensorflow/lite/kernels/reduce_test.cc +++ b/tensorflow/lite/kernels/reduce_test.cc @@ -259,7 +259,7 @@ TEST(ConstFloatMeanOpTest, KeepDims) { // Uses a set of reduction conditions that trigger the specialized 4D version // of Mean. -TEST(ConstFloatMeanOpTest, KeepDims_4DMean) { +TEST(ConstFloatMeanOpTest, KeepDims4DMean) { std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; @@ -272,7 +272,7 @@ TEST(ConstFloatMeanOpTest, KeepDims_4DMean) { ElementsAreArray(ArrayFloatNear({6, 7, 18, 19}))); } -TEST(ConstFloatMeanOpTest, KeepDims_4DMean_UInt8) { +TEST(ConstFloatMeanOpTest, KeepDims4DMeanUInt8) { float kQuantizedTolerance = GetTolerance(-1.0, 1.0); std::vector data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}; @@ -286,7 +286,24 @@ TEST(ConstFloatMeanOpTest, KeepDims_4DMean_UInt8) { kQuantizedTolerance))); } -TEST(ConstFloatMeanOpTest, KeepDims_4DMean_Quantized) { +TEST(ConstFloatMeanOpTest, KeepDims4DMeanLargeDepthUInt8) { + float kQuantizedTolerance = GetTolerance(-5.0, 5.0); + std::vector data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1, + 0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9, + 0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3, + 0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}; + MeanOpConstModel m({TensorType_UINT8, {1, 2, 2, 9}, -1.0, 1.0}, + {TensorType_UINT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true); + m.QuantizeAndPopulate(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 9})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear( + {0.35, 0.325, 0.2, 0.35, 0.375, 0.325, 0.225, 0.45, 0.425}, + kQuantizedTolerance))); +} + +TEST(ConstFloatMeanOpTest, KeepDims4DMeanQuantized) { float kQuantizedTolerance = GetTolerance(-5.0, 5.0); std::vector data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4};