diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 7f2169a6456bb04bda228cf62b89a125e4e2bb2f..5aad98c05c56f85931b7a0276d0a85b426573c4c 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -58,6 +58,43 @@ void scale( } } +template <> +void scale(const int* din, int* dout, int num, int scale, int bias) { + int cnt = num >> 4; + int remain = num % 16; + int32x4_t vscale = vdupq_n_s32(scale); + int32x4_t vbias = vdupq_n_s32(bias); +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const int* din_ptr = din + (i << 4); + int* dout_ptr = dout + (i << 4); + + int32x4_t din0 = vld1q_s32(din_ptr); + int32x4_t din1 = vld1q_s32(din_ptr + 4); + int32x4_t din2 = vld1q_s32(din_ptr + 8); + int32x4_t din3 = vld1q_s32(din_ptr + 12); + + int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale); + int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale); + int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale); + int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale); + + vst1q_s32(dout_ptr, vsum1); + vst1q_s32(dout_ptr + 4, vsum2); + vst1q_s32(dout_ptr + 8, vsum3); + vst1q_s32(dout_ptr + 12, vsum4); + } + if (remain > 0) { + const int* din_ptr = din + (cnt << 4); + int* dout_ptr = dout + (cnt << 4); + for (int i = 0; i < remain; i++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } +} + template <> void scale(const float* din, float* dout, diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h index a86528c9df18cd6ef807bc116686b766ad905d82..9a75f0a1e603327fdab36deb49e241d7483ab77a 100644 --- a/lite/backends/arm/math/scale.h +++ b/lite/backends/arm/math/scale.h @@ -20,7 +20,7 @@ namespace arm { namespace math { template -void scale(const T* din, T* dout, int num, float scale, float bias); +void scale(const T* din, T* dout, int num, T scale, T bias); template void scale(const T* din, diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc index 2a46d2212e4f69630e012ae4a497f68db7a01985..71192d7b937116966a5b95a7620805065fdd152e 100644 --- a/lite/kernels/arm/scale_compute.cc +++ b/lite/kernels/arm/scale_compute.cc @@ -20,18 +20,18 @@ namespace lite { namespace kernels { namespace arm { -void ScaleCompute::Run() { - auto& param = Param(); - const float* x_data = param.x->data(); - float* output_data = param.output->mutable_data(); - DDim x_dims = param.x->dims(); - bool bias_after_scale = param.bias_after_scale; - float scale = param.scale; - float bias = param.bias; - if (!bias_after_scale) { +template +void ScaleCompute::Run() { + auto& param = this->template Param(); + int num = param.x->numel(); + const T* x_data = param.x->template data(); + T* output_data = param.output->template mutable_data(); + T scale = static_cast(param.scale); + T bias = static_cast(param.bias); + if (!param.bias_after_scale) { bias *= scale; } - lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias); + lite::arm::math::scale(x_data, output_data, num, scale, bias); if (!param.x->lod().empty()) { param.output->set_lod(param.x->lod()); } @@ -42,8 +42,16 @@ void ScaleCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - scale, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ScaleCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) +using scale_float = + paddle::lite::kernels::arm::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .Finalize(); + +using scale_int32 = + paddle::lite::kernels::arm::ScaleCompute; +REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .Finalize(); diff --git a/lite/kernels/arm/scale_compute.h b/lite/kernels/arm/scale_compute.h index 4eacfaf8e1231c52f6235d744f62d106bc947212..b7b81c8f047fa92efad26d277040cdff4333521e 100644 --- a/lite/kernels/arm/scale_compute.h +++ b/lite/kernels/arm/scale_compute.h @@ -21,7 +21,8 @@ namespace lite { namespace kernels { namespace arm { -class ScaleCompute : public KernelLite { +template +class ScaleCompute : public KernelLite { public: void Run() override; diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc index 2683f341a23bdf0bb0e534a5df413e91894a3f9f..0d327b9807d306770850b09ed1ed2a0337104c92 100644 --- a/lite/kernels/arm/scale_compute_test.cc +++ b/lite/kernels/arm/scale_compute_test.cc @@ -41,13 +41,13 @@ void scale_compute_ref(const operators::ScaleParam& param) { } TEST(scale_arm, init) { - ScaleCompute scale; + ScaleCompute scale; ASSERT_EQ(scale.precision(), PRECISION(kFloat)); ASSERT_EQ(scale.target(), TARGET(kARM)); } TEST(scale_arm, compute) { - ScaleCompute scale; + ScaleCompute scale; operators::ScaleParam param; lite::Tensor x; diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index 1ededcd52d3fb4c8881a391dce5e7f22e87cdb44..efd0497002ee402426a7198bf47ec60c7f41d2fd 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -29,7 +29,8 @@ class ScaleComputeTester : public arena::TestCase { DDim x_dims_{{100, 20}}; float scale_ = 0.; float bias_ = 0.; - bool bias_after_scale_; + bool bias_after_scale_ = true; + PrecisionType x_dtype_ = PRECISION(kFloat); public: ScaleComputeTester(const Place& place, @@ -37,30 +38,45 @@ class ScaleComputeTester : public arena::TestCase { const DDim& x_dims, float scale, float bias, - bool bias_after_scale) + bool bias_after_scale = true, + PrecisionType x_dtype = PRECISION(kFloat)) : TestCase(place, alias), x_dims_(x_dims), scale_(scale), bias_(bias), - bias_after_scale_(bias_after_scale) {} + bias_after_scale_(bias_after_scale), + x_dtype_(x_dtype) {} - void RunBaseline(Scope* scope) override { + template + void RunBaselineHelper(Scope* scope) { + auto* x = scope->FindTensor(x_); + auto* x_data = x->data(); auto* out = scope->NewTensor(out_); - CHECK(out); out->Resize(x_dims_); - auto* out_data = out->mutable_data(); - - auto* x = scope->FindTensor(x_); - const auto* x_data = x->data(); - - float bias = bias_; + T scale = static_cast(scale_); + T bias = static_cast(bias_); if (!bias_after_scale_) { - bias *= scale_; + bias *= scale; } + auto out_data = out->mutable_data(); for (int i = 0; i < x_dims_.production(); i++) { - out_data[i] = x_data[i] * scale_ + bias; + out_data[i] = x_data[i] * scale + bias; + } + } + + void RunBaseline(Scope* scope) override { + switch (x_dtype_) { + case PRECISION(kFloat): + RunBaselineHelper(scope); + break; + case PRECISION(kInt32): + RunBaselineHelper(scope); + break; + default: + LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_); + break; } } @@ -73,13 +89,74 @@ class ScaleComputeTester : public arena::TestCase { op_desc->SetAttr("bias_after_scale", bias_after_scale_); } + template + void PrepareDataHelper() { + std::vector dx(x_dims_.production()); + fill_data_rand(dx.data(), -10, 10, x_dims_.production()); + SetCommonTensor(x_, x_dims_, dx.data()); + } + void PrepareData() override { - std::vector x(x_dims_.production()); - fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); - SetCommonTensor(x_, x_dims_, x.data()); + switch (x_dtype_) { + case PRECISION(kFloat): + PrepareDataHelper(); + break; + case PRECISION(kInt32): + PrepareDataHelper(); + break; + default: + LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_); + break; + } } }; +void TestScaleShape(Place place, float abs_error) { + for (auto x_dims : + std::vector>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { + std::unique_ptr tester( + new ScaleComputeTester(place, "def", DDim(x_dims), 1.5f, 0.2f)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + +void TestScaleValue(Place place, float abs_error) { + for (float scale : {0.123, 0., -1.2}) { + for (float bias : {1., 0., -1.2331}) { + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim({5, 2, 3, 4}), scale, bias)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } +} + +void TestScaleOrder(Place place, float abs_error) { + for (bool bias_after_scale : {true, false}) { + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim({2, 3, 4, 5}), 1.5f, 0.2f, bias_after_scale)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + +void TestScaleDtype(Place place, float abs_error) { + for (PrecisionType x_dtype : {PRECISION(kFloat), PRECISION(kInt32)}) { + if (x_dtype == PRECISION(kFloat)) { + place.precision = PRECISION(kFloat); + } else if (x_dtype == PRECISION(kInt32)) { + place.precision = PRECISION(kInt32); + } else { + LOG(FATAL) << "fatal"; + } + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim({2, 3, 4, 5}), 2.f, 1.f, true, x_dtype)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + TEST(Scale, precision) { Place place; float abs_error = 2e-5; @@ -97,19 +174,12 @@ TEST(Scale, precision) { return; #endif - for (auto x_dims : - std::vector>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { - for (float scale : {0.123, 2., -1.2}) { - for (float bias : {1., 0., -1.2331}) { - for (bool bias_after_scale : {true, false}) { - std::unique_ptr tester(new ScaleComputeTester( - place, "def", DDim(x_dims), scale, bias, bias_after_scale)); - arena::Arena arena(std::move(tester), place, abs_error); - arena.TestPrecision(); - } - } - } - } + TestScaleShape(place, abs_error); + TestScaleValue(place, abs_error); + TestScaleOrder(place, abs_error); +#ifdef LITE_WITH_ARM + TestScaleDtype(place, abs_error); +#endif } TEST(Scale, performance) {