scale support int32 input (#3065) (#3078)

b7cd1204 · zhupengyang · GitHub · f0dba648 · b7cd1204 · b7cd1204
6 changed file
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
@@ -58,6 +58,43 @@ void scale<float>(
  }
 }

+template <>
+void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
 template <>
 void scale<float>(const float* din,
                  float* dout,

--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -20,7 +20,7 @@ namespace arm {
 namespace math {

 template <typename T>
-void scale(const T* din, T* dout, int num, float scale, float bias);
+void scale(const T* din, T* dout, int num, T scale, T bias);

 template <typename T>
 void scale(const T* din,

--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
@@ -20,18 +20,18 @@ namespace lite {
 namespace kernels {
 namespace arm {

-void ScaleCompute::Run() {
-  auto& param = Param<operators::ScaleParam>();
-  const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
-  DDim x_dims = param.x->dims();
-  bool bias_after_scale = param.bias_after_scale;
-  float scale = param.scale;
-  float bias = param.bias;
-  if (!bias_after_scale) {
+template <typename T, PrecisionType PType>
+void ScaleCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ScaleParam>();
+  int num = param.x->numel();
+  const T* x_data = param.x->template data<T>();
+  T* output_data = param.output->template mutable_data<T>();
+  T scale = static_cast<T>(param.scale);
+  T bias = static_cast<T>(param.bias);
+  if (!param.bias_after_scale) {
    bias *= scale;
  }
-  lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
+  lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
  if (!param.x->lod().empty()) {
    param.output->set_lod(param.x->lod());
  }
@@ -42,8 +42,16 @@ void ScaleCompute::Run() {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(
-    scale, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ScaleCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+using scale_float =
+    paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
+
+using scale_int32 =
+    paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .Finalize();
--- a/lite/kernels/arm/scale_compute.h
+++ b/lite/kernels/arm/scale_compute.h
@@ -21,7 +21,8 @@ namespace lite {
 namespace kernels {
 namespace arm {

-class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ScaleCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  void Run() override;


--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
@@ -41,13 +41,13 @@ void scale_compute_ref(const operators::ScaleParam& param) {
 }

 TEST(scale_arm, init) {
-  ScaleCompute scale;
+  ScaleCompute<float, PRECISION(kFloat)> scale;
  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
  ASSERT_EQ(scale.target(), TARGET(kARM));
 }

 TEST(scale_arm, compute) {
-  ScaleCompute scale;
+  ScaleCompute<float, PRECISION(kFloat)> scale;
  operators::ScaleParam param;

  lite::Tensor x;

--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -29,7 +29,8 @@ class ScaleComputeTester : public arena::TestCase {
  DDim x_dims_{{100, 20}};
  float scale_ = 0.;
  float bias_ = 0.;
-  bool bias_after_scale_;
+  bool bias_after_scale_ = true;
+  PrecisionType x_dtype_ = PRECISION(kFloat);

 public:
  ScaleComputeTester(const Place& place,
@@ -37,30 +38,45 @@ class ScaleComputeTester : public arena::TestCase {
                     const DDim& x_dims,
                     float scale,
                     float bias,
-                     bool bias_after_scale)
+                     bool bias_after_scale = true,
+                     PrecisionType x_dtype = PRECISION(kFloat))
      : TestCase(place, alias),
        x_dims_(x_dims),
        scale_(scale),
        bias_(bias),
-        bias_after_scale_(bias_after_scale) {}
+        bias_after_scale_(bias_after_scale),
+        x_dtype_(x_dtype) {}

-  void RunBaseline(Scope* scope) override {
+  template <typename T>
+  void RunBaselineHelper(Scope* scope) {
+    auto* x = scope->FindTensor(x_);
+    auto* x_data = x->data<T>();
    auto* out = scope->NewTensor(out_);
-    CHECK(out);
    out->Resize(x_dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<float>();
-
-    float bias = bias_;

+    T scale = static_cast<T>(scale_);
+    T bias = static_cast<T>(bias_);
    if (!bias_after_scale_) {
-      bias *= scale_;
+      bias *= scale;
    }

+    auto out_data = out->mutable_data<T>();
    for (int i = 0; i < x_dims_.production(); i++) {
-      out_data[i] = x_data[i] * scale_ + bias;
+      out_data[i] = x_data[i] * scale + bias;
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    switch (x_dtype_) {
+      case PRECISION(kFloat):
+        RunBaselineHelper<float>(scope);
+        break;
+      case PRECISION(kInt32):
+        RunBaselineHelper<int>(scope);
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
+        break;
    }
  }

@@ -73,13 +89,74 @@ class ScaleComputeTester : public arena::TestCase {
    op_desc->SetAttr("bias_after_scale", bias_after_scale_);
  }

+  template <typename T>
+  void PrepareDataHelper() {
+    std::vector<T> dx(x_dims_.production());
+    fill_data_rand<T>(dx.data(), -10, 10, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
+  }
+
  void PrepareData() override {
-    std::vector<float> x(x_dims_.production());
-    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
-    SetCommonTensor(x_, x_dims_, x.data());
+    switch (x_dtype_) {
+      case PRECISION(kFloat):
+        PrepareDataHelper<float>();
+        break;
+      case PRECISION(kInt32):
+        PrepareDataHelper<int>();
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
+        break;
+    }
  }
 };

+void TestScaleShape(Place place, float abs_error) {
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ScaleComputeTester(place, "def", DDim(x_dims), 1.5f, 0.2f));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+void TestScaleValue(Place place, float abs_error) {
+  for (float scale : {0.123, 0., -1.2}) {
+    for (float bias : {1., 0., -1.2331}) {
+      std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+          place, "def", DDim({5, 2, 3, 4}), scale, bias));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestScaleOrder(Place place, float abs_error) {
+  for (bool bias_after_scale : {true, false}) {
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+        place, "def", DDim({2, 3, 4, 5}), 1.5f, 0.2f, bias_after_scale));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+void TestScaleDtype(Place place, float abs_error) {
+  for (PrecisionType x_dtype : {PRECISION(kFloat), PRECISION(kInt32)}) {
+    if (x_dtype == PRECISION(kFloat)) {
+      place.precision = PRECISION(kFloat);
+    } else if (x_dtype == PRECISION(kInt32)) {
+      place.precision = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "fatal";
+    }
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+        place, "def", DDim({2, 3, 4, 5}), 2.f, 1.f, true, x_dtype));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 TEST(Scale, precision) {
  Place place;
  float abs_error = 2e-5;
@@ -97,19 +174,12 @@ TEST(Scale, precision) {
  return;
 #endif

-  for (auto x_dims :
-       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    for (float scale : {0.123, 2., -1.2}) {
-      for (float bias : {1., 0., -1.2331}) {
-        for (bool bias_after_scale : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
-              place, "def", DDim(x_dims), scale, bias, bias_after_scale));
-          arena::Arena arena(std::move(tester), place, abs_error);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
+  TestScaleShape(place, abs_error);
+  TestScaleValue(place, abs_error);
+  TestScaleOrder(place, abs_error);
+#ifdef LITE_WITH_ARM
+  TestScaleDtype(place, abs_error);
+#endif
 }

 TEST(Scale, performance) {