未验证 提交 b7cd1204 编写于 作者: Z zhupengyang 提交者: GitHub

scale support int32 input (#3065) (#3078)

上级 f0dba648
......@@ -58,6 +58,43 @@ void scale<float>(
}
}
template <>
void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
int cnt = num >> 4;
int remain = num % 16;
int32x4_t vscale = vdupq_n_s32(scale);
int32x4_t vbias = vdupq_n_s32(bias);
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
const int* din_ptr = din + (i << 4);
int* dout_ptr = dout + (i << 4);
int32x4_t din0 = vld1q_s32(din_ptr);
int32x4_t din1 = vld1q_s32(din_ptr + 4);
int32x4_t din2 = vld1q_s32(din_ptr + 8);
int32x4_t din3 = vld1q_s32(din_ptr + 12);
int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
vst1q_s32(dout_ptr, vsum1);
vst1q_s32(dout_ptr + 4, vsum2);
vst1q_s32(dout_ptr + 8, vsum3);
vst1q_s32(dout_ptr + 12, vsum4);
}
if (remain > 0) {
const int* din_ptr = din + (cnt << 4);
int* dout_ptr = dout + (cnt << 4);
for (int i = 0; i < remain; i++) {
*dout_ptr = *din_ptr * scale + bias;
dout_ptr++;
din_ptr++;
}
}
}
template <>
void scale<float>(const float* din,
float* dout,
......
......@@ -20,7 +20,7 @@ namespace arm {
namespace math {
template <typename T>
void scale(const T* din, T* dout, int num, float scale, float bias);
void scale(const T* din, T* dout, int num, T scale, T bias);
template <typename T>
void scale(const T* din,
......
......@@ -20,18 +20,18 @@ namespace lite {
namespace kernels {
namespace arm {
void ScaleCompute::Run() {
auto& param = Param<operators::ScaleParam>();
const float* x_data = param.x->data<float>();
float* output_data = param.output->mutable_data<float>();
DDim x_dims = param.x->dims();
bool bias_after_scale = param.bias_after_scale;
float scale = param.scale;
float bias = param.bias;
if (!bias_after_scale) {
template <typename T, PrecisionType PType>
void ScaleCompute<T, PType>::Run() {
auto& param = this->template Param<operators::ScaleParam>();
int num = param.x->numel();
const T* x_data = param.x->template data<T>();
T* output_data = param.output->template mutable_data<T>();
T scale = static_cast<T>(param.scale);
T bias = static_cast<T>(param.bias);
if (!param.bias_after_scale) {
bias *= scale;
}
lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
if (!param.x->lod().empty()) {
param.output->set_lod(param.x->lod());
}
......@@ -42,8 +42,16 @@ void ScaleCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
scale, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ScaleCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
using scale_float =
paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
.Finalize();
using scale_int32 =
paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.Finalize();
......@@ -21,7 +21,8 @@ namespace lite {
namespace kernels {
namespace arm {
class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class ScaleCompute : public KernelLite<TARGET(kARM), PType> {
public:
void Run() override;
......
......@@ -41,13 +41,13 @@ void scale_compute_ref(const operators::ScaleParam& param) {
}
TEST(scale_arm, init) {
ScaleCompute scale;
ScaleCompute<float, PRECISION(kFloat)> scale;
ASSERT_EQ(scale.precision(), PRECISION(kFloat));
ASSERT_EQ(scale.target(), TARGET(kARM));
}
TEST(scale_arm, compute) {
ScaleCompute scale;
ScaleCompute<float, PRECISION(kFloat)> scale;
operators::ScaleParam param;
lite::Tensor x;
......
......@@ -29,7 +29,8 @@ class ScaleComputeTester : public arena::TestCase {
DDim x_dims_{{100, 20}};
float scale_ = 0.;
float bias_ = 0.;
bool bias_after_scale_;
bool bias_after_scale_ = true;
PrecisionType x_dtype_ = PRECISION(kFloat);
public:
ScaleComputeTester(const Place& place,
......@@ -37,30 +38,45 @@ class ScaleComputeTester : public arena::TestCase {
const DDim& x_dims,
float scale,
float bias,
bool bias_after_scale)
bool bias_after_scale = true,
PrecisionType x_dtype = PRECISION(kFloat))
: TestCase(place, alias),
x_dims_(x_dims),
scale_(scale),
bias_(bias),
bias_after_scale_(bias_after_scale) {}
bias_after_scale_(bias_after_scale),
x_dtype_(x_dtype) {}
void RunBaseline(Scope* scope) override {
template <typename T>
void RunBaselineHelper(Scope* scope) {
auto* x = scope->FindTensor(x_);
auto* x_data = x->data<T>();
auto* out = scope->NewTensor(out_);
CHECK(out);
out->Resize(x_dims_);
auto* out_data = out->mutable_data<float>();
auto* x = scope->FindTensor(x_);
const auto* x_data = x->data<float>();
float bias = bias_;
T scale = static_cast<T>(scale_);
T bias = static_cast<T>(bias_);
if (!bias_after_scale_) {
bias *= scale_;
bias *= scale;
}
auto out_data = out->mutable_data<T>();
for (int i = 0; i < x_dims_.production(); i++) {
out_data[i] = x_data[i] * scale_ + bias;
out_data[i] = x_data[i] * scale + bias;
}
}
void RunBaseline(Scope* scope) override {
switch (x_dtype_) {
case PRECISION(kFloat):
RunBaselineHelper<float>(scope);
break;
case PRECISION(kInt32):
RunBaselineHelper<int>(scope);
break;
default:
LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
break;
}
}
......@@ -73,13 +89,74 @@ class ScaleComputeTester : public arena::TestCase {
op_desc->SetAttr("bias_after_scale", bias_after_scale_);
}
template <typename T>
void PrepareDataHelper() {
std::vector<T> dx(x_dims_.production());
fill_data_rand<T>(dx.data(), -10, 10, x_dims_.production());
SetCommonTensor(x_, x_dims_, dx.data());
}
void PrepareData() override {
std::vector<float> x(x_dims_.production());
fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
SetCommonTensor(x_, x_dims_, x.data());
switch (x_dtype_) {
case PRECISION(kFloat):
PrepareDataHelper<float>();
break;
case PRECISION(kInt32):
PrepareDataHelper<int>();
break;
default:
LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
break;
}
}
};
void TestScaleShape(Place place, float abs_error) {
for (auto x_dims :
std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
std::unique_ptr<arena::TestCase> tester(
new ScaleComputeTester(place, "def", DDim(x_dims), 1.5f, 0.2f));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
void TestScaleValue(Place place, float abs_error) {
for (float scale : {0.123, 0., -1.2}) {
for (float bias : {1., 0., -1.2331}) {
std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
place, "def", DDim({5, 2, 3, 4}), scale, bias));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
}
void TestScaleOrder(Place place, float abs_error) {
for (bool bias_after_scale : {true, false}) {
std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
place, "def", DDim({2, 3, 4, 5}), 1.5f, 0.2f, bias_after_scale));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
void TestScaleDtype(Place place, float abs_error) {
for (PrecisionType x_dtype : {PRECISION(kFloat), PRECISION(kInt32)}) {
if (x_dtype == PRECISION(kFloat)) {
place.precision = PRECISION(kFloat);
} else if (x_dtype == PRECISION(kInt32)) {
place.precision = PRECISION(kInt32);
} else {
LOG(FATAL) << "fatal";
}
std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
place, "def", DDim({2, 3, 4, 5}), 2.f, 1.f, true, x_dtype));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
TEST(Scale, precision) {
Place place;
float abs_error = 2e-5;
......@@ -97,19 +174,12 @@ TEST(Scale, precision) {
return;
#endif
for (auto x_dims :
std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
for (float scale : {0.123, 2., -1.2}) {
for (float bias : {1., 0., -1.2331}) {
for (bool bias_after_scale : {true, false}) {
std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
place, "def", DDim(x_dims), scale, bias, bias_after_scale));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
}
}
TestScaleShape(place, abs_error);
TestScaleValue(place, abs_error);
TestScaleOrder(place, abs_error);
#ifdef LITE_WITH_ARM
TestScaleDtype(place, abs_error);
#endif
}
TEST(Scale, performance) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册