From 3c9a1a1d66a88a6f539e41ada3d6271fda83506f Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Fri, 18 Sep 2020 09:54:34 +0800 Subject: [PATCH] [cherry-pick] Fix build error caused by flatbuffer if the target is tiny_publish, and fix the missing of the attr name of the output scale (#4356) * [LITE][XPU] 1. Add sequence_unpad kernel for XPU; 2. Bugfix in sequence_unpad kernel for x86, as InferShapeImpl() is now empty in lite/operators/sequence_unpad_op.cc; 3. Refine TargetWrapperXPU; (#4237) * [Core] Fix the missing of the attr name of the output scale (#4334) * [NPU] Fix build error caused by flatbuffer if the target is tiny_publish (#4340) test=develop Co-authored-by: Cwndmiao --- lite/backends/xpu/target_wrapper.cc | 23 ++++- lite/backends/xpu/target_wrapper.h | 14 +-- lite/core/op_lite.cc | 1 + lite/kernels/apu/bridges/conv_op.cc | 4 +- lite/kernels/apu/bridges/pool_op.cc | 2 +- lite/kernels/npu/bridges/conv_op.cc | 4 +- lite/kernels/npu/bridges/conv_transpose_op.cc | 4 +- lite/kernels/npu/bridges/pad2d_op.cc | 2 +- lite/kernels/npu/bridges/pool_op.cc | 2 +- lite/kernels/npu/bridges/reduce_mean_op.cc | 2 +- lite/kernels/rknpu/bridges/conv_op.cc | 4 +- lite/kernels/rknpu/bridges/pool_op.cc | 2 +- lite/kernels/x86/sequence_unpad_compute.h | 25 +++++ lite/kernels/xpu/CMakeLists.txt | 1 + lite/kernels/xpu/bridges/conv_op.cc | 4 +- lite/kernels/xpu/bridges/pool_op.cc | 2 +- lite/kernels/xpu/sequence_pool_compute.cc | 2 + lite/kernels/xpu/sequence_unpad_compute.cc | 98 +++++++++++++++++++ lite/kernels/xpu/sequence_unpad_compute.h | 44 +++++++++ 19 files changed, 216 insertions(+), 24 deletions(-) create mode 100644 lite/kernels/xpu/sequence_unpad_compute.cc create mode 100644 lite/kernels/xpu/sequence_unpad_compute.h diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc index a3d8729410..5f5eae4703 100644 --- a/lite/backends/xpu/target_wrapper.cc +++ b/lite/backends/xpu/target_wrapper.cc @@ -18,6 +18,27 @@ namespace paddle { namespace lite { +void XPUScratchPad::Reserve(size_t new_size) { + if (new_size <= size_) { + return; + } + + if (!is_l3_) { + TargetWrapperXPU::Free(addr_); + addr_ = TargetWrapperXPU::Malloc(new_size); + size_ = new_size; + } else { + CHECK(false) << "Not supported if is_l3_ == true"; + } +} + +void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const { + if (!sp->is_l3_) { + TargetWrapperXPU::Free(sp->addr_); + } + delete sp; +} + void* TargetWrapperXPU::Malloc(size_t size) { void* ptr{nullptr}; XPU_CALL(xpu_malloc(&ptr, size)); @@ -51,7 +72,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size, ptr = TargetWrapperXPU::Malloc(size); } CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3; - return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3)); + return XPUScratchPadGuard(new XPUScratchPad(ptr, size, use_l3)); } std::string TargetWrapperXPU::multi_encoder_precision; // NOLINT diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h index 1a888b126a..8151d733ba 100644 --- a/lite/backends/xpu/target_wrapper.h +++ b/lite/backends/xpu/target_wrapper.h @@ -37,19 +37,19 @@ const int XPU_MAX_LOD_SEQ_LEN = 512; using TargetWrapperXPU = TargetWrapper; struct XPUScratchPad { - XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {} + XPUScratchPad(void* addr, size_t size, bool is_l3) + : addr_(addr), size_(size), is_l3_(is_l3) {} + + // XXX(miaotianxiang): |size_| increases monotonically + void Reserve(size_t new_size); void* addr_{nullptr}; + size_t size_{0}; bool is_l3_{false}; }; struct XPUScratchPadDeleter { - void operator()(XPUScratchPad* sp) const { - if (!sp->is_l3_) { - XPU_CALL(xpu_free(sp->addr_)); - } - delete sp; - } + void operator()(XPUScratchPad* sp) const; }; using XPUScratchPadGuard = std::unique_ptr; diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index c3c00d0fa0..dcab292be8 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -322,6 +322,7 @@ std::vector OpInfo::GetOutputScale(const std::string &name, int index; CHECK(GetOutputArgname(name, &argname)); CHECK(GetOutputIndex(name, &index)); + scale_name = argname + to_string(index) + "_scale"; } return GetAttr>(scale_name); } diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc index bdac473b1b..1c3020065e 100644 --- a/lite/kernels/apu/bridges/conv_op.cc +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -60,9 +60,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); + std::vector dilations = op_info->GetAttr>("dilations"); bool with_act = op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); std::string act_type = diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc index 594c7fabda..e255518044 100644 --- a/lite/kernels/apu/bridges/pool_op.cc +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -45,7 +45,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto pooling_type = op_info->GetAttr("pooling_type"); auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); // pool mode if ((pooling_type == "max") || (pooling_type == "avg")) { diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 5cc79137b9..95632c7a05 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -53,9 +53,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); + std::vector dilations = op_info->GetAttr>("dilations"); bool with_act = op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); std::string act_type = diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 7e149ed243..52ae137d52 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -59,8 +59,8 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_size = op_info->GetAttr>("output_size"); } - auto paddings = op_info->GetAttr>("paddings"); - auto dilations = op_info->GetAttr>("dilations"); + std::vector paddings = op_info->GetAttr>("paddings"); + std::vector dilations = op_info->GetAttr>("dilations"); CHECK_EQ(dilations.size(), 2L); std::string padding_algorithm = op_info->HasAttr("padding_algorithm") diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc index 70fa87e778..cb35b24752 100644 --- a/lite/kernels/npu/bridges/pad2d_op.cc +++ b/lite/kernels/npu/bridges/pad2d_op.cc @@ -35,7 +35,7 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); auto out_name = op_info->Output("Out").front(); - auto padding = op_info->GetAttr>("paddings"); + std::vector padding = op_info->GetAttr>("paddings"); CHECK_EQ(padding.size(), 4); // X node diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index fc2647f67e..921e1a2571 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -39,7 +39,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto pooling_type = op_info->GetAttr("pooling_type"); auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); // X node std::shared_ptr x_node = nullptr; diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc index 5987342672..a608082be0 100644 --- a/lite/kernels/npu/bridges/reduce_mean_op.cc +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -36,7 +36,7 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x_dims = x->dims(); auto out_name = op_info->Input("Out").front(); auto keep_dim = op_info->GetAttr("keep_dim"); - auto dim = op_info->GetAttr>("dim"); + std::vector dim = op_info->GetAttr>("dim"); CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty."; for (size_t i = 0; i < dim.size(); i++) { if (dim[i] < 0) { diff --git a/lite/kernels/rknpu/bridges/conv_op.cc b/lite/kernels/rknpu/bridges/conv_op.cc index 134d9e0cde..a789f0bacc 100644 --- a/lite/kernels/rknpu/bridges/conv_op.cc +++ b/lite/kernels/rknpu/bridges/conv_op.cc @@ -51,9 +51,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); + std::vector dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); diff --git a/lite/kernels/rknpu/bridges/pool_op.cc b/lite/kernels/rknpu/bridges/pool_op.cc index 36832fc578..1a5a69b134 100644 --- a/lite/kernels/rknpu/bridges/pool_op.cc +++ b/lite/kernels/rknpu/bridges/pool_op.cc @@ -42,7 +42,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto pooling_type = op_info->GetAttr("pooling_type"); auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); // for quantization bool enable_int8 = false; diff --git a/lite/kernels/x86/sequence_unpad_compute.h b/lite/kernels/x86/sequence_unpad_compute.h index 5b4e3f6c16..b8bdfe08e8 100644 --- a/lite/kernels/x86/sequence_unpad_compute.h +++ b/lite/kernels/x86/sequence_unpad_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "lite/backends/x86/math/sequence_padding.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" @@ -34,6 +35,30 @@ class SequenceUnpadCompute auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto len_dims = param.Length->dims(); + + auto* seq_len_ptr = param.Length->template data(); + int64_t batch_size = len_dims[0]; + std::vector out_lod0(batch_size + 1, 0); + for (int64_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + paddle::lite::LoD out_lod; + out_lod.push_back(out_lod0); + + int64_t out_dim0 = out_lod0.back(); + std::vector out_dims{out_dim0}; + if (x_dims.size() == 2) { + out_dims.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims.push_back(x_dims[i]); + } + } + param.Out->Resize(out_dims); + param.Out->set_lod(out_lod); + param.Out->template mutable_data(); int64_t padded_length = param.X->dims()[1]; math::UnpaddingLoDTensorFunctor()( diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index 798d707dd7..cc69120557 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -38,6 +38,7 @@ else() add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps}) add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps}) add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_unpad_compute_xpu XPU extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps}) # extra(fused kernel) add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps}) diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index a4c0bc05cb..590d830ce4 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -44,9 +44,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(input_dims.size(), 4); CHECK_EQ(filter_dims.size(), 4); auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); + std::vector dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc index 862e1841e8..5c38cacddd 100644 --- a/lite/kernels/xpu/bridges/pool_op.cc +++ b/lite/kernels/xpu/bridges/pool_op.cc @@ -37,7 +37,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_name = op_info->Output("Out").front(); auto pooling_type = op_info->GetAttr("pooling_type"); auto ceil_mode = op_info->GetAttr("ceil_mode"); - auto paddings = op_info->GetAttr>("paddings"); + std::vector paddings = op_info->GetAttr>("paddings"); auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); auto strides = op_info->GetAttr>("strides"); diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc index f8e71639b7..35412cf49c 100644 --- a/lite/kernels/xpu/sequence_pool_compute.cc +++ b/lite/kernels/xpu/sequence_pool_compute.cc @@ -42,6 +42,8 @@ void XPUSequencePoolCompute::Run() { xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX; if (pool_type_str == "MAX") { + } else if (pool_type_str == "SUM") { + pool_type = xdnn::Pooling_t::SUM; } else if (pool_type_str == "LAST") { pool_type = xdnn::Pooling_t::LAST; } else { diff --git a/lite/kernels/xpu/sequence_unpad_compute.cc b/lite/kernels/xpu/sequence_unpad_compute.cc new file mode 100644 index 0000000000..2ce296ca21 --- /dev/null +++ b/lite/kernels/xpu/sequence_unpad_compute.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_unpad_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceUnpadCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + lod_cpu_.reserve(XPU_MAX_LOD_SIZE); +} + +void SequenceUnpadCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto x_dims = param.X->dims(); + auto len_dims = param.Length->dims(); + + // XXX(miaotianxiang): Target of tensor |Length| is |kHost|. + auto* seq_len_ptr = param.Length->template data(); + int64_t batch_size = len_dims[0]; + std::vector out_lod0(batch_size + 1, 0); + for (int64_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + paddle::lite::LoD out_lod; + out_lod.push_back(out_lod0); + + int64_t out_dim0 = out_lod0.back(); + std::vector out_dims{out_dim0}; + if (x_dims.size() == 2) { + out_dims.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims.push_back(x_dims[i]); + } + } + param.Out->Resize(out_dims); + param.Out->set_lod(out_lod); + + lod_cpu_ = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + int offset = + lod_cpu_.back() + static_cast(param.Length->data()[i]); + lod_cpu_.push_back(offset); + } + lod_xpu_guard_->Reserve((batch_size + 1) * sizeof(int)); + TargetWrapperXPU::MemcpySync(lod_xpu_guard_->addr_, + lod_cpu_.data(), + (batch_size + 1) * sizeof(int), + IoDirection::HtoD); + + int dim = param.Out->numel() / out_dim0; + int r = xdnn::sequence_unpad( + ctx.GetRawContext(), /* ctx */ + param.X->data(), /* pad_data */ + param.Out->mutable_data(TARGET(kXPU)), /* seq_data */ + reinterpret_cast(lod_xpu_guard_->addr_), /* sequence */ + param.X->dims()[1], /* pad_seq_len */ + batch_size, /* batch_size */ + dim /* dim */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_unpad, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceUnpadCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_unpad_compute.h b/lite/kernels/xpu/sequence_unpad_compute.h new file mode 100644 index 0000000000..8e038383e6 --- /dev/null +++ b/lite/kernels/xpu/sequence_unpad_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceUnpadCompute + : public KernelLite { + public: + using param_t = operators::SequenceUnpadParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + std::vector lod_cpu_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle -- GitLab