From 92dc2ec64a47a55e4bfebba71357b4beaf5352c9 Mon Sep 17 00:00:00 2001 From: weihaoji Date: Fri, 14 Aug 2020 14:58:52 +0800 Subject: [PATCH] [XPU] Support OCR models * add exp and reciprocal for activation * add conv2d transpose op * add fill constant op * add im2sequence op * add interpolate op (including nearest and bilinear) * add lrn op * add split op * add sum op * add topk op * add gru(int31) op * fix bug in elementwise arithmetic op * fix bug in conv2d op * fix bug in dropout op * fix bug in cast op, test=develop test=xpu --- lite/kernels/xpu/CMakeLists.txt | 9 + lite/kernels/xpu/cast_compute.cc | 69 +++-- lite/kernels/xpu/cast_compute.h | 1 - lite/kernels/xpu/conv2d_transpose_compute.cc | 101 ++++++ lite/kernels/xpu/conv2d_transpose_compute.h | 44 +++ lite/kernels/xpu/conv_compute.cc | 82 +++-- lite/kernels/xpu/dropout_compute.cc | 27 +- lite/kernels/xpu/fill_constant_compute.cc | 90 ++++++ lite/kernels/xpu/fill_constant_compute.h | 35 +++ lite/kernels/xpu/gru_compute.cc | 307 +++++++++++++++++++ lite/kernels/xpu/gru_compute.h | 67 ++++ lite/kernels/xpu/im2sequence_compute.cc | 97 ++++++ lite/kernels/xpu/im2sequence_compute.h | 35 +++ lite/kernels/xpu/interpolate_compute.cc | 134 ++++++++ lite/kernels/xpu/interpolate_compute.h | 43 +++ lite/kernels/xpu/lrn_compute.cc | 64 ++++ lite/kernels/xpu/lrn_compute.h | 36 +++ lite/kernels/xpu/split_compute.cc | 71 +++++ lite/kernels/xpu/split_compute.h | 35 +++ lite/kernels/xpu/sum_compute.cc | 55 ++++ lite/kernels/xpu/sum_compute.h | 35 +++ lite/kernels/xpu/topk_compute.cc | 67 ++++ lite/kernels/xpu/topk_compute.h | 35 +++ lite/operators/CMakeLists.txt | 1 + lite/operators/op_params.h | 25 ++ lite/operators/sum_op.cc | 63 ++++ lite/operators/sum_op.h | 60 ++++ 27 files changed, 1632 insertions(+), 56 deletions(-) create mode 100644 lite/kernels/xpu/conv2d_transpose_compute.cc create mode 100644 lite/kernels/xpu/conv2d_transpose_compute.h create mode 100644 lite/kernels/xpu/fill_constant_compute.cc create mode 100644 lite/kernels/xpu/fill_constant_compute.h create mode 100644 lite/kernels/xpu/gru_compute.cc create mode 100644 lite/kernels/xpu/gru_compute.h create mode 100644 lite/kernels/xpu/im2sequence_compute.cc create mode 100644 lite/kernels/xpu/im2sequence_compute.h create mode 100644 lite/kernels/xpu/interpolate_compute.cc create mode 100644 lite/kernels/xpu/interpolate_compute.h create mode 100644 lite/kernels/xpu/lrn_compute.cc create mode 100644 lite/kernels/xpu/lrn_compute.h create mode 100644 lite/kernels/xpu/split_compute.cc create mode 100644 lite/kernels/xpu/split_compute.h create mode 100644 lite/kernels/xpu/sum_compute.cc create mode 100644 lite/kernels/xpu/sum_compute.h create mode 100644 lite/kernels/xpu/topk_compute.cc create mode 100644 lite/kernels/xpu/topk_compute.h create mode 100644 lite/operators/sum_op.cc create mode 100644 lite/operators/sum_op.h diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index cc69120557..35a30a08f9 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -8,6 +8,7 @@ if(LITE_WITH_XTCL) else() # basic add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(conv2d_transpose_compute_xpu XPU basic SRCS conv2d_transpose_compute.cc DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu) add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps}) @@ -27,6 +28,10 @@ else() add_kernel(reshape_compute_xpu XPU basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reduce_mean_compute_xpu XPU basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reduce_sum_compute_xpu XPU basic SRCS reduce_sum_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(split_compute_xpu XPU basic SRCS split_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sum_compute_xpu XPU basic SRCS sum_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(interpolate_compute_xpu XPU basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(fill_constant_compute_xpu XPU basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) # extra add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) @@ -39,6 +44,10 @@ else() add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps}) add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_unpad_compute_xpu XPU extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(lrn_compute_xpu XPU extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(topk_compute_xpu XPU extra SRCS topk_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(im2sequence_compute_xpu XPU extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(gru_compute_xpu XPU extra SRCS gru_compute.cc DEPS ${lite_kernel_deps}) # extra(fused kernel) add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps}) diff --git a/lite/kernels/xpu/cast_compute.cc b/lite/kernels/xpu/cast_compute.cc index c7eabd28a1..194630c3a5 100644 --- a/lite/kernels/xpu/cast_compute.cc +++ b/lite/kernels/xpu/cast_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/xpu/cast_compute.h" +#include #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -21,31 +22,61 @@ namespace lite { namespace kernels { namespace xpu { -template -void CastCompute::Run() { +void CastCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - auto* x = param.X; auto* out = param.Out; int out_dtype = param.out_dtype; - auto* in_data = x->template data(); - int numel = x->numel(); - + int in_dtype = param.in_dtype; + int numel = param.X->numel(); int r = 0; // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6; // SIZE_T = 19;UINT8 = 20;INT8 = 21; - if (out_dtype == 5) { - auto* out_data = out->template mutable_data(TARGET(kXPU)); - r = xdnn::cast( + + if (in_dtype == 5 && out_dtype == 5) { + // float -> float + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 2 && out_dtype == 2) { + // int -> int + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 3 && out_dtype == 3) { + // int64 -> int64 + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast( + ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 2 && out_dtype == 3) { + // int -> int64 + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 2 && out_dtype == 5) { + // int -> float + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 3 && out_dtype == 5) { + // int64_t -> float + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast( ctx.GetRawContext(), in_data, out_data, numel); - } else if (out_dtype == 2) { - auto* out_data = out->template mutable_data(TARGET(kXPU)); - r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); - } else if (out_dtype == 3) { - auto* out_data = out->template mutable_data(TARGET(kXPU)); - r = xdnn::cast( + } else if (in_dtype == 5 && out_dtype == 3) { + // float -> int64_t + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast( ctx.GetRawContext(), in_data, out_data, numel); + } else if (in_dtype == 5 && out_dtype == 2) { + // float -> int + auto* in_data = param.X->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); } else { CHECK(false); } @@ -57,12 +88,8 @@ void CastCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(cast, - kXPU, - kAny, - kNCHW, - paddle::lite::kernels::xpu::CastCompute, - def) +REGISTER_LITE_KERNEL( + cast, kXPU, kAny, kNCHW, paddle::lite::kernels::xpu::CastCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h index efd4cbae8d..de1a7affeb 100644 --- a/lite/kernels/xpu/cast_compute.h +++ b/lite/kernels/xpu/cast_compute.h @@ -21,7 +21,6 @@ namespace lite { namespace kernels { namespace xpu { -template class CastCompute : public KernelLite { public: using param_t = operators::CastParam; diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc new file mode 100644 index 0000000000..a3e43276c5 --- /dev/null +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/conv2d_transpose_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template <> +void Conv2dTransposeCompute::PrepareForRun() { + maxs_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */); + + auto& ctx = this->ctx_->As(); + auto& param = this->Param(); + float* max_filter_ptr = reinterpret_cast(maxs_xpu_guard_->addr_); + int filter_size = param.filter->numel(); + int r = xdnn::findmax(ctx.GetRawContext(), + param.filter->data(), + filter_size, + max_filter_ptr); + CHECK_EQ(r, 0); +} + +template <> +void Conv2dTransposeCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& out_dims = param.output->dims(); + auto& w_dims = param.filter->dims(); + auto& in_dims = param.x->dims(); + + int groups = param.groups; + auto& strides = param.strides; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + + float* max_filter_ptr = reinterpret_cast(maxs_xpu_guard_->addr_); + float* max_image_ptr = max_filter_ptr + 4; + int image_size = param.x->numel(); + + // find image max + int r = xdnn::findmax( + ctx.GetRawContext(), param.x->data(), image_size, max_image_ptr); + CHECK_EQ(r, 0); + + r = xdnn::conv2d_backward_int16( + ctx.GetRawContext(), + out_dims[0], + out_dims[1], + out_dims[2], + out_dims[3], + in_dims[1], + w_dims[2], + w_dims[3], + strides[0], + strides[1], + paddings[0], + paddings[1], + dilations[0], + dilations[1], + groups, + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), + max_image_ptr, + max_filter_ptr); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace xpu = paddle::lite::kernels::xpu; +using Conv2dTransposeFp32 = xpu::Conv2dTransposeCompute; + +REGISTER_LITE_KERNEL( + conv2d_transpose, kXPU, kFloat, kNCHW, Conv2dTransposeFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv2d_transpose_compute.h b/lite/kernels/xpu/conv2d_transpose_compute.h new file mode 100644 index 0000000000..5ee2681604 --- /dev/null +++ b/lite/kernels/xpu/conv2d_transpose_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class Conv2dTransposeCompute : public KernelLite { + public: + using param_t = operators::ConvParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~Conv2dTransposeCompute() = default; + + private: + XPUScratchPadGuard maxs_xpu_guard_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/conv_compute.cc b/lite/kernels/xpu/conv_compute.cc index ed692fd0e2..0d382ebb4a 100644 --- a/lite/kernels/xpu/conv_compute.cc +++ b/lite/kernels/xpu/conv_compute.cc @@ -33,32 +33,55 @@ void Conv2dCompute::Run() { auto paddings = *param.paddings; auto dilations = *param.dilations; - int r = xdnn::conv2d_forward_int16( - ctx.GetRawContext(), /* context */ - x_dims[0], /* num */ - x_dims[1], /* input_c */ - x_dims[2], /* input_h */ - x_dims[3], /* input_w */ - w_dims[0], /* num_filter */ - w_dims[2], /* kernel_h */ - w_dims[3], /* kernel_w */ - strides[0], /* stride_h */ - strides[1], /* stride_w */ - paddings[0], /* pad_h */ - paddings[1], /* pad_w */ - dilations[0], /* dilation_h */ - dilations[1], /* dilation_w */ - groups, /* group */ - param.x->data(), /* bottom */ - param.filter->data(), /* weight */ - param.output->mutable_data(TARGET(kXPU)), /* top */ - nullptr, /* bias */ - nullptr, /* branch */ - xdnn::Activation_t::LINEAR, /* type */ - nullptr, /* max_image_ptr */ - nullptr, /* max_filter_ptr */ - nullptr /* max_result_ptr */); - CHECK_EQ(r, 0); + if (groups == 1) { + int r = xdnn::conv2d_forward_int16( + ctx.GetRawContext(), /* context */ + x_dims[0], /* num */ + x_dims[1], /* input_c */ + x_dims[2], /* input_h */ + x_dims[3], /* input_w */ + w_dims[0], /* num_filter */ + w_dims[2], /* kernel_h */ + w_dims[3], /* kernel_w */ + strides[0], /* stride_h */ + strides[1], /* stride_w */ + paddings[0], /* pad_h */ + paddings[1], /* pad_w */ + dilations[0], /* dilation_h */ + dilations[1], /* dilation_w */ + groups, /* group */ + param.x->data(), /* bottom */ + param.filter->data(), /* weight */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + nullptr, /* bias */ + nullptr, /* branch */ + xdnn::Activation_t::LINEAR, /* type */ + nullptr, /* max_image_ptr */ + nullptr, /* max_filter_ptr */ + nullptr /* max_result_ptr */); + CHECK_EQ(r, 0); + } else { + int r = xdnn::conv2d_int16_with_group( + ctx.GetRawContext(), /* context */ + param.x->data(), /* bottom */ + param.filter->data(), /* weight */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + x_dims[0], + x_dims[1], + x_dims[2], + x_dims[3], + w_dims[0], + w_dims[2], + w_dims[3], + groups, + strides[0], + strides[1], + paddings[0], + paddings[1], + nullptr, + nullptr); + CHECK_EQ(r, 0); + } } } // namespace xpu @@ -75,3 +98,10 @@ REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL(depthwise_conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/dropout_compute.cc b/lite/kernels/xpu/dropout_compute.cc index f42d3eeff5..9fb5b0bc0d 100644 --- a/lite/kernels/xpu/dropout_compute.cc +++ b/lite/kernels/xpu/dropout_compute.cc @@ -25,14 +25,25 @@ void DropoutCompute::Run() { auto& param = this->Param(); auto& ctx = this->ctx_->As(); - int size = param.x->numel() * sizeof(float); - - int r = xdnn::memcpy_device( - ctx.GetRawContext(), /* context */ - param.output->mutable_data(TARGET(kXPU)), /* dst */ - param.x->data(), /* src */ - size /* size */); - CHECK_EQ(r, 0); + if (param.is_test) { + float scale = 1.0f; + if (param.dropout_implementation == "upscale_in_train") { + scale = 1.0f; + } else { + scale = 1.0f - param.dropout_prob; + } + int r = + xdnn::scale(ctx.GetRawContext(), /* context */ + param.x->numel(), + scale, + 0.0f, + 0, + param.x->data(), /* src */ + param.output->mutable_data(TARGET(kXPU))); /* dst */ + CHECK_EQ(r, 0); + } else { + CHECK(false); + } } } // namespace xpu diff --git a/lite/kernels/xpu/fill_constant_compute.cc b/lite/kernels/xpu/fill_constant_compute.cc new file mode 100644 index 0000000000..3223659480 --- /dev/null +++ b/lite/kernels/xpu/fill_constant_compute.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/fill_constant_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +union TypeUnion { + float fp32; + int32_t int32; +}; + +void FillConstantCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + TypeUnion value; + int write_size = param.out->numel(); + + if (param.dtype == static_cast(lite::core::FluidType::FP32)) { + auto data = param.out->template mutable_data(TARGET(kXPU)); + value.fp32 = param.value; + write_size = write_size * sizeof(float); + int r = xdnn::memset(ctx.GetRawContext(), /* context */ + reinterpret_cast(data), + value.int32, + write_size); + CHECK_EQ(r, 0); + + } else if (param.dtype == + static_cast(lite::core::FluidType::INT32)) { + auto data = param.out->template mutable_data(TARGET(kXPU)); + value.int32 = param.value; + write_size = write_size * sizeof(int32_t); + int r = xdnn::memset(ctx.GetRawContext(), /* context */ + reinterpret_cast(data), + value.int32, + write_size); + CHECK_EQ(r, 0); + + } else if (param.dtype == static_cast(lite::core::FluidType::INT8)) { + auto data = param.out->template mutable_data(TARGET(kXPU)); + value.int32 = 0; + for (int i = 0; i < 4; i++) { + value.int32 += static_cast(param.value); + value.int32 = value.int32 << 8; + } + int r = xdnn::memset(ctx.GetRawContext(), /* context */ + reinterpret_cast(data), + value.int32, + write_size); + CHECK_EQ(r, 0); + } else { + LOG(FATAL) << "not supported dtype " << param.dtype; + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(fill_constant, + kXPU, + kAny, + kNCHW, + paddle::lite::kernels::xpu::FillConstantCompute, + def) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("ShapeTensorList", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/xpu/fill_constant_compute.h b/lite/kernels/xpu/fill_constant_compute.h new file mode 100644 index 0000000000..0f561433c5 --- /dev/null +++ b/lite/kernels/xpu/fill_constant_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class FillConstantCompute : public KernelLite { + public: + using param_t = operators::FillConstantParam; + + virtual void Run(); + + virtual ~FillConstantCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/gru_compute.cc b/lite/kernels/xpu/gru_compute.cc new file mode 100644 index 0000000000..8e7f2c8feb --- /dev/null +++ b/lite/kernels/xpu/gru_compute.cc @@ -0,0 +1,307 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/gru_compute.h" +#include +#include +#include +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +inline xdnn::Activation_t get_gru_act_type(const std::string& type) { + std::map act_type_map = { + {"sigmoid", xdnn::Activation_t::SIGMOID}, + {"tanh", xdnn::Activation_t::TANH}, + {"relu", xdnn::Activation_t::RELU}}; + auto it = act_type_map.find(type); + if (it != act_type_map.end()) { + return it->second; + } else { + LOG(FATAL) << "unsupported activation type: " << type; + } +} + +void GruCompute::PrepareForRun() { + offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SEQ_LEN * sizeof(int), false /* use_l3 */); + idx_sorted_by_width_data_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + + idx_sorted_by_width_data_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + new_offset_cpu.reset(new int[XPU_MAX_LOD_SEQ_LEN]); + + // find max + maxs_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */); + auto& ctx = this->ctx_->As(); + auto& param = this->Param(); + int frame_size = param.input->dims()[1] / 3; + float* weight_ur_max_ptr_xpu = + reinterpret_cast(maxs_xpu_guard_->addr_); + float* weight_c_max_ptr_xpu = weight_ur_max_ptr_xpu + 4; + + // weight_ur_max + int ret = xdnn::findmax(ctx.GetRawContext(), + param.weight->data(), + frame_size * frame_size * 2, + weight_ur_max_ptr_xpu); + CHECK_EQ(ret, 0); + // weight_c_max + ret = xdnn::findmax(ctx.GetRawContext(), + param.weight->data() + frame_size * frame_size * 2, + frame_size * frame_size, + weight_c_max_ptr_xpu); + CHECK_EQ(ret, 0); + + float weight_ur_max_cpu[4]; + XPU_CALL(xpu_memcpy(weight_ur_max_cpu, + weight_ur_max_ptr_xpu, + sizeof(float) * 4, + XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + weight_u_r_max_value = + std::max(std::max(weight_ur_max_cpu[0], weight_ur_max_cpu[1]), + std::max(weight_ur_max_cpu[2], weight_ur_max_cpu[3])); + + float weight_c_max_cpu[4]; + XPU_CALL(xpu_memcpy(weight_c_max_cpu, + weight_c_max_ptr_xpu, + sizeof(float) * 4, + XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + weight_c_max_value = + std::max(std::max(weight_c_max_cpu[0], weight_c_max_cpu[1]), + std::max(weight_c_max_cpu[2], weight_c_max_cpu[3])); +} + +void GruCompute::prepare_layout(const paddle::lite::LoD& lods, + int* offset_xpu, + int* new_offset_xpu, + int* idx_sorted_by_width_data_xpu) { + const auto& lod = lods[0]; + for (auto i = 0; i < lod.size(); i++) { + offset_cpu[i] = lod[i]; + } + for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { + int length = lod[seq_id + 1] - lod[seq_id]; + seq_info.push_back(SeqInfo(lod[seq_id], length, seq_id)); + } + std::cout << "seq len is " << seq_info.size() << std::endl; + std::stable_sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { + return a.length > b.length; + }); + for (auto i = 0; i < seq_info.size(); i++) { + idx_sorted_by_width_data_cpu[i] = seq_info[i].seq_idx; + } + // max_width + int max_width = seq_info[0].length; + new_offset_cpu[0] = 0; + int cur_offset_idx = 1; + for (auto i = 0; i < seq_info.size(); i++) { + int cur_length = seq_info.size() - i; + int repeat_times = (i == 0) ? seq_info[i].length + : (seq_info[i].length - seq_info[i - 1].length); + for (int j = 0; j < repeat_times; j++) { + new_offset_cpu[cur_offset_idx] = + new_offset_cpu[cur_offset_idx - 1] + cur_length; + cur_offset_idx++; + } + } + XPU_CALL(xpu_memcpy(offset_xpu, + offset_cpu.get(), + sizeof(int) * lod.size(), + XPU_HOST_TO_DEVICE)); + + XPU_CALL(xpu_memcpy(idx_sorted_by_width_data_xpu, + idx_sorted_by_width_data_cpu.get(), + sizeof(int) * seq_info.size(), + XPU_HOST_TO_DEVICE)); + + XPU_CALL(xpu_memcpy(new_offset_xpu, + new_offset_cpu.get(), + sizeof(int) * (max_width + 1), + XPU_HOST_TO_DEVICE)); +} + +void GruCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto input = param.input; + float* batch_gate = param.batch_gate->mutable_data(TARGET(kXPU)); + float* batch_reset_hidden_prev = + param.batch_reset_hidden_prev->mutable_data(TARGET(kXPU)); + float* batch_hidden = param.hidden->mutable_data(TARGET(kXPU)); + bool origin_mode = param.origin_mode; + int frame_size = input->dims()[1] / 3; + + int* offset_xpu = reinterpret_cast(offset_xpu_guard_->addr_); + int* new_offset_xpu = reinterpret_cast(new_offset_xpu_guard_->addr_); + int* idx_sorted_by_width_data_xpu = + reinterpret_cast(idx_sorted_by_width_data_xpu_guard_->addr_); + + // prepare seq_info + auto lods = input->lod(); + const auto& lod = lods[0]; + prepare_layout( + lods, offset_xpu, new_offset_xpu, idx_sorted_by_width_data_xpu); + int max_width = seq_info[0].length; + + // sequence to batch + XPUScratchPadGuard xpu_batch_data_guard_ = TargetWrapperXPU::MallocScratchPad( + lod[lod.size() - 1] * frame_size * 3 * sizeof(float), false /*use_l3 */); + float* batch_data = reinterpret_cast(xpu_batch_data_guard_->addr_); + + bool is_reverse = param.is_reverse; + if (is_reverse) { + int ret = xdnn::sequence_reverse(ctx.GetRawContext(), /* context */ + lod.size() - 1, + offset_xpu, + frame_size * 3, + param.input->data(), + batch_data); + CHECK_EQ(ret, 0); + ret = xdnn::search_seq2batch(ctx.GetRawContext(), /* context */ + lod.size() - 1, + max_width, + frame_size * 3, + idx_sorted_by_width_data_xpu, + offset_xpu, + new_offset_xpu, + batch_data, + batch_data); + CHECK_EQ(ret, 0); + } else { + int ret = xdnn::search_seq2batch(ctx.GetRawContext(), /* context */ + lod.size() - 1, + max_width, + frame_size * 3, + idx_sorted_by_width_data_xpu, + offset_xpu, + new_offset_xpu, + param.input->data(), + batch_data); + CHECK_EQ(ret, 0); + } + // perpare xpu_h_p + auto* h0 = param.h0; + XPUScratchPadGuard xpu_h0_guard_ = TargetWrapperXPU::MallocScratchPad( + (lod.size() - 1) * frame_size * sizeof(float), false /*use_l3 */); + float* xpu_h0_start = reinterpret_cast(xpu_h0_guard_->addr_); + float* xpu_h0 = xpu_h0_start; + if (h0) { + for (auto i = 0; i < seq_info.size(); i++) { + int ret = xdnn::memcpy_device( + ctx.GetRawContext(), + xpu_h0 + i * frame_size, + h0->data() + seq_info[i].seq_idx * frame_size, + sizeof(float) * frame_size); + CHECK_EQ(ret, 0); + } + } else { + // initial with zero + int ret = xdnn::scale(ctx.GetRawContext(), + frame_size * seq_info.size(), + 0.0, + 0.0, + false, + xpu_h0, + xpu_h0); + CHECK_EQ(ret, 0); + } + // gru + for (int batch_idx = 0; batch_idx < max_width; batch_idx++) { + float* x = batch_data + new_offset_cpu[batch_idx] * frame_size * 3; + + int ret = xdnn::gru_unit_int31( + ctx.GetRawContext(), + new_offset_cpu[batch_idx + 1] - new_offset_cpu[batch_idx], + frame_size, + origin_mode, + get_gru_act_type(param.gate_activation), + get_gru_act_type(param.activation), + x, + xpu_h0, + param.weight->data(), + weight_u_r_max_value, + weight_c_max_value, + param.bias->data(), + batch_gate + new_offset_cpu[batch_idx] * frame_size * 3, + batch_reset_hidden_prev + new_offset_cpu[batch_idx] * frame_size, + batch_hidden + new_offset_cpu[batch_idx] * frame_size); + + CHECK_EQ(ret, 0); + xpu_h0 = batch_hidden + new_offset_cpu[batch_idx] * frame_size; + } + // batch to sequence + if (is_reverse) { + int ret = xdnn::search_batch2seq(ctx.GetRawContext(), + seq_info.size(), + max_width, + frame_size, + idx_sorted_by_width_data_xpu, + offset_xpu, + new_offset_xpu, + batch_hidden, + batch_data); + CHECK_EQ(ret, 0); + ret = + xdnn::sequence_reverse(ctx.GetRawContext(), + lod.size() - 1, + offset_xpu, + frame_size, + batch_data, + param.hidden->mutable_data(TARGET(kXPU))); + CHECK_EQ(ret, 0); + + } else { + int ret = + xdnn::search_batch2seq(ctx.GetRawContext(), + seq_info.size(), + max_width, + frame_size, + idx_sorted_by_width_data_xpu, + offset_xpu, + new_offset_xpu, + batch_hidden, + param.hidden->mutable_data(TARGET(kXPU))); + CHECK_EQ(ret, 0); + } + seq_info.clear(); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + gru, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::GruCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("H0", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/gru_compute.h b/lite/kernels/xpu/gru_compute.h new file mode 100644 index 0000000000..741aad6ca4 --- /dev/null +++ b/lite/kernels/xpu/gru_compute.h @@ -0,0 +1,67 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class GruCompute : public KernelLite { + public: + using param_t = operators::GRUParam; + + void PrepareForRun() override; + + void prepare_layout(const paddle::lite::LoD& lods, + int* offset_xpu, + int* new_offset_xpu, + int* idx_sorted_by_width_data_xpu); + + void Run() override; + + virtual ~GruCompute() = default; + + private: + XPUScratchPadGuard offset_xpu_guard_; + XPUScratchPadGuard new_offset_xpu_guard_; + XPUScratchPadGuard maxs_xpu_guard_; + XPUScratchPadGuard idx_sorted_by_width_data_xpu_guard_; + + float weight_u_r_max_value; + float weight_c_max_value; + + std::unique_ptr idx_sorted_by_width_data_cpu; + std::unique_ptr offset_cpu; + std::unique_ptr new_offset_cpu; + struct SeqInfo { + SeqInfo() = default; + SeqInfo(int start, int length, int seq_idx) + : start(start), length(length), seq_idx(seq_idx) {} + int start; + int length; + int seq_idx; + }; + std::vector seq_info; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/im2sequence_compute.cc b/lite/kernels/xpu/im2sequence_compute.cc new file mode 100644 index 0000000000..a3fba58ce4 --- /dev/null +++ b/lite/kernels/xpu/im2sequence_compute.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/im2sequence_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +inline int Im2SeqOutputSize( + int input_size, int filter_size, int padding_0, int padding_1, int stride) { + const int output_size = + (input_size + padding_0 + padding_1 - filter_size) / stride + 1; + return output_size; +} + +void Im2SequenceCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + auto x_dims = param.X->dims(); + + int batch = x_dims[0]; + int channel = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + int kernel_h = param.kernels[0]; + int kernel_w = param.kernels[1]; + int stride_h = param.strides[0]; + int stride_w = param.strides[1]; + int dilation_h = 1; + int dilation_w = 1; + int pad_h = param.paddings[0]; + int pad_w = param.paddings[1]; + + int output_height = + Im2SeqOutputSize(height, kernel_h, pad_h, pad_h, stride_h); + int output_width = Im2SeqOutputSize(width, kernel_w, pad_w, pad_w, stride_w); + + std::vector out_offset; + out_offset.push_back(0); + out_offset.push_back(output_height * output_width); + + for (int batch_idx = 0; batch_idx < batch; batch_idx++) { + int r = xdnn::im2col_ocf( + ctx.GetRawContext(), /* context */ + channel, + height, + width, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + param.X->data() + batch_idx * channel * height * width, + param.Out->mutable_data(TARGET(kXPU)) + + batch_idx * output_height * output_width * channel * kernel_h * + kernel_w); + CHECK_EQ(r, 0); + } + auto lod = param.Out->mutable_lod(); + lod->resize(1); + (*lod)[0] = out_offset; +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(im2sequence, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::Im2SequenceCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/im2sequence_compute.h b/lite/kernels/xpu/im2sequence_compute.h new file mode 100644 index 0000000000..b1b5a27d0e --- /dev/null +++ b/lite/kernels/xpu/im2sequence_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class Im2SequenceCompute : public KernelLite { + public: + using param_t = operators::Im2SequenceParam; + + virtual void Run(); + + virtual ~Im2SequenceCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/interpolate_compute.cc b/lite/kernels/xpu/interpolate_compute.cc new file mode 100644 index 0000000000..5fe15baa31 --- /dev/null +++ b/lite/kernels/xpu/interpolate_compute.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/interpolate_compute.h" +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void BilinearInterpCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + auto x_dims = param.X->dims(); + CHECK_EQ(x_dims.size(), 4); + int n = x_dims[0]; + int c = x_dims[1]; + int in_h = x_dims[2]; + int in_w = x_dims[3]; + + int out_w = param.out_w; + int out_h = param.out_h; + float scale = param.scale; + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + if (param.OutSize != nullptr) { + out_h = param.OutSize->data()[0]; + out_w = param.OutSize->data()[1]; + } + bool align_corners = param.align_corners; + CHECK_EQ(align_corners, 1) << "XPU only support align corners = 1"; + + int r = xdnn::bilinear_interp(ctx.GetRawContext(), /* context */ + param.X->data(), + param.Out->mutable_data(TARGET(kXPU)), + n, + c, + in_h, + in_w, + out_h, + out_w, + align_corners, + 1); + CHECK_EQ(r, 0); +} + +void NearestInterpCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + auto x_dims = param.X->dims(); + CHECK_EQ(x_dims.size(), 4); + int n = x_dims[0]; + int c = x_dims[1]; + int in_h = x_dims[2]; + int in_w = x_dims[3]; + + int out_w = param.out_w; + int out_h = param.out_h; + float scale = param.scale; + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + + if (param.OutSize != nullptr) { + out_h = param.OutSize->data()[0]; + out_w = param.OutSize->data()[1]; + } + bool align_corners = param.align_corners; + + int r = xdnn::interpolate(ctx.GetRawContext(), /* context */ + param.X->data(), + param.Out->mutable_data(TARGET(kXPU)), + n, + c, + in_h, + in_w, + out_h, + out_w, + align_corners); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(bilinear_interp, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::BilinearInterpCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("OutSize", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(nearest_interp, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::NearestInterpCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("OutSize", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/interpolate_compute.h b/lite/kernels/xpu/interpolate_compute.h new file mode 100644 index 0000000000..d09c2f6801 --- /dev/null +++ b/lite/kernels/xpu/interpolate_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class BilinearInterpCompute + : public KernelLite { + public: + using param_t = operators::InterpolateParam; + void Run() override; + virtual ~BilinearInterpCompute() = default; +}; + +class NearestInterpCompute + : public KernelLite { + public: + using param_t = operators::InterpolateParam; + void Run() override; + + virtual ~NearestInterpCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/lrn_compute.cc b/lite/kernels/xpu/lrn_compute.cc new file mode 100644 index 0000000000..cfc41e9b99 --- /dev/null +++ b/lite/kernels/xpu/lrn_compute.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/lrn_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void LrnCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + auto x_dims = param.X->dims(); + int batch = x_dims[0]; + int channel = x_dims[1]; + int h = x_dims[2]; + int w = x_dims[3]; + int n = param.n; + float alpha = param.alpha; + float beta = param.beta; + float k = param.k; + if (param.norm_region == "AcrossChannels") { + int r = xdnn::lrn_fwd(ctx.GetRawContext(), + param.X->data(), + param.Out->mutable_data(TARGET(kXPU)), + batch, + channel, + h, + w, + n, + k, + alpha, + beta); + CHECK_EQ(r, 0); + } else { + LOG(FATAL) << "Unsupport Norm Region Type: " << param.norm_region; + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + lrn, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::LrnCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/lrn_compute.h b/lite/kernels/xpu/lrn_compute.h new file mode 100644 index 0000000000..fdd0df19a4 --- /dev/null +++ b/lite/kernels/xpu/lrn_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class LrnCompute : public KernelLite { + public: + using param_t = operators::LrnParam; + + virtual void Run(); + + virtual ~LrnCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/split_compute.cc b/lite/kernels/xpu/split_compute.cc new file mode 100644 index 0000000000..ddb2b48bc3 --- /dev/null +++ b/lite/kernels/xpu/split_compute.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/split_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SplitCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + auto& dout = param.output; + auto in_dim = param.x->dims(); + auto axis = param.axis; + + int height = 1; + for (int i = 0; i < axis; i++) { + height = height * in_dim[i]; + } + + int n = 0; + std::vector out_ptrs; + std::vector width_out; + + for (auto out : dout) { + n++; + out->set_lod(param.x->lod()); + out_ptrs.push_back(out->mutable_data(TARGET(kXPU))); + int out_strides = out->numel(); + width_out.push_back(out_strides / height); + } + + int r = xdnn::concat_grad(ctx.GetRawContext(), + height, + width_out.data(), + n, + out_ptrs.data(), + param.x->data()); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + split, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SplitCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("SectionsTensorList", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/split_compute.h b/lite/kernels/xpu/split_compute.h new file mode 100644 index 0000000000..6320e7a7bd --- /dev/null +++ b/lite/kernels/xpu/split_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SplitCompute : public KernelLite { + public: + using param_t = operators::SplitParam; + + virtual void Run(); + + virtual ~SplitCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sum_compute.cc b/lite/kernels/xpu/sum_compute.cc new file mode 100644 index 0000000000..afba90b121 --- /dev/null +++ b/lite/kernels/xpu/sum_compute.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sum_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SumCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + int N = param.x.size(); + if (N == 1) { + param.output->ShareDataWith(*param.x[0]); + return; + } + std::vector ptrs(N, nullptr); + for (int i = 0; i < N; i++) { + ptrs[i] = param.x[i]->data(); + } + int out_numel = param.output->numel(); + int r = xdnn::sum_batch(ctx.GetRawContext(), + ptrs.data(), + param.output->mutable_data(TARGET(kXPU)), + N, + out_numel); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + sum, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SumCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sum_compute.h b/lite/kernels/xpu/sum_compute.h new file mode 100644 index 0000000000..c141de2f58 --- /dev/null +++ b/lite/kernels/xpu/sum_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SumCompute : public KernelLite { + public: + using param_t = operators::SumParam; + + virtual void Run(); + + virtual ~SumCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/topk_compute.cc b/lite/kernels/xpu/topk_compute.cc new file mode 100644 index 0000000000..21eddee9b5 --- /dev/null +++ b/lite/kernels/xpu/topk_compute.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/topk_compute.h" +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void TopkCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + DDim x_dims = param.X->dims(); + int K = param.K; + int dim_size = x_dims.size(); + int m = x_dims.production() / x_dims[dim_size - 1]; + int n = x_dims[dim_size - 1]; + + XPUScratchPadGuard indices_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + m * K * sizeof(int), false /* use_l3 */); + + int* indices_int32_device = reinterpret_cast(indices_xpu_guard_->addr_); + int64_t* indices_int64_device = + param.Indices->mutable_data(TARGET(kXPU)); + + int r = xdnn::topk(ctx.GetRawContext(), + param.X->data(), + param.Out->mutable_data(TARGET(kXPU)), + indices_int32_device, + m, + n, + K); + CHECK_EQ(r, 0); + + r = xdnn::cast( + ctx.GetRawContext(), indices_int32_device, indices_int64_device, m * K); + + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + top_k, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TopkCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Indices", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/topk_compute.h b/lite/kernels/xpu/topk_compute.h new file mode 100644 index 0000000000..670bacc675 --- /dev/null +++ b/lite/kernels/xpu/topk_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class TopkCompute : public KernelLite { + public: + using param_t = operators::TopkParam; + + virtual void Run(); + + virtual ~TopkCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 6cdf815a6f..e86784a1bd 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -26,6 +26,7 @@ add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS}) add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS}) add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS}) add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS}) +add_operator(sum_op basic SRCS sum_op.cc DEPS ${op_DEPS}) add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS}) add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS}) add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS}) diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 33da913d2e..0eccd8361d 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -354,6 +354,31 @@ struct ReshapeParam : ParamBase { } }; +// For Sum op +struct SumParam : ParamBase { + std::vector x{}; + lite::Tensor* output{}; + bool use_mkldnn{false}; + // get a vector of input tensors + const std::vector* input_tensor_ptrs() override { + if (!input_tensor_ptrs_cache_) { + std::vector vec; + for (auto in : x) { + vec.push_back(in); + } + input_tensor_ptrs_cache_.reset(new std::vector(vec)); + } + return input_tensor_ptrs_cache_.get(); + } + // get a vector of output tensors + std::vector* output_tensor_ptrs() override { + if (!output_tensor_ptrs_cache_) { + output_tensor_ptrs_cache_.reset(new std::vector({output})); + } + return output_tensor_ptrs_cache_.get(); + } +}; + // For Concat op struct ConcatParam : ParamBase { std::vector x{}; diff --git a/lite/operators/sum_op.cc b/lite/operators/sum_op.cc new file mode 100644 index 0000000000..d09e97184f --- /dev/null +++ b/lite/operators/sum_op.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sum_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SumOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.output); + CHECK_GE_OR_FALSE(param_.x.size(), 1UL); + return true; +} + +bool SumOpLite::InferShapeImpl() const { + const std::vector &inputs = param_.x; + const size_t n = inputs.size(); + CHECK_GT_OR_FALSE(n, 0); + + auto out_dims = inputs[0]->dims(); + // Set output dims + param_.output->Resize(out_dims); + auto out_lod = param_.output->mutable_lod(); + *out_lod = param_.x[0]->lod(); + return true; +} + +// TODO(Superjomn) replace framework::OpDesc with a lite one. +bool SumOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + AttachParam(¶m_); + auto inputs = op_desc.Input("X"); + auto out = op_desc.Output("Out").front(); + + param_.x.clear(); + for (auto var : inputs) { + param_.x.push_back(scope->FindVar(var)->GetMutable()); + } + CHECK(scope->FindVar(out)); + param_.output = scope->FindVar(out)->GetMutable(); + param_.use_mkldnn = op_desc.GetAttr("use_mkldnn"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sum, paddle::lite::operators::SumOpLite); diff --git a/lite/operators/sum_op.h b/lite/operators/sum_op.h new file mode 100644 index 0000000000..d00452ccf0 --- /dev/null +++ b/lite/operators/sum_op.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SumOpLite : public OpLite { + public: + SumOpLite() {} + explicit SumOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sum"; } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto output_dims = param_.output->dims(); + std::string inputs_shape = ""; + for (size_t i = 0; i < param_.x.size(); ++i) { + inputs_shape += ch->DimToStr(param_.x[i]->dims()); + if (i != param_.x.size() - 1) inputs_shape += "/"; + } + ch->input_shape = inputs_shape; + ch->output_shape = ch->DimToStr(output_dims); + ch->macs = 0.f; // no calc. only io operation + } +#endif + + private: + mutable SumParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle -- GitLab