diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 31996dab195c7d5d7e99d917b8d251b35f477f8f..c4b03b03072b36ff10d53f7da9a90b8ea5607818 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -41,7 +41,6 @@ add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_ add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index a0085e6d6c5e65667e96393c42a1608c8dd24d0c..078fad7aa0221a0e60b1f4dd928136b38f198dcb 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) diff --git a/lite/kernels/arm/expand_compute.cc b/lite/kernels/host/expand_compute.cc similarity index 63% rename from lite/kernels/arm/expand_compute.cc rename to lite/kernels/host/expand_compute.cc index 73bcae909e7016b6b3cf9d2b0091299b44cea3db..cb7241a47371b4793b1bcd24353c7f09669d6f8e 100644 --- a/lite/kernels/arm/expand_compute.cc +++ b/lite/kernels/host/expand_compute.cc @@ -12,24 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/expand_compute.h" +#include "lite/kernels/host/expand_compute.h" #include -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { -void ExpandCompute::Run() { - auto& param = Param(); +template +void ExpandCompute::Run() { + auto& param = this->template Param(); const auto* x = param.X; auto* out = param.Out; std::vector expand_times = param.expand_times; - const float* src = x->data(); - float* dst = out->mutable_data(); + const T* src = x->template data(); + T* dst = out->template mutable_data(); int dims = expand_times.size(); DDim in_shape = x->dims(); @@ -42,7 +41,7 @@ void ExpandCompute::Run() { for (int k = 0; k < expand_times[i]; ++k) { memcpy(dst + (j * expand_times[i] + k) * inner_num, src + j * inner_num, - sizeof(float) * inner_num); + sizeof(T) * inner_num); } } inner_num *= expand_times[i]; @@ -53,20 +52,27 @@ void ExpandCompute::Run() { for (int k = expand_times[i] - 1; k >= 0; --k) { memcpy(dst + (j * expand_times[i] + k) * inner_num, dst + j * inner_num, - sizeof(float) * inner_num); + sizeof(T) * inner_num); } } inner_num *= expand_times[i]; } } -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - expand, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpandCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) +using expand_float = + paddle::lite::kernels::host::ExpandCompute; +REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/expand_compute.h b/lite/kernels/host/expand_compute.h similarity index 84% rename from lite/kernels/arm/expand_compute.h rename to lite/kernels/host/expand_compute.h index d872c2a60b613bb05ee36698cb31ceef0d5eed3e..8bb9422501fa4ffb77472a5c898a838d3b6cc7e1 100644 --- a/lite/kernels/arm/expand_compute.h +++ b/lite/kernels/host/expand_compute.h @@ -19,16 +19,18 @@ namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { -class ExpandCompute : public KernelLite { +template +class ExpandCompute + : public KernelLite { public: void Run() override; virtual ~ExpandCompute() = default; }; -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index f2974bf6103da4e8470926b4cc1ef07e5530fd2c..5157f47867160cf4f705306ca37cfad962373386 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -49,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_expand_op_npu SRCS expand_op.cc DEPS ${npu_subgraph_bridge_deps}) #lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps}) @@ -87,6 +88,7 @@ set(npu_subgraph_bridges subgraph_bridge_fill_constant_batch_size_like_op_npu subgraph_bridge_increment_op_npu subgraph_bridge_compare_op_npu + subgraph_bridge_expand_op_npu CACHE INTERNAL "npu_subgraph_bridges") message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}") diff --git a/lite/kernels/npu/bridges/expand_op.cc b/lite/kernels/npu/bridges/expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..62501ab76c46c714af6be95c68b85d22e1e044c9 --- /dev/null +++ b/lite/kernels/npu/bridges/expand_op.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace npu { + +int ExpandConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NPU] Converting " + op_type + "..."; + + // Get input, output and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindTensor(x_name); + auto x_dims = x->dims(); + + auto out_name = op_info->Output("Out").front(); + + auto expand_times = op_info->GetAttr>("expand_times"); + + // x node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // w node + std::shared_ptr w_node = graph->Add(out_name + "/w", expand_times); + + // expand node + auto expand_node = graph->Add(out_name); + auto expand_op = expand_node->data(); + expand_op->set_input_x(*x_node->data()); + expand_op->set_input_w(*w_node->data()); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(expand, + kNPU, + paddle::lite::subgraph::npu::ExpandConverter); diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h index 3965bd64ff534eb1bc2506527e80cca8a19056b2..b6ce66fe34963d8c3bc9c2bccc0f3a294ab16290 100644 --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -38,6 +38,7 @@ USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU); +USE_SUBGRAPH_BRIDGE(expand, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU); diff --git a/lite/tests/kernels/expand_compute_test.cc b/lite/tests/kernels/expand_compute_test.cc index 4ab1c15a5e78f562bc4270cd57c5f0dd3600bbe2..75d5aa65f5a7eba179f5da23e2497434f9cdb1dc 100644 --- a/lite/tests/kernels/expand_compute_test.cc +++ b/lite/tests/kernels/expand_compute_test.cc @@ -84,7 +84,7 @@ class ExpandComputeTester : public arena::TestCase { } }; -void test_expand_3dim(Place place) { +void test_expand_3dim(Place place, float abs_error) { for (std::vector expand_times : {std::vector({2, 3, 1}), std::vector({2, 2, 2}), std::vector({3, 1, 2})}) { @@ -93,7 +93,7 @@ void test_expand_3dim(Place place) { for (int W : {4}) { std::unique_ptr tester(new ExpandComputeTester( place, "def", expand_times, DDim({C, H, W}))); - arena::Arena arena(std::move(tester), place, 2e-5); + arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } } @@ -101,7 +101,7 @@ void test_expand_3dim(Place place) { } } -void test_expand_4dim(Place place) { +void test_expand_4dim(Place place, float abs_error) { for (std::vector expand_times : {std::vector({2, 3, 1, 4}), std::vector({2, 2, 2, 2}), std::vector({3, 1, 2, 1})}) { @@ -111,7 +111,7 @@ void test_expand_4dim(Place place) { for (int W : {4}) { std::unique_ptr tester(new ExpandComputeTester( place, "def", expand_times, DDim({N, C, H, W}))); - arena::Arena arena(std::move(tester), place, 2e-5); + arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } } @@ -121,14 +121,19 @@ void test_expand_4dim(Place place) { } TEST(Expand, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_expand_3dim(place); - test_expand_4dim(place); + float abs_error = 1e-5; + Place place; +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kHost); +#else + return; #endif + + test_expand_3dim(place, abs_error); + test_expand_4dim(place, abs_error); } } // namespace lite