未验证 提交 f99e28b4 编写于 作者: Z zhupengyang 提交者: GitHub

[NPU] support expand op (#3594)

上级 e9b94f04
...@@ -41,7 +41,6 @@ add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_ ...@@ -41,7 +41,6 @@ add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_
add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
......
...@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_ ...@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps}) add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
......
...@@ -12,24 +12,23 @@ ...@@ -12,24 +12,23 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/arm/expand_compute.h" #include "lite/kernels/host/expand_compute.h"
#include <vector> #include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace host {
void ExpandCompute::Run() { template <typename T, PrecisionType PType>
auto& param = Param<operators::ExpandParam>(); void ExpandCompute<T, PType>::Run() {
auto& param = this->template Param<operators::ExpandParam>();
const auto* x = param.X; const auto* x = param.X;
auto* out = param.Out; auto* out = param.Out;
std::vector<int> expand_times = param.expand_times; std::vector<int> expand_times = param.expand_times;
const float* src = x->data<float>(); const T* src = x->template data<T>();
float* dst = out->mutable_data<float>(); T* dst = out->template mutable_data<T>();
int dims = expand_times.size(); int dims = expand_times.size();
DDim in_shape = x->dims(); DDim in_shape = x->dims();
...@@ -42,7 +41,7 @@ void ExpandCompute::Run() { ...@@ -42,7 +41,7 @@ void ExpandCompute::Run() {
for (int k = 0; k < expand_times[i]; ++k) { for (int k = 0; k < expand_times[i]; ++k) {
memcpy(dst + (j * expand_times[i] + k) * inner_num, memcpy(dst + (j * expand_times[i] + k) * inner_num,
src + j * inner_num, src + j * inner_num,
sizeof(float) * inner_num); sizeof(T) * inner_num);
} }
} }
inner_num *= expand_times[i]; inner_num *= expand_times[i];
...@@ -53,20 +52,27 @@ void ExpandCompute::Run() { ...@@ -53,20 +52,27 @@ void ExpandCompute::Run() {
for (int k = expand_times[i] - 1; k >= 0; --k) { for (int k = expand_times[i] - 1; k >= 0; --k) {
memcpy(dst + (j * expand_times[i] + k) * inner_num, memcpy(dst + (j * expand_times[i] + k) * inner_num,
dst + j * inner_num, dst + j * inner_num,
sizeof(float) * inner_num); sizeof(T) * inner_num);
} }
} }
inner_num *= expand_times[i]; inner_num *= expand_times[i];
} }
} }
} // namespace arm } // namespace host
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_KERNEL( using expand_float =
expand, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpandCompute, def) paddle::lite::kernels::host::ExpandCompute<float, PRECISION(kFloat)>;
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def)
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.Finalize(); .Finalize();
...@@ -19,16 +19,18 @@ ...@@ -19,16 +19,18 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace host {
class ExpandCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { template <typename T, PrecisionType PType>
class ExpandCompute
: public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
public: public:
void Run() override; void Run() override;
virtual ~ExpandCompute() = default; virtual ~ExpandCompute() = default;
}; };
} // namespace arm } // namespace host
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -49,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE ...@@ -49,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_expand_op_npu SRCS expand_op.cc DEPS ${npu_subgraph_bridge_deps})
#lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps}) #lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})
...@@ -87,6 +88,7 @@ set(npu_subgraph_bridges ...@@ -87,6 +88,7 @@ set(npu_subgraph_bridges
subgraph_bridge_fill_constant_batch_size_like_op_npu subgraph_bridge_fill_constant_batch_size_like_op_npu
subgraph_bridge_increment_op_npu subgraph_bridge_increment_op_npu
subgraph_bridge_compare_op_npu subgraph_bridge_compare_op_npu
subgraph_bridge_expand_op_npu
CACHE INTERNAL "npu_subgraph_bridges") CACHE INTERNAL "npu_subgraph_bridges")
message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}") message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace npu {
int ExpandConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[NPU] Converting " + op_type + "...";
// Get input, output and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
auto expand_times = op_info->GetAttr<std::vector<int>>("expand_times");
// x node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// w node
std::shared_ptr<Node> w_node = graph->Add(out_name + "/w", expand_times);
// expand node
auto expand_node = graph->Add<ge::op::Tile>(out_name);
auto expand_op = expand_node->data<ge::op::Tile>();
expand_op->set_input_x(*x_node->data());
expand_op->set_input_w(*w_node->data());
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(expand,
kNPU,
paddle::lite::subgraph::npu::ExpandConverter);
...@@ -38,6 +38,7 @@ USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU); ...@@ -38,6 +38,7 @@ USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
USE_SUBGRAPH_BRIDGE(expand, kNPU);
USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);
......
...@@ -84,7 +84,7 @@ class ExpandComputeTester : public arena::TestCase { ...@@ -84,7 +84,7 @@ class ExpandComputeTester : public arena::TestCase {
} }
}; };
void test_expand_3dim(Place place) { void test_expand_3dim(Place place, float abs_error) {
for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1}), for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1}),
std::vector<int>({2, 2, 2}), std::vector<int>({2, 2, 2}),
std::vector<int>({3, 1, 2})}) { std::vector<int>({3, 1, 2})}) {
...@@ -93,7 +93,7 @@ void test_expand_3dim(Place place) { ...@@ -93,7 +93,7 @@ void test_expand_3dim(Place place) {
for (int W : {4}) { for (int W : {4}) {
std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester( std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
place, "def", expand_times, DDim({C, H, W}))); place, "def", expand_times, DDim({C, H, W})));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
...@@ -101,7 +101,7 @@ void test_expand_3dim(Place place) { ...@@ -101,7 +101,7 @@ void test_expand_3dim(Place place) {
} }
} }
void test_expand_4dim(Place place) { void test_expand_4dim(Place place, float abs_error) {
for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1, 4}), for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1, 4}),
std::vector<int>({2, 2, 2, 2}), std::vector<int>({2, 2, 2, 2}),
std::vector<int>({3, 1, 2, 1})}) { std::vector<int>({3, 1, 2, 1})}) {
...@@ -111,7 +111,7 @@ void test_expand_4dim(Place place) { ...@@ -111,7 +111,7 @@ void test_expand_4dim(Place place) {
for (int W : {4}) { for (int W : {4}) {
std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester( std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
place, "def", expand_times, DDim({N, C, H, W}))); place, "def", expand_times, DDim({N, C, H, W})));
arena::Arena arena(std::move(tester), place, 2e-5); arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision(); arena.TestPrecision();
} }
} }
...@@ -121,14 +121,19 @@ void test_expand_4dim(Place place) { ...@@ -121,14 +121,19 @@ void test_expand_4dim(Place place) {
} }
TEST(Expand, precision) { TEST(Expand, precision) {
#ifdef LITE_WITH_X86 float abs_error = 1e-5;
Place place(TARGET(kX86)); Place place;
#endif #if defined(LITE_WITH_NPU)
#ifdef LITE_WITH_ARM place = TARGET(kNPU);
Place place(TARGET(kARM)); abs_error = 1e-2; // Using fp16 in NPU
test_expand_3dim(place); #elif defined(LITE_WITH_ARM)
test_expand_4dim(place); place = TARGET(kHost);
#else
return;
#endif #endif
test_expand_3dim(place, abs_error);
test_expand_4dim(place, abs_error);
} }
} // namespace lite } // namespace lite
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册