diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 4e104ef748ba884c9cbe4ba8f01260752d6a13af..63bdb4f57bb5d196daedc6d4c57737fdcb2ad3a1 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -42,6 +42,7 @@ lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps}) set(npu_subgraph_bridges subgraph_bridge_registry @@ -71,6 +72,7 @@ set(npu_subgraph_bridges subgraph_bridge_argmax_op_npu subgraph_bridge_instance_norm_op_npu subgraph_bridge_dropout_op_npu + subgraph_bridge_layer_norm_op_npu CACHE INTERNAL "npu_subgraph_bridges") message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}") diff --git a/lite/kernels/npu/bridges/instance_norm_op.cc b/lite/kernels/npu/bridges/instance_norm_op.cc index f2b8db7eaf15eea0449d47bae5e156cf8d2a409a..d71d17d8f164edf9daefe19162991726f677ce74 100644 --- a/lite/kernels/npu/bridges/instance_norm_op.cc +++ b/lite/kernels/npu/bridges/instance_norm_op.cc @@ -82,7 +82,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } else { if (!bias->persistable()) { LOG(WARNING) << "[NPU] Only supporting persistable bias tensor."; - bias->set_persistable(true); + return FAILED; } bias_node = graph->Add(bias_name, *bias, scale_bias_dims); } @@ -108,7 +108,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(channel_size, scale_dims.production()); if (!scale->persistable()) { LOG(WARNING) << "[NPU] Only supporting persistable scale tensor."; - scale->set_persistable(true); + return FAILED; } scale_node = graph->Add(scale_name, *scale, scale_bias_dims); } else { @@ -121,8 +121,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { instance_norm_op->set_input_x(*x_node->data()); instance_norm_op->set_input_scale(*scale_node->data()); instance_norm_op->set_input_bias(*bias_node->data()); - instance_norm_op->set_attr_reduction_indices( - ge::AttrValue::LIST_INT({0, 1, 2})); + instance_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({2})); instance_norm_op->set_attr_epsilon(epsilon); return SUCCESS; } diff --git a/lite/kernels/npu/bridges/layer_norm_op.cc b/lite/kernels/npu/bridges/layer_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad32d69d3c40df49ae155b397803cab65ec43dc9 --- /dev/null +++ b/lite/kernels/npu/bridges/layer_norm_op.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace npu { + +int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto padded_x_shape = CvtShape(x_dims); + auto x_rank = static_cast(x_dims.size()); + CHECK(x_rank >= 2 && x_rank <= 4); + + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); + auto padded_y_shape = CvtShape(y_dims); + + auto epsilon = op_info->GetAttr("epsilon"); + auto begin_norm_axis = op_info->GetAttr("begin_norm_axis"); + if (begin_norm_axis < 0) { + begin_norm_axis += x_rank; + } + CHECK(begin_norm_axis >= 1 && begin_norm_axis < x_rank); + auto x_mat_dims = x_dims.Flatten2D(begin_norm_axis); + auto left = x_mat_dims[0]; + auto right = x_mat_dims[1]; + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x, padded_x_shape); + } + + // Reshaped X node if needs + bool reshape = false; + if (!(x_rank == 4 && begin_norm_axis == 1)) { + reshape = true; + // Only the input shape 4-D(n, c, h, w) and axis=1 is supported + // by HiAI DDK, So the input shape need to be padded to 4-D if it is less + // than 4 or axis!=1. For example: + // (1) (n, c, h, w), axis=1 -> no need + // (2) (n, c, h, w), axis=2 -> (n * c, h, w, 1) + // (3) (n, c, h, w), axis=3 -> (n * c * h, w, 1) + // (4) (n, h, w), axis=1 -> (n, h, w, 1) + // (5) (n, h, w), axis=2 -> (n * h, w, 1, 1) + // (6) (h, w), axis=1 -> (h, w, 1, 1) + padded_x_shape = {left}; + for (int i = begin_norm_axis; i < x_rank; i++) { + padded_x_shape.push_back(x_dims[i]); + } + auto remain = 4 - padded_x_shape.size(); + for (int i = 0; i < remain; i++) { + padded_x_shape.push_back(1); + } + auto reshaped_x_node = graph->Add( + x_name + "/reshape", x_node->precision(), x_node->layout()); + auto reshaped_x_op = reshaped_x_node->data(); + reshaped_x_op->set_input_tensor(*x_node->data()); + reshaped_x_op->set_attr_shape(padded_x_shape); + x_node = reshaped_x_node; + } + + // Bias node + auto scale_bias_dims = + DDim({1, padded_x_shape[1], padded_x_shape[2], padded_x_shape[3]}); + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + CHECK_EQ(bias_dims.size(), 1); + CHECK_EQ(bias_dims.production(), right); + if (!bias->persistable()) { + LOG(WARNING) << "[NPU] Only supporting persistable bias tensor."; + return FAILED; + } + bias_node = graph->Add(bias_name, *bias, scale_bias_dims); + } else { + bias_node = graph->Add(y_name + "/bias", 0.0f, scale_bias_dims); + } + + // Scale node + std::shared_ptr scale_node = nullptr; + if (HasInputArg(op_info, scope, "Scale")) { + auto scale_name = op_info->Input("Scale").front(); + auto scale_type = kernel->GetInputDeclType("Scale"); + CHECK(scale_type->precision() == PRECISION(kFloat)); + CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); + auto scale = scope->FindMutableTensor(scale_name); + auto scale_dims = scale->dims(); + CHECK_EQ(scale_dims.size(), 1); + CHECK_EQ(scale_dims.production(), right); + if (!scale->persistable()) { + LOG(WARNING) << "[NPU] Only supporting persistable scale tensor."; + return FAILED; + } + scale_node = graph->Add(scale_name, *scale, scale_bias_dims); + } else { + scale_node = graph->Add(y_name + "/scale", 1.0f, scale_bias_dims); + } + + // LayerNorm node + auto layer_norm_node = graph->Add(y_name); + auto layer_norm_op = layer_norm_node->data(); + layer_norm_op->set_input_x(*x_node->data()); + layer_norm_op->set_input_scale(*scale_node->data()); + layer_norm_op->set_input_bias(*bias_node->data()); + layer_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({3})); + layer_norm_op->set_attr_epsilon(epsilon); + + // Reshaped Y node if needs + if (reshape) { + auto reshaped_y_node = graph->Add( + y_name, layer_norm_node->precision(), layer_norm_node->layout()); + auto reshaped_y_op = reshaped_y_node->data(); + reshaped_y_op->set_input_tensor(*layer_norm_node->data()); + reshaped_y_op->set_attr_shape(padded_y_shape); + } + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(layer_norm, + kNPU, + paddle::lite::subgraph::npu::LayerNormConverter); diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h index fcb2693641b4663ba4769a792006c42dcbd53129..3a584115c173d75b139d68220701cc709a8c03d6 100644 --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -55,3 +55,4 @@ USE_SUBGRAPH_BRIDGE(transpose2, kNPU); USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU); USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU); USE_SUBGRAPH_BRIDGE(instance_norm, kNPU); +USE_SUBGRAPH_BRIDGE(layer_norm, kNPU); diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 762b20ec83ab7e517b05f0e0eaae7de89efc272a..113f6a8b337e27557f9675cb251b5006040224c4 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -26,7 +26,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_ lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc index 4ba8fb5aaa09190d2119670df506e8be4d48fab1..adfe922098623084a5c4e95dfe735f9348bcee0b 100644 --- a/lite/tests/kernels/instance_norm_compute_test.cc +++ b/lite/tests/kernels/instance_norm_compute_test.cc @@ -122,8 +122,8 @@ class InstanceNormComputeTest : public arena::TestCase { fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production()); SetCommonTensor(x_, dims_, x.data()); - SetCommonTensor(scale_, scale_bias_dims, scale.data()); - SetCommonTensor(bias_, scale_bias_dims, bias.data()); + SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true); + SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true); } }; diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc index a30ac55d427563ed34c198689d6d142a22d9abc1..5ea01a6cca504db230d62a63ef3a62d4f73470fa 100644 --- a/lite/tests/kernels/layer_norm_compute_test.cc +++ b/lite/tests/kernels/layer_norm_compute_test.cc @@ -132,13 +132,13 @@ class LayerNormComputeTest : public arena::TestCase { DDim scale_dims({scale_bias_size}); std::vector scale(scale_bias_size); fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_size); - SetCommonTensor(scale_, scale_dims, scale.data()); + SetCommonTensor(scale_, scale_dims, scale.data(), {}, true); } if (has_bias_) { DDim bias_dims({scale_bias_size}); std::vector bias(scale_bias_size); fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_size); - SetCommonTensor(bias_, bias_dims, bias.data()); + SetCommonTensor(bias_, bias_dims, bias.data(), {}, true); } } }; @@ -149,6 +149,9 @@ TEST(LayerNorm, precision) { Place place; #if defined(LITE_WITH_XPU) place = TARGET(kXPU); +#elif defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; #elif defined(LITE_WITH_ARM) place = TARGET(kARM); abs_error = 6e-5; @@ -157,7 +160,7 @@ TEST(LayerNorm, precision) { #endif for (auto dims : - std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { + std::vector>{{2, 3, 4, 5}, {3, 4, 5}, {4, 5}}) { for (auto epsilon : {1e-5f}) { for (auto axis : {1, 2, 3}) { for (bool has_bias : {true, false}) {