diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 2c516e47e494a445156898c6c2b017607c2de6ee..9a14d10311b73c4b6429d6acd3b8fb05856a2913 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -40,6 +40,7 @@ lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_ lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps}) set(npu_subgraph_bridges subgraph_bridge_registry @@ -67,6 +68,7 @@ set(npu_subgraph_bridges subgraph_bridge_reduce_mean_op_npu subgraph_bridge_unsqueeze_op_npu subgraph_bridge_argmax_op_npu + subgraph_bridge_dropout_op_npu CACHE INTERNAL "npu_subgraph_bridges") message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}") diff --git a/lite/kernels/npu/bridges/dropout_op.cc b/lite/kernels/npu/bridges/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0bb57673281bc3e9dd92fabd6ca5a8e76c76cb73 --- /dev/null +++ b/lite/kernels/npu/bridges/dropout_op.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace npu { + +int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NPU] Converting " + op_type + "..."; + + // Get input, output and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto x_rank = x_dims.size(); + CHECK_GE(x_rank, 2); + + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + + auto dropout_implementation = + op_info->GetAttr("dropout_implementation"); + auto scale = 1 - op_info->GetAttr("dropout_prob"); + if (dropout_implementation == "upscale_in_train") { + scale = 1.f; + } + // HiAI only support [n, c, 1, 1] for the shape of scale + std::vector scale_shape = { + 1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1}; + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x, CvtShape(x_dims)); + } + + // Scale node + auto scale_node = graph->Add(out_name); + auto scale_op = scale_node->data(); + scale_op->set_input_x(*x_node->data()); + scale_op->set_attr_axis(1); + + // Add filter node(fill with scale) + auto filter_node = graph->Add(out_name + "/filter", scale, scale_shape); + scale_op->set_input_filter(*filter_node->data()); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(dropout, + kNPU, + paddle::lite::subgraph::npu::DropoutConverter); diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h index a63a0d889d4792bf95e9749df4f4772e3d667d5f..7d5f95237ac63e1d6505ec4900ea4f60f72d08ef 100644 --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -28,6 +28,7 @@ USE_SUBGRAPH_BRIDGE(conv2d, kNPU); USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU); USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU); +USE_SUBGRAPH_BRIDGE(dropout, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU); USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU); USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU); diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index a7ae4145737a7ee6fbce61663ff068b44d6270b0..8ee0255f2b90cc4c93aa94ab9d28d408eaddad11 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -28,7 +28,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc index 1de0bda26eb5286a2ce290f64c7d457f222d6d57..025f02ce31505cee684fb9a21c7b26d96e1c3026 100644 --- a/lite/tests/kernels/dropout_compute_test.cc +++ b/lite/tests/kernels/dropout_compute_test.cc @@ -41,14 +41,10 @@ class DropoutComputeTester : public arena::TestCase { const std::string& alias, DDim dims, float dropout_prob, - bool fix_seed, - int seed, std::string dropout_implementation) : TestCase(place, alias), dims_(dims), dropout_prob_(dropout_prob), - fix_seed_(fix_seed), - seed_(seed), dropout_implementation_(dropout_implementation) {} void RunBaseline(Scope* scope) override { @@ -95,7 +91,10 @@ TEST(Dropout, precision) { LOG(INFO) << "test dropout op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) place = TARGET(kXPU); #else return; @@ -106,14 +105,11 @@ TEST(Dropout, precision) { for (auto dropout_prob : {0., 0.5, 1.}) { for (auto dropout_implementation : {"downgrade_in_infer", "upscale_in_train"}) { - std::unique_ptr tester( - new DropoutComputeTester(place, - "def", - DDim(dims), - dropout_prob, - true, - 1, - dropout_implementation)); +#ifdef LITE_WITH_NPU + if (dims.size() < 2) continue; +#endif + std::unique_ptr tester(new DropoutComputeTester( + place, "def", DDim(dims), dropout_prob, dropout_implementation)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision({"mask"}); }