From 05da0c724100de037aa7afa5935ca70d560464f8 Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Tue, 24 Dec 2019 16:42:38 +0800 Subject: [PATCH] [LITE][NPU][XPU] Support multiple types for XPU and NPU op bridges (#2646) * Support multiple types for XPU and NPU op bridges * Add lookup_table, gather, slice, stack and scale op bridges for supporting BERT * Fix the definition of lookup_table kernel for X86 --- CMakeLists.txt | 8 +- cmake/xpu.cmake | 4 +- lite/backends/npu/device.cc | 1 + lite/backends/xpu/device.cc | 5 +- lite/core/mir/subgraph/subgraph_pass_test.cc | 114 ++++++++---- lite/kernels/npu/bridges/act_op.cc | 31 +++- lite/kernels/npu/bridges/argmax_op.cc | 33 +++- lite/kernels/npu/bridges/batch_norm_op.cc | 71 +++++--- lite/kernels/npu/bridges/concat_op.cc | 34 ++-- lite/kernels/npu/bridges/conv_op.cc | 73 +++++--- lite/kernels/npu/bridges/conv_transpose_op.cc | 86 ++++++---- lite/kernels/npu/bridges/elementwise_ops.cc | 62 ++++--- lite/kernels/npu/bridges/fc_op.cc | 83 +++++---- lite/kernels/npu/bridges/graph.cc | 24 +-- lite/kernels/npu/bridges/graph.h | 111 ++++++++++-- lite/kernels/npu/bridges/interpolate_op.cc | 47 +++-- lite/kernels/npu/bridges/mul_op.cc | 71 +++++--- lite/kernels/npu/bridges/pad2d_op.cc | 56 +++--- lite/kernels/npu/bridges/pool_op.cc | 62 ++++--- lite/kernels/npu/bridges/reduce_mean_op.cc | 52 ++++-- lite/kernels/npu/bridges/registry.h | 3 +- lite/kernels/npu/bridges/reshape_op.cc | 61 +++++-- lite/kernels/npu/bridges/scale_op.cc | 36 ++-- .../kernels/npu/bridges/shuffle_channel_op.cc | 32 +++- lite/kernels/npu/bridges/softmax_op.cc | 29 +++- lite/kernels/npu/bridges/split_op.cc | 36 +++- lite/kernels/npu/bridges/sqrt_op.cc | 30 +++- lite/kernels/npu/bridges/square_op.cc | 30 +++- lite/kernels/npu/bridges/transpose_op.cc | 27 ++- lite/kernels/npu/bridges/unsqueeze_op.cc | 29 +++- lite/kernels/npu/bridges/utility.cc | 43 ++--- lite/kernels/npu/bridges/utility.h | 8 +- lite/kernels/npu/subgraph_compute.cc | 156 +++++++++++------ lite/kernels/npu/subgraph_compute.h | 4 +- lite/kernels/x86/lookup_table_compute.cc | 4 +- lite/kernels/x86/lookup_table_compute.h | 2 +- lite/kernels/x86/lookup_table_compute_test.cc | 2 +- lite/kernels/x86/stack_compute.cc | 2 +- lite/kernels/xpu/bridges/CMakeLists.txt | 10 ++ lite/kernels/xpu/bridges/act_op.cc | 37 +++- lite/kernels/xpu/bridges/act_op_test.cc | 102 ----------- lite/kernels/xpu/bridges/batch_norm_op.cc | 81 ++++++--- lite/kernels/xpu/bridges/conv_op.cc | 74 +++++--- lite/kernels/xpu/bridges/elementwise_ops.cc | 46 +++-- lite/kernels/xpu/bridges/gather_op.cc | 100 +++++++++++ lite/kernels/xpu/bridges/graph.cc | 43 +++-- lite/kernels/xpu/bridges/graph.h | 106 +++++++++--- lite/kernels/xpu/bridges/layer_norm_op.cc | 105 +++++++++--- lite/kernels/xpu/bridges/lookup_table_op.cc | 95 ++++++++++ lite/kernels/xpu/bridges/mul_op.cc | 82 +++++---- lite/kernels/xpu/bridges/paddle_use_bridges.h | 8 + lite/kernels/xpu/bridges/pool_op.cc | 53 +++--- lite/kernels/xpu/bridges/reshape_op.cc | 71 +++++--- lite/kernels/xpu/bridges/scale_op.cc | 70 ++++++++ lite/kernels/xpu/bridges/slice_op.cc | 90 ++++++++++ lite/kernels/xpu/bridges/softmax_op.cc | 31 +++- lite/kernels/xpu/bridges/stack_op.cc | 72 ++++++++ lite/kernels/xpu/bridges/transpose_op.cc | 38 ++-- lite/kernels/xpu/bridges/utility.cc | 88 +++++----- lite/kernels/xpu/bridges/utility.h | 29 +++- lite/kernels/xpu/subgraph_compute.cc | 162 +++++++++++++----- lite/kernels/xpu/subgraph_compute.h | 4 + lite/operators/activation_ops.cc | 1 + lite/tests/kernels/CMakeLists.txt | 10 +- lite/tests/kernels/activation_compute_test.cc | 42 ++++- lite/tests/kernels/fc_compute_test.cc | 26 +-- lite/tests/kernels/gather_compute_test.cc | 116 +++++++++++++ lite/tests/kernels/layer_norm_compute_test.cc | 83 +++++---- .../kernels/lookup_table_compute_test.cc | 140 +++++++++++++++ lite/tests/kernels/scale_compute_test.cc | 30 ++-- lite/tests/kernels/slice_compute_test.cc | 6 +- lite/tests/kernels/stack_compute_test.cc | 12 +- 72 files changed, 2590 insertions(+), 1035 deletions(-) delete mode 100644 lite/kernels/xpu/bridges/act_op_test.cc create mode 100644 lite/kernels/xpu/bridges/gather_op.cc create mode 100644 lite/kernels/xpu/bridges/lookup_table_op.cc create mode 100644 lite/kernels/xpu/bridges/scale_op.cc create mode 100644 lite/kernels/xpu/bridges/slice_op.cc create mode 100644 lite/kernels/xpu/bridges/stack_op.cc create mode 100644 lite/tests/kernels/gather_compute_test.cc create mode 100644 lite/tests/kernels/lookup_table_compute_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c71a45ffc..f1034e0b95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,10 @@ endif() ######################################################################################## +if(LITE_WITH_XPU) + include(xpu) +endif() + include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm @@ -188,10 +192,6 @@ if(LITE_WITH_CUDA) include(cuda) endif() -if(LITE_WITH_XPU) - include(xpu) -endif() - include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake index ab34f409b8..2112f6b658 100644 --- a/cmake/xpu.cmake +++ b/cmake/xpu.cmake @@ -89,7 +89,7 @@ else() endif() find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 - PATHS ${XPU_SDK_ROOT}/XTDK/shlib/gcc482) + PATHS ${XPU_SDK_ROOT}/XTDK/shlib) if(NOT XPU_SDK_LLVM_FILE) message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") @@ -99,7 +99,7 @@ else() set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0") set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index e639392642..d62ac9cad3 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -61,6 +61,7 @@ std::unique_ptr Device::Build( return nullptr; } ir_build.ReleaseModelBuff(om_model_buf); + VLOG(3) << "[NPU] Build done"; return model_client; } diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc index 74a5681aa9..dbf88ff833 100644 --- a/lite/backends/xpu/device.cc +++ b/lite/backends/xpu/device.cc @@ -28,8 +28,8 @@ std::unique_ptr Device::Build( CHECK(outputs != nullptr); CHECK_GT(outputs->size(), 0); - // The XPU compiler build the graph and fill all of the constant params, only - // one output is supported now. + // The XPU compiler build the graph and fill all of the constant params, and + // use TupleNode to support multiple outputs xtcl::Array all_outs; for (size_t i = 0; i < outputs->size(); i++) { all_outs.push_back(*outputs->at(i)); @@ -40,6 +40,7 @@ std::unique_ptr Device::Build( auto compiler = xtcl::network::xTensorCompiler(network, target); compiler.SetParams(*params); // Set the data of constant tensors compiler.Build(); + VLOG(3) << "[XPU] Build done"; return std::unique_ptr( new xtcl::network::xRuntimeInstance(compiler.CreateRuntimeInstance())); } diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 45c82a4262..0d5fc7bf5e 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -24,39 +24,56 @@ DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model"); DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model"); -DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors"); -DEFINE_int32(output_tensor_num, 1, "number of output tensors"); +DEFINE_string(input_tensor_shape, "1,3,224,224", "shape of input tensors"); +DEFINE_string(input_tensor_type, "float32", "data type of input tensors"); +DEFINE_string(output_tensor_type, "float32", "data type of output tensors"); namespace paddle { namespace lite { // The helper functions for loading and running model from command line and // verifying output data -std::vector> ShapeParsing(std::string txt) { - std::vector> shape; - while (!txt.empty()) { - size_t idx = txt.find_first_of(":"); - std::string dims = txt.substr(0, idx); - std::vector s; - while (!dims.empty()) { - size_t idx = dims.find_first_of(","); - int d = atoi(dims.substr(0, idx).c_str()); +std::vector TypeParsing(std::string text) { + std::vector types; + while (!text.empty()) { + size_t index = text.find_first_of(":"); + std::string type = text.substr(0, index); + VLOG(3) << type; + types.push_back(type); + if (index == std::string::npos) { + break; + } else { + text = text.substr(index + 1); + } + } + return types; +} + +std::vector> ShapeParsing(std::string text) { + std::vector> shapes; + while (!text.empty()) { + size_t index = text.find_first_of(":"); + std::string slice = text.substr(0, index); + std::vector shape; + while (!slice.empty()) { + size_t index = slice.find_first_of(","); + int d = atoi(slice.substr(0, index).c_str()); VLOG(3) << d; - s.push_back(d); - if (idx == std::string::npos) { + shape.push_back(d); + if (index == std::string::npos) { break; } else { - dims = dims.substr(idx + 1); + slice = slice.substr(index + 1); } } - shape.push_back(s); - if (idx == std::string::npos) { + shapes.push_back(shape); + if (index == std::string::npos) { break; } else { - txt = txt.substr(idx + 1); + text = text.substr(index + 1); } } - return shape; + return shapes; } int64_t ShapeProduction(std::vector shape) { @@ -70,40 +87,55 @@ int64_t ShapeProduction(std::vector shape) { void FillInputTensors( const std::shared_ptr& predictor, const std::vector>& input_tensor_shape, + const std::vector& input_tensor_type, const float value) { +#define FILL_TENSOR_WITH_TYPE(type) \ + auto input_tensor_data = input_tensor->mutable_data(); \ + for (int j = 0; j < input_tensor_size; j++) { \ + input_tensor_data[i] = static_cast(value); \ + } for (int i = 0; i < input_tensor_shape.size(); i++) { auto input_tensor = predictor->GetInput(i); input_tensor->Resize(input_tensor_shape[i]); - auto input_tensor_data = input_tensor->mutable_data(); auto input_tensor_size = ShapeProduction(input_tensor->shape()); - for (int j = 0; j < input_tensor_size; j++) { - input_tensor_data[i] = value; + if (input_tensor_type[i] == "float32") { + FILL_TENSOR_WITH_TYPE(float) + } else if (input_tensor_type[i] == "int64") { + FILL_TENSOR_WITH_TYPE(int64_t) } } +#undef FILL_TENSOR_WITH_TYPE } void CheckOutputTensors( const std::shared_ptr& tar_predictor, const std::shared_ptr& ref_predictor, - const int output_tensor_num) { - for (int i = 0; i < output_tensor_num; i++) { + const std::vector& output_tensor_type) { +#define CHECK_TENSOR_WITH_TYPE(type) \ + auto tar_output_tensor_data = tar_output_tensor->data(); \ + auto ref_output_tensor_data = ref_output_tensor->data(); \ + for (size_t j = 0; j < ref_output_tensor_size; j++) { \ + auto abs_diff = \ + std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]); \ + auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6); \ + VLOG(5) << "val: " << tar_output_tensor_data[j] \ + << " ref: " << ref_output_tensor_data[j] \ + << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \ + EXPECT_LT(rel_diff, 0.1); \ + } + for (int i = 0; i < output_tensor_type.size(); i++) { auto tar_output_tensor = tar_predictor->GetOutput(i); auto ref_output_tensor = ref_predictor->GetOutput(i); - auto tar_output_tensor_data = tar_output_tensor->data(); - auto ref_output_tensor_data = ref_output_tensor->data(); auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape()); EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size); - for (size_t j = 0; j < ref_output_tensor_size; j++) { - auto abs_diff = - std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]); - auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6); - VLOG(5) << "val: " << tar_output_tensor_data[j] - << " ref: " << ref_output_tensor_data[j] - << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; - EXPECT_LT(rel_diff, 0.1); + if (output_tensor_type[i] == "float32") { + CHECK_TENSOR_WITH_TYPE(float) + } else if (output_tensor_type[i] == "int64") { + CHECK_TENSOR_WITH_TYPE(int64_t) } } +#undef CHECK_TENSOR_WITH_TYPE } std::shared_ptr TestModel( @@ -112,6 +144,7 @@ std::shared_ptr TestModel( const std::string& params_file, const std::vector& valid_places, const std::vector>& input_tensor_shape, + const std::vector& input_tensor_type, const std::string& optimized_model_dir) { // Generate optimized model lite_api::CxxConfig cxx_config; @@ -128,7 +161,7 @@ std::shared_ptr TestModel( mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH); mobile_config.set_threads(1); predictor = lite_api::CreatePaddlePredictor(mobile_config); - FillInputTensors(predictor, input_tensor_shape, 1); + FillInputTensors(predictor, input_tensor_shape, input_tensor_type, 1); // Run optimized model for (int i = 0; i < FLAGS_warmup; i++) { predictor->Run(); @@ -148,10 +181,13 @@ TEST(Subgraph, generate_model_and_check_precision) { "the path of model files."; return; } - // Parsing the shapes of input tensors from strings, supported formats: + // Parsing the shape of input tensors from strings, supported formats: // "1,3,224,224" and "1,3,224,224:1,80" - std::vector> input_tensor_shape = - ShapeParsing(FLAGS_input_tensor_shape); + auto input_tensor_shape = ShapeParsing(FLAGS_input_tensor_shape); + // Parsing the data type of input and output tensors from strings, supported + // formats: "float32" and "float32:int64:int8" + auto input_tensor_type = TypeParsing(FLAGS_input_tensor_type); + auto output_tensor_type = TypeParsing(FLAGS_output_tensor_type); std::vector valid_places({ #ifdef LITE_WITH_ARM lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, @@ -166,6 +202,7 @@ TEST(Subgraph, generate_model_and_check_precision) { FLAGS_params_file, valid_places, input_tensor_shape, + input_tensor_type, FLAGS_optimized_model_dir + "/ref_opt_model"); // Generate and run optimized model on NPU/XPU as the target predictor #ifdef LITE_WITH_NPU @@ -179,10 +216,11 @@ TEST(Subgraph, generate_model_and_check_precision) { FLAGS_params_file, valid_places, input_tensor_shape, + input_tensor_type, FLAGS_optimized_model_dir + "/tar_opt_model"); // Check the difference of the output tensors between reference predictor and // target predictor - CheckOutputTensors(tar_predictor, ref_predictor, FLAGS_output_tensor_num); + CheckOutputTensors(tar_predictor, ref_predictor, output_tensor_type); } } // namespace lite diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 0edab9e664..62eb649e0e 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -21,24 +21,41 @@ namespace lite { namespace subgraph { namespace npu { -int ActConverter(void* ctx, OpLite* op) { +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - // Create act node and set input node which is obtained from the node map - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto act_node = graph->AddNode(out_var_name); - act_node->set_input_x(*graph->GetNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Act node + auto act_node = graph->AddNode(out_name); + act_node->set_input_x(*x_node); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. act_node->set_attr_mode(CvtActMode(op_type)); - if (op_type == "relu_clipped") { auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); act_node->set_attr_coef(Relu_clipped_coef); diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc index 66b906eee5..835d4dd1ed 100644 --- a/lite/kernels/npu/bridges/argmax_op.cc +++ b/lite/kernels/npu/bridges/argmax_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ArgmaxConverter(void* ctx, OpLite* op) { +int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,15 +30,34 @@ int ArgmaxConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); int axis = op_info->GetAttr("axis"); - auto argmax_node = graph->AddNode(out_var_name); - argmax_node->set_input_x1(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } - auto x2 = graph->AddNode(out_var_name + "/axis", axis); - argmax_node->set_input_x2(*x2); + // Axis node + auto axis_const_node = graph->AddNode(out_name + "/axis", axis); + + // Argmax node + auto argmax_node = graph->AddNode(out_name); + argmax_node->set_input_x1(*x_node); + argmax_node->set_input_x2(*axis_const_node); return SUCCESS; } diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index f1bd1b43c1..57b52cf745 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int BatchNormConverter(void* ctx, OpLite* op) { +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,32 +30,59 @@ int BatchNormConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Output("Y").front(); - auto batch_norm_node = graph->AddNode(y_var_name); - batch_norm_node->set_input_x(*graph->GetNode(x_var_name)); - - auto scale_var_name = op_info->Input("Scale").front(); - auto scale = scope->FindVar(scale_var_name)->GetMutable(); - auto scale_const_node = graph->AddNode(scale_var_name, *scale); - - auto bias_var_name = op_info->Input("Bias").front(); - auto bias = scope->FindVar(bias_var_name)->GetMutable(); - auto bias_const_node = graph->AddNode(bias_var_name, *bias); - - auto mean_var_name = op_info->Input("Mean").front(); - auto mean = scope->FindVar(mean_var_name)->GetMutable(); - auto mean_const_node = graph->AddNode(mean_var_name, *mean); - - auto variance_var_name = op_info->Input("Variance").front(); - auto variance = scope->FindVar(variance_var_name)->GetMutable(); - auto variance_const_node = graph->AddNode(variance_var_name, *variance); - + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); + auto scale_type = kernel->GetInputDeclType("Scale"); + CHECK(scale_type->precision() == PRECISION(kFloat)); + CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); + auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); + auto mean_type = kernel->GetInputDeclType("Mean"); + CHECK(mean_type->precision() == PRECISION(kFloat)); + CHECK(mean_type->layout() == DATALAYOUT(kNCHW)); + auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); + auto variance_type = kernel->GetInputDeclType("Variance"); + CHECK(variance_type->precision() == PRECISION(kFloat)); + CHECK(variance_type->layout() == DATALAYOUT(kNCHW)); + auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); float momentum = op_info->GetAttr("momentum"); float epsilon = op_info->GetAttr("epsilon"); int mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1 bool use_global_stats = op_info->GetAttr("use_global_stats"); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Scale, Bias, Mean, Variance node + auto scale_const_node = graph->AddNode(scale_name, *scale); + auto bias_const_node = graph->AddNode(bias_name, *bias); + auto mean_const_node = graph->AddNode(mean_name, *mean); + auto variance_const_node = graph->AddNode(variance_name, *variance); + + // Batch Norm node + auto batch_norm_node = graph->AddNode(y_name); + batch_norm_node->set_input_x(*x_node); batch_norm_node->set_input_scale(*scale_const_node); batch_norm_node->set_input_offset(*bias_const_node); batch_norm_node->set_input_mean(*mean_const_node); diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc index 9f504213a6..44a2734c89 100644 --- a/lite/kernels/npu/bridges/concat_op.cc +++ b/lite/kernels/npu/bridges/concat_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ConcatConverter(void* ctx, OpLite* op) { +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,23 +30,35 @@ int ConcatConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " << op_type << " ... "; - auto x_var_names = op_info->Input("X"); - auto out_var_name = op_info->Output("Out").front(); + // Get input and output vars and op attributes + auto x_names = op_info->Input("X"); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); - auto num = x_var_names.size(); - auto concat_node = graph->AddNode(out_var_name); + auto num = x_names.size(); + + // Traverse all of input nodes which are added into the new created concat + // node + auto concat_node = graph->AddNode(out_name); concat_node->set_attr_axis(axis); concat_node->set_attr_N(num); concat_node->create_dynamic_input_x(num); int idx = 1; - for (auto& x_var_name : x_var_names) { - if (graph->HasNode(x_var_name)) { - concat_node->set_dynamic_input_x(idx, *graph->GetNode(x_var_name)); + for (auto& x_name : x_names) { + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); } else { - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto x_const_node = graph->AddNode(x_var_name, *x); - concat_node->set_dynamic_input_x(idx, *x_const_node); + x_node = graph->AddNode(x_name, x_dims); } + concat_node->set_dynamic_input_x(idx, *x_node); idx++; } return SUCCESS; diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 0cc22ef356..6b34e76880 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -22,7 +22,7 @@ namespace lite { namespace subgraph { namespace npu { -int ConvConverter(void* ctx, OpLite* op) { +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,16 +31,25 @@ int ConvConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " << op_type << "... "; - // Get input, filter and op attributes - auto input_var_name = op_info->Input("Input").front(); - auto input = scope->FindVar(input_var_name)->GetMutable(); + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + CHECK(input_type->precision() == PRECISION(kFloat)); + CHECK(input_type->layout() == DATALAYOUT(kNCHW)); + auto input = scope->FindMutableTensor(input_name); auto input_dims = input->dims(); - auto output_var_name = op_info->Output("Output").front(); - auto output = scope->FindVar(output_var_name)->GetMutable(); - auto output_dims = output->dims(); - auto filter_var_name = op_info->Input("Filter").front(); - auto filter = scope->FindVar(filter_var_name)->GetMutable(); + auto filter_name = op_info->Input("Filter").front(); + auto filter_type = kernel->GetInputDeclType("Filter"); + CHECK(filter_type->precision() == PRECISION(kFloat)); + CHECK(filter_type->layout() == DATALAYOUT(kNCHW)); + auto filter = scope->FindMutableTensor(filter_name); auto filter_dims = filter->dims(); + auto output_name = op_info->Output("Output").front(); + auto output_type = kernel->GetOutputDeclType("Output"); + CHECK(output_type->precision() == PRECISION(kFloat)); + CHECK(output_type->layout() == DATALAYOUT(kNCHW)); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); auto bs = input_dims[0]; auto ic = input_dims[1]; auto oc = filter_dims[0]; @@ -57,6 +66,14 @@ int ConvConverter(void* ctx, OpLite* op) { CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); + // Input node + std::shared_ptr input_node = nullptr; + if (graph->HasNode(input_name)) { + input_node = graph->GetNode(input_name); + } else { + input_node = graph->AddNode(input_name, input_dims); + } + if (paddings.size() == 2L) { for (size_t i = 0; i < strides.size(); ++i) { int copy_pad = *(paddings.begin() + 2 * i); @@ -91,10 +108,10 @@ int ConvConverter(void* ctx, OpLite* op) { "performance."; } - // Create filter node - auto filter_const_node = graph->AddNode(filter_var_name, *filter); + // Filter node + auto filter_const_node = graph->AddNode(filter_name, *filter); - // Create bias node if exists bias + // Add bias node if exists bias // Supports the bias nodes with the following dimensions // 0: {oc} // 1: {1, oc, oh, ow} @@ -102,8 +119,11 @@ int ConvConverter(void* ctx, OpLite* op) { std::shared_ptr bias_node = nullptr; bool is_channel_bias = false; if (HasInputArg(op_info, scope, "Bias")) { - auto bias_var_name = op_info->Input("Bias").front(); - auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); auto bias_data_size = bias_dims.production(); auto output_data_size = output_dims.production(); @@ -124,21 +144,21 @@ int ConvConverter(void* ctx, OpLite* op) { << output_dims; return FAILED; } - if (graph->HasNode(bias_var_name)) { - // Bias node from input map - bias_node = graph->GetNode(bias_var_name); + if (graph->HasNode(bias_name)) { + // Bias node from input node + bias_node = graph->GetNode(bias_name); } else { // Bias node with const data - bias_node = graph->AddNode(bias_var_name, *bias, bias_shape); + bias_node = graph->AddNode(bias_name, *bias, bias_shape); } } - // Create conv node and set input, filter, bias nodes and attributes + // Conv node std::shared_ptr conv_node = nullptr; if (use_depthwise_conv && is_depthwise_mode) { auto depthwise_conv_node = - graph->AddNode(output_var_name); - depthwise_conv_node->set_input_x(*graph->GetNode(input_var_name)); + graph->AddNode(output_name); + depthwise_conv_node->set_input_x(*input_node); depthwise_conv_node->set_input_filter(*filter_const_node); depthwise_conv_node->set_attr_mode(1); depthwise_conv_node->set_attr_algo(0); @@ -157,15 +177,14 @@ int ConvConverter(void* ctx, OpLite* op) { // ConvolutionDepthwise Op doesn't support bias, so append Add node to // support bias if (bias_node != nullptr) { - auto add_node = graph->AddNode(output_var_name); + auto add_node = graph->AddNode(output_name); add_node->set_input_x1(*depthwise_conv_node); add_node->set_input_x2(*bias_node); conv_node = add_node; } } else { - auto common_conv_node = - graph->AddNode(output_var_name); - common_conv_node->set_input_x(*graph->GetNode(input_var_name)); + auto common_conv_node = graph->AddNode(output_name); + common_conv_node->set_input_x(*input_node); common_conv_node->set_input_w(*filter_const_node); common_conv_node->set_attr_mode(1); common_conv_node->set_attr_pad_mode(0); // NOTSET @@ -185,7 +204,7 @@ int ConvConverter(void* ctx, OpLite* op) { if (is_channel_bias) { common_conv_node->set_input_b(*bias_node); } else { - auto add_node = graph->AddNode(output_var_name); + auto add_node = graph->AddNode(output_name); add_node->set_input_x1(*common_conv_node); add_node->set_input_x2(*bias_node); conv_node = add_node; @@ -196,7 +215,7 @@ int ConvConverter(void* ctx, OpLite* op) { if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_var_name); + auto relu_node = graph->AddNode(output_name); relu_node->set_input_x(*conv_node); relu_node->set_attr_mode(CvtActMode("relu")); } diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 6e689b56f6..5ac0723c78 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ConvTransposeConverter(void* ctx, OpLite* op) { +int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,15 +31,24 @@ int ConvTransposeConverter(void* ctx, OpLite* op) { VLOG(3) << "[NPU] Converting " << op_type << "... "; // Get input, output and op attributes - auto input_var_name = op_info->Input("Input").front(); - auto input = scope->FindVar(input_var_name)->GetMutable(); - auto input_shape = input->dims().Vectorize(); - auto output_var_name = op_info->Output("Output").front(); - auto filter_var_name = op_info->Input("Filter").front(); - auto filter = scope->FindVar(filter_var_name)->GetMutable(); - auto filter_shape = filter->dims().Vectorize(); - CHECK_EQ(input_shape.size(), 4); - CHECK_EQ(filter_shape.size(), 4); + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + CHECK(input_type->precision() == PRECISION(kFloat)); + CHECK(input_type->layout() == DATALAYOUT(kNCHW)); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_EQ(input_dims.size(), 4); + auto filter_name = op_info->Input("Filter").front(); + auto filter_type = kernel->GetInputDeclType("Filter"); + CHECK(filter_type->precision() == PRECISION(kFloat)); + CHECK(filter_type->layout() == DATALAYOUT(kNCHW)); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + CHECK_EQ(filter_dims.size(), 4); + auto output_name = op_info->Output("Output").front(); + auto output_type = kernel->GetOutputDeclType("Output"); + CHECK(output_type->precision() == PRECISION(kFloat)); + CHECK(output_type->layout() == DATALAYOUT(kNCHW)); auto strides = op_info->GetAttr>("strides"); auto paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); @@ -48,6 +57,15 @@ int ConvTransposeConverter(void* ctx, OpLite* op) { CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); + // Input node + std::shared_ptr input_node = nullptr; + if (graph->HasNode(input_name)) { + input_node = graph->GetNode(input_name); + } else { + input_node = graph->AddNode(input_name, input_dims); + } + + // Create input sizes node to describe the dimensions of input tensor if (paddings.size() == 2L) { for (size_t i = 0; i < 2L; ++i) { int copy_pad = *(paddings.begin() + 2 * i); @@ -56,32 +74,26 @@ int ConvTransposeConverter(void* ctx, OpLite* op) { } CHECK_EQ(paddings.size(), 4L) << "[NPU] Paddings size should be the same or twice as the input size."; - - // Create deconv node - auto conv_transpose_node = - graph->AddNode(output_var_name); - - // Create input sizes node to describe the dimensions of input tensor std::vector input_sizes; - input_sizes.push_back(input_shape[0]); - input_sizes.push_back(filter_shape[1] * groups); + input_sizes.push_back(input_dims[0]); + input_sizes.push_back(filter_dims[1] * groups); for (int i = 0; i < strides.size(); i++) { - int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1; + int kernel_ext = dilations[i] * (filter_dims[i + 2] - 1) + 1; int output_size = - (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; + (input_dims[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; input_sizes.push_back(output_size); } auto input_sizes_const_node = - graph->AddNode(output_var_name + "/input_sizes", input_sizes); - conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); - - // Create filter node - auto filter_const_node = graph->AddNode(filter_var_name, *filter); - conv_transpose_node->set_input_filter(*filter_const_node); + graph->AddNode(output_name + "/input_sizes", input_sizes); - // Set input node - conv_transpose_node->set_input_x(*graph->GetNode(input_var_name)); + // Filter node + auto filter_const_node = graph->AddNode(filter_name, *filter); + // Deconv node + auto conv_transpose_node = graph->AddNode(output_name); + conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); + conv_transpose_node->set_input_filter(*filter_const_node); + conv_transpose_node->set_input_x(*input_node); // Set attributes conv_transpose_node->set_attr_format(0); // NCHW conv_transpose_node->set_attr_pad_mode(0); // NOTSET @@ -93,21 +105,23 @@ int ConvTransposeConverter(void* ctx, OpLite* op) { conv_transpose_node->set_attr_stride( ge::AttrValue::LIST_INT({strides[0], strides[1]})); conv_transpose_node->set_attr_kernel( - ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]})); + ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); // Append add node to add bias if exists bias std::shared_ptr output_node = conv_transpose_node; if (HasInputArg(op_info, scope, "Bias")) { // Create bias node - auto bias_var_name = op_info->Input("Bias").front(); - CHECK(!graph->HasNode(bias_var_name)); - auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); auto channel_size = bias->dims().production(); - CHECK_EQ(channel_size, filter_shape[1] * groups); + CHECK_EQ(channel_size, filter_dims[1] * groups); auto bias_const_node = - graph->AddNode(bias_var_name, *bias, {1, channel_size, 1, 1}); + graph->AddNode(bias_name, *bias, {1, channel_size, 1, 1}); // Append add node to add bias node - auto add_node = graph->AddNode(output_var_name); + auto add_node = graph->AddNode(output_name); add_node->set_input_x1(*conv_transpose_node); add_node->set_input_x2(*bias_const_node); output_node = add_node; @@ -115,7 +129,7 @@ int ConvTransposeConverter(void* ctx, OpLite* op) { if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_var_name); + auto relu_node = graph->AddNode(output_name); relu_node->set_input_x(*output_node); relu_node->set_attr_mode(CvtActMode("relu")); } diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index 43ecae25e0..a31a1426dc 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -21,10 +21,10 @@ namespace lite { namespace subgraph { namespace npu { -std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { - auto x_dims = x.dims(); +std::vector CvtYShape(const DDim& x_dims, + const DDim& y_dims, + int axis) { CHECK_EQ(x_dims.size(), 4UL) << "[NPU] Only support 4-dimension x"; - auto y_dims = y->dims(); CHECK_GE(x_dims.size(), y_dims.size()); if (axis < 0) { @@ -45,7 +45,7 @@ std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { return y_new_shape; } -int ElementwiseConverter(void* ctx, OpLite* op) { +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -54,41 +54,62 @@ int ElementwiseConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - auto out_var_name = op_info->Output("Out").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto y_name = op_info->Input("Y").front(); + auto y_type = kernel->GetInputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); - std::shared_ptr elementwise_node = nullptr; - std::shared_ptr x_node = graph->GetNode(x_var_name); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Y node std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_var_name)) { - y_node = graph->GetNode(y_var_name); + if (graph->HasNode(y_name)) { + y_node = graph->GetNode(y_name); } else { - auto x = scope->FindTensor(x_var_name); - auto y = scope->FindMutableTensor(y_var_name); - auto y_new_shape = CvtYShape(*x, y, axis); - y_node = graph->AddNode(y_var_name, y, y_new_shape); + auto y_new_shape = CvtYShape(x_dims, y_dims, axis); + y_node = graph->AddNode(y_name, y_new_shape); } + // Elementwise node + std::shared_ptr elementwise_node = nullptr; if (op_type == "elementwise_add" || op_type == "fusion_elementwise_add_activation") { - auto elt_node = graph->AddNode(out_var_name); + auto elt_node = graph->AddNode(out_name); elt_node->set_input_x1(*x_node); elt_node->set_input_x2(*y_node); elementwise_node = elt_node; } else if (op_type == "elementwise_sub") { - auto elt_node = graph->AddNode(out_var_name); + auto elt_node = graph->AddNode(out_name); elt_node->set_input_x1(*x_node); elt_node->set_input_x2(*y_node); elementwise_node = elt_node; } else if (op_type == "elementwise_mul") { - auto elt_node = graph->AddNode(out_var_name); + auto elt_node = graph->AddNode(out_name); elt_node->set_input_x(*x_node); elt_node->set_input_y(*y_node); elementwise_node = elt_node; } else if (op_type == "elementwise_div") { - auto elt_node = graph->AddNode(out_var_name); + auto elt_node = graph->AddNode(out_name); elt_node->set_input_x1(*x_node); elt_node->set_input_x2(*y_node); elementwise_node = elt_node; @@ -97,9 +118,10 @@ int ElementwiseConverter(void* ctx, OpLite* op) { return FAILED; } + // Act node if (op_type == "fusion_elementwise_add_activation") { auto act_type = op_info->GetAttr("act_type"); - auto act_node = graph->AddNode(out_var_name); + auto act_node = graph->AddNode(out_name); act_node->set_input_x(*elementwise_node); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc index 65b8ca657f..7b66d54565 100644 --- a/lite/kernels/npu/bridges/fc_op.cc +++ b/lite/kernels/npu/bridges/fc_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int FCConverter(void* ctx, OpLite* op) { +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,36 +30,44 @@ int FCConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("Input").front(); - auto w_var_name = op_info->Input("W").front(); - auto out_var_name = op_info->Output("Out").front(); - - int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto w = scope->FindVar(w_var_name)->GetMutable(); - auto x_dims = x->dims(); + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + CHECK(input_type->precision() == PRECISION(kFloat)); + CHECK(input_type->layout() == DATALAYOUT(kNCHW)); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_GE(input_dims.size(), 2UL); + auto w_name = op_info->Input("W").front(); + auto w_type = kernel->GetInputDeclType("W"); + CHECK(w_type->precision() == PRECISION(kFloat)); + CHECK(w_type->layout() == DATALAYOUT(kNCHW)); + auto w = scope->FindMutableTensor(w_name); auto w_dims = w->dims(); - - CHECK_GE(x_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL); - - int m = x_dims.Slice(0, in_num_col_dims).production(); - int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + int m = input_dims.Slice(0, in_num_col_dims).production(); + int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production(); int n = w_dims[1]; CHECK_EQ(k * n, w_dims.production()); - VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m - << " k: " << k << " n: " << n; + VLOG(3) << "[NPU] input dims: " << input_dims << " w dims: " << w_dims + << " m: " << m << " k: " << k << " n: " << n; - auto fc_node = graph->AddNode(out_var_name + "/fc"); - CHECK(!graph->HasNode(w_var_name)); - - // Reshape x to (m, k, 1, 1) - auto reshaped_x_node = - graph->AddNode(x_var_name + "/reshape"); - reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name)); - reshaped_x_node->set_attr_shape({m, k, 1, 1}); - reshaped_x_node->set_attr_axis(0); - fc_node->set_input_x(*reshaped_x_node); + // Create input node and reshape it to (m, k, 1, 1) + std::shared_ptr input_node = nullptr; + if (graph->HasNode(input_name)) { + input_node = graph->GetNode(input_name); + } else { + input_node = graph->AddNode(input_name, input_dims); + } + auto reshaped_input_node = + graph->AddNode(input_name + "/reshape"); + reshaped_input_node->set_input_tensor(*input_node); + reshaped_input_node->set_attr_shape({m, k, 1, 1}); + reshaped_input_node->set_attr_axis(0); // Create w const node, set its shape to (n, k, 1, 1) and fill with // the transposed w tensor @@ -72,23 +80,26 @@ int FCConverter(void* ctx, OpLite* op) { transpose_w_data[j * k + i] = w_data[i * n + j]; } } - auto w_const_node = graph->AddNode(w_var_name, transpose_w); - fc_node->set_input_w(*w_const_node); + auto trans_w_const_node = graph->AddNode(w_name, transpose_w); + // FC node + auto fc_node = graph->AddNode(out_name + "/fc"); + fc_node->set_input_x(*reshaped_input_node); + fc_node->set_input_w(*trans_w_const_node); // Add bias node if bias tensor exists if (HasInputArg(op_info, scope, "Bias")) { - auto bias_var_name = op_info->Input("Bias").front(); - auto bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); - CHECK(!graph->HasNode(bias_var_name)); CHECK_EQ(bias_dims.production(), n); - - auto bias_const_node = graph->AddNode(bias_var_name, *bias, {1, n, 1, 1}); + auto bias_const_node = graph->AddNode(bias_name, *bias, {1, n, 1, 1}); fc_node->set_input_b(*bias_const_node); } - - // Reshape output of fc_node from (m, n, 1, 1) to (m, n) - auto reshaped_fc_node = graph->AddNode(out_var_name); + // Reshape output of FC node from (m, n, 1, 1) to (m, n) + auto reshaped_fc_node = graph->AddNode(out_name); reshaped_fc_node->set_input_tensor(*fc_node); reshaped_fc_node->set_attr_shape({m, n}); reshaped_fc_node->set_attr_axis(0); diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc index 2676eab14d..48ebfd5673 100644 --- a/lite/kernels/npu/bridges/graph.cc +++ b/lite/kernels/npu/bridges/graph.cc @@ -22,35 +22,25 @@ namespace subgraph { namespace npu { // Const node -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - PrecisionType ptype, - DataLayoutType ltype) { - return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype); -} - std::shared_ptr Graph::AddNode(const std::string& name, const Tensor& tensor, std::vector shape, - PrecisionType ptype, - DataLayoutType ltype) { - CHECK(!HasNode(name)) << "Node " << name << " redefined."; - auto node = AddNode(name); - node->set_attr_value(CvtTensor(tensor, shape, ptype, ltype)); + PrecisionType precision, + DataLayoutType layout) { + auto node = AddNode(name, precision, layout); + node->set_attr_value(CvtTensor(tensor, shape, precision, layout)); return node; } // Data node std::shared_ptr Graph::AddNode(const std::string& name, std::vector shape, - PrecisionType ptype, - DataLayoutType ltype) { - CHECK(!HasNode(name)) << "Node " << name << " redefined."; + PrecisionType precision, + DataLayoutType layout) { auto node = AddNode(name); ge::TensorDesc desc( - ge::Shape(shape), CvtDataLayoutType(ltype), CvtPrecisionType(ptype)); + ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); node->update_input_desc_x(desc); - nodes_.insert(std::make_pair(name, node)); return node; } diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h index 153cc65409..9b6e49c5e9 100644 --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -28,11 +28,35 @@ namespace lite { namespace subgraph { namespace npu { -// Type and registers of converters for converting Paddle Ops to HiAI IR graph +// Type of graph nodes +class Type { + public: + Type(PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW), + bool persistable = false) + : precision_(precision), layout_(layout), persistable_(persistable) {} + + void set_precision(PrecisionType precision) { precision_ = precision; } + void set_layout(DataLayoutType layout) { layout_ = layout; } + bool set_persistable(bool persistable) { persistable_ = persistable; } + + PrecisionType precision() const { return precision_; } + DataLayoutType layout() const { return layout_; } + bool persistable() const { return persistable_; } + + private: + PrecisionType precision_{PRECISION(kFloat)}; + DataLayoutType layout_{DATALAYOUT(kNCHW)}; + bool persistable_{false}; +}; + +// Graph to collect all of converted HiAI IR nodes class Graph { public: template - std::shared_ptr AddNode(const std::string& name) { + std::shared_ptr AddNode(const std::string& name, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { auto unique_name = [&](const std::string& key) { int idx = 1; auto it = counts_.find(key); @@ -43,8 +67,12 @@ class Graph { } return key + "_" + std::to_string(idx); }; + bool persistable = typeid(T) == typeid(ge::op::Const); auto it = nodes_.find(name); if (it != nodes_.end()) { + // Only variable can rebind the name + CHECK(!it->second.second.persistable() && !persistable) + << "[NPU] Node " << name << " redefined."; // Generate a new unique name as the key to bind the origin node: // new_name->node nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); @@ -52,7 +80,8 @@ class Graph { } // Create a new node and bind with the name: name->new_node auto node = std::make_shared(unique_name(name + "_op")); - nodes_.insert(std::make_pair(name, node)); + nodes_.insert(std::make_pair( + name, std::make_pair(node, Type(precision, layout, persistable)))); return node; } @@ -60,30 +89,41 @@ class Graph { std::shared_ptr AddNode( const std::string& name, const Tensor& tensor, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); + } std::shared_ptr AddNode( const std::string& name, const Tensor& tensor, std::vector shape, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr AddNode( + const std::string& name, + const Tensor& tensor, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, tensor, dims.Vectorize(), precision, layout); + } template std::shared_ptr AddNode( const std::string& name, const std::vector& data, std::vector shape = {}, - DataLayoutType ltype = DATALAYOUT(kNCHW)) { + DataLayoutType layout = DATALAYOUT(kNCHW)) { const std::type_info& info = typeid(T); - PrecisionType ptype = PRECISION(kFloat); + PrecisionType precision = PRECISION(kFloat); if (info == typeid(float)) { - ptype = PRECISION(kFloat); + precision = PRECISION(kFloat); } else if (info == typeid(int8_t)) { - ptype = PRECISION(kFloat); + precision = PRECISION(kFloat); } else if (info == typeid(int32_t)) { - ptype = PRECISION(kInt32); + precision = PRECISION(kInt32); } else { LOG(FATAL) << "[NPU] Unknow data type " << info.name(); } @@ -101,7 +141,16 @@ class Graph { std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, ptype, ltype); + return AddNode(name, tensor, precision, layout); + } + + template + std::shared_ptr AddNode( + const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, data, dims.Vectorize(), layout); } template @@ -109,25 +158,47 @@ class Graph { const std::string& name, T value, std::vector shape = {1}, - DataLayoutType ltype = DATALAYOUT(kNCHW)) { + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, ltype); + return AddNode(name, data, shape, layout); + } + + template + std::shared_ptr AddNode( + const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, value, dims.Vectorize(), layout); } // Data node std::shared_ptr AddNode( const std::string& name, std::vector shape, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr AddNode( + const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, dims.Vectorize(), precision, layout); + } std::shared_ptr GetNode(std::string name) { CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; - return nodes_.at(name); + return nodes_.at(name).first; + } + + const Type& GetType(const std::string& name) { + CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; + return nodes_.at(name).second; } bool HasNode(const std::string& name) { @@ -135,7 +206,9 @@ class Graph { } private: - std::unordered_map> nodes_; + std::unordered_map, Type>> + nodes_; std::unordered_map counts_; }; diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index b54dcee849..f95ebc347a 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int InterpolateConverter(void* ctx, OpLite* op) { +int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,14 +30,20 @@ int InterpolateConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - // Get input, output and attributes from lite op - auto x_var_name = op_info->Input("X").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); auto x_h = x_dims[2]; auto x_w = x_dims[3]; CHECK_EQ(x_dims.size(), 4); - auto out_var_name = op_info->Output("Out").front(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto scale = op_info->GetAttr("scale"); auto out_w = op_info->GetAttr("out_w"); auto out_h = op_info->GetAttr("out_h"); @@ -48,6 +54,14 @@ int InterpolateConverter(void* ctx, OpLite* op) { "align_corners = false isn't " "supported in HiAI DDK"; + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + // Priority: OutSize > scale > out_h/out_w if (scale > 0) { out_h = static_cast(x_h * scale); @@ -56,14 +70,17 @@ int InterpolateConverter(void* ctx, OpLite* op) { out_w = out_w > 0 ? out_w : -1; } - // Update out_h and out_w if has OutSize + // Update out_h and out_w and create out_size node if has OutSize std::shared_ptr out_size_node = nullptr; if (HasInputArg(op_info, scope, "OutSize")) { - auto out_size_var_name = op_info->Input("OutSize").front(); - if (graph->HasNode(out_size_var_name)) { - out_size_node = graph->GetNode(out_size_var_name); + auto out_size_name = op_info->Input("OutSize").front(); + auto out_size_type = kernel->GetInputDeclType("OutSize"); + CHECK(out_size_type->precision() == PRECISION(kInt32)); + CHECK(out_size_type->layout() == DATALAYOUT(kNCHW)); + if (graph->HasNode(out_size_name)) { + out_size_node = graph->GetNode(out_size_name); } else { - auto out_size = scope->FindVar(out_size_var_name)->GetMutable(); + auto out_size = scope->FindMutableTensor(out_size_name); CHECK_EQ(out_size->numel(), 2); auto out_size_data = out_size->mutable_data(); // Update out_h and out_w if has OutSize @@ -80,20 +97,20 @@ int InterpolateConverter(void* ctx, OpLite* op) { << " is too large, should not exceed " << largest_multiple << " in HiAI DDK"; } - out_size_node = graph->AddNode(out_var_name + "/out_size", + out_size_node = graph->AddNode(out_name + "/out_size", std::vector({out_h, out_w})); } if (interp_method == "bilinear") { auto bilinear_interp_node = - graph->AddNode(out_var_name); - bilinear_interp_node->set_input_x(*graph->GetNode(x_var_name)); + graph->AddNode(out_name); + bilinear_interp_node->set_input_x(*x_node); bilinear_interp_node->set_input_size(*out_size_node); bilinear_interp_node->set_attr_align_corners(align_corners); } else if (interp_method == "nearest") { auto nearest_interp_node = - graph->AddNode(out_var_name); - nearest_interp_node->set_input_image(*graph->GetNode(x_var_name)); + graph->AddNode(out_name); + nearest_interp_node->set_input_image(*x_node); nearest_interp_node->set_input_size(*out_size_node); nearest_interp_node->set_attr_align_corners(align_corners); } else { diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index e5b24b4092..f63b6826b9 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -22,7 +22,7 @@ namespace subgraph { namespace npu { // Note: all of the input weight vars should be handled in this converter -int MulConverter(void* ctx, OpLite* op) { +int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,13 +31,23 @@ int MulConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto y = scope->FindVar(y_var_name)->GetMutable(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto y_name = op_info->Input("Y").front(); + auto y_type = kernel->GetInputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); auto y_dims = y->dims(); - auto out_var_name = op_info->Output("Out").front(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); int x_num_col_dims = op_info->GetAttr("x_num_col_dims"); int y_num_col_dims = op_info->GetAttr("y_num_col_dims"); int m = x_dims.Slice(0, x_num_col_dims).production(); @@ -46,40 +56,45 @@ int MulConverter(void* ctx, OpLite* op) { << "[NPU] columns of X must be equal with rows of Y"; int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k; - VLOG(3) << "x_var_name:" << x_var_name - << ", is data: " << graph->HasNode(x_var_name); - VLOG(3) << "y_var_name:" << y_var_name - << ", is data: " << graph->HasNode(y_var_name); - CHECK(graph->HasNode(x_var_name)) + VLOG(3) << "x_name:" << x_name << ", is data: " << graph->HasNode(x_name); + VLOG(3) << "y_name:" << y_name << ", is data: " << graph->HasNode(y_name); + CHECK(graph->HasNode(x_name)) << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; - auto mul_node = graph->AddNode(out_var_name); - // Add input x node which supports persistable and non-persistable tensor, and + // X node which supports persistable and non-persistable tensor, and // reshape to (m, k) - if (graph->HasNode(x_var_name)) { - auto reshaped_x_node = - graph->AddNode(x_var_name + "/reshape"); - reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name)); + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + auto reshaped_x_node = graph->AddNode(x_name + "/reshape"); + reshaped_x_node->set_input_tensor(*x_node); reshaped_x_node->set_attr_shape({m, k}); reshaped_x_node->set_attr_axis(0); - mul_node->set_input_x1(*reshaped_x_node); + x_node = reshaped_x_node; } else { - auto x_const_node = graph->AddNode(x_var_name, *x, {m, k}); - mul_node->set_input_x1(*x_const_node); + auto x_const_node = graph->AddNode(x_name, *x, {m, k}); + x_node = x_const_node; } - // Add input y node which only supports persistable tensor, and reshape to + + // Y node which only supports persistable tensor, and reshape to // (k,n) - if (graph->HasNode(y_var_name)) { - auto reshaped_y_node = - graph->AddNode(y_var_name + "/reshape"); - reshaped_y_node->set_input_tensor(*graph->GetNode(y_var_name)); + std::shared_ptr y_node = nullptr; + if (graph->HasNode(y_name)) { + y_node = graph->GetNode(y_name); + auto reshaped_y_node = graph->AddNode(y_name + "/reshape"); + reshaped_y_node->set_input_tensor(*y_node); reshaped_y_node->set_attr_shape({k, n}); reshaped_y_node->set_attr_axis(0); - mul_node->set_input_x2(*reshaped_y_node); + y_node = reshaped_y_node; } else { - auto y_const_node = graph->AddNode(y_var_name, *y, {k, n}); - mul_node->set_input_x2(*y_const_node); + auto y_const_node = graph->AddNode(y_name, *y, {k, n}); + y_node = y_const_node; } + + // Matmul node + auto mul_node = graph->AddNode(out_name); + mul_node->set_input_x1(*x_node); + mul_node->set_input_x2(*y_node); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc index 8b3f17a861..451f48b1df 100644 --- a/lite/kernels/npu/bridges/pad2d_op.cc +++ b/lite/kernels/npu/bridges/pad2d_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int Pad2dConverter(void* ctx, OpLite* op) { +int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,38 +30,54 @@ int Pad2dConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto pad2d_node = graph->AddNode(out_var_name); - pad2d_node->set_input_x(*graph->GetNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("Input"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto padding = op_info->GetAttr>("paddings"); + CHECK_EQ(padding.size(), 4); - auto mode = op_info->GetAttr("mode"); - if (mode == "constant") { - pad2d_node->set_attr_mode(0); - } else if (mode == "reflect") { - LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; - pad2d_node->set_attr_mode(1); - return FAILED; + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); } else { - LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; - return FAILED; + x_node = graph->AddNode(x_name, x_dims); } - auto x_dims = scope->FindTensor(x_var_name)->dims(); - auto padding = op_info->GetAttr>("paddings"); - CHECK_EQ(padding.size(), 4); + // Padding node int xds = x_dims.size(); padding.insert(padding.begin(), xds * 2 - 4, 0); auto padding_const_node = - graph->AddNode(out_var_name + "/padding", padding, {xds, 2}); - pad2d_node->set_input_padding(*padding_const_node); + graph->AddNode(out_name + "/padding", padding, {xds, 2}); + // Pad node + auto pad2d_node = graph->AddNode(out_name); + pad2d_node->set_input_x(*x_node); + pad2d_node->set_input_padding(*padding_const_node); + auto mode = op_info->GetAttr("mode"); if (mode == "constant") { + // Pad value node auto pad_value = op_info->GetAttr("pad_value"); auto pad_value_const_node = - graph->AddNode(out_var_name + "/pad_value", pad_value); + graph->AddNode(out_name + "/pad_value", pad_value); pad2d_node->set_input_constant_values(*pad_value_const_node); pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 + pad2d_node->set_attr_mode(0); + } else if (mode == "reflect") { + LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; + pad2d_node->set_attr_mode(1); + return FAILED; + } else { + LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; + return FAILED; } return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index 9122da2c16..8b108fc4ee 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -22,7 +22,7 @@ namespace lite { namespace subgraph { namespace npu { -int PoolConverter(void* ctx, OpLite* op) { +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,14 +31,32 @@ int PoolConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto x = scope->FindTensor(x_var_name); - auto out_var_name = op_info->Output("Out").front(); - auto pool_node = graph->AddNode(out_var_name); - pool_node->set_input_x(*graph->GetNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + // pool mode int mode = 0; - auto pooling_type = op_info->GetAttr("pooling_type"); if (pooling_type == "max") { mode = 0; } else if (pooling_type == "avg") { @@ -49,8 +67,8 @@ int PoolConverter(void* ctx, OpLite* op) { LOG(WARNING) << "[NPU] Unsupported pooling type: " << pooling_type; return FAILED; } - pool_node->set_attr_mode(mode); + // pad mode int pad_mode = 0; std::string padding_algorithm(""); if (op_info->HasAttr("padding_algorithm")) { @@ -61,16 +79,8 @@ int PoolConverter(void* ctx, OpLite* op) { } else if (padding_algorithm == "VALID") { pad_mode = 5; } - pool_node->set_attr_pad_mode(pad_mode); - - bool global_pooling = op_info->GetAttr("global_pooling"); - pool_node->set_attr_global_pooling(global_pooling); - - auto ksize = op_info->GetAttr>("ksize"); - pool_node->set_attr_window( - ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); - auto paddings = op_info->GetAttr>("paddings"); + // paddings and strides if (paddings.size() == 2L) { for (size_t i = 0; i < 2L; ++i) { int copy_pad = *(paddings.begin() + 2 * i); @@ -91,15 +101,25 @@ int PoolConverter(void* ctx, OpLite* op) { x->dims(), strides, ksize); - pool_node->set_attr_pad(ge::AttrValue::LIST_INT{ - paddings[0], paddings[1], paddings[2], paddings[3]}); - pool_node->set_attr_stride( - ge::AttrValue::LIST_INT(strides.begin(), strides.end())); + // ceil mode int ceil_mode = 0; if (op_info->HasAttr("ceil_mode")) { ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; } + + // Pooling node + auto pool_node = graph->AddNode(out_name); + pool_node->set_input_x(*x_node); + pool_node->set_attr_mode(mode); + pool_node->set_attr_pad_mode(pad_mode); + pool_node->set_attr_global_pooling(global_pooling); + pool_node->set_attr_window( + ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); + pool_node->set_attr_pad(ge::AttrValue::LIST_INT{ + paddings[0], paddings[1], paddings[2], paddings[3]}); + pool_node->set_attr_stride( + ge::AttrValue::LIST_INT(strides.begin(), strides.end())); pool_node->set_attr_ceil_mode(ceil_mode); // pool_node->set_attr_data_mode(data_mode); return REBUILD_WHEN_SHAPE_CHANGED; diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc index 29f11193e3..6c7f29fb27 100644 --- a/lite/kernels/npu/bridges/reduce_mean_op.cc +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ReduceMeanConverter(void* ctx, OpLite* op) { +int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,10 +30,17 @@ int ReduceMeanConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - // Get input and op attributes - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Input("Out").front(); - auto x_dims = scope->FindTensor(x_var_name)->dims(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Input("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto keep_dim = op_info->GetAttr("keep_dim"); auto dim = op_info->GetAttr>("dim"); CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty."; @@ -44,21 +51,36 @@ int ReduceMeanConverter(void* ctx, OpLite* op) { } std::sort(dim.begin(), dim.end()); - // Create reduce_mean(using reduce_sum + scale) node and set input node from - // node map - auto reduce_sum_node = - graph->AddNode(out_var_name + "/reducesum"); - reduce_sum_node->set_input_x(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Using ReduceSum + Scale to implement ReduceMean - auto dim_const_node = graph->AddNode(out_var_name + "/dim", dim); + // Dim node + auto dim_const_node = graph->AddNode(out_name + "/dim", dim); + + // Reduce Sum node + auto reduce_sum_node = + graph->AddNode(out_name + "/reducesum"); + reduce_sum_node->set_input_x(*x_node); reduce_sum_node->set_input_w(*dim_const_node); reduce_sum_node->set_attr_keep_dims(keep_dim); + // Scale node + auto scale_node = graph->AddNode(out_name); + scale_node->set_input_x(*reduce_sum_node); + scale_node->set_attr_axis(1); + + // Add filter node(fill with scale) float scale = 1; for (size_t i = 0; i < dim.size(); i++) { scale /= x_dims[dim[i]]; } - std::vector scale_bias_shape = x_dims.Vectorize(); if (keep_dim) { for (size_t i = 0; i < dim.size(); i++) { @@ -73,13 +95,9 @@ int ReduceMeanConverter(void* ctx, OpLite* op) { remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), scale_bias_shape.end()); } - auto filter_const_node = - graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape); - auto scale_node = graph->AddNode(out_var_name); - scale_node->set_input_x(*reduce_sum_node); + graph->AddNode(out_name + "/filter", scale, scale_bias_shape); scale_node->set_input_filter(*filter_const_node); - scale_node->set_attr_axis(1); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h index c4c88ae02b..5198a3f8f2 100644 --- a/lite/kernels/npu/bridges/registry.h +++ b/lite/kernels/npu/bridges/registry.h @@ -33,7 +33,8 @@ inline bool CHECK_REBUILD_WHEN_SHAPE_CHANGED(int status) { return status & REBUILD_WHEN_SHAPE_CHANGED; } -using cvt_func_type = std::function; +using cvt_func_type = + std::function; using cvt_map_type = std::unordered_map>; diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index 9bd77e8eb9..d5100dee4a 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -22,7 +22,7 @@ namespace lite { namespace subgraph { namespace npu { -int ReshapeConverter(void* ctx, OpLite* op) { +int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,25 +31,44 @@ int ReshapeConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - // Get input, output and op attributes - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); - // Create reshape node and set input node from inputs_map - auto reshape_node = graph->AddNode(out_var_name); - reshape_node->set_input_tensor(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Reshape node + auto reshape_node = graph->AddNode(out_name); + reshape_node->set_input_tensor(*x_node); // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) if (HasInputArg(op_info, scope, "ShapeTensor")) { LOG(WARNING) << "[NPU] not support \"Shape\" from more than one Tensor."; return FAILED; } else if (HasInputArg(op_info, scope, "Shape")) { - auto actual_shape_var_name = op_info->Input("Shape").front(); - if (!graph->HasNode(actual_shape_var_name)) { - auto actual_shape = - scope->FindVar(actual_shape_var_name)->GetMutable(); + auto actual_shape_name = op_info->Input("Shape").front(); + // auto actual_shape_type = kernel->GetInputDeclType("Shape"); + // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); + // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); + std::shared_ptr actual_shape_node = nullptr; + if (graph->HasNode(actual_shape_name)) { + actual_shape_node = graph->GetNode(actual_shape_name); + } else { + auto actual_shape = scope->FindMutableTensor(actual_shape_name); auto actual_shape_dims = actual_shape->dims(); auto actual_shape_data = actual_shape->mutable_data(); auto shape = @@ -63,12 +82,11 @@ int ReshapeConverter(void* ctx, OpLite* op) { << out_shape.size(); } auto actual_shape_const_node = - graph->AddNode(actual_shape_var_name, + graph->AddNode(actual_shape_name, std::vector(out_shape.begin(), out_shape.end())); - reshape_node->set_input_w(*actual_shape_const_node); - } else { - reshape_node->set_input_w(*graph->GetNode(actual_shape_var_name)); + actual_shape_node = actual_shape_const_node; } + reshape_node->set_input_w(*actual_shape_node); } else { auto shape = op_info->GetAttr>("shape"); auto out_dims = lite::operators::ValidateShape(shape, x_dims); @@ -82,6 +100,7 @@ int ReshapeConverter(void* ctx, OpLite* op) { ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); } + // XShape node if (op_type == "reshape2") { // Append an extra reshape node to calc XShape std::vector xshape_dims(x_dims.size() + 1, 1); @@ -92,10 +111,14 @@ int ReshapeConverter(void* ctx, OpLite* op) { LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " "but XShape has " << xshape_dims.size(); + return FAILED; } - auto xshape_var_name = op_info->Output("XShape").front(); - auto xshape_node = graph->AddNode(xshape_var_name); - xshape_node->set_input_tensor(*graph->GetNode(x_var_name)); + auto xshape_name = op_info->Output("XShape").front(); + // auto xshape_type = kernel->GetOutputDeclType("XShape"); + // CHECK(xshape_type->precision() == PRECISION(kFloat)); + // CHECK(xshape_type->layout() == DATALAYOUT(kNCHW)); + auto xshape_node = graph->AddNode(xshape_name); + xshape_node->set_input_tensor(*x_node); xshape_node->set_attr_shape( ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); } diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc index 72d0a7d300..ca04996faf 100644 --- a/lite/kernels/npu/bridges/scale_op.cc +++ b/lite/kernels/npu/bridges/scale_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ScaleConverter(void* ctx, OpLite* op) { +int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,11 +31,17 @@ int ScaleConverter(void* ctx, OpLite* op) { VLOG(3) << "[NPU] Converting " + op_type + "..."; // Get input, output and op attributes - auto x_var_name = op_info->Input("X").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto x_dims = x->dims().Vectorize(); + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); CHECK_GE(x_dims.size(), 2); - auto out_var_name = op_info->Output("Out").front(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); std::vector scale_bias_shape = {x_dims[1]}; float scale = op_info->GetAttr("scale"); float bias = op_info->GetAttr("bias"); @@ -44,23 +50,31 @@ int ScaleConverter(void* ctx, OpLite* op) { bias *= scale; } - // Create scale node and set input node from inputs_map - auto scale_node = graph->AddNode(out_var_name); - scale_node->set_input_x(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Scale node + auto scale_node = graph->AddNode(out_name); + scale_node->set_input_x(*x_node); + scale_node->set_attr_axis(1); // Add filter node(fill with scale) auto filter_const_node = - graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape); + graph->AddNode(out_name + "/filter", scale, scale_bias_shape); scale_node->set_input_filter(*filter_const_node); // Add bias node(fill with bias) if (fabs(bias) > 1e-6f) { auto bias_const_node = - graph->AddNode(out_var_name + "/bias", bias, scale_bias_shape); + graph->AddNode(out_name + "/bias", bias, scale_bias_shape); scale_node->set_input_bias(*bias_const_node); scale_node->set_attr_has_bias_value(true); } - scale_node->set_attr_axis(1); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc index 1d56b7d206..47469e1506 100644 --- a/lite/kernels/npu/bridges/shuffle_channel_op.cc +++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int ShuffleChannelConverter(void* ctx, OpLite* op) { +int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,13 +30,31 @@ int ShuffleChannelConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto shuffle_channel_node = - graph->AddNode(out_var_name); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto group = op_info->GetAttr("group"); - shuffle_channel_node->set_input_x(*graph->GetNode(x_var_name)); - shuffle_channel_node->set_attr_group(op_info->GetAttr("group")); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Shuffle Channel node + auto shuffle_channel_node = graph->AddNode(out_name); + shuffle_channel_node->set_input_x(*x_node); + shuffle_channel_node->set_attr_group(group); return SUCCESS; } diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc index e8d97194a8..01d8b0a944 100644 --- a/lite/kernels/npu/bridges/softmax_op.cc +++ b/lite/kernels/npu/bridges/softmax_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int SoftmaxConverter(void* ctx, OpLite* op) { +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,9 +30,17 @@ int SoftmaxConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto x_dims = scope->FindVar(x_var_name)->GetMutable()->dims(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); if (x_dims.size() > 3) { CHECK(!(axis == 2 && x_dims[3] > 1)) @@ -40,8 +48,17 @@ int SoftmaxConverter(void* ctx, OpLite* op) { << " :x_w = " << x_dims[3]; } - auto softmax_node = graph->AddNode(out_var_name); - softmax_node->set_input_x(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Softmax node + auto softmax_node = graph->AddNode(out_name); + softmax_node->set_input_x(*x_node); softmax_node->set_attr_axis(axis); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc index 29ba88f8a9..597de04d5b 100644 --- a/lite/kernels/npu/bridges/split_op.cc +++ b/lite/kernels/npu/bridges/split_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int SplitConverter(void* ctx, OpLite* op) { +int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,15 +30,33 @@ int SplitConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " << op_type << " ... "; - auto x_var_name = op_info->Input("X").front(); - auto out_var_names = op_info->Output("Out"); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_names = op_info->Output("Out"); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); auto num = op_info->GetAttr("num"); auto sections = op_info->GetAttr>("sections"); int64_t sections_num = static_cast(sections.size()); - auto split_node = graph->AddNode(op_type + "/" + x_var_name); - split_node->set_input_x(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Split node + auto split_node = graph->AddNode(op_type + "/" + x_name); + split_node->set_input_x(*x_node); split_node->set_attr_axis(static_cast(axis)); if (num > 0) { split_node->set_attr_output_num(static_cast(num)); @@ -48,12 +66,12 @@ int SplitConverter(void* ctx, OpLite* op) { split_node->set_attr_size_split(size_split); } - split_node->create_dynamic_output_y(out_var_names.size()); + split_node->create_dynamic_output_y(out_names.size()); int idx = 1; - for (auto& out_var_name : out_var_names) { + for (auto& out_name : out_names) { auto zero_const_node = - graph->AddNode(out_var_name + "/zero" + std::to_string(idx), 0); - auto add_node = graph->AddNode(out_var_name); + graph->AddNode(out_name + "/zero" + std::to_string(idx), 0); + auto add_node = graph->AddNode(out_name); add_node->set_input_x1(*split_node, "y" + std::to_string(idx)); add_node->set_input_x2(*zero_const_node); idx++; diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc index f10fa7b616..2ee58862fb 100644 --- a/lite/kernels/npu/bridges/sqrt_op.cc +++ b/lite/kernels/npu/bridges/sqrt_op.cc @@ -21,18 +21,38 @@ namespace lite { namespace subgraph { namespace npu { -int SqrtConverter(void* ctx, OpLite* op) { +int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto sqrt_node = graph->AddNode(out_var_name); - sqrt_node->set_input_x(*graph->GetNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Sqrt node + auto sqrt_node = graph->AddNode(out_name); + sqrt_node->set_input_x(*x_node); return SUCCESS; } diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc index f7d1a2535e..3f6676c8a8 100644 --- a/lite/kernels/npu/bridges/square_op.cc +++ b/lite/kernels/npu/bridges/square_op.cc @@ -21,18 +21,38 @@ namespace lite { namespace subgraph { namespace npu { -int SquareConverter(void* ctx, OpLite* op) { +int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto square_node = graph->AddNode(out_var_name); - square_node->set_input_x(*graph->GetNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Square node + auto square_node = graph->AddNode(out_name); + square_node->set_input_x(*x_node); return SUCCESS; } diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc index 126390e9b7..70449dac7a 100644 --- a/lite/kernels/npu/bridges/transpose_op.cc +++ b/lite/kernels/npu/bridges/transpose_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int TransposeConverter(void* ctx, OpLite* op) { +int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,13 +30,28 @@ int TransposeConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " + op_type + "..."; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Input("Out").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Input("Out").front(); auto axis = op_info->GetAttr>("axis"); - auto transpose_node = graph->AddNode(out_var_name); - transpose_node->set_input_x(*graph->GetNode(x_var_name)); - auto w_const_node = graph->AddNode(out_var_name + "/w", 1.0f); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Transpose node + auto transpose_node = graph->AddNode(out_name); + transpose_node->set_input_x(*x_node); + auto w_const_node = graph->AddNode(out_name + "/w", 1.0f); transpose_node->set_input_w(*w_const_node); transpose_node->set_attr_order( ge::AttrValue::LIST_INT(axis.begin(), axis.end())); diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc index 0da82e2f63..8ff95d4ed8 100644 --- a/lite/kernels/npu/bridges/unsqueeze_op.cc +++ b/lite/kernels/npu/bridges/unsqueeze_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace npu { -int UnsqueezeConverter(void* ctx, OpLite* op) { +int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,14 +30,31 @@ int UnsqueezeConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[NPU] Converting " << op_type << "... "; - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - auto out_shape = scope->FindTensor(out_var_name)->dims().Vectorize(); + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto out_shape = scope->FindTensor(out_name)->dims().Vectorize(); CHECK(op_info->HasAttr("axes")) << "[NPU] unsqueeze not support axes from tensor now"; - auto unsqueeze_node = graph->AddNode(out_var_name); - unsqueeze_node->set_input_tensor(*graph->GetNode(x_var_name)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Unsqueeze node + auto unsqueeze_node = graph->AddNode(out_name); + unsqueeze_node->set_input_tensor(*x_node); unsqueeze_node->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); return REBUILD_WHEN_SHAPE_CHANGED; diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc index fdee98cc86..f79936c5d7 100644 --- a/lite/kernels/npu/bridges/utility.cc +++ b/lite/kernels/npu/bridges/utility.cc @@ -44,12 +44,21 @@ ge::DataType CvtPrecisionType(PrecisionType itype) { case PRECISION(kFloat): otype = ge::DT_FLOAT; break; + case PRECISION(kFP16): + otype = ge::DT_FLOAT16; + break; case PRECISION(kInt8): otype = ge::DT_INT8; break; + case PRECISION(kInt16): + otype = ge::DT_INT16; + break; case PRECISION(kInt32): otype = ge::DT_INT32; break; + case PRECISION(kInt64): + otype = ge::DT_INT64; + break; default: LOG(FATAL) << "[NPU] Can not convert precision type(" << PrecisionToStr(itype) << ") from Lite to NPU"; @@ -64,6 +73,9 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) { case DATALAYOUT(kNCHW): otype = ge::FORMAT_NCHW; break; + case DATALAYOUT(kNHWC): + otype = ge::FORMAT_NHWC; + break; // TODO(hong19860320) support more data layout type default: LOG(FATAL) << "[NPU] Can not convert data layout type(" @@ -75,39 +87,22 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) { ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - const uint8_t* in_data = nullptr; + PrecisionType in_precision, + DataLayoutType in_layout) { auto in_size = in_tensor.dims().production(); auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { out_shape = in_shape; } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype); - } - ge::DataType out_ptype = CvtPrecisionType(in_ptype); - ge::Format out_ltype = CvtDataLayoutType(in_ltype); - - ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype); - CHECK_EQ(out_ltype, ge::FORMAT_NCHW); - + ge::TensorDesc out_desc(ge::Shape(out_shape), + CvtDataLayoutType(in_layout), + CvtPrecisionType(in_precision)); auto out_size = out_desc.GetShape().GetShapeSize(); CHECK_EQ(out_size, in_size); - ge::TensorPtr out_tensor = std::make_shared(); out_tensor->SetTensorDesc(out_desc); - out_tensor->SetData(in_data, in_bytes); + out_tensor->SetData(reinterpret_cast(in_tensor.raw_data()), + in_tensor.memory_size()); return out_tensor; } diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h index db8086edde..e8300a0472 100644 --- a/lite/kernels/npu/bridges/utility.h +++ b/lite/kernels/npu/bridges/utility.h @@ -72,8 +72,8 @@ ge::Format CvtDataLayoutType(DataLayoutType itype); ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); + PrecisionType in_precision = PRECISION(kFloat), + DataLayoutType in_layout = DATALAYOUT(kNCHW)); template ge::TensorPtr CreateTensorAndFillData(const std::vector& data, @@ -85,8 +85,12 @@ ge::TensorPtr CreateTensorAndFillData(const std::vector& data, type = ge::DT_FLOAT; } else if (info == typeid(int8_t)) { type = ge::DT_INT8; + } else if (info == typeid(int16_t)) { + type = ge::DT_INT16; } else if (info == typeid(int32_t)) { type = ge::DT_INT32; + } else if (info == typeid(int64_t)) { + type = ge::DT_INT64; } else { LOG(FATAL) << "[NPU] Unknow value type " << info.name(); } diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index def7717a97..c6cbea46fa 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -29,19 +29,9 @@ namespace npu { int SubgraphEngine::BuildDeviceProgram() { int status = 0; - // Convert all of input data vars and added into the HiAI IR graph + // Convert all of ops and their input vars and weights and added into the NPU + // HiAI IR graph subgraph::npu::Graph graph; - for (auto& input_name : input_names_) { - auto input_tensor = scope_->FindMutableTensor(input_name); - CHECK(input_tensor); - auto input_node = - graph.AddNode(input_name, input_tensor->dims().Vectorize()); - CHECK(input_node); - // HiAI DDK doesn't support dynamic dimensions/shapes, so need to rebuild - // the program when the shape of any input tensor is changed. - status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; - } - // Convert all of ops and its weights and added into the HiAI IR graph const auto& bridges = subgraph::Registry::Instance(); for (auto& inst : origin_program_) { auto op = inst.op(); @@ -52,29 +42,56 @@ int SubgraphEngine::BuildDeviceProgram() { if (!bridges.Exists("NPU", op_type)) { return subgraph::FAILED; } + auto kernel = inst.kernel(); status |= bridges.Select("NPU", op_type)(reinterpret_cast(&graph), - const_cast(op)); + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } } - // Set the input and output nodes of the HiAI IR graph - std::vector input_nodes, output_nodes; + // Collect the valid input and output nodes in the HiAI IR graph and update + // the input and output names + device_inames_.clear(); + device_onames_.clear(); + std::vector device_inodes; + std::vector device_onodes; for (auto& input_name : input_names_) { - input_nodes.push_back(*graph.GetNode(input_name)); + if (graph.HasNode(input_name)) { + if (!graph.GetType(input_name).persistable()) { + device_inodes.push_back(*graph.GetNode(input_name)); + device_inames_.push_back(input_name); + } else { + LOG(WARNING) << "[NPU] Input node " << input_name + << " is skipped because it is a persistable node."; + } + } else { + LOG(WARNING) << "[NPU] Input node " << input_name + << " is skipped because it does not exist."; + } } for (auto& output_name : output_names_) { - output_nodes.push_back(*graph.GetNode(output_name)); + if (graph.HasNode(output_name)) { + device_onodes.push_back(*graph.GetNode(output_name)); + device_onames_.push_back(output_name); + } else { + LOG(WARNING) << "[NPU] Output node " << output_name + << " is skipped because it does not exist."; + } } - // Build the HiAI IR graph to HiAI om model - device_program_ = - lite::npu::Device::Global().Build(model_name_, input_nodes, output_nodes); + CHECK(!device_inames_.empty()) + << "[NPU] No input nodes found for building NPU model"; + CHECK(!device_onames_.empty()) + << "[NPU] No output nodes found for building NPU model"; + // Build the HiAI IR graph to HiAI om model as the device program + device_program_ = lite::npu::Device::Global().Build( + model_name_, device_inodes, device_onodes); if (device_program_ == nullptr) { LOG(WARNING) << "[NPU] Build model failed!"; return subgraph::FAILED; } - // Query and check the dimensions of input and output tensors + // Query and check the dimensions of valid input and output tensors std::vector device_idims, device_odims; if (device_program_->GetModelIOTensorDim( model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { @@ -82,44 +99,75 @@ int SubgraphEngine::BuildDeviceProgram() { << "[NPU] Get the dimensions of input and output tensors failed!"; return subgraph::FAILED; } - CHECK_EQ(device_idims.size(), input_names_.size()); - CHECK_EQ(device_odims.size(), output_names_.size()); - origin_idims_.resize(input_names_.size()); - origin_itensors_.resize(input_names_.size()); - device_idatasizes_.resize(input_names_.size()); - device_itensors_.resize(input_names_.size()); - origin_odims_.resize(output_names_.size()); - origin_otensors_.resize(output_names_.size()); - device_odatasizes_.resize(output_names_.size()); - device_otensors_.resize(output_names_.size()); - for (int i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK_EQ(device_idims.size(), device_inames_.size()); + CHECK_EQ(device_odims.size(), device_onames_.size()); + origin_idims_.resize(device_inames_.size()); + origin_itensors_.resize(device_inames_.size()); + device_itensors_.resize(device_inames_.size()); + origin_odims_.resize(device_onames_.size()); + origin_otensors_.resize(device_onames_.size()); + device_otensors_.resize(device_onames_.size()); + for (int i = 0; i < device_inames_.size(); i++) { + auto type = graph.GetType(device_inames_[i]); + auto precision = type.precision(); + auto layout = type.layout(); + origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[NPU] Input dims[" << i << "]: {" << device_idims[i].GetNumber() - << "," << device_idims[i].GetChannel() << "," + VLOG(3) << "[NPU] Inputs[" << i + << "] precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) << " dims: {" + << device_idims[i].GetNumber() << "," + << device_idims[i].GetChannel() << "," << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth() << "}"; - device_idatasizes_[i] = - device_idims[i].GetNumber() * device_idims[i].GetChannel() * - device_idims[i].GetHeight() * device_idims[i].GetWidth(); - CHECK_EQ(device_idatasizes_[i], origin_idims_[i].production()); + // Prepare the device input tensors + CHECK_EQ(origin_idims_[i].production(), + device_idims[i].GetNumber() * device_idims[i].GetChannel() * + device_idims[i].GetHeight() * device_idims[i].GetWidth()); device_itensors_[i].reset(new hiai::AiTensor); device_itensors_[i]->Init(&(device_idims[i])); } - for (int i = 0; i < output_names_.size(); i++) { - origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + for (int i = 0; i < device_onames_.size(); i++) { + auto type = graph.GetType(device_onames_[i]); + auto precision = type.precision(); + auto layout = type.layout(); + origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[NPU] Output dims[" << i << "]: {" + VLOG(3) << "[NPU] Outputs[" << i + << "] precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) << " dims: {" << device_odims[i].GetNumber() << "," << device_odims[i].GetChannel() << "," << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth() << "}"; - device_odatasizes_[i] = - device_odims[i].GetNumber() * device_odims[i].GetChannel() * - device_odims[i].GetHeight() * device_odims[i].GetWidth(); - CHECK_EQ(device_odatasizes_[i], origin_odims_[i].production()); + // Prepare the device output tensors + switch (precision) { + case PRECISION(kFloat): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt8): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt16): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt32): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt64): + origin_otensors_[i]->mutable_data(); + break; + default: + LOG(FATAL) << "[NPU] " << device_onames_[i] + << " can't mutable data with precision type " + << PrecisionToStr(precision); + break; + } + CHECK_EQ(origin_odims_[i].production(), + device_odims[i].GetNumber() * device_odims[i].GetChannel() * + device_odims[i].GetHeight() * device_odims[i].GetWidth()); device_otensors_[i].reset(new hiai::AiTensor); device_otensors_[i]->Init(&(device_odims[i])); } @@ -128,10 +176,10 @@ int SubgraphEngine::BuildDeviceProgram() { int SubgraphEngine::LaunchDeviceProgram() { // Copy the data of origin input tensors to the buffer of input HiAI tensors - for (size_t i = 0; i < input_names_.size(); i++) { - std::memcpy(static_cast(device_itensors_[i]->GetBuffer()), - origin_itensors_[i]->mutable_data(), - sizeof(float) * static_cast(device_idatasizes_[i])); + for (size_t i = 0; i < device_itensors_.size(); i++) { + std::memcpy(device_itensors_[i]->GetBuffer(), + origin_itensors_[i]->raw_data(), + origin_itensors_[i]->memory_size()); } // Run the HiAI model by name std::string key = "model_name"; // Note: key seems must be model_name @@ -149,10 +197,10 @@ int SubgraphEngine::LaunchDeviceProgram() { hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; // Copy the data of output HiAI tensor to the buffer of origin output tensors - for (size_t i = 0; i < output_names_.size(); i++) { - std::memcpy(origin_otensors_[i]->mutable_data(), - static_cast(device_otensors_[i]->GetBuffer()), - sizeof(float) * static_cast(device_odatasizes_[i])); + for (size_t i = 0; i < device_otensors_.size(); i++) { + std::memcpy(const_cast(origin_otensors_[i]->raw_data()), + device_otensors_[i]->GetBuffer(), + device_otensors_[i]->GetSize()); } return 0; } diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index cc6ca9c13d..dd0bf82bc9 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -43,8 +43,8 @@ class SubgraphEngine : public subgraph::Engine { std::string model_name_; hiai::AiContext model_context_; - std::vector device_idatasizes_; - std::vector device_odatasizes_; + std::vector device_inames_; + std::vector device_onames_; std::vector> device_itensors_; std::vector> device_otensors_; std::unique_ptr device_program_{nullptr}; diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc index 856a07a94c..baac8c47a6 100644 --- a/lite/kernels/x86/lookup_table_compute.cc +++ b/lite/kernels/x86/lookup_table_compute.cc @@ -24,7 +24,7 @@ //, REGISTER_LITE_KERNEL(lookup_table, kX86, - kInt64, + kFloat, kNCHW, paddle::lite::kernels::x86::LookupTableCompute, def) @@ -34,7 +34,7 @@ REGISTER_LITE_KERNEL(lookup_table, .Finalize(); REGISTER_LITE_KERNEL(lookup_table_v2, kX86, - kInt64, + kFloat, kNCHW, paddle::lite::kernels::x86::LookupTableCompute, def) diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h index d5719f332c..eeafa2e33e 100644 --- a/lite/kernels/x86/lookup_table_compute.h +++ b/lite/kernels/x86/lookup_table_compute.h @@ -24,7 +24,7 @@ namespace kernels { namespace x86 { template -class LookupTableCompute : public KernelLite { +class LookupTableCompute : public KernelLite { public: using param_t = operators::LookupTableParam; diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc index 86b2d39186..9c11c67240 100644 --- a/lite/kernels/x86/lookup_table_compute_test.cc +++ b/lite/kernels/x86/lookup_table_compute_test.cc @@ -79,4 +79,4 @@ TEST(lookup_table_x86, compute) { } // namespace lite } // namespace paddle -USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def); +USE_LITE_KERNEL(lookup_table, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc index 5f69319a6c..93479b02ee 100644 --- a/lite/kernels/x86/stack_compute.cc +++ b/lite/kernels/x86/stack_compute.cc @@ -21,5 +21,5 @@ REGISTER_LITE_KERNEL(stack, paddle::lite::kernels::x86::StackCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt index f44fcecdaf..c0388e8a2c 100644 --- a/lite/kernels/xpu/bridges/CMakeLists.txt +++ b/lite/kernels/xpu/bridges/CMakeLists.txt @@ -14,6 +14,11 @@ lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_brid lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu}) lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_stack_op_xpu SRCS stack_op.cc DEPS ${xpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_gather_op_xpu SRCS gather_op.cc DEPS ${xpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_scale_op_xpu SRCS scale_op.cc DEPS ${xpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_lookup_table_op_xpu SRCS lookup_table_op.cc DEPS ${xpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_slice_op_xpu SRCS slice_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_transpose_op_xpu SRCS transpose_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_reshape_op_xpu SRCS reshape_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${xpu_subgraph_bridge_deps}) @@ -30,6 +35,11 @@ set(xpu_subgraph_bridges subgraph_bridge_softmax_op_xpu subgraph_bridge_mul_op_xpu subgraph_bridge_batch_norm_op_xpu + subgraph_bridge_stack_op_xpu + subgraph_bridge_gather_op_xpu + subgraph_bridge_scale_op_xpu + subgraph_bridge_lookup_table_op_xpu + subgraph_bridge_slice_op_xpu subgraph_bridge_transpose_op_xpu subgraph_bridge_reshape_op_xpu subgraph_bridge_layer_norm_op_xpu diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc index 7536ec264d..f674af84ca 100644 --- a/lite/kernels/xpu/bridges/act_op.cc +++ b/lite/kernels/xpu/bridges/act_op.cc @@ -21,21 +21,42 @@ namespace lite { namespace subgraph { namespace xpu { -int ActConverter(void* ctx, OpLite* op) { +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Create act node and set params from op - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - CHECK(graph->HasNode(x_var_name)); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Act node if (op_type == "relu") { - graph->AddNode(out_var_name, - graph->builder_.CreateRelu(*graph->GetNode(x_var_name))); + graph->AddNode(out_name, graph->builder_.CreateRelu(*x_node)); + } else if (op_type == "tanh") { + graph->AddNode(out_name, graph->builder_.CreateUnaryOp("tanh", *x_node)); + } else if (op_type == "gelu") { + graph->AddNode(out_name, graph->builder_.CreateGelu(*x_node)); } else { // TODO(hong19860320) supports more activation ops LOG(WARNING) << "[XPU] Unsupported activation type " << op_type; @@ -50,3 +71,5 @@ int ActConverter(void* ctx, OpLite* op) { } // namespace paddle REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(XPU, tanh, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(XPU, gelu, paddle::lite::subgraph::xpu::ActConverter); diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc deleted file mode 100644 index 1a3efab46e..0000000000 --- a/lite/kernels/xpu/bridges/act_op_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" -#include "lite/operators/activation_ops.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -void relu_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); - } -} - -void test_relu(int bs, int ic, int ih, int iw) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("relu"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - - // create and convert op to XPU model, and run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - relu_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(relu); -USE_XPU_BRIDGE(relu); diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc index 4ca107679b..980f241660 100644 --- a/lite/kernels/xpu/bridges/batch_norm_op.cc +++ b/lite/kernels/xpu/bridges/batch_norm_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace xpu { -int BatchNormConverter(void* ctx, OpLite* op) { +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,35 +30,62 @@ int BatchNormConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Get input vars and op attributes - auto x_var_name = op_info->Input("X").front(); - auto scale_var_name = op_info->Input("Scale").front(); - auto* scale = scope->FindMutableTensor(scale_var_name); - auto bias_var_name = op_info->Input("Bias").front(); - auto* bias = scope->FindMutableTensor(bias_var_name); - auto mean_var_name = op_info->Input("Mean").front(); - auto* mean = scope->FindMutableTensor(mean_var_name); - auto variance_var_name = op_info->Input("Variance").front(); - auto* variance = scope->FindMutableTensor(variance_var_name); - auto y_var_name = op_info->Output("Y").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); + auto scale_type = kernel->GetInputDeclType("Scale"); + CHECK(scale_type->precision() == PRECISION(kFloat)); + CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); + auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); + auto mean_type = kernel->GetInputDeclType("Mean"); + CHECK(mean_type->precision() == PRECISION(kFloat)); + CHECK(mean_type->layout() == DATALAYOUT(kNCHW)); + auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); + auto variance_type = kernel->GetInputDeclType("Variance"); + CHECK(variance_type->precision() == PRECISION(kFloat)); + CHECK(variance_type->layout() == DATALAYOUT(kNCHW)); + auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); auto epsilon = op_info->GetAttr("epsilon"); - // Create scale, bias, mean, variance nodes - auto scale_const_node = graph->AddNode(scale_var_name, *scale); - auto bias_const_node = graph->AddNode(bias_var_name, *bias); - auto mean_const_node = graph->AddNode(mean_var_name, *mean); - auto variance_const_node = graph->AddNode(variance_var_name, *variance); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } - // Create batch_norm node and set params from op - auto batch_norm_node = - graph->builder_.CreateBatchNorm(*graph->GetNode(x_var_name), - *scale_const_node, - *bias_const_node, - *mean_const_node, - *variance_const_node, - 1, - epsilon); - graph->AddNode(y_var_name, graph->builder_.GetField(batch_norm_node, 0)); + // Scale, Bias, Mean, Variance node + auto scale_const_node = graph->AddNode(scale_name, *scale); + auto bias_const_node = graph->AddNode(bias_name, *bias); + auto mean_const_node = graph->AddNode(mean_name, *mean); + auto variance_const_node = graph->AddNode(variance_name, *variance); + + // Batch Norm node and extract the first field as the output node + auto batch_norm_node = graph->builder_.CreateBatchNorm(*x_node, + *scale_const_node, + *bias_const_node, + *mean_const_node, + *variance_const_node, + 1, + epsilon); + graph->AddNode(y_name, graph->builder_.GetField(batch_norm_node, 0)); return SUCCESS; } diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index 2abddf9e4b..5e9e5448a1 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -22,7 +22,7 @@ namespace lite { namespace subgraph { namespace xpu { -int ConvConverter(void* ctx, OpLite* op) { +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,14 +31,23 @@ int ConvConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[XPU] Converting " << op_type << "... "; - // Get input, filter and op attributes - auto input_var_name = op_info->Input("Input").front(); - auto input = scope->FindVar(input_var_name)->GetMutable(); + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + CHECK(input_type->precision() == PRECISION(kFloat)); + CHECK(input_type->layout() == DATALAYOUT(kNCHW)); + auto input = scope->FindMutableTensor(input_name); auto input_dims = input->dims(); - auto filter_var_name = op_info->Input("Filter").front(); - auto filter = scope->FindVar(filter_var_name)->GetMutable(); + auto filter_name = op_info->Input("Filter").front(); + auto filter_type = kernel->GetInputDeclType("Filter"); + CHECK(filter_type->precision() == PRECISION(kFloat)); + CHECK(filter_type->layout() == DATALAYOUT(kNCHW)); + auto filter = scope->FindMutableTensor(filter_name); auto filter_dims = filter->dims(); - auto output_var_name = op_info->Output("Output").front(); + auto output_name = op_info->Output("Output").front(); + auto output_type = kernel->GetOutputDeclType("Output"); + CHECK(output_type->precision() == PRECISION(kFloat)); + CHECK(output_type->layout() == DATALAYOUT(kNCHW)); auto bs = input_dims[0]; auto oc = filter_dims[0]; CHECK_EQ(input_dims.size(), 4); @@ -51,6 +60,14 @@ int ConvConverter(void* ctx, OpLite* op) { CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); + // Input node + std::shared_ptr input_node = nullptr; + if (graph->HasNode(input_name)) { + input_node = graph->GetNode(input_name); + } else { + input_node = graph->AddNode(input_name, input_dims); + } + if (paddings.size() == 2L) { for (size_t i = 0; i < strides.size(); ++i) { int copy_pad = *(paddings.begin() + 2 * i); @@ -81,14 +98,14 @@ int ConvConverter(void* ctx, OpLite* op) { } DDim output_dims(output_shape); - // Create filter node - auto filter_const_node = graph->AddNode(filter_var_name, *filter); + // Filter node + auto filter_const_node = graph->AddNode(filter_name, *filter); - // Create conv node and set input, filter, bias nodes and attributes + // Conv node auto conv_attrs = xtcl::make_node(); - conv_attrs->strides = std::move(CvtShape(strides)); - conv_attrs->padding = std::move(CvtShape(paddings)); - conv_attrs->dilation = std::move(CvtShape(dilations)); + conv_attrs->strides = std::move(CvtShape(strides)); + conv_attrs->padding = std::move(CvtShape(paddings)); + conv_attrs->dilation = std::move(CvtShape(dilations)); conv_attrs->groups = groups; // conv_attrs->channels = nullptr; conv_attrs->kernel_size = std::move(xtcl::Array(nullptr)); @@ -96,19 +113,22 @@ int ConvConverter(void* ctx, OpLite* op) { conv_attrs->kernel_layout = "OIHW"; conv_attrs->out_layout = ""; // conv_attrs->out_dtype = ""; - auto conv_node = graph->AddNode( - output_var_name, - graph->builder_.CreateConv2D( - *graph->GetNode(input_var_name), *filter_const_node, conv_attrs)); + auto conv_node = + graph->AddNode(output_name, + graph->builder_.CreateConv2D( + *input_node, *filter_const_node, conv_attrs)); - // Create bias node if exists bias + // Add bias node if exists bias // supports the bias nodes with the following dimensions // 0: {oc} // 1: {1, oc, oh, ow} // 2: {n, oc, oh, ow} if (HasInputArg(op_info, scope, "Bias")) { - auto bias_var_name = op_info->Input("Bias").front(); - auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); auto bias_data_size = bias_dims.production(); auto output_data_size = output_dims.production(); @@ -130,21 +150,21 @@ int ConvConverter(void* ctx, OpLite* op) { << output_dims; } std::shared_ptr bias_node = nullptr; - if (graph->HasNode(bias_var_name)) { + if (graph->HasNode(bias_name)) { // Bias node from input node - bias_node = graph->GetNode(bias_var_name); + bias_node = graph->GetNode(bias_name); } else { - // Bias node with const tensor - bias_node = graph->AddNode(bias_var_name, *bias, bias_shape); + // Bias node with const data + bias_node = graph->AddNode(bias_name, *bias, bias_shape); } std::shared_ptr add_node = nullptr; if (is_channel_bias) { add_node = graph->AddNode( - output_var_name, + output_name, graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node)); } else { add_node = graph->AddNode( - output_var_name, + output_name, graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node)); } conv_node = add_node; @@ -152,7 +172,7 @@ int ConvConverter(void* ctx, OpLite* op) { if (fuse_relu) { // Append relu node if fuse_relu is true - graph->AddNode(output_var_name, graph->builder_.CreateRelu(*conv_node)); + graph->AddNode(output_name, graph->builder_.CreateRelu(*conv_node)); } return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc index 0ab6cc1091..49a42c55d6 100644 --- a/lite/kernels/xpu/bridges/elementwise_ops.cc +++ b/lite/kernels/xpu/bridges/elementwise_ops.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace xpu { -int ElementwiseConverter(void* ctx, OpLite* op) { +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(op != nullptr); CHECK(ctx != nullptr); auto graph = static_cast(ctx); @@ -30,39 +30,49 @@ int ElementwiseConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Get input, and attributes - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - auto out_var_name = op_info->Output("Out").front(); - auto axis = op_info->GetAttr("axis"); - auto x = scope->FindMutableTensor(x_var_name); - auto y = scope->FindMutableTensor(y_var_name); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto y_name = op_info->Input("Y").front(); + auto y_type = kernel->GetInputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); auto y_dims = y->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto axis = op_info->GetAttr("axis"); - // Create x and y node + // X node std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_var_name)) { - x_node = graph->GetNode(x_var_name); + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); } else { - x_node = graph->AddNode(x_var_name, *x); + x_node = graph->AddNode(x_name, x_dims); } + // Y node std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_var_name)) { - y_node = graph->GetNode(y_var_name); + if (graph->HasNode(y_name)) { + y_node = graph->GetNode(y_name); } else { - y_node = graph->AddNode(y_var_name, *y); + y_node = graph->AddNode(y_name, y_dims); } - // Create elementwise node and set input, attributes + // Elementwise node std::shared_ptr elementwise_node = nullptr; if (y_dims.size() == 1) { elementwise_node = graph->AddNode( - out_var_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node)); + out_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node)); } else if (x_dims.size() == y_dims.size()) { elementwise_node = graph->AddNode( - out_var_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node)); + out_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node)); } else { LOG(WARNING) << "[XPU] elementwise_add only support y of one dimension, or x " diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc new file mode 100644 index 0000000000..06d1c67b0d --- /dev/null +++ b/lite/kernels/xpu/bridges/gather_op.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/graph.h" +#include "lite/kernels/xpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace xpu { + +int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[XPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto index_name = op_info->Input("Index").front(); + auto index_type = kernel->GetInputDeclType("Index"); + CHECK(index_type->precision() == PRECISION(kInt32) || + index_type->precision() == PRECISION(kInt64)); + CHECK(index_type->layout() == DATALAYOUT(kNCHW)); + auto index = scope->FindMutableTensor(index_name); + auto index_dims = index->dims(); + CHECK(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Index node + std::shared_ptr index_node = nullptr; + if (graph->HasNode(index_name)) { + index_node = graph->GetNode(index_name); + } else { + index_node = graph->AddNode( + index_name, index_dims, index_type->precision(), index_type->layout()); + } + // Flatten index node + if (index_dims.size() != 1) { + index_node = + graph->AddNode(index_name + "/reshape", + graph->builder_.CreateReshape(*index_node, {-1}), + index_type->precision(), + index_type->layout()); + } + + // Reshape the gather node with the inferred shape as the output node + auto gather_node = graph->AddNode( + out_name, + graph->builder_.CreateGather(*x_node, *index_node, /* axis= */ 0)); + if (out_dims.size() != 2) { + graph->AddNode(out_name, + graph->builder_.CreateReshape( + *gather_node, CvtShape(out_dims))); + } + return SUCCESS; +} + +} // namespace xpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(XPU, + gather, + paddle::lite::subgraph::xpu::GatherConverter); diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc index 3d35e21972..1691e4b0c5 100644 --- a/lite/kernels/xpu/bridges/graph.cc +++ b/lite/kernels/xpu/bridges/graph.cc @@ -22,7 +22,9 @@ namespace subgraph { namespace xpu { std::shared_ptr Graph::AddNode(const std::string& name, - const xtcl::xExpr& layer) { + const xtcl::xExpr& layer, + PrecisionType precision, + DataLayoutType layout) { auto unique_name = [&](const std::string& key) { int idx = 1; auto it = counts_.find(key); @@ -35,8 +37,9 @@ std::shared_ptr Graph::AddNode(const std::string& name, }; auto it = nodes_.find(name); if (it != nodes_.end()) { - CHECK(params_.find(name) == params_.end()) << "[XPU] Node " << name - << " redefined."; + // Only variable can rebind the name + CHECK(!it->second.second.persistable()) << "[XPU] Node " << name + << " redefined."; // Generate a new unique name as the key to bind the origin node if the // origin node isn't a const node: new_name->node nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); @@ -44,7 +47,8 @@ std::shared_ptr Graph::AddNode(const std::string& name, } // Create a new node and bind with the name: name->new_node auto node = std::make_shared(layer); - nodes_.insert(std::make_pair(name, node)); + nodes_.insert(std::make_pair( + name, std::make_pair(node, Type(precision, layout, false)))); builder_.SetLayer(unique_name(name + "_op")); return node; } @@ -52,31 +56,36 @@ std::shared_ptr Graph::AddNode(const std::string& name, // Const node std::shared_ptr Graph::AddNode(const std::string& name, const Tensor& tensor, - PrecisionType ptype, - DataLayoutType ltype) { - return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype); + PrecisionType precision, + DataLayoutType layout) { + return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); } std::shared_ptr Graph::AddNode(const std::string& name, const Tensor& tensor, std::vector shape, - PrecisionType ptype, - DataLayoutType ltype) { - auto node = AddNode(name, shape, ptype, ltype); + PrecisionType precision, + DataLayoutType layout) { + CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; + auto node = std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision))); + nodes_.insert(std::make_pair( + name, std::make_pair(node, Type(precision, layout, true)))); params_.emplace( - std::make_pair(name, *CvtTensor(tensor, shape, ptype, ltype))); + std::make_pair(name, *CvtTensor(tensor, shape, precision, layout))); return node; } // Data node std::shared_ptr Graph::AddNode(const std::string& name, std::vector shape, - PrecisionType ptype, - DataLayoutType ltype) { - CHECK(!HasNode(name)); - auto node = std::make_shared( - builder_.CreateTensor(name, CvtShape(shape), CvtPrecisionType(ptype))); - nodes_.insert(std::make_pair(name, node)); + PrecisionType precision, + DataLayoutType layout) { + CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; + auto node = std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision))); + nodes_.insert(std::make_pair( + name, std::make_pair(node, Type(precision, layout, false)))); return node; } diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h index 0b43a8435a..3107346851 100644 --- a/lite/kernels/xpu/bridges/graph.h +++ b/lite/kernels/xpu/bridges/graph.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "lite/core/op_lite.h" #include "lite/core/tensor.h" @@ -27,42 +28,75 @@ namespace lite { namespace subgraph { namespace xpu { -// The Context of the converters which used for converting the ops of subgraph -// to the XPU IR graph +// Type of graph nodes +class Type { + public: + Type(PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW), + bool persistable = false) + : precision_(precision), layout_(layout), persistable_(persistable) {} + + void set_precision(PrecisionType precision) { precision_ = precision; } + void set_layout(DataLayoutType layout) { layout_ = layout; } + void set_persistable(bool persistable) { persistable_ = persistable; } + + PrecisionType precision() const { return precision_; } + DataLayoutType layout() const { return layout_; } + bool persistable() const { return persistable_; } + + private: + PrecisionType precision_{PRECISION(kFloat)}; + DataLayoutType layout_{DATALAYOUT(kNCHW)}; + bool persistable_{false}; +}; + +// Graph to collect all of converted XPU IR nodes class Graph { public: // Layer node - std::shared_ptr AddNode(const std::string& name, - const xtcl::xExpr& layer); + std::shared_ptr AddNode( + const std::string& name, + const xtcl::xExpr& layer, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); // Const node std::shared_ptr AddNode( const std::string& name, const Tensor& tensor, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); std::shared_ptr AddNode( const std::string& name, const Tensor& tensor, std::vector shape, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr AddNode( + const std::string& name, + const Tensor& tensor, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, tensor, dims.Vectorize(), precision, layout); + } template std::shared_ptr AddNode( const std::string& name, const std::vector& data, std::vector shape = {}, - DataLayoutType ltype = DATALAYOUT(kNCHW)) { + DataLayoutType layout = DATALAYOUT(kNCHW)) { const std::type_info& info = typeid(T); - PrecisionType ptype = PRECISION(kFloat); + PrecisionType precision = PRECISION(kFloat); if (info == typeid(float)) { - ptype = PRECISION(kFloat); + precision = PRECISION(kFloat); } else if (info == typeid(int8_t)) { - ptype = PRECISION(kFloat); + precision = PRECISION(kFloat); } else if (info == typeid(int32_t)) { - ptype = PRECISION(kInt32); + precision = PRECISION(kInt32); } else { LOG(FATAL) << "[XPU] Unknow data type " << info.name(); } @@ -80,7 +114,16 @@ class Graph { std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, ptype, ltype); + return AddNode(name, tensor, precision, layout); + } + + template + std::shared_ptr AddNode( + const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, data, dims.Vectorize(), layout); } template @@ -88,25 +131,47 @@ class Graph { const std::string& name, T value, std::vector shape = {1}, - DataLayoutType ltype = DATALAYOUT(kNCHW)) { + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, ltype); + return AddNode(name, data, shape, layout); + } + + template + std::shared_ptr AddNode( + const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, value, dims.Vectorize(), layout); } // Data node std::shared_ptr AddNode( const std::string& name, std::vector shape, - PrecisionType ptype = PRECISION(kFloat), - DataLayoutType ltype = DATALAYOUT(kNCHW)); + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr AddNode( + const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return AddNode(name, dims.Vectorize(), precision, layout); + } std::shared_ptr GetNode(const std::string& name) { CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; - return nodes_.at(name); + return nodes_.at(name).first; + } + + const Type& GetType(const std::string& name) { + CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; + return nodes_.at(name).second; } bool HasNode(const std::string& name) { @@ -119,7 +184,8 @@ class Graph { xtcl::network::xTensorCompiler::ParamNDArrayMap params_; private: - std::unordered_map> nodes_; + std::unordered_map, Type>> + nodes_; std::unordered_map counts_; }; diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc index 68dcab1888..601dd42770 100644 --- a/lite/kernels/xpu/bridges/layer_norm_op.cc +++ b/lite/kernels/xpu/bridges/layer_norm_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace xpu { -int LayerNormConverter(void* ctx, OpLite* op) { +int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,33 +30,92 @@ int LayerNormConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Get input vars and op attributes - auto x_var_name = op_info->Input("X").front(); - - auto scale_var_name = op_info->Input("Scale").front(); - auto* scale = scope->FindMutableTensor(scale_var_name); - auto bias_var_name = op_info->Input("Bias").front(); - auto* bias = scope->FindMutableTensor(bias_var_name); - - auto y_var_name = op_info->Output("Y").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); auto epsilon = op_info->GetAttr("epsilon"); auto axis = op_info->GetAttr("begin_norm_axis"); + auto x_rank = static_cast(x_dims.size()); + axis = axis < 0 ? (x_rank + axis) : axis; + bool reshape = axis != (x_rank - 1); // XPU only support the last dimension + auto x_inner_size = x_dims.Slice(axis, x_rank).production(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + if (reshape) { + auto reshaped_x_dims = x_dims.Slice(0, axis).Vectorize(); + reshaped_x_dims.push_back(x_inner_size); + x_node = + graph->AddNode(x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node, CvtShape(reshaped_x_dims))); + } + + // Scale node + std::shared_ptr scale_const_node = nullptr; + if (HasInputArg(op_info, scope, "Scale")) { + auto scale_name = op_info->Input("Scale").front(); + auto scale_type = kernel->GetInputDeclType("Scale"); + CHECK(scale_type->precision() == PRECISION(kFloat)); + CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); + auto scale = scope->FindMutableTensor(scale_name); + auto scale_dims = scale->dims(); + CHECK_EQ(scale_dims.size(), 1); + CHECK_EQ(scale_dims.production(), x_inner_size); + scale_const_node = graph->AddNode(scale_name, *scale); + } else { + scale_const_node = + graph->AddNode(y_name + "/scale_one", 1.0f, {x_inner_size}); + } - // Create scale, bias nodes - auto scale_const_node = graph->AddNode(scale_var_name, *scale); - auto bias_const_node = graph->AddNode(bias_var_name, *bias); + // Bias node + std::shared_ptr bias_const_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + CHECK_EQ(bias_dims.size(), 1); + CHECK_EQ(bias_dims.production(), x_inner_size); + bias_const_node = graph->AddNode(bias_name, *bias); + } else { + bias_const_node = + graph->AddNode(y_name + "/bias_zero", 0.0f, {x_inner_size}); + } - // Create node and set params from op + // Layer Norm node auto layer_norm_node = - graph->builder_.CreateLayerNorm(*graph->GetNode(x_var_name), - *scale_const_node, - *bias_const_node, - axis, - epsilon, - true, - true); - graph->AddNode(y_var_name, graph->builder_.GetField(layer_norm_node, 0)); - return SUCCESS; + graph->AddNode(y_name, + graph->builder_.CreateLayerNorm(*x_node, + *scale_const_node, + *bias_const_node, + axis, + epsilon, + true, + true)); + if (reshape) { + graph->AddNode(y_name, + graph->builder_.CreateReshape( + *layer_norm_node, CvtShape(y_dims))); + } + return REBUILD_WHEN_SHAPE_CHANGED; } } // namespace xpu diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc new file mode 100644 index 0000000000..a03e0c2d24 --- /dev/null +++ b/lite/kernels/xpu/bridges/lookup_table_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/graph.h" +#include "lite/kernels/xpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace xpu { + +int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[XPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto ids_name = op_info->Input("Ids").front(); + auto ids_type = kernel->GetInputDeclType("Ids"); + CHECK(ids_type->precision() == PRECISION(kInt64)); + CHECK(ids_type->layout() == DATALAYOUT(kNCHW)); + auto ids = scope->FindMutableTensor(ids_name); + auto ids_dims = ids->dims(); + auto w_name = op_info->Input("W").front(); + auto w_type = kernel->GetInputDeclType("W"); + CHECK(w_type->precision() == PRECISION(kFloat)); + CHECK(w_type->layout() == DATALAYOUT(kNCHW)); + auto w = scope->FindMutableTensor(w_name); + auto w_dims = w->dims(); + CHECK_EQ(w_dims.size(), 2); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + auto padding_idx = op_info->GetAttr("padding_idx"); + if (padding_idx != -1) { + LOG(WARNING) << "[XPU] Only padding_idx=-1 is supported."; + return FAILED; + } + + // Ids node + std::shared_ptr ids_node = nullptr; + if (graph->HasNode(ids_name)) { + ids_node = graph->GetNode(ids_name); + } else { + ids_node = graph->AddNode( + ids_name, ids_dims, ids_type->precision(), ids_type->layout()); + } + // Flatten Ids node + if (ids_dims.size() != 1) { + ids_node = graph->AddNode(ids_name + "/reshape", + graph->builder_.CreateReshape(*ids_node, {-1}), + ids_type->precision(), + ids_type->layout()); + } + auto w_const_node = graph->AddNode(w_name, *w); + + // Reshape the gather node with the inferred shape as the output node + auto gather_node = graph->AddNode( + out_name, + graph->builder_.CreateGather(*w_const_node, *ids_node, /* axis= */ 0)); + if (out_dims.size() != 2) { + graph->AddNode(out_name, + graph->builder_.CreateReshape( + *gather_node, CvtShape(out_dims))); + } + return SUCCESS; +} + +} // namespace xpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(XPU, + lookup_table, + paddle::lite::subgraph::xpu::LookupTableConverter); diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc index 321c0aa217..9d2684ac40 100644 --- a/lite/kernels/xpu/bridges/mul_op.cc +++ b/lite/kernels/xpu/bridges/mul_op.cc @@ -21,7 +21,7 @@ namespace lite { namespace subgraph { namespace xpu { -int MulConverter(void* ctx, OpLite* op) { +int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -30,45 +30,57 @@ int MulConverter(void* ctx, OpLite* op) { auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Get input, and attributes - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - auto out_var_name = op_info->Output("Out").front(); - auto y = scope->FindMutableTensor(y_var_name); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto y_name = op_info->Input("Y").front(); + auto y_type = kernel->GetInputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto y = scope->FindMutableTensor(y_name); auto y_dims = y->dims(); - CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2"; - + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); auto x_num_col_dims = op_info->GetAttr("x_num_col_dims"); - CHECK_EQ(x_num_col_dims, 1) << "xpu now only support x_num_col_dims == 1"; - auto y_num_col_dims = op_info->GetAttr("x_num_col_dims"); - CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1"; - - // Flatten x node - auto x_node = graph->AddNode( - x_var_name + "/flatten", - graph->builder_.CreateBatchFlatten(*graph->GetNode(x_var_name))); + auto x_matrix_dims = x_dims.Flatten2D(x_num_col_dims); + auto y_num_col_dims = op_info->GetAttr("y_num_col_dims"); + auto y_matrix_dims = y_dims.Flatten2D(y_num_col_dims); + CHECK_EQ(x_matrix_dims[1], y_matrix_dims[0]); - // Transpose y data and create y node - Tensor transpose_y; - DDim transpose_y_dims(std::vector{y_dims[1], y_dims[0]}); - transpose_y.Resize(transpose_y_dims); - auto transpose_y_data = transpose_y.mutable_data(); - auto y_data = y->mutable_data(); - for (int i = 0; i < transpose_y_dims[0]; i++) { - for (int j = 0; j < transpose_y_dims[1]; j++) { - transpose_y_data[i * transpose_y_dims[1] + j] = - y_data[j * transpose_y_dims[0] + i]; - } + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + // Flatten X node + if (x_dims.size() != 2) { + x_node = + graph->AddNode(x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node, {-1, static_cast(y_matrix_dims[0])})); } - auto y_const_node = graph->AddNode(y_var_name + "/transpose", transpose_y); - // Create mul node and set params from op - graph->AddNode( - out_var_name, - graph->builder_.CreateDense(*x_node, - static_cast(y_dims[1]), - ::xtcl::NullValue<::xtcl::DataType>(), - *y_const_node)); + // Y node + auto y_const_node = graph->AddNode(y_name, *y, y_matrix_dims); + + // Reshape the matmul node with the inferred shape as the output node + auto matmul_node = graph->AddNode( + out_name, graph->builder_.CreateMatmul2D(*x_node, *y_const_node, false)); + if (out_dims.size() != 2) { + graph->AddNode(out_name, + graph->builder_.CreateReshape( + *matmul_node, CvtShape(out_dims))); + } return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h index 2ceab8d37a..9f8cb0a61c 100644 --- a/lite/kernels/xpu/bridges/paddle_use_bridges.h +++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h @@ -15,6 +15,7 @@ #pragma once USE_SUBGRAPH_BRIDGE(XPU, relu); +USE_SUBGRAPH_BRIDGE(XPU, tanh); USE_SUBGRAPH_BRIDGE(XPU, conv2d); USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d); USE_SUBGRAPH_BRIDGE(XPU, elementwise_add); @@ -22,8 +23,15 @@ USE_SUBGRAPH_BRIDGE(XPU, pool2d); USE_SUBGRAPH_BRIDGE(XPU, softmax); USE_SUBGRAPH_BRIDGE(XPU, mul); USE_SUBGRAPH_BRIDGE(XPU, batch_norm); +USE_SUBGRAPH_BRIDGE(XPU, stack); +USE_SUBGRAPH_BRIDGE(XPU, gather); +USE_SUBGRAPH_BRIDGE(XPU, scale); +USE_SUBGRAPH_BRIDGE(XPU, lookup_table); +USE_SUBGRAPH_BRIDGE(XPU, slice); USE_SUBGRAPH_BRIDGE(XPU, transpose); USE_SUBGRAPH_BRIDGE(XPU, transpose2); USE_SUBGRAPH_BRIDGE(XPU, reshape); USE_SUBGRAPH_BRIDGE(XPU, reshape2); +USE_SUBGRAPH_BRIDGE(XPU, layer_norm); +USE_SUBGRAPH_BRIDGE(XPU, gelu); USE_SUBGRAPH_BRIDGE(XPU, dropout); diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc index 0e6cc818c9..60787a3429 100644 --- a/lite/kernels/xpu/bridges/pool_op.cc +++ b/lite/kernels/xpu/bridges/pool_op.cc @@ -21,17 +21,26 @@ namespace lite { namespace subgraph { namespace xpu { -int PoolConverter(void* ctx, OpLite* op) { +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; // Get input, and attributes - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto pooling_type = op_info->GetAttr("pooling_type"); auto ceil_mode = op_info->GetAttr("ceil_mode"); auto paddings = op_info->GetAttr>("paddings"); @@ -40,35 +49,39 @@ int PoolConverter(void* ctx, OpLite* op) { auto strides = op_info->GetAttr>("strides"); auto exclusive = op_info->GetAttr("exclusive"); - // Create pool node and set params from op + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Pool node if (pooling_type == "max") { if (global_pooling) { - graph->AddNode( - out_var_name, - graph->builder_.CreateGlobalMaxPool2D(*graph->GetNode(x_var_name))); + graph->AddNode(out_name, graph->builder_.CreateGlobalMaxPool2D(*x_node)); } else { graph->AddNode( - out_var_name, - graph->builder_.CreateMaxPool2D(*graph->GetNode(x_var_name), - CvtShape(ksize), - CvtShape(strides), - CvtShape(paddings), + out_name, + graph->builder_.CreateMaxPool2D(*x_node, + CvtShape(ksize), + CvtShape(strides), + CvtShape(paddings), "NCHW", ceil_mode)); } } else if (pooling_type == "avg") { if (global_pooling) { - graph->AddNode( - out_var_name, - graph->builder_.CreateGlobalAvgPool2D(*graph->GetNode(x_var_name))); + graph->AddNode(out_name, graph->builder_.CreateGlobalAvgPool2D(*x_node)); } else { // !exclusive ---> count_include_pad graph->AddNode( - out_var_name, - graph->builder_.CreateAvgPool2D(*graph->GetNode(x_var_name), - CvtShape(ksize), - CvtShape(strides), - CvtShape(paddings), + out_name, + graph->builder_.CreateAvgPool2D(*x_node, + CvtShape(ksize), + CvtShape(strides), + CvtShape(paddings), "NCHW", ceil_mode, !exclusive)); diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc index 37d7bf58b0..eeee6c7244 100644 --- a/lite/kernels/xpu/bridges/reshape_op.cc +++ b/lite/kernels/xpu/bridges/reshape_op.cc @@ -22,7 +22,7 @@ namespace lite { namespace subgraph { namespace xpu { -int ReshapeConverter(void* ctx, OpLite* op) { +int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); @@ -31,40 +31,65 @@ int ReshapeConverter(void* ctx, OpLite* op) { auto op_type = op_info->Type(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Create node and set params from op - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } std::vector shape; - if (op_info->HasInput("ShapeTensor") && - !op_info->Input("ShapeTensor").empty()) { - for (auto var_name : op_info->Input("ShapeTensor")) { - shape.emplace_back(scope->FindMutableTensor(var_name)->data()[0]); + if (HasInputArg(op_info, scope, "ShapeTensor")) { + auto shape_tensor_names = op_info->Input("ShapeTensor"); + // auto shape_tensor_type = kernel->GetInputDeclType("ShapeTensor"); + // CHECK(shape_tensor_type->precision() == PRECISION(kInt32)); + // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW)); + for (auto shape_tensor_name : shape_tensor_names) { + auto shape_tensor = scope->FindMutableTensor(shape_tensor_name); + auto shape_tensor_data = shape_tensor->mutable_data(); + shape.emplace_back(shape_tensor_data[0]); } CHECK_GT(shape.size(), 0) - << "ShapeError: When `shape` in ReshapeOp is a list or tuple " + << "[XPU] ShapeError: When `shape` in ReshapeOp is a list or tuple " "which contains Tensor, the shape's size can't be zero. " "But received shape's size is " << shape.size(); - } else if (op_info->HasInput("Shape") && !op_info->Input("Shape").empty()) { - auto shape_tensor = - scope->FindMutableTensor(op_info->Input("Shape").front()); - auto shape_data = shape_tensor->data(); - shape = std::vector(shape_data, shape_data + shape_tensor->numel()); + } else if (HasInputArg(op_info, scope, "Shape")) { + auto actual_shape_name = op_info->Input("Shape").front(); + // auto actual_shape_type = kernel->GetInputDeclType("Shape"); + // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); + // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); + auto actual_shape = scope->FindMutableTensor(actual_shape_name); + auto actual_shape_dims = actual_shape->dims(); + auto actual_shape_data = actual_shape->mutable_data(); + auto shape = std::vector( + actual_shape_data, actual_shape_data + actual_shape_dims.production()); } else if (op_info->HasAttr("shape")) { shape = op_info->GetAttr>("shape"); } else { - LOG(FATAL) << "no new shape for reshape op"; + LOG(WARNING) << "[XPU] No new shape for reshape op"; + return FAILED; } - auto out_dims = - operators::ValidateShape(shape, scope->FindTensor(x_var_name)->dims()); - - CHECK(graph->HasNode(x_var_name)); - graph->AddNode(out_var_name, - graph->builder_.CreateReshape(*graph->GetNode(x_var_name), - Cvt2ArrayInt(out_dims))); + auto out_dims = operators::ValidateShape(shape, x_dims); - return SUCCESS; + // Reshape node + graph->AddNode(out_name, + graph->builder_.CreateReshape( + *x_node, CvtShape(out_dims))); + return REBUILD_WHEN_SHAPE_CHANGED; } } // namespace xpu diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc new file mode 100644 index 0000000000..a3423d290c --- /dev/null +++ b/lite/kernels/xpu/bridges/scale_op.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/graph.h" +#include "lite/kernels/xpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace xpu { + +int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[XPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + float scale = op_info->GetAttr("scale"); + bool bias_after_scale = op_info->GetAttr("bias_after_scale"); + float bias = op_info->GetAttr("bias"); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Scale node + graph->AddNode( + out_name, + graph->builder_.CreateScale(*x_node, scale, bias, bias_after_scale)); + return SUCCESS; +} + +} // namespace xpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(XPU, + scale, + paddle::lite::subgraph::xpu::ScaleConverter); diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc new file mode 100644 index 0000000000..90c91d3b59 --- /dev/null +++ b/lite/kernels/xpu/bridges/slice_op.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/graph.h" +#include "lite/kernels/xpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace xpu { + +int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[XPU] Converting " + op_type + "..."; + + // Get input, output and op attributes + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + CHECK(input_type->precision() == PRECISION(kFloat)); + CHECK(input_type->layout() == DATALAYOUT(kNCHW)); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto axes = op_info->GetAttr>("axes"); + auto starts = op_info->GetAttr>("starts"); + auto ends = op_info->GetAttr>("ends"); + + // Input node + std::shared_ptr input_node = nullptr; + if (graph->HasNode(input_name)) { + input_node = graph->GetNode(input_name); + } else { + input_node = graph->AddNode(input_name, input_dims); + } + + // Calculate the begin and end of the slice in all of + // dimensions and Create slice node as the output node + xtcl::Array begin, end, strides; + for (size_t i = 0; i < input_dims.size(); ++i) { + auto it = std::find(axes.cbegin(), axes.cend(), i); + if (it == axes.cend()) { + // If not found, don't slice this axis + int s = 0; + int e = input_dims[i]; + begin.push_back(s); + end.push_back(e); + strides.push_back(1); + } else { + int offset = it - axes.cbegin(); + int s = starts[offset]; + int e = ends[offset]; + begin.push_back(s); + end.push_back(e); + strides.push_back(1); + } + } + graph->AddNode( + out_name, + graph->builder_.CreateStridedSlice(*input_node, begin, end, strides)); + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace xpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(XPU, + slice, + paddle::lite::subgraph::xpu::SliceConverter); diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc index af3f233e2d..6deb536ef1 100644 --- a/lite/kernels/xpu/bridges/softmax_op.cc +++ b/lite/kernels/xpu/bridges/softmax_op.cc @@ -21,23 +21,38 @@ namespace lite { namespace subgraph { namespace xpu { -int SoftmaxConverter(void* ctx, OpLite* op) { +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Get op's attributes - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); - // Create softmax node and set params from ops - graph->AddNode( - out_var_name, - graph->builder_.CreateSoftmax(*graph->GetNode(x_var_name), axis)); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Softmax node + graph->AddNode(out_name, graph->builder_.CreateSoftmax(*x_node, axis)); return SUCCESS; } diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc new file mode 100644 index 0000000000..eb7d6d7b79 --- /dev/null +++ b/lite/kernels/xpu/bridges/stack_op.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/graph.h" +#include "lite/kernels/xpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace xpu { + +int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[XPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_names = op_info->Input("X"); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + CHECK(y_type->precision() == PRECISION(kFloat)); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + int axis = op_info->GetAttr("axis"); + + // X nodes + xtcl::Array x_nodes; + for (auto& x_name : x_names) { + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + x_nodes.push_back(*x_node); + } + + // Stack node + graph->AddNode(y_name, + graph->builder_.CreateStack( + xtcl::network::TupleNode::make(x_nodes), axis)); + return SUCCESS; +} + +} // namespace xpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(XPU, + stack, + paddle::lite::subgraph::xpu::StackConverter); diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc index 3d0e87836d..b6823dd6a8 100644 --- a/lite/kernels/xpu/bridges/transpose_op.cc +++ b/lite/kernels/xpu/bridges/transpose_op.cc @@ -21,26 +21,42 @@ namespace lite { namespace subgraph { namespace xpu { -int TransposeConverter(void* ctx, OpLite* op) { +int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto op_info = op->op_info(); auto op_type = op_info->Type(); + auto scope = op->scope(); VLOG(3) << "[XPU] Converting " + op_type + "..."; - // Create node and set params from op - auto x_var_name = op_info->Input("X").front(); - auto out_var_name = op_info->Output("Out").front(); - + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr>("axis"); - CHECK(graph->HasNode(x_var_name)); - graph->AddNode( - out_var_name, - graph->builder_.CreateTranspose( - *graph->GetNode(x_var_name), - Cvt2ArrayInt(std::vector(axis.begin(), axis.end())))); + // X node + std::shared_ptr x_node = nullptr; + if (graph->HasNode(x_name)) { + x_node = graph->GetNode(x_name); + } else { + x_node = graph->AddNode(x_name, x_dims); + } + + // Transpose node + graph->AddNode(out_name, + graph->builder_.CreateTranspose( + *x_node, + CvtShape( + std::vector(axis.begin(), axis.end())))); return SUCCESS; } diff --git a/lite/kernels/xpu/bridges/utility.cc b/lite/kernels/xpu/bridges/utility.cc index cf8d09a53a..79fad7c8b4 100644 --- a/lite/kernels/xpu/bridges/utility.cc +++ b/lite/kernels/xpu/bridges/utility.cc @@ -47,9 +47,15 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) { case PRECISION(kInt8): out_type = ::xtcl::Int(8); break; + case PRECISION(kInt16): + out_type = ::xtcl::Int(16); + break; case PRECISION(kInt32): out_type = ::xtcl::Int(32); break; + case PRECISION(kInt64): + out_type = ::xtcl::Int(64); + break; default: LOG(FATAL) << "[XPU] Can not convert precision type(" << PrecisionToStr(in_type) << ") from Lite to XPU"; @@ -58,7 +64,7 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) { return out_type; } -DLDataType CvtDataType(PrecisionType in_type) { +DLDataType CvtDLDataType(PrecisionType in_type) { DLDataType out_type = {kDLFloat, 32, 1}; switch (in_type) { case PRECISION(kFloat): @@ -67,76 +73,64 @@ DLDataType CvtDataType(PrecisionType in_type) { case PRECISION(kInt8): out_type = {kDLInt, 8, 1}; break; + case PRECISION(kInt16): + out_type = {kDLInt, 16, 1}; + break; case PRECISION(kInt32): out_type = {kDLInt, 32, 1}; break; + case PRECISION(kInt64): + out_type = {kDLInt, 64, 1}; + break; default: - LOG(FATAL) << "[XPU] Can not convert data type(" - << PrecisionToStr(in_type) << ") from Lite to XPU"; + LOG(FATAL) << "[XPU] Can not convert precision type(" + << PrecisionToStr(in_type) << ") from Lite to XPU DLDataType"; break; } return out_type; } -xtcl::Array CvtShape(const std::vector& in_shape) { - xtcl::Array out_shape; - for (auto dim : in_shape) { - out_shape.push_back(dim); +DLDeviceType CvtDLDeviceType(TargetType in_type) { + DLDeviceType out_type = kDLCPU; + switch (in_type) { + case TARGET(kX86): + out_type = kDLCPU; + break; + case TARGET(kHost): + out_type = kDLCPU; + break; + case TARGET(kCUDA): + out_type = kDLGPU; + break; + case TARGET(kXPU): + out_type = kDLCPU; + break; + default: + LOG(FATAL) << "[XPU] Can not convert target type(" << TargetToStr(in_type) + << ") from Lite to XPU DLDeviceType"; + break; } - return out_shape; -} - -xtcl::Array CvtShape(const std::vector& in_shape) { - return CvtShape(std::vector(in_shape.begin(), in_shape.end())); -} - -xtcl::Array CvtShape(const DDim& in_dims) { - return CvtShape(in_dims.Vectorize()); + return out_type; } std::shared_ptr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - const uint8_t* in_data = nullptr; - auto in_size = in_tensor.dims().production(); + PrecisionType in_precision, + DataLayoutType in_layout) { auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { out_shape = in_shape; } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor.data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "[XPU] Unknow precision type " << PrecisionToStr(in_ptype); - } auto out_tensor = std::make_shared( - xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0})); + xtcl::xNDArray::Empty(out_shape, + CvtDLDataType(in_precision), + {CvtDLDeviceType(TARGET(kHost)), 0})); auto out_data = reinterpret_cast(out_tensor->ToDLPack()->dl_tensor.data); - std::memcpy(out_data, in_data, in_bytes); + std::memcpy(out_data, in_tensor.raw_data(), in_tensor.memory_size()); return out_tensor; } -xtcl::Array Cvt2ArrayInt(const std::vector& input) { - xtcl::Array output; - for (auto i : input) { - output.push_back(i); - } - return output; -} - -xtcl::Array Cvt2ArrayInt(const DDim& input) { - return Cvt2ArrayInt(input.Vectorize()); -} - } // namespace xpu } // namespace subgraph } // namespace lite diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h index f04488d2c3..a02a5ddff0 100644 --- a/lite/kernels/xpu/bridges/utility.h +++ b/lite/kernels/xpu/bridges/utility.h @@ -33,22 +33,33 @@ bool HasInputArg(const OpInfo* op_info, xtcl::DataType CvtPrecisionType(PrecisionType in_type); -DLDataType CvtDataType(PrecisionType in_type); +DLDataType CvtDLDataType(PrecisionType in_type); +DLDeviceType CvtDLDeviceType(TargetType in_type); -xtcl::Array CvtShape(const std::vector& in_shape); +template +xtcl::Array CvtShape(const std::vector& in_shape) { + xtcl::Array out_shape; + for (auto dim : in_shape) { + out_shape.push_back(dim); + } + return out_shape; +} -xtcl::Array CvtShape(const std::vector& in_shape); +template +xtcl::Array CvtShape(const std::vector& in_shape) { + return CvtShape(std::vector(in_shape.begin(), in_shape.end())); +} -xtcl::Array CvtShape(const DDim& in_dims); +template +xtcl::Array CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} std::shared_ptr CvtTensor( const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -xtcl::Array Cvt2ArrayInt(const std::vector& input); -xtcl::Array Cvt2ArrayInt(const DDim& input); + PrecisionType in_precision = PRECISION(kFloat), + DataLayoutType in_layout = DATALAYOUT(kNCHW)); } // namespace xpu } // namespace subgraph diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc index 899fb074b3..0a7a4d2aa5 100644 --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -20,6 +20,7 @@ #include "lite/core/op_registry.h" #include "lite/kernels/xpu/bridges/graph.h" #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" +#include "lite/kernels/xpu/bridges/utility.h" namespace paddle { namespace lite { @@ -28,19 +29,9 @@ namespace xpu { int SubgraphEngine::BuildDeviceProgram() { int status = 0; - // Convert all of input data vars and added into the XPU IR graph + // Convert all of ops and their input vars and weights and added into the XPU + // IR graph subgraph::xpu::Graph graph; - for (auto& input_name : input_names_) { - auto input_tensor = scope_->FindMutableTensor(input_name); - CHECK(input_tensor); - auto input_node = - graph.AddNode(input_name, input_tensor->dims().Vectorize()); - CHECK(input_node); - // XTCL doesn't support dynamic dimensions/shapes, so need to rebuild - // the program when the shape of any input tensor is changed. - status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; - } - // Convert all of ops and its weights and added into the XPU IR graph const auto& bridges = subgraph::Registry::Instance(); for (auto& inst : origin_program_) { auto op = inst.op(); @@ -51,62 +42,140 @@ int SubgraphEngine::BuildDeviceProgram() { if (!bridges.Exists("XPU", op_type)) { return subgraph::FAILED; } + auto kernel = inst.kernel(); status |= bridges.Select("XPU", op_type)(reinterpret_cast(&graph), - const_cast(op)); + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } } - // Obtain the output nodes of the XPU IR graph and build the graph to XPU + // Obtain the output nodes of the XPU IR graph and build the graph to the XPU // runtime - std::vector output_nodes; - std::vector valid_output_names; + device_inames_.clear(); + device_onames_.clear(); + std::vector device_inodes; + std::vector device_onodes; + for (auto& input_name : input_names_) { + if (graph.HasNode(input_name)) { + if (!graph.GetType(input_name).persistable()) { + device_inodes.push_back(graph.GetNode(input_name).get()); + device_inames_.push_back(input_name); + } else { + LOG(WARNING) << "[XPU] Input node " << input_name + << " is skipped because it is a persistable node."; + } + } else { + LOG(WARNING) << "[XPU] Input node " << input_name + << " is skipped because it does not exist."; + } + } for (auto& output_name : output_names_) { if (graph.HasNode(output_name)) { - output_nodes.push_back(graph.GetNode(output_name).get()); - valid_output_names.push_back(output_name); + device_onodes.push_back(graph.GetNode(output_name).get()); + device_onames_.push_back(output_name); + } else { + LOG(WARNING) << "[XPU] Output node " << output_name + << " is skipped because it does not exist."; } } - CHECK(!valid_output_names.empty()) << "[XPU] no valid output names"; + CHECK(!device_inames_.empty()) + << "[XPU] No input nodes found for building XPU model"; + CHECK(!device_onames_.empty()) + << "[XPU] No output nodes found for building XPU model"; device_program_ = lite::xpu::Device::Global().Build( - &graph.builder_, &graph.params_, &output_nodes); + &graph.builder_, &graph.params_, &device_onodes); if (device_program_ == nullptr) { LOG(WARNING) << "[XPU] Build model failed!"; return subgraph::FAILED; } // Query and check the dimensions of input and output tensors - origin_idims_.resize(input_names_.size()); - origin_itensors_.resize(input_names_.size()); - origin_odims_.resize(valid_output_names.size()); - origin_otensors_.resize(valid_output_names.size()); - for (int i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + origin_idims_.resize(device_inames_.size()); + origin_itensors_.resize(device_inames_.size()); + device_itensors_.resize(device_inames_.size()); + origin_odims_.resize(device_onames_.size()); + origin_otensors_.resize(device_onames_.size()); + device_otensors_.resize(device_onames_.size()); + for (int i = 0; i < device_inames_.size(); i++) { + auto type = graph.GetType(device_inames_[i]); + auto precision = type.precision(); + auto layout = type.layout(); + origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i]; + VLOG(3) << "[XPU] Inputs[" << i + << "] precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) + << " dims: " << origin_idims_[i]; + // Prepare the device input tensors which share data with the origin input + // tensors + device_itensors_[i].data = nullptr; + device_itensors_[i].ctx.device_type = + subgraph::xpu::CvtDLDeviceType(TARGET(kHost)); + device_itensors_[i].ctx.device_id = 0; + device_itensors_[i].ndim = origin_idims_[i].size(); + device_itensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision); + device_itensors_[i].shape = const_cast( + static_cast(origin_idims_[i].data().data())); + device_itensors_[i].strides = nullptr; + device_itensors_[i].byte_offset = 0; } - for (int i = 0; i < valid_output_names.size(); i++) { - origin_otensors_[i] = scope_->FindMutableTensor(valid_output_names[i]); + for (int i = 0; i < device_onames_.size(); i++) { + auto type = graph.GetType(device_onames_[i]); + auto precision = type.precision(); + auto layout = type.layout(); + origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i]; + VLOG(3) << "[XPU] Outputs[" << i + << "] precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) + << " dims: " << origin_odims_[i]; + // Prepare the device output tensors which share data with the origin output + // tensors + switch (precision) { + case PRECISION(kFloat): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt8): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt16): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt32): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt64): + origin_otensors_[i]->mutable_data(); + break; + default: + LOG(FATAL) << "[XPU] " << device_onames_[i] + << " can't mutable data with precision type " + << PrecisionToStr(precision); + break; + } + device_otensors_[i].data = nullptr; + device_otensors_[i].ctx.device_type = + subgraph::xpu::CvtDLDeviceType(TARGET(kHost)); + device_otensors_[i].ctx.device_id = 0; + device_otensors_[i].ndim = origin_odims_[i].size(); + device_otensors_[i].dtype = subgraph::xpu::CvtDLDataType(precision); + device_otensors_[i].shape = const_cast( + static_cast(origin_odims_[i].data().data())); + device_otensors_[i].strides = nullptr; + device_otensors_[i].byte_offset = 0; } return status; } int SubgraphEngine::LaunchDeviceProgram() { - // Copy the data of origin input tensors to the buffer of input XPU tensors - for (size_t i = 0; i < input_names_.size(); i++) { - auto input_ndarray = - xtcl::xNDArray::Empty(origin_itensors_[i]->dims().Vectorize(), - {kDLFloat, 32, 1}, - {kDLCPU, 0}); - std::memcpy(static_cast(input_ndarray.ToDLPack()->dl_tensor.data), - origin_itensors_[i]->mutable_data(), - sizeof(float) * origin_itensors_[i]->dims().production()); - device_program_->SetInputZeroCopy(input_names_[i], - &input_ndarray.ToDLPack()->dl_tensor); + for (size_t i = 0; i < device_itensors_.size(); i++) { + // Update the data pointer of DLTensor to track the origin input tensors + device_itensors_[i].data = + const_cast(origin_itensors_[i]->raw_data()); + device_program_->SetInputZeroCopy(device_inames_[i], &device_itensors_[i]); } // Run the XPU model auto GetCurrentUS = []() -> double { @@ -117,12 +186,11 @@ int SubgraphEngine::LaunchDeviceProgram() { auto start_time = GetCurrentUS(); device_program_->Run(); VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us"; - // Copy the data of output XPU tensor to the buffer of origin output tensors - for (size_t i = 0; i < origin_otensors_.size(); i++) { - auto output_ndarray = device_program_->GetOutput(i); - std::memcpy(origin_otensors_[i]->mutable_data(), - static_cast(output_ndarray.ToDLPack()->dl_tensor.data), - sizeof(float) * origin_otensors_[i]->dims().production()); + for (size_t i = 0; i < device_otensors_.size(); i++) { + // Update the data pointer of DLTensor to track the origin output tensors + device_otensors_[i].data = + const_cast(origin_otensors_[i]->raw_data()); + device_program_->CopyOutputTo(i, &device_otensors_[i]); } return 0; } diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 0d7108a866..2196eb3621 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -41,6 +41,10 @@ class SubgraphEngine : public subgraph::Engine { int BuildDeviceProgram() override; int LaunchDeviceProgram() override; + std::vector device_inames_; + std::vector device_onames_; + std::vector device_itensors_; + std::vector device_otensors_; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index 6ddcee0cb9..6292c5aef6 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -120,6 +120,7 @@ REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp); #ifdef LITE_WITH_TRAIN REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp); diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 36cd759ebf..7cd7f5363c 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,10 +1,10 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_fc SRCS fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) @@ -41,7 +41,7 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) @@ -49,6 +49,8 @@ if(LITE_BUILD_EXTRA) #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc index dc5252cdad..d049544a7c 100644 --- a/lite/tests/kernels/activation_compute_test.cc +++ b/lite/tests/kernels/activation_compute_test.cc @@ -34,7 +34,8 @@ enum activation_type_test { LOG, EXP, FLOOR, - RSQRT + RSQRT, + GELU }; class ActivationComputeTester : public arena::TestCase { @@ -184,6 +185,13 @@ class ActivationComputeTester : public arena::TestCase { } break; } + case GELU: { + for (int i = 0; i < dims_.production(); i++) { + output_data[i] = x_data[i] * 0.5 * + (1.0 + std::erf(x_data[i] * 0.70710678118654752440)); + } + break; + } default: LOG(INFO) << "the type of activation is unknow."; } @@ -243,8 +251,8 @@ class ActivationComputeTester : public arena::TestCase { TEST(Activation_relu, precision) { LOG(INFO) << "test relu op"; - float abs_error = 2e-5; Place place; + float abs_error = 2e-5; #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU @@ -280,8 +288,8 @@ TEST(Activation_relu, precision) { TEST(Activation_leaky_relu, precision) { LOG(INFO) << "test leaky_relu op"; - float abs_error = 2e-5; Place place; + float abs_error = 2e-5; #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU @@ -317,8 +325,8 @@ TEST(Activation_leaky_relu, precision) { TEST(Activation_relu_clipped, precision) { LOG(INFO) << "test relu clipped op"; - float abs_error = 2e-5; Place place; + float abs_error = 2e-5; #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU @@ -384,8 +392,8 @@ TEST(Activation_prelu, precision) { TEST(Activation_sigmoid, precision) { LOG(INFO) << "test sigmoid op"; - float abs_error = 2e-5; Place place; + float abs_error = 2e-5; #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU @@ -419,13 +427,15 @@ TEST(Activation_sigmoid, precision) { TEST(Activation_tanh, precision) { LOG(INFO) << "test tanh op"; - float abs_error = 2e-5; Place place; + float abs_error = 2e-5; #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); +#elif defined(LITE_WITH_XPU) + place = TARGET(kXPU); #else return; #endif @@ -621,5 +631,25 @@ TEST(Activation_rsqrt, precision) { } #endif } + +TEST(Activation_gelu, precision) { + LOG(INFO) << "test gelu op"; + Place place; + float abs_error = 2e-5; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#else + return; +#endif + + for (auto dims : std::vector>{ + {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) { + std::unique_ptr tester(new ActivationComputeTester( + place, "def", 0.01, 6., "all", 0., DDim(dims), "gelu", GELU)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/tests/kernels/fc_compute_test.cc b/lite/tests/kernels/fc_compute_test.cc index ef5baa8185..1dca6d41ed 100644 --- a/lite/tests/kernels/fc_compute_test.cc +++ b/lite/tests/kernels/fc_compute_test.cc @@ -161,7 +161,7 @@ class FcOPTest : public arena::TestCase { } }; -void test_fc(Place place) { +void test_fc(Place place, float abs_error) { for (auto& m : {1, 3, 16}) { for (auto& n : {1, 4, 16, 128, 256, 1024}) { for (auto& k : {1, 16, 128, 1024}) { @@ -172,10 +172,12 @@ void test_fc(Place place) { std::unique_ptr tester( new FcOPTest(place, "def", dim_in, wdim, bdim, 1)); #ifdef LITE_WITH_ARM - auto& ctx = tester->context()->As(); - ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1); + if (place == TARGET(kARM)) { + auto& ctx = tester->context()->As(); + ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1); + } #endif - arena::Arena arena(std::move(tester), place, 6e-5); + arena::Arena arena(std::move(tester), place, abs_error); if (!arena.TestPrecision()) { LOG(ERROR) << "run m: " << m << ", n: " << n << ", k: " << k << ", bias: " << (bflag ? "true" : "false") << " failed"; @@ -188,13 +190,17 @@ void test_fc(Place place) { } TEST(FcOP, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_fc(place); + Place place; + float abs_error = 6e-5; +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 2e-1; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; #endif + test_fc(place, abs_error); } } // namespace lite diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc new file mode 100644 index 0000000000..9db225b2cd --- /dev/null +++ b/lite/tests/kernels/gather_compute_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class GatherComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "gather"; + std::string x_ = "x"; + std::string index_ = "index"; + std::string out_ = "out"; + DDim x_dims_{{5, 4, 2, 3}}; + DDim index_dims_{{2, 1}}; + + public: + GatherComputeTest(const Place& place, + const std::string& alias, + const DDim& x_dims, + const DDim& index_dims) + : TestCase(place, alias), x_dims_(x_dims), index_dims_(index_dims) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto index = scope->FindTensor(index_); + auto x_dims = x->dims(); + auto index_dims = index->dims(); + CHECK(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); + + auto out = scope->NewTensor(out_); + CHECK(out); + int batch_size = index_dims[0]; + DDim out_dims = x_dims; + out_dims[0] = batch_size; + out->Resize(out_dims); + + auto x_data = x->data(); + auto index_data = index->data(); + auto out_data = out->mutable_data(); + + auto slice_num = x_dims[0]; + auto slice_size = x_dims.Slice(1, x_dims.size()).production(); + for (int i = 0; i < batch_size; i++) { + auto index = index_data[i]; + CHECK_LT(index, slice_num) << "gather index[i] expected < " << slice_num + << " but got " << index; + CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index; + memcpy(out_data + i * slice_size, + x_data + index * slice_size, + slice_size * sizeof(float)); + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("Index", {index_}); + op_desc->SetOutput("Out", {out_}); + } + + void PrepareData() override { + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + + std::vector index(index_dims_.production()); + fill_data_rand( + index.data(), 0, x_dims_[0] - 1, index_dims_.production()); + + SetCommonTensor(x_, x_dims_, x.data()); + SetCommonTensor(index_, index_dims_, index.data()); + } +}; + +TEST(Gather, precision) { + LOG(INFO) << "test gather op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#else + return; +#endif + + for (auto x_dims : + std::vector>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { + for (auto index_dims : + std::vector>{{3, 1}, {7, 1}, {10, 1}}) { + std::unique_ptr tester( + new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims))); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc index 5bb122e7b6..a30ac55d42 100644 --- a/lite/tests/kernels/layer_norm_compute_test.cc +++ b/lite/tests/kernels/layer_norm_compute_test.cc @@ -25,10 +25,10 @@ class LayerNormComputeTest : public arena::TestCase { protected: // common attributes for this op. std::string op_type_ = "layer_norm"; - std::string input_ = "x"; + std::string x_ = "x"; std::string scale_ = "scale"; std::string bias_ = "bias"; - std::string output_ = "y"; + std::string y_ = "y"; std::string mean_ = "mean"; std::string variance_ = "variance"; DDim dims_{{4, 5, 19, 19}}; @@ -53,11 +53,11 @@ class LayerNormComputeTest : public arena::TestCase { has_scale_(has_scale) {} void RunBaseline(Scope* scope) override { - auto x = scope->FindTensor(input_); + auto x = scope->FindTensor(x_); auto scale = scope->FindTensor(scale_); auto bias = scope->FindTensor(bias_); - auto y = scope->NewTensor(output_); + auto y = scope->NewTensor(y_); auto mean = scope->NewTensor(mean_); auto variance = scope->NewTensor(variance_); CHECK(y); @@ -74,7 +74,7 @@ class LayerNormComputeTest : public arena::TestCase { auto* x_data = x->data(); auto* scale_data = (scale == nullptr ? nullptr : scale->data()); auto* bias_data = (bias == nullptr ? nullptr : bias->data()); - auto* out_data = y->mutable_data(); + auto* y_data = y->mutable_data(); auto* mean_data = mean->mutable_data(); auto* variance_data = variance->mutable_data(); @@ -94,12 +94,12 @@ class LayerNormComputeTest : public arena::TestCase { variance_data[i] = variance_t; variance_t = sqrt(variance_t + epsilon_); for (int j = start; j < end; ++j) { - out_data[j] = (x_data[j] - mean_t) / variance_t; + y_data[j] = (x_data[j] - mean_t) / variance_t; if (scale_data) { - out_data[j] *= scale_data[j - start]; + y_data[j] *= scale_data[j - start]; } if (bias_data) { - out_data[j] += bias_data[j - start]; + y_data[j] += bias_data[j - start]; } } } @@ -107,10 +107,14 @@ class LayerNormComputeTest : public arena::TestCase { void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType(op_type_); - op_desc->SetInput("X", {input_}); - op_desc->SetInput("Bias", {bias_}); - op_desc->SetInput("Scale", {scale_}); - op_desc->SetOutput("Y", {output_}); + op_desc->SetInput("X", {x_}); + if (has_scale_) { + op_desc->SetInput("Scale", {scale_}); + } + if (has_bias_) { + op_desc->SetInput("Bias", {bias_}); + } + op_desc->SetOutput("Y", {y_}); op_desc->SetOutput("Mean", {mean_}); op_desc->SetOutput("Variance", {variance_}); op_desc->SetAttr("epsilon", epsilon_); @@ -118,23 +122,24 @@ class LayerNormComputeTest : public arena::TestCase { } void PrepareData() override { - std::vector din(dims_.production()); - fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); - - std::vector scale_v; - for (size_t i = begin_norm_axis_; i < dims_.size(); i++) { - scale_v.push_back(dims_[i]); + std::vector x(dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(x_, dims_, x.data()); + + auto scale_bias_size = + dims_.Slice(begin_norm_axis_, dims_.size()).production(); + if (has_scale_) { + DDim scale_dims({scale_bias_size}); + std::vector scale(scale_bias_size); + fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_size); + SetCommonTensor(scale_, scale_dims, scale.data()); + } + if (has_bias_) { + DDim bias_dims({scale_bias_size}); + std::vector bias(scale_bias_size); + fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_size); + SetCommonTensor(bias_, bias_dims, bias.data()); } - DDim scale_dim(scale_v); - std::vector scale(scale_dim.production()); - fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production()); - - std::vector bias(scale_dim.production()); - fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production()); - - SetCommonTensor(input_, dims_, din.data()); - SetCommonTensor(scale_, scale_dim, scale.data()); - SetCommonTensor(bias_, scale_dim, bias.data()); } }; @@ -151,25 +156,15 @@ TEST(LayerNorm, precision) { return; #endif - std::vector> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}; - for (auto dim_in : dims) { + for (auto dims : + std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { for (auto epsilon : {1e-5f}) { - for (auto axis : {0, 1, 2, 3}) { + for (auto axis : {1, 2, 3}) { for (bool has_bias : {true, false}) { for (bool has_scale : {true, false}) { - if (axis >= dim_in.size()) continue; - std::unique_ptr tester( - new LayerNormComputeTest(place, - "def", - DDim(dim_in), - epsilon, - axis, - has_bias, - has_scale)); -#ifdef LITE_WITH_ARM - auto& ctx = tester->context()->As(); - ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4); -#endif + if (axis >= dims.size()) continue; + std::unique_ptr tester(new LayerNormComputeTest( + place, "def", DDim(dims), epsilon, axis, has_bias, has_scale)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision({"mean", "variance"}); } diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc new file mode 100644 index 0000000000..5951601ef9 --- /dev/null +++ b/lite/tests/kernels/lookup_table_compute_test.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class LookupTableComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "lookup_table"; + std::string ids_ = "ids"; + std::string w_ = "w"; + std::string out_ = "out"; + DDim ids_dims_{{2, 1}}; + DDim w_dims_{{8, 4}}; + int64_t padding_idx_ = -1; + + public: + LookupTableComputeTest(const Place& place, + const std::string& alias, + const DDim& ids_dims, + const DDim& w_dims, + int64_t padding_idx) + : TestCase(place, alias), + ids_dims_(ids_dims), + w_dims_(w_dims), + padding_idx_(padding_idx) {} + + void RunBaseline(Scope* scope) override { + auto ids = scope->FindTensor(ids_); + auto w = scope->FindTensor(w_); + auto ids_dims = ids->dims(); + auto w_dims = w->dims(); + + auto out = scope->NewTensor(out_); + CHECK(out); + + int ids_rank = ids_dims.size(); + CHECK_EQ(ids_dims[ids_rank - 1], 1); + CHECK_EQ(w_dims.size(), 2); + + std::vector out_dims; + for (int i = 0; i < ids_rank - 1; ++i) { + out_dims.push_back(ids_dims[i]); + } + out_dims.push_back(w_dims[1]); + out->Resize(out_dims); + out->set_lod(ids->lod()); + + auto ids_data = ids->data(); + auto ids_size = ids_dims.production(); + auto w_data = w->data(); + auto w_rows = w_dims[0]; + auto w_cols = w_dims[1]; + auto out_data = out->mutable_data(); + + for (int64_t i = 0; i < ids_size; i++) { + auto id = ids_data[i]; + if (padding_idx_ != -1 && id == padding_idx_) { + memset(out_data + i * w_cols, 0, w_cols * sizeof(float)); + } else { + CHECK_LT(id, w_rows) << "lookup_table ids[i] expected < " << w_rows + << " but got " << id; + CHECK_GE(id, 0) << "lookup_table ids[i] expected >= 0 but got " << id; + memcpy(out_data + i * w_cols, + w_data + id * w_cols, + w_cols * sizeof(float)); + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("Ids", {ids_}); + op_desc->SetInput("W", {w_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("padding_idx", padding_idx_); + } + + void PrepareData() override { + std::vector ids(ids_dims_.production()); + fill_data_rand( + ids.data(), 0, w_dims_[0] - 1, ids_dims_.production()); + + std::vector w(w_dims_.production()); + fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production()); + + SetCommonTensor(ids_, ids_dims_, ids.data()); + SetCommonTensor(w_, w_dims_, w.data()); + } +}; + +TEST(LookupTable, precision) { + LOG(INFO) << "test lookup_table op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#else + return; +#endif + + for (auto ids_dims : + std::vector>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) { + for (auto w_dims : + std::vector>{{4, 2}, {6, 8}, {12, 15}}) { +#if defined(LITE_WITH_XPU) + for (auto padding_idx : + std::vector{-1}) { // Only -1 is supported by XPU +#else + for (auto padding_idx : std::vector{-1, 0, w_dims[0] - 1}) { +#endif + std::unique_ptr tester(new LookupTableComputeTest( + place, "def", DDim(ids_dims), DDim(w_dims), padding_idx)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index fd254c7495..706936d2b1 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -82,11 +82,17 @@ class ScaleComputeTester : public arena::TestCase { }; TEST(Scale, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); + Place place; + float abs_error = 2e-5; +#if defined(LITE_WITH_ARM) + place = TARGET(kARM); +#elif defined(LITE_WITH_XPU) + place = TARGET(kXPU); + abs_error = 3e-4; // Some operations use fp16 in XPU +#elif defined(LITE_WITH_X86) + place = TARGET(kX86); +#else + return; #endif for (float scale : {0.123, 2., -1.2}) { @@ -94,7 +100,7 @@ TEST(Scale, precision) { for (bool bias_before : {true, false}) { std::unique_ptr tester( new ScaleComputeTester(place, "def", scale, bias, bias_before)); - arena::Arena arena(std::move(tester), place, 2e-5); + arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } } @@ -102,11 +108,13 @@ TEST(Scale, precision) { } TEST(Scale, performance) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); + Place place; +#if defined(LITE_WITH_ARM) + place = TARGET(kARM); +#elif defined(LITE_WITH_X86) + place = TARGET(kX86); +#else + return; #endif std::unique_ptr tester( diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc index 19725d72fb..e8c63e2d72 100644 --- a/lite/tests/kernels/slice_compute_test.cc +++ b/lite/tests/kernels/slice_compute_test.cc @@ -267,14 +267,14 @@ void test_slice_tensor_list(Place place) { } TEST(Slice, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif #ifdef LITE_WITH_ARM Place place(TARGET(kARM)); test_slice(place); test_slice_tensor(place); test_slice_tensor_list(place); +#elif defined(LITE_WITH_XPU) + Place place(TARGET(kXPU)); + test_slice(place); #endif } diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc index 543409d4ba..10b289e419 100644 --- a/lite/tests/kernels/stack_compute_test.cc +++ b/lite/tests/kernels/stack_compute_test.cc @@ -103,13 +103,15 @@ void test_stack(Place place) { } TEST(Stack, precision) { -// #ifdef LITE_WITH_X86 -// Place place(TARGET(kX86)); -// #endif + Place place; #ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_stack(place); + place = TARGET(kARM); +#elif defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#else + return; #endif + test_stack(place); } } // namespace lite -- GitLab