diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index 95339d6175c98f22d542db24f02d6d714ccbe2a8..1afb54c692592ca42d8b120dcf1a91922e19149c 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) { TestModel(FLAGS_model_dir, FLAGS_model_file, FLAGS_params_file, - {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, - lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}}, + {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kARM), PRECISION(kFloat)}}, input_tensor_shape, FLAGS_optimized_model_dir + "/NPU"); // verify results diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index d79fdeb3c16cae2fcc0964697bb31f3746164a9f..79d1bf2fd5fa694d4888d474c321a43d279bab76 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -21,6 +21,7 @@ lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${ lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps}) set(npu_bridges npu_bridge_registry @@ -43,6 +44,7 @@ set(npu_bridges npu_bridge_pad2d_op npu_bridge_square_op npu_bridge_sqrt_op + npu_bridge_reduce_mean_op CACHE INTERNAL "npu_bridges") set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) @@ -66,5 +68,6 @@ lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) message(STATUS "+++++ npu_bridges: ${npu_bridges}") diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h index 8b4252de06e8934affe7592fc8ea521ad7d20025..8cdfff1fb4365f9a855c5490ddc4677c6954830a 100644 --- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h @@ -24,6 +24,9 @@ USE_NPU_BRIDGE(pool2d); USE_NPU_BRIDGE(relu); USE_NPU_BRIDGE(elementwise_add); USE_NPU_BRIDGE(fusion_elementwise_add_activation); +USE_NPU_BRIDGE(elementwise_sub); +USE_NPU_BRIDGE(elementwise_mul); +USE_NPU_BRIDGE(elementwise_div); USE_NPU_BRIDGE(scale); USE_NPU_BRIDGE(softmax); USE_NPU_BRIDGE(concat); @@ -36,3 +39,8 @@ USE_NPU_BRIDGE(bilinear_interp); USE_NPU_BRIDGE(conv2d_transpose); USE_NPU_BRIDGE(reshape); USE_NPU_BRIDGE(reshape2); +USE_NPU_BRIDGE(sqrt); +USE_NPU_BRIDGE(square); +USE_NPU_BRIDGE(reduce_mean); +USE_NPU_BRIDGE(tanh); +USE_NPU_BRIDGE(nearest_interp); diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4725bdfb0e17c4f99dfd2359ff34c96f9e5af6e5 --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type ReduceMeanConverter( + const std::shared_ptr reduce_mean_op, + const node_map_type& inputs_map) { + auto scope = reduce_mean_op->scope(); + auto op_info = reduce_mean_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + // get input, and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x_dims = scope->FindTensor(x_var_name)->dims(); + auto keep_dim = op_info->GetAttr("keep_dim"); + auto dim = op_info->GetAttr>("dim"); + CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty."; + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_dims.size(); + } + } + std::sort(dim.begin(), dim.end()); + + // create reduce_mean(reduce_sum + scale) node and set input node from + // inputs_map + // creat reduce_sum node + auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum"); + auto reduce_sum_node = std::make_shared(unique_reduce_sum); + CHECK(inputs_map.count(x_var_name)); + reduce_sum_node->set_input_x(*inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reduce_sum_node); + + auto dim_const_node = + std::make_shared(unique_reduce_sum + "/dim"); + dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData(dim)); + reduce_sum_node->set_input_w(*dim_const_node); + lite::npu::OpList::Global().add(dim_const_node); + + reduce_sum_node->set_attr_keep_dims(keep_dim); + + // create scale node + auto unique_scale = lite::npu::UniqueName("scale"); + auto scale_node = std::make_shared(unique_scale); + scale_node->set_input_x(*reduce_sum_node); + lite::npu::OpList::Global().add(scale_node); + + float scale = 1; + for (size_t i = 0; i < dim.size(); i++) { + scale /= x_dims[dim[i]]; + } + + std::vector scale_bias_shape = x_dims.Vectorize(); + if (keep_dim) { + for (size_t i = 0; i < dim.size(); i++) { + scale_bias_shape[dim[i]] = 1; + } + } else { + const int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + scale_bias_shape[dim[i]] = kDelFlag; + } + scale_bias_shape.erase( + remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), + scale_bias_shape.end()); + } + + auto filter_const_node = + std::make_shared(unique_scale + "/filter"); + filter_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(scale, scale_bias_shape)); + scale_node->set_input_filter(*filter_const_node); + lite::npu::OpList::Global().add(filter_const_node); + + scale_node->set_attr_axis(1); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = scale_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(reduce_mean, + paddle::lite::kernels::npu::bridges::ReduceMeanConverter); diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8646ce5c25b367cf3c9055f1ed13a225149a9cc7 --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/reduce_mean_op.h" +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +void reduce_mean_n(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index, src_index; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int n = 0; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] += static_cast(src[src_index]) / num_in; + } + } + } + } +} + +void reduce_mean_c(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index, src_index0, src_index; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int c = 0; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] += static_cast(src[src_index]) / channel_in; + } + } + } + } +} + +void reduce_mean_h(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index, src_index, src_index0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = 0.0; + for (int h = 0; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] += static_cast(src[src_index]) / height_in; + } + } + } + } +} + +void reduce_mean_w(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = 0.0; + for (int w = 0; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] += static_cast(src[src_index]) / width_in; + } + } + } + } +} + +void reduce_mean_all(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + float mean = 0.0; + int src_index; + int n_id, c_id; + int all = num_in * channel_in * height_in * width_in; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + mean = src[src_index] / all; + } + } + } + } + dst[0] = mean; +} + +void reduce_mean_nc(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce n first. + DDimLite ddimA({1, channel_in, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +void reduce_mean_ch(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce c first + DDimLite ddimA({num_in, 1, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +void reduce_mean_hw(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce h first + DDimLite ddimA({num_in, channel_in, 1, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +void reduce_mean_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto x_dims = x->dims(); + auto x_data = x->data(); + auto out = scope->FindMutableTensor("out_ref"); + + auto dim = op_info->GetAttr>("dim"); + auto keep_dim = op_info->GetAttr("keep_dim"); + + auto x_rank = x_dims.size(); + if (!dim.empty()) { + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_rank; + } + } + } + + bool reduce_all = false; + sort(dim.begin(), dim.end()); + if (dim.size() == 0) { + reduce_all = true; + } + + std::vector out_dims; + if (reduce_all) { + if (keep_dim) { + for (size_t i = 0; i < x_dims.size(); i++) { + out_dims.push_back(1); + } + } else { + out_dims.push_back(1); + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + out_dims.push_back(x_dims[i]); + } + if (keep_dim) { + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = 1L; + } + } else { + int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = kDelFlag; + } + out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag), + out_dims.end()); + } + out->Resize(DDim(out_dims)); + } + + auto out_data = out->mutable_data(); + int in_n = x_dims[0]; + int in_c = x_dims[1]; + int in_h = x_dims[2]; + int in_w = x_dims[3]; + + if (dim.size() == 0) { + reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim.size() == 1) { + switch (dim[0]) { + case 0: + reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 1: + reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 2: + reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 3: + reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w); + break; + default: + LOG(FATAL) << "error!!!"; + } + } else if (dim.size() == 2) { + if (dim[0] == 0 && dim[1] == 1) { + reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 1 && dim[1] == 2) { + reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 2 && dim[1] == 3) { + reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w); + } else { + LOG(FATAL) << "invalid dim!!"; + } + } +} + +void test_reduce_mean(const std::vector& input_shape, + std::vector dim, + bool keep_dim) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("reduce_mean"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("dim", dim); + opdesc.SetAttr("keep_dim", keep_dim); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + reduce_mean_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, reduce_mean) { + std::vector> reduce_dim{ + {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}}; + for (auto dim : reduce_dim) { + for (auto keep_dim : {true, false}) { + test_reduce_mean({1, 2, 3, 4}, dim, keep_dim); + } + } +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(reduce_mean); +USE_NPU_BRIDGE(reduce_mean);