提交 1ea8d0ff 编写于 作者: Z zhupengyang 提交者: GitHub

[NPU] add reduce_mean op bridge and unit test (#2522)

* [NPU] add reduce_mean op bridge and unit test

test=develop

* refine xpu_pass place order; add bridges use

test=develop
上级 cd7bdfcf
...@@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) { ...@@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) {
TestModel(FLAGS_model_dir, TestModel(FLAGS_model_dir,
FLAGS_model_file, FLAGS_model_file,
FLAGS_params_file, FLAGS_params_file,
{lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}}, lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
input_tensor_shape, input_tensor_shape,
FLAGS_optimized_model_dir + "/NPU"); FLAGS_optimized_model_dir + "/NPU");
// verify results // verify results
......
...@@ -21,6 +21,7 @@ lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${ ...@@ -21,6 +21,7 @@ lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${
lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
set(npu_bridges set(npu_bridges
npu_bridge_registry npu_bridge_registry
...@@ -43,6 +44,7 @@ set(npu_bridges ...@@ -43,6 +44,7 @@ set(npu_bridges
npu_bridge_pad2d_op npu_bridge_pad2d_op
npu_bridge_square_op npu_bridge_square_op
npu_bridge_sqrt_op npu_bridge_sqrt_op
npu_bridge_reduce_mean_op
CACHE INTERNAL "npu_bridges") CACHE INTERNAL "npu_bridges")
set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
...@@ -66,5 +68,6 @@ lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc ...@@ -66,5 +68,6 @@ lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc
lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
message(STATUS "+++++ npu_bridges: ${npu_bridges}") message(STATUS "+++++ npu_bridges: ${npu_bridges}")
...@@ -24,6 +24,9 @@ USE_NPU_BRIDGE(pool2d); ...@@ -24,6 +24,9 @@ USE_NPU_BRIDGE(pool2d);
USE_NPU_BRIDGE(relu); USE_NPU_BRIDGE(relu);
USE_NPU_BRIDGE(elementwise_add); USE_NPU_BRIDGE(elementwise_add);
USE_NPU_BRIDGE(fusion_elementwise_add_activation); USE_NPU_BRIDGE(fusion_elementwise_add_activation);
USE_NPU_BRIDGE(elementwise_sub);
USE_NPU_BRIDGE(elementwise_mul);
USE_NPU_BRIDGE(elementwise_div);
USE_NPU_BRIDGE(scale); USE_NPU_BRIDGE(scale);
USE_NPU_BRIDGE(softmax); USE_NPU_BRIDGE(softmax);
USE_NPU_BRIDGE(concat); USE_NPU_BRIDGE(concat);
...@@ -36,3 +39,8 @@ USE_NPU_BRIDGE(bilinear_interp); ...@@ -36,3 +39,8 @@ USE_NPU_BRIDGE(bilinear_interp);
USE_NPU_BRIDGE(conv2d_transpose); USE_NPU_BRIDGE(conv2d_transpose);
USE_NPU_BRIDGE(reshape); USE_NPU_BRIDGE(reshape);
USE_NPU_BRIDGE(reshape2); USE_NPU_BRIDGE(reshape2);
USE_NPU_BRIDGE(sqrt);
USE_NPU_BRIDGE(square);
USE_NPU_BRIDGE(reduce_mean);
USE_NPU_BRIDGE(tanh);
USE_NPU_BRIDGE(nearest_interp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/npu/builder.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace npu {
namespace bridges {
node_map_type ReduceMeanConverter(
const std::shared_ptr<lite::OpLite> reduce_mean_op,
const node_map_type& inputs_map) {
auto scope = reduce_mean_op->scope();
auto op_info = reduce_mean_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "[NPU] Converting " + op_type + "...";
// get input, and op attributes
auto x_var_name = op_info->Input("X").front();
auto x_dims = scope->FindTensor(x_var_name)->dims();
auto keep_dim = op_info->GetAttr<bool>("keep_dim");
auto dim = op_info->GetAttr<std::vector<int>>("dim");
CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
for (size_t i = 0; i < dim.size(); i++) {
if (dim[i] < 0) {
dim[i] += x_dims.size();
}
}
std::sort(dim.begin(), dim.end());
// create reduce_mean(reduce_sum + scale) node and set input node from
// inputs_map
// creat reduce_sum node
auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
CHECK(inputs_map.count(x_var_name));
reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
lite::npu::OpList::Global().add(reduce_sum_node);
auto dim_const_node =
std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
reduce_sum_node->set_input_w(*dim_const_node);
lite::npu::OpList::Global().add(dim_const_node);
reduce_sum_node->set_attr_keep_dims(keep_dim);
// create scale node
auto unique_scale = lite::npu::UniqueName("scale");
auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
scale_node->set_input_x(*reduce_sum_node);
lite::npu::OpList::Global().add(scale_node);
float scale = 1;
for (size_t i = 0; i < dim.size(); i++) {
scale /= x_dims[dim[i]];
}
std::vector<int64_t> scale_bias_shape = x_dims.Vectorize();
if (keep_dim) {
for (size_t i = 0; i < dim.size(); i++) {
scale_bias_shape[dim[i]] = 1;
}
} else {
const int64_t kDelFlag = -2;
for (size_t i = 0; i < dim.size(); ++i) {
scale_bias_shape[dim[i]] = kDelFlag;
}
scale_bias_shape.erase(
remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
scale_bias_shape.end());
}
auto filter_const_node =
std::make_shared<ge::op::Const>(unique_scale + "/filter");
filter_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
scale_node->set_input_filter(*filter_const_node);
lite::npu::OpList::Global().add(filter_const_node);
scale_node->set_attr_axis(1);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = scale_node;
return outputs_map;
}
} // namespace bridges
} // namespace npu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_NPU_BRIDGE(reduce_mean,
paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/reduce_mean_op.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/test_helper.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace npu {
namespace bridges {
void reduce_mean_n(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int hw_size = height_in * width_in;
int chw_size = channel_in * hw_size;
int data_index, src_index;
for (int c = 0; c < channel_in; ++c) {
for (int h = 0; h < height_in; ++h) {
for (int w = 0; w < width_in; ++w) {
data_index = c * hw_size + h * width_in + w;
dst[data_index] = 0.0;
for (int n = 0; n < num_in; ++n) {
src_index = n * chw_size + data_index;
dst[data_index] += static_cast<float>(src[src_index]) / num_in;
}
}
}
}
}
void reduce_mean_c(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int hw_size = height_in * width_in;
int chw_size = hw_size * channel_in;
int data_index, src_index0, src_index;
for (int n = 0; n < num_in; ++n) {
for (int h = 0; h < height_in; ++h) {
for (int w = 0; w < width_in; ++w) {
data_index = n * hw_size + h * width_in + w;
src_index0 = n * chw_size + h * width_in + w;
dst[data_index] = 0.0;
for (int c = 0; c < channel_in; ++c) {
src_index = src_index0 + c * hw_size;
dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
}
}
}
}
}
void reduce_mean_h(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int cw_size = channel_in * width_in;
int chw_size = cw_size * height_in;
int hw_size = height_in * width_in;
int data_index, src_index, src_index0;
for (int n = 0; n < num_in; ++n) {
for (int c = 0; c < channel_in; ++c) {
for (int w = 0; w < width_in; ++w) {
data_index = n * cw_size + c * width_in + w;
src_index0 = n * chw_size + c * hw_size + w;
dst[data_index] = 0.0;
for (int h = 0; h < height_in; ++h) {
src_index = src_index0 + h * width_in;
dst[data_index] += static_cast<float>(src[src_index]) / height_in;
}
}
}
}
}
void reduce_mean_w(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
int ch_size = channel_in * height_in;
int hw_size = height_in * width_in;
int chw_size = ch_size * width_in;
int data_index = 0;
int src_index0 = 0;
int src_index = 0;
for (int n = 0; n < num_in; ++n) {
for (int c = 0; c < channel_in; ++c) {
for (int h = 0; h < height_in; ++h) {
data_index = n * ch_size + c * height_in + h;
src_index0 = n * chw_size + c * hw_size + h * width_in;
dst[data_index] = 0.0;
for (int w = 0; w < width_in; ++w) {
src_index = src_index0 + w;
dst[data_index] += static_cast<float>(src[src_index]) / width_in;
}
}
}
}
}
void reduce_mean_all(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
float mean = 0.0;
int src_index;
int n_id, c_id;
int all = num_in * channel_in * height_in * width_in;
for (int n = 0; n < num_in; ++n) {
n_id = n * channel_in * height_in * width_in;
for (int c = 0; c < channel_in; ++c) {
c_id = c * height_in * width_in;
for (int h = 0; h < height_in; ++h) {
for (int w = 0; w < width_in; ++w) {
src_index = n_id + c_id + h * width_in + w;
mean = src[src_index] / all;
}
}
}
}
dst[0] = mean;
}
void reduce_mean_nc(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
// reduce n first.
DDimLite ddimA({1, channel_in, height_in, width_in});
lite::Tensor tensor_tmp;
tensor_tmp.Resize(ddimA);
float* tmp_out = tensor_tmp.mutable_data<float>();
reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
}
void reduce_mean_ch(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
// reduce c first
DDimLite ddimA({num_in, 1, height_in, width_in});
lite::Tensor tensor_tmp;
tensor_tmp.Resize(ddimA);
float* tmp_out = tensor_tmp.mutable_data<float>();
reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
}
void reduce_mean_hw(const float* src,
float* dst,
int num_in,
int channel_in,
int height_in,
int width_in) {
// reduce h first
DDimLite ddimA({num_in, channel_in, 1, width_in});
lite::Tensor tensor_tmp;
tensor_tmp.Resize(ddimA);
float* tmp_out = tensor_tmp.mutable_data<float>();
reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
}
void reduce_mean_ref(const std::shared_ptr<operators::ReduceMeanOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindTensor("x");
auto x_dims = x->dims();
auto x_data = x->data<float>();
auto out = scope->FindMutableTensor("out_ref");
auto dim = op_info->GetAttr<std::vector<int>>("dim");
auto keep_dim = op_info->GetAttr<bool>("keep_dim");
auto x_rank = x_dims.size();
if (!dim.empty()) {
for (size_t i = 0; i < dim.size(); i++) {
if (dim[i] < 0) {
dim[i] += x_rank;
}
}
}
bool reduce_all = false;
sort(dim.begin(), dim.end());
if (dim.size() == 0) {
reduce_all = true;
}
std::vector<int64_t> out_dims;
if (reduce_all) {
if (keep_dim) {
for (size_t i = 0; i < x_dims.size(); i++) {
out_dims.push_back(1);
}
} else {
out_dims.push_back(1);
}
} else {
for (int i = 0; i < x_dims.size(); i++) {
out_dims.push_back(x_dims[i]);
}
if (keep_dim) {
for (size_t i = 0; i < dim.size(); ++i) {
out_dims[dim[i]] = 1L;
}
} else {
int64_t kDelFlag = -2;
for (size_t i = 0; i < dim.size(); ++i) {
out_dims[dim[i]] = kDelFlag;
}
out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
out_dims.end());
}
out->Resize(DDim(out_dims));
}
auto out_data = out->mutable_data<float>();
int in_n = x_dims[0];
int in_c = x_dims[1];
int in_h = x_dims[2];
int in_w = x_dims[3];
if (dim.size() == 0) {
reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
} else if (dim.size() == 1) {
switch (dim[0]) {
case 0:
reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
break;
case 1:
reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
break;
case 2:
reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
break;
case 3:
reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
break;
default:
LOG(FATAL) << "error!!!";
}
} else if (dim.size() == 2) {
if (dim[0] == 0 && dim[1] == 1) {
reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
} else if (dim[0] == 1 && dim[1] == 2) {
reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
} else if (dim[0] == 2 && dim[1] == 3) {
reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
} else {
LOG(FATAL) << "invalid dim!!";
}
}
}
void test_reduce_mean(const std::vector<int64_t>& input_shape,
std::vector<int> dim,
bool keep_dim) {
// prepare input&output variables
Scope scope;
std::string x_var_name("x");
std::string out_var_name("out");
std::string out_ref_var_name("out_ref");
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize(input_shape);
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("reduce_mean");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("dim", dim);
opdesc.SetAttr("keep_dim", keep_dim);
// create and convert op to NPU model, then run it on NPU
auto op = CreateOp<operators::ReduceMeanOp>(opdesc, &scope);
LauchOp(op, {x_var_name}, {out_var_name});
// execute reference implementation and save to output tensor
reduce_mean_ref(op);
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(NPUBridges, reduce_mean) {
std::vector<std::vector<int>> reduce_dim{
{0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
for (auto dim : reduce_dim) {
for (auto keep_dim : {true, false}) {
test_reduce_mean({1, 2, 3, 4}, dim, keep_dim);
}
}
}
} // namespace bridges
} // namespace npu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(reduce_mean);
USE_NPU_BRIDGE(reduce_mean);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册