未验证 提交 ae3ebea5 编写于 作者: C cc 提交者: GitHub

Fix gather and concat, add abs op, test=develop (#3395)

上级 4a7284f9
......@@ -187,6 +187,10 @@ void Run(const std::vector<int64_t>& input_shape,
}
LOG(INFO) << "max_value:" << max_value;
LOG(INFO) << "max_index:" << max_index;
LOG(INFO) << "output data[0:10]:";
for (int i = 0; i < 10; i++) {
LOG(INFO) << out_data[i];
}
}
}
#endif
......@@ -198,32 +202,33 @@ void print_usage() {
std::string help_info =
"Usage: \n"
"./benchmark_bin \n"
" --optimized_model_path (the path of the model that is optimized\n"
" by opt.) type: string \n"
" --model_dir (the path of the model that is not optimized by opt,\n"
" --optimized_model_path (The path of the model that is optimized\n"
" by opt. If the model is optimized, please set the param.) \n"
" type: string \n"
" --model_dir (The path of the model that is not optimized by opt,\n"
" the model and param files is under model_dir.) type: string \n"
" --model_filename (the filename of model file. When the model is\n "
" --model_filename (The filename of model file. When the model is\n "
" combined formate, please set model_file. Otherwise, it is not\n"
" necessary to set it.) type: string \n"
" --param_filename (the filename of param file, set param_file when\n"
" --param_filename (The filename of param file, set param_file when\n"
" the model is combined formate. Otherwise, it is not necessary\n"
" to set it.) type: string \n"
" --input_shape (set input shapes according to the model, separated by\n"
" --input_shape (Tet input shapes according to the model, separated by\n"
" colon and comma, such as 1,3,244,244) type: string\n"
" default: 1,3,224,224 \n"
" --input_img_path (the path of input image, if not set\n"
" --input_img_path (The path of input image, if not set\n"
" input_img_path, the input will be 1.0.) type: string \n "
" --power_mode (arm power mode: 0 for big cluster, 1 for little\n"
" --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
" cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
" --repeats (repeats times) type: int32 default: 1 \n"
" --result_filename (save the inference time to the file.) type: \n"
" --repeats (Repeats times) type: int32 default: 1 \n"
" --result_filename (Save the inference time to the file.) type: \n"
" string default: result.txt \n"
" --threads (threads num) type: int32 default: 1 \n"
" --warmup (warmup times) type: int32 default: 0 \n"
" --threads (Threads num) type: int32 default: 1 \n"
" --warmup (Warmup times) type: int32 default: 0 \n"
"Note that: \n"
" If load the optimized model, set optimized_model_path, or set\n"
" model_dir, model_filename and param_filename according to the\n"
" model. \n";
" If load the optimized model, set optimized_model_path. Otherwise, \n"
" set model_dir, model_filename and param_filename according to \n"
" the model. \n";
LOG(INFO) << help_info;
}
......
......@@ -295,6 +295,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kNCHW));
// Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places
......
......@@ -744,6 +744,15 @@ void act_reciprocal<float>(const float* din,
}
}
template <>
void act_abs<float>(const float* din, float* dout, int size, int threads) {
for (int i = 0; i < size; ++i) {
dout[0] = (din[0] > 0 ? din[0] : -din[0]);
din++;
dout++;
}
}
#ifdef LITE_WITH_TRAIN
template <>
void act_square_grad(const float* din,
......
......@@ -83,6 +83,9 @@ void act_hard_swish(const T* din,
template <typename T>
void act_reciprocal(const T* din, T* dout, int size, int threads);
template <typename T>
void act_abs(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN
template <typename T>
void act_square_grad(
......
......@@ -16,46 +16,3 @@
#include <algorithm>
#include <limits>
#include <memory>
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void concat_func(const std::vector<lite::Tensor *> &input,
const int axis,
lite::Tensor *output) {
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
auto dim_0 = input[0]->dims();
size_t num = input.size();
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
float *dst_ptr = output->mutable_data<float>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
const float *src_ptr = input[n]->data<float>();
int64_t in_concat_axis = dims[axis];
float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -25,9 +25,39 @@ namespace lite {
namespace arm {
namespace math {
void concat_func(const std::vector<lite::Tensor *> &input,
template <typename T>
void concat_func(const std::vector<lite::Tensor*>& input,
const int axis,
lite::Tensor *output);
lite::Tensor* output) {
size_t num = input.size();
auto dim_0 = input[0]->dims();
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
auto* dst_ptr = output->mutable_data<T>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
auto* src_ptr = input[n]->data<T>();
int64_t in_concat_axis = dims[axis];
auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math
} // namespace arm
......
......@@ -207,6 +207,16 @@ void ReciprocalCompute::Run() {
x_data, output_data, x_dims.production(), ctx.threads());
}
void AbsCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
auto x_dims = param.X->dims();
auto x_data = param.X->data<float>();
auto output_data = param.Out->mutable_data<float>();
lite::arm::math::act_abs<float>(
x_data, output_data, x_dims.production(), ctx.threads());
}
} // namespace arm
} // namespace kernels
} // namespace lite
......@@ -321,3 +331,8 @@ REGISTER_LITE_KERNEL(reciprocal,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(
abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
......@@ -166,6 +166,15 @@ class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
virtual ~ReciprocalCompute() = default;
};
class AbsCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~AbsCompute() = default;
};
} // namespace arm
} // namespace kernels
} // namespace lite
......
......@@ -34,40 +34,21 @@ std::vector<size_t> stride_numel(const DDim& ddim) {
return strides;
}
void ConcatCompute::Run() {
auto& param = Param<operators::ConcatParam>();
std::vector<lite::Tensor*> inputs = param.x;
auto* out = param.output;
int axis = param.axis;
auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>();
axis = axis_tensor_data[0];
}
out->mutable_data<float>();
/// Sometimes direct copies will be faster, this maybe need deeply analysis.
template <typename T>
void ConcatFunc(const std::vector<lite::Tensor*> inputs,
int axis,
lite::Tensor* out) {
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && inputs.size() < 10) {
size_t output_offset = 0;
for (auto* in : inputs) {
auto in_stride = stride_numel(in->dims());
auto out_stride = stride_numel(out->dims());
void* dst = out->mutable_data<float>() + output_offset;
const void* src = in->data<float>();
#if 0
LOG(INFO) << "out_stride.size():" << out_stride.size();
LOG(INFO) << "out_stride[0]" << out_stride[0];
for (int i=0; i < out_stride.size(); ++i) {
LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
}
LOG(INFO) << "in_stride.size():" << in_stride.size();
for (int i=0; i < in_stride.size(); ++i) {
LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
}
#endif
void* dst = out->mutable_data<T>() + output_offset;
const void* src = in->data<T>();
// src and dst tensor should have the same dims size.
CHECK(in_stride.size() == out_stride.size());
std::memcpy(dst, src, sizeof(float) * in_stride[0]);
std::memcpy(dst, src, sizeof(T) * in_stride[0]);
output_offset += in_stride[0];
}
} else {
......@@ -75,9 +56,37 @@ void ConcatCompute::Run() {
for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = inputs[j];
}
lite::arm::math::concat_func(inputs_concat, axis, out);
lite::arm::math::concat_func<T>(inputs_concat, axis, out);
}
}
void ConcatCompute::Run() {
auto& param = Param<operators::ConcatParam>();
std::vector<lite::Tensor*> inputs = param.x;
CHECK_GE(inputs.size(), 1);
auto* out = param.output;
int axis = param.axis;
auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>();
axis = axis_tensor_data[0];
}
switch (inputs.front()->precision()) {
case PRECISION(kFloat):
ConcatFunc<float>(inputs, axis, out);
break;
case PRECISION(kInt32):
ConcatFunc<int32_t>(inputs, axis, out);
break;
case PRECISION(kInt64):
ConcatFunc<int64_t>(inputs, axis, out);
break;
default:
LOG(FATAL) << "Concat does not implement for the "
<< "input type:"
<< static_cast<int>(inputs.front()->precision());
}
return;
}
} // namespace arm
......@@ -86,9 +95,9 @@ void ConcatCompute::Run() {
} // namespace paddle
REGISTER_LITE_KERNEL(
concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
concat, kARM, kAny, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
......@@ -22,7 +22,7 @@ namespace lite {
namespace kernels {
namespace arm {
class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
public:
using param_t = operators::ConcatParam;
......
......@@ -95,7 +95,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {
TEST(concat_arm, init) {
ConcatCompute concat;
ASSERT_EQ(concat.precision(), PRECISION(kFloat));
ASSERT_EQ(concat.precision(), PRECISION(kAny));
ASSERT_EQ(concat.target(), TARGET(kARM));
}
......@@ -222,8 +222,7 @@ TEST(concat_arm, compute_input_multi) {
TEST(concat, retrive_op) {
auto concat =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"concat");
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
ASSERT_FALSE(concat.empty());
ASSERT_TRUE(concat.front());
}
......@@ -233,4 +232,4 @@ TEST(concat, retrive_op) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
......@@ -20,24 +20,48 @@ namespace lite {
namespace kernels {
namespace arm {
void GatherCompute::Run() {
auto& param = this->Param<operators::GatherParam>();
auto* p_output = param.Out->mutable_data<float>();
auto index_size = param.Index->dims()[0];
template <typename T>
void GatherFunc(const operators::GatherParam& param) {
auto src_dims = param.X->dims();
const float* p_src = param.X->data<float>();
auto index_size = param.Index->dims()[0];
auto* p_src = param.X->data<T>();
const int* p_index = param.Index->data<int>();
auto* p_output = param.Out->mutable_data<T>();
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) {
for (size_t i = 1; i < src_dims.size(); ++i) {
slice_size *= src_dims[i];
}
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + i * slice_size,
p_src + index_ * slice_size,
slice_size * sizeof(float));
slice_size * sizeof(T));
}
}
void GatherCompute::Run() {
auto& param = this->Param<operators::GatherParam>();
switch (param.X->precision()) {
case PRECISION(kFloat):
GatherFunc<float>(param);
break;
case PRECISION(kInt8):
GatherFunc<int8_t>(param);
break;
case PRECISION(kInt16):
GatherFunc<int16_t>(param);
break;
case PRECISION(kInt32):
GatherFunc<int32_t>(param);
break;
case PRECISION(kInt64):
GatherFunc<int64_t>(param);
break;
default:
LOG(FATAL) << "Gather does not implement for the "
<< "input type:" << static_cast<int>(param.X->precision());
}
}
......@@ -48,8 +72,8 @@ void GatherCompute::Run() {
REGISTER_LITE_KERNEL(
gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.BindInput("Index",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
.Finalize();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册