提交 7ad182e1 编写于 作者: N nhzlx

Cherry-Pick from 16662 : Anakin subgraph cpu support

上级 8643dbc2
......@@ -25,8 +25,9 @@ endif()
if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT})
include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber)
include_directories(${ANAKIN_ROOT}/saber)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
......@@ -16,16 +16,13 @@
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
template <typename TargetT>
ActivationOpConverter<TargetT>::ActivationOpConverter(
const std::string &op_type)
: op_type_(op_type) {
auto it = anakin_op_types_.find(op_type_);
PADDLE_ENFORCE(it != anakin_op_types_.end(),
......@@ -33,10 +30,10 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
anakin_op_type_ = it->second;
}
void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void ActivationOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -44,13 +41,20 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "type", anakin_op_type_);
this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::X86>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ActivationOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ActivationOpConverter : public AnakinOpConverter<TargetT> {
public:
explicit ActivationOpConverter(const std::string &op_type);
......@@ -39,14 +40,16 @@ class ActivationOpConverter : public AnakinOpConverter {
{"sigmoid", "Sigmoid"}};
};
class TanhOpConverter : public ActivationOpConverter {
template <typename TargetT>
class TanhOpConverter : public ActivationOpConverter<TargetT> {
public:
TanhOpConverter() : ActivationOpConverter("tanh") {}
TanhOpConverter() : ActivationOpConverter<TargetT>("tanh") {}
};
class SigmoidOpConverter : public ActivationOpConverter {
template <typename TargetT>
class SigmoidOpConverter : public ActivationOpConverter<TargetT> {
public:
SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
SigmoidOpConverter() : ActivationOpConverter<TargetT>("sigmoid") {}
};
} // namespace anakin
} // namespace inference
......
......@@ -18,19 +18,16 @@
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void AffineChannelOpConverter::operator()(
template <typename TargetT>
void AffineChannelOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -59,7 +56,7 @@ void AffineChannelOpConverter::operator()(
bias_tensor->Resize(bias_t->dims());
TensorCopySync((*bias_t), platform::CPUPlace(), bias_tensor.get());
engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
// Generate the Scale parameter of Anakin.
auto scale_shape = framework::vectorize2int(scale_t->dims());
......@@ -67,7 +64,8 @@ void AffineChannelOpConverter::operator()(
scale_shape.insert(scale_shape.begin(), 1);
}
Shape anakin_scale_shape(scale_shape);
auto *weight1 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_scale_shape);
float *scale_cpu_data =
static_cast<float *>(weight1->h_tensor().mutable_data());
......@@ -75,7 +73,7 @@ void AffineChannelOpConverter::operator()(
scale_cpu_data);
weight1->d_tensor().set_shape(anakin_scale_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// Generate the Bias parameter of Anakin.
auto bias_shape = framework::vectorize2int(bias_t->dims());
......@@ -83,18 +81,24 @@ void AffineChannelOpConverter::operator()(
bias_shape.insert(bias_shape.begin(), 1);
}
Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *bias_cpu_data =
static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_tensor->data<float>(), bias_tensor->numel(), bias_cpu_data);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::X86>);
......@@ -21,7 +21,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class AffineChannelOpConverter : public AnakinOpConverter {
template <typename TargetT>
class AffineChannelOpConverter : public AnakinOpConverter<TargetT> {
public:
AffineChannelOpConverter() = default;
......
......@@ -21,17 +21,16 @@
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void BatchNormOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
std::map<std::string, std::string> inputs;
......@@ -48,9 +47,9 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
auto bn_op_name = op_name + ":bn";
auto bn_output = bn_op_name + "_output";
engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
this->engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
auto scale_op_name = op_name + ":scale";
auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
......@@ -81,48 +80,54 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
auto *weight2 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape2);
auto *variance_data =
static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
Shape shape3(std::vector<int>({1, 1, 1, 1}));
auto *weight3 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape3);
auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
float weight3_data[] = {1};
std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
auto *scale =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape);
auto *scale = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
scale_shape);
auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
auto *bias =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape);
auto *bias = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
bias_shape);
auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
engine_->AddOpAttr(scale_op_name, "axis", 1);
engine_->AddOpAttr(scale_op_name, "num_axes", 1);
engine_->AddOpAttr(scale_op_name, "bias_term", true);
engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
this->engine_->AddOpAttr(scale_op_name, "axis", 1);
this->engine_->AddOpAttr(scale_op_name, "num_axes", 1);
this->engine_->AddOpAttr(scale_op_name, "bias_term", true);
this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class BatchNormOpConverter : public AnakinOpConverter {
template <typename TargetT>
class BatchNormOpConverter : public AnakinOpConverter<TargetT> {
public:
BatchNormOpConverter() = default;
......
......@@ -15,38 +15,32 @@
#include "paddle/fluid/inference/anakin/convert/concat.h"
#include <algorithm>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void ConcatOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
int axis = boost::get<int>(op_desc.GetAttr("axis"));
auto input_names = op_desc.Input("X");
// PADDLE_ENFORCE(axis > 0,
// "The axis attr of Concat op should be large than 0 for trt");
auto y_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Concat", input_names, {y_name});
engine_->AddOpAttr(op_name, "axis", axis);
this->engine_->AddOp(op_name, "Concat", input_names, {y_name});
this->engine_->AddOpAttr(op_name, "axis", axis);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ConcatOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ConcatOpConverter : public AnakinOpConverter<TargetT> {
public:
ConcatOpConverter() = default;
......
......@@ -18,19 +18,18 @@
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void Conv2dOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
......@@ -39,7 +38,7 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
......@@ -51,38 +50,44 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
{filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", false);
this->engine_->AddOpAttr(op_name, "group", groups);
this->engine_->AddOpAttr(op_name, "axis", 1);
this->engine_->AddOpAttr(op_name, "bias_term", false);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::X86>);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::NV>);
#endif
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class Conv2dOpConverter : public AnakinOpConverter {
template <typename TargetT>
class Conv2dOpConverter : public AnakinOpConverter<TargetT> {
public:
Conv2dOpConverter() = default;
......
......@@ -18,19 +18,18 @@
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void Conv2dFusionOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
......@@ -40,7 +39,7 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
......@@ -63,28 +62,31 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
{filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", true);
this->engine_->AddOpAttr(op_name, "group", groups);
this->engine_->AddOpAttr(op_name, "axis", 1);
this->engine_->AddOpAttr(op_name, "bias_term", true);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor;
......@@ -98,17 +100,24 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class Conv2dFusionOpConverter : public AnakinOpConverter {
template <typename TargetT>
class Conv2dFusionOpConverter : public AnakinOpConverter<TargetT> {
public:
Conv2dFusionOpConverter() = default;
......
......@@ -17,17 +17,14 @@
#include <map>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void DensityPriorBoxOpConverter::operator()(
template <typename TargetT>
void DensityPriorBoxOpConverter<TargetT>::operator()(
const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
const framework::Scope& scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -81,27 +78,44 @@ void DensityPriorBoxOpConverter::operator()(
std::vector<float> temp_v = {};
engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
engine_->AddOpAttr(op_name, "is_flip", is_flip);
engine_->AddOpAttr(op_name, "is_clip", is_clip);
engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
engine_->AddOpAttr(op_name, "step_h", step_h);
engine_->AddOpAttr(op_name, "step_w", step_w);
engine_->AddOpAttr(op_name, "offset", offset);
engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
this->engine_->AddOp(op_name, "PriorBox", {input_name, image_name},
{output_name});
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "min_size",
min_sizes);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "max_size",
max_sizes);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "aspect_ratio",
aspect_ratios);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_size",
fixed_sizes);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_ratio",
fixed_ratios);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "density", dens);
this->engine_->AddOpAttr(op_name, "is_flip", is_flip);
this->engine_->AddOpAttr(op_name, "is_clip", is_clip);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "variance",
variances);
this->engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
this->engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
this->engine_->AddOpAttr(op_name, "step_h", step_h);
this->engine_->AddOpAttr(op_name, "step_w", step_w);
this->engine_->AddOpAttr(op_name, "offset", offset);
this->engine_->template AddOpAttr<PTuple<std::string>>(op_name, "order",
t_order);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class DensityPriorBoxOpConverter : public AnakinOpConverter {
template <typename TargetT>
class DensityPriorBoxOpConverter : public AnakinOpConverter<TargetT> {
public:
DensityPriorBoxOpConverter() = default;
......
......@@ -16,19 +16,14 @@
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void DetectionOutOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto target_name = op_desc.Input("TargetBox").front();
auto prior_box_name = op_desc.Input("PriorBox").front();
......@@ -52,22 +47,28 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
"Not support encode_center_size code_type in DetectionOut of anakin");
}
engine_->AddOp(op_name, "DetectionOutput",
{target_name, scores_name, prior_box_name}, {output_name});
engine_->AddOpAttr(op_name, "share_location", true);
engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
engine_->AddOpAttr(op_name, "background_id", background_label);
engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
this->engine_->AddOp(op_name, "DetectionOutput",
{target_name, scores_name, prior_box_name},
{output_name});
this->engine_->AddOpAttr(op_name, "share_location", true);
this->engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
this->engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
this->engine_->AddOpAttr(op_name, "background_id", background_label);
this->engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
this->engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
this->engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
this->engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
this->engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
this->engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::X86>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class DetectionOutOpConverter : public AnakinOpConverter {
template <typename TargetT>
class DetectionOutOpConverter : public AnakinOpConverter<TargetT> {
public:
DetectionOutOpConverter() = default;
......
......@@ -19,21 +19,16 @@
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void DropoutOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
......@@ -43,25 +38,30 @@ void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
this->engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
auto factor = 1 - dropout_prob;
Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {factor};
std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1);
engine_->AddOpAttr(op_name, "axis", 0);
engine_->AddOpAttr(op_name, "num_axes", 0);
engine_->AddOpAttr(op_name, "bias_term", false);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "axis", 0);
this->engine_->AddOpAttr(op_name, "num_axes", 0);
this->engine_->AddOpAttr(op_name, "bias_term", false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class DropoutOpConverter : public AnakinOpConverter {
template <typename TargetT>
class DropoutOpConverter : public AnakinOpConverter<TargetT> {
public:
DropoutOpConverter() = default;
......
......@@ -19,18 +19,15 @@
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ElementwiseAddOpConverter::operator()(
template <typename TargetT>
void ElementwiseAddOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -43,14 +40,16 @@ void ElementwiseAddOpConverter::operator()(
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
std::string elementwise_type = "Add";
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
std::vector<float> coeff = {1.0, 1.0};
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
}
void ElementwiseMulOpConverter::operator()(
template <typename TargetT>
void ElementwiseMulOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -63,26 +62,25 @@ void ElementwiseMulOpConverter::operator()(
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name});
// Fill a number to weight_1 as a placeholder.
Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
auto *placeholder_data =
static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {1};
std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto axis = boost::get<int>(op_desc.GetAttr("axis"));
engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr(op_name, "num_axes", 1);
engine_->AddOpAttr(op_name, "bias_term", false);
this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
std::string elementwise_type = "Prod";
this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
std::vector<float> coeff = {1.0, 1.0};
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ElementwiseAddOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ElementwiseAddOpConverter : public AnakinOpConverter<TargetT> {
public:
ElementwiseAddOpConverter() = default;
......@@ -33,7 +34,8 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
private:
};
class ElementwiseMulOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ElementwiseMulOpConverter : public AnakinOpConverter<TargetT> {
public:
ElementwiseMulOpConverter() = default;
......
......@@ -19,17 +19,16 @@
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void FcBaseOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto input_names = op_desc.InputNames();
bool with_bias = input_names.size() == 3;
......@@ -51,13 +50,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input(i_name).front();
auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "bias_term", with_bias);
engine_->AddOpAttr(op_name, "axis", 1);
this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "bias_term", with_bias);
this->engine_->AddOpAttr(op_name, "axis", 1);
auto weight_shape = framework::vectorize2int(y_t->dims());
int out_dim = weight_shape[1];
engine_->AddOpAttr(op_name, "out_dim", out_dim);
this->engine_->AddOpAttr(op_name, "out_dim", out_dim);
const int w_m = weight_shape[0];
const int w_k = weight_shape[1];
......@@ -79,12 +78,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
}
}
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// get bias
if (with_bias) {
......@@ -104,13 +104,14 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
}
......@@ -118,5 +119,10 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class FcBaseOpConverter : public AnakinOpConverter {
template <typename TargetT>
class FcBaseOpConverter : public AnakinOpConverter<TargetT> {
public:
FcBaseOpConverter() = default;
......@@ -32,13 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter {
};
// with bias
class FcOpConverter : public FcBaseOpConverter {
template <typename TargetT>
class FcOpConverter : public FcBaseOpConverter<TargetT> {
public:
FcOpConverter() = default;
};
// without bias
class MulOpConverter : public FcBaseOpConverter {
template <typename TargetT>
class MulOpConverter : public FcBaseOpConverter<TargetT> {
public:
MulOpConverter() = default;
};
......
......@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/flatten.h"
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void FlattenOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
......@@ -41,12 +37,17 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
std::vector<int> out_dims = {0, -1, 1, 1};
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output});
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
this->engine_->AddOp(op_name, "Reshape", {input}, {output});
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class FlattenOpConverter : public AnakinOpConverter {
template <typename TargetT>
class FlattenOpConverter : public AnakinOpConverter<TargetT> {
public:
FlattenOpConverter() = default;
......
......@@ -17,23 +17,16 @@
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void Im2SequenceConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
......@@ -43,21 +36,24 @@ void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
this->engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
std::vector<int> dilations = {1, 1};
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
engine_->AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "window_size", kernels);
engine_->AddOpAttr<PTuple<int>>(op_name, "dilations", dilations);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "window_size",
kernels);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilations",
dilations);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence,
Im2SequenceConverter<::anakin::saber::NV>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class Im2SequenceConverter : public AnakinOpConverter {
template <typename TargetT>
class Im2SequenceConverter : public AnakinOpConverter<TargetT> {
public:
Im2SequenceConverter() = default;
......
......@@ -32,10 +32,10 @@ namespace paddle {
namespace inference {
namespace anakin {
using AnakinNvEngine =
AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
template <typename TargetT>
class AnakinOpConverter {
using AnakinEngineT = AnakinEngine<TargetT, ::anakin::Precision::FP32>;
public:
AnakinOpConverter() = default;
......@@ -45,7 +45,7 @@ class AnakinOpConverter {
void ConvertOp(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine,
const framework::Scope &scope, AnakinEngineT *engine,
bool test_mode = false) {
framework::OpDesc op_desc(op, nullptr);
std::string op_type = op_desc.Type();
......@@ -65,7 +65,7 @@ class AnakinOpConverter {
void ConvertBlock(framework::BlockDesc *block_desc,
const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine) {
const framework::Scope &scope, AnakinEngineT *engine) {
std::unique_lock<std::mutex> lock(mutex_);
framework::proto::BlockDesc *block = block_desc->Proto();
for (auto i = 0; i < block->ops_size(); i++) {
......@@ -79,7 +79,7 @@ class AnakinOpConverter {
framework::BlockDesc *block_desc, framework::Scope *scope,
const std::vector<std::string> &inputs,
const std::unordered_set<std::string> &parameters,
const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
const std::vector<std::string> &outputs, AnakinEngineT *engine) {
ConvertBlock(block_desc, parameters, *scope, engine);
// if the max_batch size
int max_batch_size = engine->GetMaxBatchSize();
......@@ -128,40 +128,60 @@ class AnakinOpConverter {
engine->InitNet();
}
void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
void SetEngine(AnakinEngineT *engine) { engine_ = engine; }
virtual ~AnakinOpConverter() {}
protected:
bool test_mode_;
AnakinNvEngine *engine_{nullptr};
AnakinEngineT *engine_{nullptr};
private:
std::unordered_map<std::string, AnakinOpConverter *> converters_;
std::unordered_map<std::string, AnakinOpConverter<TargetT> *> converters_;
framework::Scope *scope_{nullptr};
std::mutex mutex_;
};
template class AnakinOpConverter<::anakin::saber::NV>;
template class AnakinOpConverter<::anakin::saber::X86>;
} // namespace anakin
} // namespace inference
} // namespace paddle
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
struct anakin_##op_type__##_converter \
#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \
place_type__, place_class__) \
struct anakin_##op_type__##_##place_type__##_converter \
: public ::paddle::framework::Registrar { \
anakin_##op_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__; \
anakin_##op_type__##_##place_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__ << " "; \
::paddle::inference::Registry< \
::paddle::inference::anakin::AnakinOpConverter>::Global() \
::paddle::inference::anakin::AnakinOpConverter<place_class__>>:: \
Global() \
.Register<::paddle::inference::anakin::Converter__>(#op_type__); \
} \
}; \
anakin_##op_type__##_converter anakin_##op_type__##_converter__; \
int TouchConverterRegister_anakin_##op_type__() { \
anakin_##op_type__##_converter__.Touch(); \
anakin_##op_type__##_##place_type__##_converter \
anakin_##op_type__##_##place_type__##_converter__; \
int TouchConverterRegister_anakin_##op_type__##_##place_type__() { \
anakin_##op_type__##_##place_type__##_converter__.Touch(); \
return 0; \
}
#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV)
#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86)
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__) \
extern int TouchConverterRegister_anakin_##op_type__##_##place_type__(); \
int use_op_converter_anakin_##op_type__##_##place_type__ \
__attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__##_##place_type__();
#define USE_ANAKIN_CONVERTER(op_type__) \
extern int TouchConverterRegister_anakin_##op_type__(); \
int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__();
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA)
#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU)
......@@ -17,23 +17,16 @@
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void Pool2dOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -65,17 +58,22 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_THROW("TensorRT unsupported pooling type!");
}
engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
engine_->AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
engine_->AddOpAttr(op_name, "method", anakin_pool_type);
engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
this->engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
this->engine_->AddOpAttr(op_name, "method", anakin_pool_type);
this->engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
this->engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class Pool2dOpConverter : public AnakinOpConverter {
template <typename TargetT>
class Pool2dOpConverter : public AnakinOpConverter<TargetT> {
public:
Pool2dOpConverter() = default;
......
......@@ -16,19 +16,14 @@
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void ReluOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -37,14 +32,14 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "alpha", 0);
this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "alpha", 0);
}
void LeakyReluOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void LeakyReluOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -54,13 +49,19 @@ void LeakyReluOpConverter::operator()(const framework::proto::OpDesc &op,
auto output_name = op_desc.Output("Out").front();
float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "alpha", alpha);
this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "alpha", alpha);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::X86>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ReluOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ReluOpConverter : public AnakinOpConverter<TargetT> {
public:
ReluOpConverter() = default;
......@@ -33,7 +34,8 @@ class ReluOpConverter : public AnakinOpConverter {
virtual ~ReluOpConverter() {}
};
class LeakyReluOpConverter : public AnakinOpConverter {
template <typename TargetT>
class LeakyReluOpConverter : public AnakinOpConverter<TargetT> {
public:
LeakyReluOpConverter() = default;
......
......@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/reshape.h"
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void ReshapeOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
......@@ -37,17 +33,23 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output});
this->engine_->AddOp(op_name, "Reshape", {input}, {output});
auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
if (shape.size() < 4) {
shape.insert(shape.end(), 4 - shape.size(), 1);
}
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", shape);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", shape);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ReshapeOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ReshapeOpConverter : public AnakinOpConverter<TargetT> {
public:
ReshapeOpConverter() = default;
......
......@@ -25,10 +25,10 @@ namespace paddle {
namespace inference {
namespace anakin {
void RoiAlignOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void RoiAlignOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1);
......@@ -44,16 +44,21 @@ void RoiAlignOpConverter::operator()(const framework::proto::OpDesc &op,
auto pooled_width = boost::get<int>(op_desc.GetAttr("pooled_width"));
auto sampling_ratio = boost::get<int>(op_desc.GetAttr("sampling_ratio"));
engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name},
this->engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name},
{output_name});
engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale);
engine_->AddOpAttr(op_name, "pooled_height", pooled_height);
engine_->AddOpAttr(op_name, "pooled_width", pooled_width);
engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio);
this->engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale);
this->engine_->AddOpAttr(op_name, "pooled_height", pooled_height);
this->engine_->AddOpAttr(op_name, "pooled_width", pooled_width);
this->engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::X86>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class RoiAlignOpConverter : public AnakinOpConverter {
template <typename TargetT>
class RoiAlignOpConverter : public AnakinOpConverter<TargetT> {
public:
RoiAlignOpConverter() = default;
......
......@@ -16,19 +16,14 @@
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void ScaleOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -44,14 +39,14 @@ void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_ENFORCE(bias_after_scale,
"The anakin scale layer only support bias after scale now.");
engine_->AddOp(op_name, "Power", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "shift", bias);
engine_->AddOpAttr(op_name, "scale", scale);
engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
this->engine_->AddOp(op_name, "Power", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "shift", bias);
this->engine_->AddOpAttr(op_name, "scale", scale);
this->engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter<::anakin::saber::NV>);
......@@ -22,7 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class ScaleOpConverter : public AnakinOpConverter {
template <typename TargetT>
class ScaleOpConverter : public AnakinOpConverter<TargetT> {
public:
ScaleOpConverter() = default;
......
......@@ -14,19 +14,14 @@
#include "paddle/fluid/inference/anakin/convert/softmax.h"
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void SoftMaxOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
......@@ -41,12 +36,18 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_shape_in_fluid = input_var_desc->GetShape();
size_t input_dims = input_shape_in_fluid.size();
engine_->AddOp(op_name, "Softmax", {input}, {output});
engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
this->engine_->AddOp(op_name, "Softmax", {input}, {output});
this->engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class SoftMaxOpConverter : public AnakinOpConverter {
template <typename TargetT>
class SoftMaxOpConverter : public AnakinOpConverter<TargetT> {
public:
SoftMaxOpConverter() = default;
......
......@@ -16,23 +16,16 @@
#include <algorithm>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void SplitOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto input_name = op_desc.Input("X").front();
auto y_names = op_desc.Output("Out");
......@@ -51,14 +44,19 @@ void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
num_sum += output_lengths[i];
slice_point.push_back(num_sum);
}
engine_->AddOp(op_name, "Slice", {input_name}, y_names);
engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr<PTuple<int>>(op_name, "slice_point", slice_point);
this->engine_->AddOp(op_name, "Slice", {input_name}, y_names);
this->engine_->AddOpAttr(op_name, "axis", axis);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "slice_point",
slice_point);
// slice_dim is useless in anakin
engine_->AddOpAttr(op_name, "slice_dim", 4);
this->engine_->AddOpAttr(op_name, "slice_dim", 4);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class SplitOpConverter : public AnakinOpConverter {
template <typename TargetT>
class SplitOpConverter : public AnakinOpConverter<TargetT> {
public:
SplitOpConverter() = default;
......
......@@ -17,22 +17,17 @@
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void SumOpConverter::operator()(const framework::proto::OpDesc &op,
template <typename TargetT>
void SumOpConverter<TargetT>::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -43,13 +38,17 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op,
std::vector<float> coeff = {1, 1};
std::string elementwise_type = "Add";
engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
this->engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class SumOpConverter : public AnakinOpConverter {
template <typename TargetT>
class SumOpConverter : public AnakinOpConverter<TargetT> {
public:
SumOpConverter() = default;
......
......@@ -21,12 +21,14 @@ namespace paddle {
namespace inference {
namespace anakin {
static void test_activation_op(const std::string &op_type) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
PADDLE_ENFORCE(converter != nullptr);
template <typename TargetT>
static void test_activation_op(const std::string& op_type,
const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......@@ -41,13 +43,42 @@ static void test_activation_op(const std::string &op_type) {
validator.Execute(5);
}
TEST(sigm_op, test) { test_activation_op("sigmoid"); }
TEST(tanh_op, test) { test_activation_op("tanh"); }
#ifdef PADDLE_WITH_CUDA
TEST(sigm_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("sigmoid", ctx, true);
}
TEST(tanh_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("tanh", ctx, true);
}
#endif
TEST(sigm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
}
TEST(tanh_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(sigmoid);
USE_OP(tanh);
USE_CPU_ANAKIN_CONVERTER(sigmoid);
USE_CPU_ANAKIN_CONVERTER(tanh);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh);
#endif
......@@ -21,16 +21,19 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(affine_channel, native) {
template <typename TargetT>
void test_affine_channel_op(const platform::DeviceContext& context,
bool use_gpu) {
// Declare the difference between the inputs.
std::unordered_set<std::string> parameters({"scale", "bias"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 3, 5, 2});
validator.DeclOutputVar("out", {1, 3, 5, 2});
validator.DeclParamVar("scale", {1, 3, 1, 1});
validator.DeclParamVar("bias", {1, 3, 1, 1});
validator.DeclParamVar("scale", {3});
validator.DeclParamVar("bias", {3});
// Prepare Op descriptions.
framework::OpDesc desc;
......@@ -47,9 +50,26 @@ TEST(affine_channel, native) {
validator.Execute(1);
}
#ifdef PADDLE_WITH_CUDA
TEST(affine_channel_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_affine_channel_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(affine_channel_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_affine_channel_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(affine_channel);
USE_CPU_ANAKIN_CONVERTER(affine_channel);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(affine_channel);
#endif
......@@ -19,12 +19,14 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(batch_norm_op, test) {
template <typename TargetT>
void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters(
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
......@@ -64,8 +66,26 @@ TEST(batch_norm_op, test) {
validator.Execute(1, neglected_output);
}
#ifdef PADDLE_WITH_CUDA
TEST(batch_norm_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_batchnorm_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(batch_norm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_batchnorm_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(batch_norm);
USE_CPU_ANAKIN_CONVERTER(batch_norm);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(batch_norm);
#endif
......@@ -21,10 +21,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(concat_op, test) {
template <typename TargetT>
void test_concat_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
......@@ -44,31 +46,26 @@ TEST(concat_op, test) {
validator.Execute(1);
}
TEST(concat_op, test2) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("concat_x1", {1, 4});
validator.DeclInputVar("concat_x2", {3, 4});
validator.DeclInputVar("concat_x3", {2, 4});
validator.DeclOutputVar("concat_out", {6, 4});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 0;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
#ifdef PADDLE_WITH_CUDA
TEST(concat_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_concat_op<::anakin::saber::NV>(ctx, true);
}
#endif
validator.Execute(1);
TEST(concat_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_concat_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(concat);
USE_CPU_ANAKIN_CONVERTER(concat);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(concat);
#endif
......@@ -21,13 +21,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(conv2d_op, test) {
auto* conv2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("conv2d");
ASSERT_TRUE(conv2d_converter != nullptr);
template <typename TargetT>
void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
......@@ -54,9 +53,27 @@ TEST(conv2d_op, test) {
validator.Execute(3);
}
#ifdef PADDLE_WITH_CUDA
TEST(conv2d_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_conv2d_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(conv2d_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_conv2d_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(conv2d);
USE_CPU_ANAKIN_CONVERTER(conv2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(conv2d);
#endif
......@@ -21,10 +21,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(dropout_op, native) {
template <typename TargetT>
void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
validator.DeclOutputVar("mask", {1, 1, 2, 2});
......@@ -45,9 +47,26 @@ TEST(dropout_op, native) {
validator.Execute(1, neglected_output);
}
#ifdef PADDLE_WITH_CUDA
TEST(dropout_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_dropout_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(dropout_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_dropout_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(dropout);
USE_CPU_ANAKIN_CONVERTER(dropout);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(dropout);
#endif
......@@ -21,10 +21,14 @@ namespace paddle {
namespace inference {
namespace anakin {
static void test_elementwise_op(const std::string &op_type) {
template <typename TargetT>
static void test_elementwise_op(const std::string& op_type,
const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclInputVar("y", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
......@@ -43,14 +47,41 @@ static void test_elementwise_op(const std::string &op_type) {
validator.Execute(1);
}
TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); }
TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); }
#ifdef PADDLE_WITH_CUDA
TEST(elementwise_op, native_add_gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_elementwise_op<::anakin::saber::NV>("elementwise_add", ctx, true);
}
TEST(elementwise_op, native_mul_gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
}
#endif
TEST(elementwise_op, native_add_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
}
TEST(elementwise_op, native_mul_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_add);
USE_OP(elementwise_mul);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_mul);
#endif
USE_CPU_ANAKIN_CONVERTER(elementwise_add);
USE_CPU_ANAKIN_CONVERTER(elementwise_mul);
......@@ -20,13 +20,13 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(fc_op, test) {
auto* fc_converter = Registry<AnakinOpConverter>::Global().Lookup("fc");
ASSERT_TRUE(fc_converter);
template <typename TargetT>
void test_mul_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({"mul_y"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("mul_x", {1, 1, 2, 2});
validator.DeclParamVar("mul_y", {4, 2});
validator.DeclOutputVar("mul_out", {1, 2});
......@@ -42,9 +42,26 @@ TEST(fc_op, test) {
validator.Execute(10);
}
#ifdef PADDLE_WITH_CUDA
TEST(mul_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_mul_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(mul_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_mul_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(mul);
USE_CPU_ANAKIN_CONVERTER(fc);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(fc);
#endif
......@@ -20,13 +20,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(flatten_op, test) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup("flatten");
ASSERT_TRUE(converter);
template <typename TargetT>
void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
framework::OpDesc desc;
......@@ -42,10 +41,27 @@ TEST(flatten_op, test) {
validator.Execute(5);
}
#ifdef PADDLE_WITH_CUDA
TEST(flatten_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_flatten_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(flatten_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_flatten_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_OP_ITSELF(flatten);
USE_CPU_ANAKIN_CONVERTER(flatten);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(flatten);
#endif
......@@ -19,15 +19,14 @@ namespace paddle {
namespace inference {
namespace anakin {
void test_pool2d(bool global_pooling, bool ceil_mode,
template <typename TargetT>
void test_pool2d(const platform::DeviceContext& context, bool use_gpu,
bool global_pooling, bool ceil_mode,
std::string pool_type = "max") {
auto* pool2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("pool2d");
ASSERT_TRUE(pool2d_converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
......@@ -64,56 +63,61 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
validator.Execute(1);
}
void test_pool2d2(bool global_pooling, bool ceil_mode,
std::string pool_type = "max") {
auto* pool2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("pool2d");
ASSERT_TRUE(pool2d_converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pool2d");
desc.SetInput("X", {"pool2d_x"});
desc.SetOutput("Out", {"pool2d_out"});
std::vector<int> ksize({3, 3});
std::vector<int> strides({1, 1});
std::vector<int> paddings({1, 1});
std::string pooling_t = pool_type;
#ifdef PADDLE_WITH_CUDA
TEST(Pool2dOpConverter, normal) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, false, false);
}
TEST(Pool2dOpConverter, test_global_pooling) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, true, false);
}
desc.SetAttr("pooling_type", pooling_t);
desc.SetAttr("ksize", ksize);
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("global_pooling", global_pooling);
desc.SetAttr("ceil_mode", true);
TEST(Pool2dOpConverter, max_ceil_test) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, false, true);
}
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
TEST(Pool2dOpConverter, avg_ceil_test) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
}
#endif
validator.Execute(1);
TEST(Pool2dOpConverter, normal_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, false);
}
TEST(Pool2dOpConverter, test_global_pooling_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, true, false);
}
TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
TEST(Pool2dOpConverter, max_ceil_test_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, true);
}
TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); }
TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(pool2d);
USE_CPU_ANAKIN_CONVERTER(pool2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(pool2d);
#endif
......@@ -21,12 +21,14 @@ namespace paddle {
namespace inference {
namespace anakin {
static void test_relu_op(const std::string &op_type) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
PADDLE_ENFORCE(converter != nullptr);
template <typename TargetT>
static void test_activation_op(const std::string& op_type,
const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......@@ -44,14 +46,44 @@ static void test_relu_op(const std::string &op_type) {
validator.Execute(5);
}
TEST(activation, relu) { test_relu_op("relu"); }
TEST(activation, leaky_relu) { test_relu_op("leaky_relu"); }
#ifdef PADDLE_WITH_CUDA
TEST(relu_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("relu", ctx, true);
}
TEST(leaky_relu_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("leaky_relu", ctx, true);
}
#endif
/* seems bug here
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("relu", ctx, false);
}
TEST(leaky_relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("leaky_relu", ctx, false);
}
*/
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(relu);
USE_ANAKIN_CONVERTER(relu);
USE_OP(leaky_relu);
USE_CPU_ANAKIN_CONVERTER(relu);
USE_CPU_ANAKIN_CONVERTER(leaky_relu);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(relu);
USE_ANAKIN_CONVERTER(leaky_relu);
#endif
......@@ -20,12 +20,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(reshape, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("reshape");
ASSERT_TRUE(converter);
template <typename TargetT>
void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
......@@ -45,10 +45,12 @@ TEST(reshape, test) {
validator.Execute(1);
}
TEST(reshape, test2) {
template <typename TargetT>
void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("reshape-X", {1, 2, 4});
validator.DeclOutputVar("reshape-Out", {1, 4, 2});
......@@ -66,9 +68,39 @@ TEST(reshape, test2) {
validator.Execute(1);
}
#ifdef PADDLE_WITH_CUDA
TEST(reshape1_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_reshape1_op<::anakin::saber::NV>(ctx, true);
}
TEST(reshape2_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_reshape2_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(reshape1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_reshape2_op<::anakin::saber::X86>(ctx, false);
}
TEST(reshape2_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_reshape2_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_CPU_ANAKIN_CONVERTER(reshape);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(reshape);
#endif
......@@ -20,12 +20,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(softmax, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("softmax");
ASSERT_TRUE(converter);
template <typename TargetT>
void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("softmax-X", {1, 10, 2});
validator.DeclOutputVar("softmax-Out", {1, 10, 2});
......@@ -41,9 +41,27 @@ TEST(softmax, test) {
validator.Execute(1);
}
#ifdef PADDLE_WITH_CUDA
TEST(softmax_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_softmax_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_softmax_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(softmax);
USE_CPU_ANAKIN_CONVERTER(softmax);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(softmax);
#endif
......@@ -21,12 +21,14 @@ namespace paddle {
namespace inference {
namespace anakin {
template <int Axis>
void AnakinSliceTest(const std::vector<int> &in_shape,
template <typename TargetT, int Axis>
void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu,
const std::vector<int> &in_shape,
const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("split_input", in_shape);
std::vector<std::string> output_vars;
......@@ -55,51 +57,58 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
// batch = 0, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 4, 2, 2}, {2, 2});
}
// batch = 0, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
}
// batch = 10, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
}
// batch = 10, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 3, 2, 2}, {2, 1});
}
// batch = 0, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 4, 2}, {2, 2});
}
// batch = 0, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
}
// batch = 10, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
}
// batch = 10, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 3, 2}, {2, 1});
}
// batch = 0, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 4}, {2, 2});
}
// batch = 0, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
}
// batch = 10, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch10) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
TEST(split_op, test_different_shape_axis1_batch1_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 1>(ctx, false, {1, 3, 2, 3}, {2, 1});
}
TEST(split_op, test_different_shape_axis2_batch1_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 2>(ctx, false, {1, 3, 4, 2}, {2, 2});
}
// batch = 10, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch10) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
TEST(split_op, test_different_shape_axis3_batch1_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
}
} // namespace anakin
......@@ -107,4 +116,7 @@ TEST(split_op, test_different_shape_axis3_batch10) {
} // namespace paddle
USE_OP(split);
USE_CPU_ANAKIN_CONVERTER(split);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(split);
#endif
......@@ -22,10 +22,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(sum, native) {
template <typename TargetT>
static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
......@@ -40,9 +42,26 @@ TEST(sum, native) {
validator.Execute(1);
}
#ifdef PADDLE_WITH_CUDA
TEST(sum_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_sum_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(sum_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_sum_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(sum);
USE_CPU_ANAKIN_CONVERTER(sum);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sum);
#endif
......@@ -20,12 +20,12 @@ namespace paddle {
namespace inference {
namespace anakin {
TEST(transpose_op, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("transpose");
ASSERT_TRUE(converter != nullptr);
template <typename TargetT>
void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
......@@ -43,11 +43,12 @@ TEST(transpose_op, test) {
validator.Execute(3);
}
// test input shape's dims < 4
TEST(transpose_op, test2) {
template <typename TargetT>
void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("transpose-X", {3, 4, 5});
validator.DeclOutputVar("transpose-Out", {3, 5, 4});
......@@ -65,9 +66,38 @@ TEST(transpose_op, test2) {
validator.Execute(1);
}
#ifdef PADDLE_WITH_CUDA
TEST(transpose1_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_transpose1_op<::anakin::saber::NV>(ctx, true);
}
TEST(transpose2_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_transpose2_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(transpose1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_transpose2_op<::anakin::saber::X86>(ctx, false);
}
TEST(transpose2_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_transpose2_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(transpose);
USE_CPU_ANAKIN_CONVERTER(transpose);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(transpose);
#endif
......@@ -17,20 +17,16 @@
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT>
void TransposeOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -38,7 +34,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Permute", {input}, {output});
this->engine_->AddOp(op_name, "Permute", {input}, {output});
auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
size_t axis_size = axis.size();
......@@ -46,11 +42,17 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
axis.push_back(axis_size);
axis_size += 1;
}
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", axis);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::X86>);
......@@ -20,7 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
class TransposeOpConverter : public AnakinOpConverter {
template <typename TargetT>
class TransposeOpConverter : public AnakinOpConverter<TargetT> {
public:
TransposeOpConverter() = default;
......
......@@ -32,14 +32,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
......@@ -55,8 +49,8 @@ float random(float low, float high) {
return dist(mt);
}
void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
const platform::DeviceContext& ctx) {
void RandomizeTensor(framework::LoDTensor* tensor,
const platform::Place& place) {
auto dims = tensor->dims();
size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0);
......@@ -78,17 +72,19 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
* anakin
* layer.
*/
template <typename TargetT>
class AnakinConvertValidation {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
using AnakinNvEngineT = AnakinEngine<TargetT, Precision::FP32>;
public:
AnakinConvertValidation() = delete;
AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
framework::Scope* scope)
: parameters_(parameters), scope_(scope), place_(0) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
framework::Scope* scope,
const platform::DeviceContext& ctx,
bool use_gpu = true)
: parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) {
engine_.reset(new AnakinEngine<TargetT, Precision::FP32>(true));
}
// Declare a Variable as input with random initialization.
......@@ -108,11 +104,10 @@ class AnakinConvertValidation {
}
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_);
auto* x = scope_->Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place_, ctx);
RandomizeTensor(x_tensor, ctx_.GetPlace());
std::vector<int64_t> dim_vec_int64;
for (auto& ele : dim_vec) {
......@@ -132,7 +127,7 @@ class AnakinConvertValidation {
// should init anakin engine here.
auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
Singleton<AnakinOpConverter>::Global().ConvertOp(
Singleton<AnakinOpConverter<TargetT>>::Global().ConvertOp(
desc, block_desc, parameters_, *scope_, engine_.get(),
true /*test_mode*/);
engine_->Freeze();
......@@ -160,11 +155,8 @@ class AnakinConvertValidation {
void Execute(int batch_size,
std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op
platform::CUDADeviceContext ctx(place_);
op_->Run(*scope_, place_);
op_->Run(*scope_, ctx_.GetPlace());
// std::vector<framework::LoDTensor> input_vector;
// std::vector<framework::LoDTensor> output_vector;
std::map<std::string, framework::LoDTensor*> inputs;
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
......@@ -180,20 +172,27 @@ class AnakinConvertValidation {
std::vector<float> fluid_out;
auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out);
framework::TensorToVector(*tensor, ctx_, &fluid_out);
fluid_outputs.push_back(fluid_out);
outputs.insert({output, tensor});
}
engine_->Execute(inputs, outputs, stream_);
if (!use_gpu_) {
engine_->Execute(inputs, outputs);
} else {
cudaStream_t stream;
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0);
engine_->Execute(inputs, outputs, stream);
}
int i_output = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> anakin_out;
auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &anakin_out);
framework::TensorToVector(*tensor, ctx_, &anakin_out);
size_t anakin_out_size = anakin_out.size();
auto fluid_out = fluid_outputs[i_output++];
......@@ -205,15 +204,17 @@ class AnakinConvertValidation {
private:
std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_;
std::unique_ptr<framework::OpDesc> op_desc_;
framework::ProgramDesc program_desc_;
const std::unordered_set<std::string>& parameters_;
framework::Scope* scope_;
platform::CUDAPlace place_;
const platform::DeviceContext& ctx_;
bool use_gpu_{true};
};
template class AnakinConvertValidation<::anakin::saber::NV>;
template class AnakinConvertValidation<::anakin::saber::X86>;
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -69,11 +69,11 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
}
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream) {
void AnakinEngine<TargetT, PrecisionType, RunType>::BindInput(
const std::map<std::string, framework::LoDTensor *> &inputs) {
#ifdef PADDLE_WITH_CUDA
cudaDeviceSynchronize();
#endif
for (const auto &input : inputs) {
auto *tensor = input.second;
auto *data = tensor->data<float>();
......@@ -105,6 +105,35 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor);
}
}
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs) {
BindInput(inputs);
net_->prediction();
for (const auto &output : outputs) {
platform::CPUPlace cpu_place;
auto *tensor = output.second;
auto *anakin_output = net_->get_out(output.first);
auto *anakin_data = anakin_output->data();
auto anakin_output_shape = anakin_output->valid_shape();
tensor->Resize(framework::make_ddim(anakin_output_shape));
auto *fluid_data = tensor->mutable_data<float>(cpu_place);
memory::Copy(cpu_place, static_cast<void *>(fluid_data), cpu_place,
static_cast<void *>(anakin_data),
tensor->numel() * sizeof(float));
}
}
#ifdef PADDLE_WITH_CUDA
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream) {
BindInput(inputs);
net_->prediction();
cudaDeviceSynchronize();
for (const auto &output : outputs) {
......@@ -121,6 +150,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
}
cudaDeviceSynchronize();
}
#endif
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
......@@ -140,7 +170,15 @@ AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
return std::unique_ptr<AnakinEngine>(engine);
}
#ifdef PADDLE_WITH_CUDA
template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::NV>;
#endif
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::X86>;
// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -32,7 +32,6 @@
#include "saber/saber_types.h"
using anakin::Precision;
using anakin::saber::NV;
namespace anakin {
......@@ -94,9 +93,16 @@ class AnakinEngine {
void Save(std::string path) { graph_->save(path); }
bool IsInit() { return initialized_; }
int GetDevice() { return device_; }
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs);
#ifdef PADDLE_WITH_CUDA
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream);
#endif
private:
void BindInput(const std::map<std::string, framework::LoDTensor *> &inputs);
private:
bool initialized_{false};
......@@ -108,24 +114,25 @@ class AnakinEngine {
std::vector<std::string> program_inputs_;
};
template <typename TargetT>
class AnakinEngineManager {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
using AnakinEngineT = AnakinEngine<TargetT, Precision::FP32>;
public:
bool HasEngine(const std::string &name) const {
if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr;
}
AnakinNvEngineT *Get(const std::string &name) const {
AnakinEngineT *Get(const std::string &name) const {
return engines_.at(name).get();
}
AnakinNvEngineT *Create(
bool need_summary, int device, int max_batch_size,
AnakinEngineT *Create(bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape,
std::vector<std::string> program_inputs, std::string engine_name) {
std::vector<std::string> program_inputs,
std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<NV, Precision::FP32>(
auto *p = new AnakinEngine<TargetT, Precision::FP32>(
need_summary, device, max_batch_size, max_input_shape, program_inputs);
engines_[engine_name].reset(p);
return p;
......@@ -138,7 +145,7 @@ class AnakinEngineManager {
}
private:
std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_;
std::unordered_map<std::string, std::unique_ptr<AnakinEngineT>> engines_;
std::mutex mut_;
};
} // namespace anakin
......
......@@ -67,7 +67,7 @@ struct Argument {
#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
public: \
type__& field__() { \
PADDLE_ENFORCE(Has(#field__)); \
PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
return field__##_; \
} \
void Set##Field(const type__& x) { \
......
......@@ -114,6 +114,7 @@ void IRPassManager::CreatePasses(Argument *argument,
if (pass_name == "anakin_subgraph_pass") {
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("engine_opt_info", new std::map<std::string, std::string>(
......
......@@ -194,20 +194,49 @@ void AnakinSubgraphPass::CreateAnakinOp(
auto max_batch_size = Get<int>("max_batch_size");
auto program_inputs = program_desc->GetFeedTargetNames();
auto *anakin_engine =
inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
program_inputs, engine_key);
bool use_gpu = Get<bool>("use_gpu");
SetAttr(op_desc->Proto(), "use_gpu", use_gpu);
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::NV>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
#endif
} else {
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::X86>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
}
auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end());
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
if (use_gpu) {
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::NV>>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
} else {
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::X86>>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
}
}
} // namespace analysis
......
......@@ -70,4 +70,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared)
endif()
inference_analysis_test(faster_rcnn_test SRCS faster_rcnn_test.cc EXTRA_DEPS paddle_fluid)
......@@ -268,9 +268,11 @@ void AnalysisConfig::Update() {
PADDLE_ENFORCE(!use_tensorrt_,
"Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!");
PADDLE_ENFORCE(
use_gpu_,
"Anakin sub-graph engine need gpu, please use the EnableGpu API.");
if (use_gpu_) {
LOG(INFO) << "Run Anakin GPU mode";
} else {
LOG(INFO) << "Run Anakin CPU mode";
}
pass_builder()->ClearPasses();
for (const auto &pass : kAnakinSubgraphPasses) {
......
......@@ -382,7 +382,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
}
if (config_.use_gpu() && config_.anakin_engine_enabled()) {
if (config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
......
......@@ -34,28 +34,16 @@ limitations under the License. */
namespace paddle {
namespace operators {
using FluidDT = framework::proto::VarType_Type;
using inference::Singleton;
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
using inference::anakin::AnakinEngine;
class AnakinEngineOp : public framework::OperatorBase {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
mutable AnakinNvEngineT *anakin_engine_;
std::string engine_key_;
std::string engine_serialized_data_;
bool use_gpu_;
public:
AnakinEngineOp(const std::string &type,
......@@ -66,10 +54,10 @@ class AnakinEngineOp : public framework::OperatorBase {
input_names_ = Inputs("Xs");
engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters");
use_gpu_ = Attr<bool>("use_gpu");
for (const auto &param : params) {
param_names_.insert(param);
}
anakin_engine_ = nullptr;
}
protected:
......@@ -80,7 +68,6 @@ class AnakinEngineOp : public framework::OperatorBase {
void RunAnakin(const framework::Scope &scope,
const platform::Place &dev_place) const {
auto *engine = GetEngine(scope, dev_place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
......@@ -92,7 +79,6 @@ class AnakinEngineOp : public framework::OperatorBase {
Attr<std::vector<std::string>>("output_name_mapping");
std::map<std::string, framework::LoDTensor *> inputs;
// Convert input tensor from fluid to engine.
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
auto &t =
......@@ -110,17 +96,21 @@ class AnakinEngineOp : public framework::OperatorBase {
outputs.insert({output_maps[output_index], fluid_t});
output_index += 1;
}
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs, stream);
}
AnakinNvEngineT *GetEngine(const framework::Scope &scope,
const platform::Place &dev_place) const {
if (anakin_engine_ == nullptr) {
anakin_engine_ =
inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
#endif
} else {
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs);
}
return anakin_engine_;
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册