提交 7ad182e1 编写于 作者: N nhzlx

Cherry-Pick from 16662 : Anakin subgraph cpu support

上级 8643dbc2
...@@ -25,8 +25,9 @@ endif() ...@@ -25,8 +25,9 @@ endif()
if(ANAKIN_FOUND) if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ") message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT})
include_directories(${ANAKIN_ROOT}/include) include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber) include_directories(${ANAKIN_ROOT}/saber)
link_directories(${ANAKIN_ROOT}) link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN) add_definitions(-DPADDLE_WITH_ANAKIN)
endif() endif()
...@@ -16,16 +16,13 @@ ...@@ -16,16 +16,13 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
ActivationOpConverter::ActivationOpConverter(const std::string &op_type) template <typename TargetT>
ActivationOpConverter<TargetT>::ActivationOpConverter(
const std::string &op_type)
: op_type_(op_type) { : op_type_(op_type) {
auto it = anakin_op_types_.find(op_type_); auto it = anakin_op_types_.find(op_type_);
PADDLE_ENFORCE(it != anakin_op_types_.end(), PADDLE_ENFORCE(it != anakin_op_types_.end(),
...@@ -33,10 +30,10 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type) ...@@ -33,10 +30,10 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
anakin_op_type_ = it->second; anakin_op_type_ = it->second;
} }
void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void ActivationOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -44,13 +41,20 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -44,13 +41,20 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front(); auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "type", anakin_op_type_); this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::X86>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ActivationOpConverter : public AnakinOpConverter { template <typename TargetT>
class ActivationOpConverter : public AnakinOpConverter<TargetT> {
public: public:
explicit ActivationOpConverter(const std::string &op_type); explicit ActivationOpConverter(const std::string &op_type);
...@@ -39,14 +40,16 @@ class ActivationOpConverter : public AnakinOpConverter { ...@@ -39,14 +40,16 @@ class ActivationOpConverter : public AnakinOpConverter {
{"sigmoid", "Sigmoid"}}; {"sigmoid", "Sigmoid"}};
}; };
class TanhOpConverter : public ActivationOpConverter { template <typename TargetT>
class TanhOpConverter : public ActivationOpConverter<TargetT> {
public: public:
TanhOpConverter() : ActivationOpConverter("tanh") {} TanhOpConverter() : ActivationOpConverter<TargetT>("tanh") {}
}; };
class SigmoidOpConverter : public ActivationOpConverter { template <typename TargetT>
class SigmoidOpConverter : public ActivationOpConverter<TargetT> {
public: public:
SigmoidOpConverter() : ActivationOpConverter("sigmoid") {} SigmoidOpConverter() : ActivationOpConverter<TargetT>("sigmoid") {}
}; };
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
......
...@@ -18,19 +18,16 @@ ...@@ -18,19 +18,16 @@
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void AffineChannelOpConverter::operator()( template <typename TargetT>
void AffineChannelOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -59,7 +56,7 @@ void AffineChannelOpConverter::operator()( ...@@ -59,7 +56,7 @@ void AffineChannelOpConverter::operator()(
bias_tensor->Resize(bias_t->dims()); bias_tensor->Resize(bias_t->dims());
TensorCopySync((*bias_t), platform::CPUPlace(), bias_tensor.get()); TensorCopySync((*bias_t), platform::CPUPlace(), bias_tensor.get());
engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name}); this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
// Generate the Scale parameter of Anakin. // Generate the Scale parameter of Anakin.
auto scale_shape = framework::vectorize2int(scale_t->dims()); auto scale_shape = framework::vectorize2int(scale_t->dims());
...@@ -67,7 +64,8 @@ void AffineChannelOpConverter::operator()( ...@@ -67,7 +64,8 @@ void AffineChannelOpConverter::operator()(
scale_shape.insert(scale_shape.begin(), 1); scale_shape.insert(scale_shape.begin(), 1);
} }
Shape anakin_scale_shape(scale_shape); Shape anakin_scale_shape(scale_shape);
auto *weight1 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>( auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_scale_shape); anakin_scale_shape);
float *scale_cpu_data = float *scale_cpu_data =
static_cast<float *>(weight1->h_tensor().mutable_data()); static_cast<float *>(weight1->h_tensor().mutable_data());
...@@ -75,7 +73,7 @@ void AffineChannelOpConverter::operator()( ...@@ -75,7 +73,7 @@ void AffineChannelOpConverter::operator()(
scale_cpu_data); scale_cpu_data);
weight1->d_tensor().set_shape(anakin_scale_shape); weight1->d_tensor().set_shape(anakin_scale_shape);
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// Generate the Bias parameter of Anakin. // Generate the Bias parameter of Anakin.
auto bias_shape = framework::vectorize2int(bias_t->dims()); auto bias_shape = framework::vectorize2int(bias_t->dims());
...@@ -83,18 +81,24 @@ void AffineChannelOpConverter::operator()( ...@@ -83,18 +81,24 @@ void AffineChannelOpConverter::operator()(
bias_shape.insert(bias_shape.begin(), 1); bias_shape.insert(bias_shape.begin(), 1);
} }
Shape anakin_bias_shape(bias_shape); Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>( auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape); anakin_bias_shape);
float *bias_cpu_data = float *bias_cpu_data =
static_cast<float *>(weight2->h_tensor().mutable_data()); static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_tensor->data<float>(), bias_tensor->numel(), bias_cpu_data); std::copy_n(bias_tensor->data<float>(), bias_tensor->numel(), bias_cpu_data);
weight2->d_tensor().set_shape(anakin_bias_shape); weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor()); weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2); this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::X86>);
...@@ -21,7 +21,8 @@ namespace paddle { ...@@ -21,7 +21,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class AffineChannelOpConverter : public AnakinOpConverter { template <typename TargetT>
class AffineChannelOpConverter : public AnakinOpConverter<TargetT> {
public: public:
AffineChannelOpConverter() = default; AffineChannelOpConverter() = default;
......
...@@ -21,17 +21,16 @@ ...@@ -21,17 +21,16 @@
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape; using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void BatchNormOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
std::map<std::string, std::string> inputs; std::map<std::string, std::string> inputs;
...@@ -48,9 +47,9 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -48,9 +47,9 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
auto bn_op_name = op_name + ":bn"; auto bn_op_name = op_name + ":bn";
auto bn_output = bn_op_name + "_output"; auto bn_output = bn_op_name + "_output";
engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); this->engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0)); this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
auto scale_op_name = op_name + ":scale"; auto scale_op_name = op_name + ":scale";
auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
...@@ -81,48 +80,54 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -81,48 +80,54 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims()))); Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims()))); Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data()); auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data); std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
auto *weight2 = auto *weight2 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape2);
auto *variance_data = auto *variance_data =
static_cast<float *>(weight2->h_tensor().mutable_data()); static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data); std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
Shape shape3(std::vector<int>({1, 1, 1, 1})); Shape shape3(std::vector<int>({1, 1, 1, 1}));
auto *weight3 = auto *weight3 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape3);
auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data()); auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
float weight3_data[] = {1}; float weight3_data[] = {1};
std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data); std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims()))); Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
auto *scale = auto *scale = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape); scale_shape);
auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data()); auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data); std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims()))); Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
auto *bias = auto *bias = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape); bias_shape);
auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data()); auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data); std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
engine_->AddOpAttr(scale_op_name, "axis", 1); this->engine_->AddOpAttr(scale_op_name, "axis", 1);
engine_->AddOpAttr(scale_op_name, "num_axes", 1); this->engine_->AddOpAttr(scale_op_name, "num_axes", 1);
engine_->AddOpAttr(scale_op_name, "bias_term", true); this->engine_->AddOpAttr(scale_op_name, "bias_term", true);
engine_->AddOpAttr(scale_op_name, "weight_1", *scale); this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
engine_->AddOpAttr(scale_op_name, "weight_2", *bias); this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class BatchNormOpConverter : public AnakinOpConverter { template <typename TargetT>
class BatchNormOpConverter : public AnakinOpConverter<TargetT> {
public: public:
BatchNormOpConverter() = default; BatchNormOpConverter() = default;
......
...@@ -15,38 +15,32 @@ ...@@ -15,38 +15,32 @@
#include "paddle/fluid/inference/anakin/convert/concat.h" #include "paddle/fluid/inference/anakin/convert/concat.h"
#include <algorithm> #include <algorithm>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void ConcatOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
int axis = boost::get<int>(op_desc.GetAttr("axis")); int axis = boost::get<int>(op_desc.GetAttr("axis"));
auto input_names = op_desc.Input("X"); auto input_names = op_desc.Input("X");
// PADDLE_ENFORCE(axis > 0,
// "The axis attr of Concat op should be large than 0 for trt");
auto y_name = op_desc.Output("Out").front(); auto y_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Concat", input_names, {y_name}); this->engine_->AddOp(op_name, "Concat", input_names, {y_name});
engine_->AddOpAttr(op_name, "axis", axis); this->engine_->AddOpAttr(op_name, "axis", axis);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ConcatOpConverter : public AnakinOpConverter { template <typename TargetT>
class ConcatOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ConcatOpConverter() = default; ConcatOpConverter() = default;
......
...@@ -18,19 +18,18 @@ ...@@ -18,19 +18,18 @@
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void Conv2dOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
...@@ -39,7 +38,7 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -39,7 +38,7 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("Input").front(); auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front(); auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v); PADDLE_ENFORCE_NOT_NULL(filter_v);
...@@ -51,38 +50,44 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -51,38 +50,44 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2]; const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3]; const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0]; auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num); this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w}); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
{filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides")); auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings")); auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations")); auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups")); const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups); this->engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1); this->engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", false); this->engine_->AddOpAttr(op_name, "bias_term", false);
auto weight_shape = framework::vectorize2int(filter_t->dims()); auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape); Shape anakin_shape(weight_shape);
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data()); float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data); std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape); weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter); REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::X86>);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::NV>);
#endif
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class Conv2dOpConverter : public AnakinOpConverter { template <typename TargetT>
class Conv2dOpConverter : public AnakinOpConverter<TargetT> {
public: public:
Conv2dOpConverter() = default; Conv2dOpConverter() = default;
......
...@@ -18,19 +18,18 @@ ...@@ -18,19 +18,18 @@
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void Conv2dFusionOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
...@@ -40,7 +39,7 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -40,7 +39,7 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("Input").front(); auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front(); auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v); PADDLE_ENFORCE_NOT_NULL(filter_v);
...@@ -63,28 +62,31 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -63,28 +62,31 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
const int filter_w = weight_tensor->dims()[3]; const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ; // auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0]; auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num); this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w}); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
{filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides")); auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings")); auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations")); auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilation_rate",
dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups")); const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups); this->engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1); this->engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", true); this->engine_->AddOpAttr(op_name, "bias_term", true);
auto weight_shape = framework::vectorize2int(filter_t->dims()); auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape); Shape anakin_shape(weight_shape);
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data()); float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data); std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape); weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto bias_shape = framework::vectorize2int(b_t->dims()); auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor; framework::LoDTensor bias_tensor;
...@@ -98,17 +100,24 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -98,17 +100,24 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
// bias_shape.push_back(1); // bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape); Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>( auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape); anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data()); float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape); weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor()); weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2); this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class Conv2dFusionOpConverter : public AnakinOpConverter { template <typename TargetT>
class Conv2dFusionOpConverter : public AnakinOpConverter<TargetT> {
public: public:
Conv2dFusionOpConverter() = default; Conv2dFusionOpConverter() = default;
......
...@@ -17,17 +17,14 @@ ...@@ -17,17 +17,14 @@
#include <map> #include <map>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void DensityPriorBoxOpConverter::operator()( template <typename TargetT>
void DensityPriorBoxOpConverter<TargetT>::operator()(
const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc, const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
const framework::Scope& scope, bool test_mode) { const framework::Scope& scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -81,27 +78,44 @@ void DensityPriorBoxOpConverter::operator()( ...@@ -81,27 +78,44 @@ void DensityPriorBoxOpConverter::operator()(
std::vector<float> temp_v = {}; std::vector<float> temp_v = {};
engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); this->engine_->AddOp(op_name, "PriorBox", {input_name, image_name},
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes); {output_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "min_size",
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios); min_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "max_size",
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios); max_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "aspect_ratio",
engine_->AddOpAttr(op_name, "is_flip", is_flip); aspect_ratios);
engine_->AddOpAttr(op_name, "is_clip", is_clip); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_size",
engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances); fixed_sizes);
engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0)); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "fixed_ratio",
engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0)); fixed_ratios);
engine_->AddOpAttr(op_name, "step_h", step_h); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "density", dens);
engine_->AddOpAttr(op_name, "step_w", step_w); this->engine_->AddOpAttr(op_name, "is_flip", is_flip);
engine_->AddOpAttr(op_name, "offset", offset); this->engine_->AddOpAttr(op_name, "is_clip", is_clip);
engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "variance",
variances);
this->engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
this->engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
this->engine_->AddOpAttr(op_name, "step_h", step_h);
this->engine_->AddOpAttr(op_name, "step_w", step_w);
this->engine_->AddOpAttr(op_name, "offset", offset);
this->engine_->template AddOpAttr<PTuple<std::string>>(op_name, "order",
t_order);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class DensityPriorBoxOpConverter : public AnakinOpConverter { template <typename TargetT>
class DensityPriorBoxOpConverter : public AnakinOpConverter<TargetT> {
public: public:
DensityPriorBoxOpConverter() = default; DensityPriorBoxOpConverter() = default;
......
...@@ -16,19 +16,14 @@ ...@@ -16,19 +16,14 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void DetectionOutOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
auto target_name = op_desc.Input("TargetBox").front(); auto target_name = op_desc.Input("TargetBox").front();
auto prior_box_name = op_desc.Input("PriorBox").front(); auto prior_box_name = op_desc.Input("PriorBox").front();
...@@ -52,22 +47,28 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -52,22 +47,28 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
"Not support encode_center_size code_type in DetectionOut of anakin"); "Not support encode_center_size code_type in DetectionOut of anakin");
} }
engine_->AddOp(op_name, "DetectionOutput", this->engine_->AddOp(op_name, "DetectionOutput",
{target_name, scores_name, prior_box_name}, {output_name}); {target_name, scores_name, prior_box_name},
engine_->AddOpAttr(op_name, "share_location", true); {output_name});
engine_->AddOpAttr(op_name, "variance_encode_in_target", false); this->engine_->AddOpAttr(op_name, "share_location", true);
engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0)); this->engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
engine_->AddOpAttr(op_name, "background_id", background_label); this->engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); this->engine_->AddOpAttr(op_name, "background_id", background_label);
engine_->AddOpAttr(op_name, "code_type", anakin_code_type); this->engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); this->engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); this->engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); this->engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
engine_->AddOpAttr(op_name, "nms_eta", nms_eta); this->engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
this->engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::X86>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class DetectionOutOpConverter : public AnakinOpConverter { template <typename TargetT>
class DetectionOutOpConverter : public AnakinOpConverter<TargetT> {
public: public:
DetectionOutOpConverter() = default; DetectionOutOpConverter() = default;
......
...@@ -19,21 +19,16 @@ ...@@ -19,21 +19,16 @@
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void DropoutOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
...@@ -43,25 +38,30 @@ void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -43,25 +38,30 @@ void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
auto out_name = op_desc.Output("Out").front(); auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name}, {out_name}); this->engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob")); auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
auto factor = 1 - dropout_prob; auto factor = 1 - dropout_prob;
Shape shape1(std::vector<int>({1, 1, 1, 1})); Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data()); auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {factor}; float weight1_data[] = {factor};
std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data); std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
engine_->AddOpAttr(op_name, "axis", 0); this->engine_->AddOpAttr(op_name, "axis", 0);
engine_->AddOpAttr(op_name, "num_axes", 0); this->engine_->AddOpAttr(op_name, "num_axes", 0);
engine_->AddOpAttr(op_name, "bias_term", false); this->engine_->AddOpAttr(op_name, "bias_term", false);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class DropoutOpConverter : public AnakinOpConverter { template <typename TargetT>
class DropoutOpConverter : public AnakinOpConverter<TargetT> {
public: public:
DropoutOpConverter() = default; DropoutOpConverter() = default;
......
...@@ -19,18 +19,15 @@ ...@@ -19,18 +19,15 @@
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ElementwiseAddOpConverter::operator()( template <typename TargetT>
void ElementwiseAddOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -43,14 +40,16 @@ void ElementwiseAddOpConverter::operator()( ...@@ -43,14 +40,16 @@ void ElementwiseAddOpConverter::operator()(
auto out_name = op_desc.Output("Out").front(); auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
std::string elementwise_type = "Add"; std::string elementwise_type = "Add";
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type); this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
std::vector<float> coeff = {1.0, 1.0}; std::vector<float> coeff = {1.0, 1.0};
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
} }
void ElementwiseMulOpConverter::operator()( template <typename TargetT>
void ElementwiseMulOpConverter<TargetT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -63,26 +62,25 @@ void ElementwiseMulOpConverter::operator()( ...@@ -63,26 +62,25 @@ void ElementwiseMulOpConverter::operator()(
auto out_name = op_desc.Output("Out").front(); auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name}); this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
// Fill a number to weight_1 as a placeholder. std::string elementwise_type = "Prod";
Shape shape1(std::vector<int>({1, 1, 1, 1})); this->engine_->template AddOpAttr<std::string>(op_name, "type",
auto *weight1 = elementwise_type);
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1); std::vector<float> coeff = {1.0, 1.0};
auto *placeholder_data = this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {1};
std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto axis = boost::get<int>(op_desc.GetAttr("axis"));
engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr(op_name, "num_axes", 1);
engine_->AddOpAttr(op_name, "bias_term", false);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ElementwiseAddOpConverter : public AnakinOpConverter { template <typename TargetT>
class ElementwiseAddOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ElementwiseAddOpConverter() = default; ElementwiseAddOpConverter() = default;
...@@ -33,7 +34,8 @@ class ElementwiseAddOpConverter : public AnakinOpConverter { ...@@ -33,7 +34,8 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
private: private:
}; };
class ElementwiseMulOpConverter : public AnakinOpConverter { template <typename TargetT>
class ElementwiseMulOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ElementwiseMulOpConverter() = default; ElementwiseMulOpConverter() = default;
......
...@@ -19,17 +19,16 @@ ...@@ -19,17 +19,16 @@
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape; using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void FcBaseOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
auto input_names = op_desc.InputNames(); auto input_names = op_desc.InputNames();
bool with_bias = input_names.size() == 3; bool with_bias = input_names.size() == 3;
...@@ -51,13 +50,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -51,13 +50,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input(i_name).front(); auto input_name = op_desc.Input(i_name).front();
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "bias_term", with_bias); this->engine_->AddOpAttr(op_name, "bias_term", with_bias);
engine_->AddOpAttr(op_name, "axis", 1); this->engine_->AddOpAttr(op_name, "axis", 1);
auto weight_shape = framework::vectorize2int(y_t->dims()); auto weight_shape = framework::vectorize2int(y_t->dims());
int out_dim = weight_shape[1]; int out_dim = weight_shape[1];
engine_->AddOpAttr(op_name, "out_dim", out_dim); this->engine_->AddOpAttr(op_name, "out_dim", out_dim);
const int w_m = weight_shape[0]; const int w_m = weight_shape[0];
const int w_k = weight_shape[1]; const int w_k = weight_shape[1];
...@@ -79,12 +78,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -79,12 +78,13 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
} }
} }
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape); GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data()); float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data); std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape); weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1); this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// get bias // get bias
if (with_bias) { if (with_bias) {
...@@ -104,13 +104,14 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -104,13 +104,14 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
// bias_shape.push_back(1); // bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape); Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>( auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape); anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data()); float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape); weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor()); weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2); this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
} }
} }
...@@ -118,5 +119,10 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -118,5 +119,10 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class FcBaseOpConverter : public AnakinOpConverter { template <typename TargetT>
class FcBaseOpConverter : public AnakinOpConverter<TargetT> {
public: public:
FcBaseOpConverter() = default; FcBaseOpConverter() = default;
...@@ -32,13 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter { ...@@ -32,13 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter {
}; };
// with bias // with bias
class FcOpConverter : public FcBaseOpConverter { template <typename TargetT>
class FcOpConverter : public FcBaseOpConverter<TargetT> {
public: public:
FcOpConverter() = default; FcOpConverter() = default;
}; };
// without bias // without bias
class MulOpConverter : public FcBaseOpConverter { template <typename TargetT>
class MulOpConverter : public FcBaseOpConverter<TargetT> {
public: public:
MulOpConverter() = default; MulOpConverter() = default;
}; };
......
...@@ -15,20 +15,16 @@ ...@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/flatten.h" #include "paddle/fluid/inference/anakin/convert/flatten.h"
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void FlattenOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
...@@ -41,12 +37,17 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -41,12 +37,17 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
std::vector<int> out_dims = {0, -1, 1, 1}; std::vector<int> out_dims = {0, -1, 1, 1};
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output}); this->engine_->AddOp(op_name, "Reshape", {input}, {output});
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class FlattenOpConverter : public AnakinOpConverter { template <typename TargetT>
class FlattenOpConverter : public AnakinOpConverter<TargetT> {
public: public:
FlattenOpConverter() = default; FlattenOpConverter() = default;
......
...@@ -17,23 +17,16 @@ ...@@ -17,23 +17,16 @@
#include <string> #include <string>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void Im2SequenceConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0); PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
...@@ -43,21 +36,24 @@ void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, ...@@ -43,21 +36,24 @@ void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
auto out_name = op_desc.Output("Out").front(); auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name}); this->engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
std::vector<int> dilations = {1, 1}; std::vector<int> dilations = {1, 1};
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings")); auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides")); auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels")); auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
engine_->AddOpAttr<PTuple<int>>(op_name, "paddings", paddings); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "window_size", kernels); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "window_size",
engine_->AddOpAttr<PTuple<int>>(op_name, "dilations", dilations); kernels);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dilations",
dilations);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence,
Im2SequenceConverter<::anakin::saber::NV>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class Im2SequenceConverter : public AnakinOpConverter { template <typename TargetT>
class Im2SequenceConverter : public AnakinOpConverter<TargetT> {
public: public:
Im2SequenceConverter() = default; Im2SequenceConverter() = default;
......
...@@ -32,10 +32,10 @@ namespace paddle { ...@@ -32,10 +32,10 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
using AnakinNvEngine = template <typename TargetT>
AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
class AnakinOpConverter { class AnakinOpConverter {
using AnakinEngineT = AnakinEngine<TargetT, ::anakin::Precision::FP32>;
public: public:
AnakinOpConverter() = default; AnakinOpConverter() = default;
...@@ -45,7 +45,7 @@ class AnakinOpConverter { ...@@ -45,7 +45,7 @@ class AnakinOpConverter {
void ConvertOp(const framework::proto::OpDesc &op, void ConvertOp(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc, const framework::BlockDesc &block_desc,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine, const framework::Scope &scope, AnakinEngineT *engine,
bool test_mode = false) { bool test_mode = false) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
std::string op_type = op_desc.Type(); std::string op_type = op_desc.Type();
...@@ -65,7 +65,7 @@ class AnakinOpConverter { ...@@ -65,7 +65,7 @@ class AnakinOpConverter {
void ConvertBlock(framework::BlockDesc *block_desc, void ConvertBlock(framework::BlockDesc *block_desc,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine) { const framework::Scope &scope, AnakinEngineT *engine) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
framework::proto::BlockDesc *block = block_desc->Proto(); framework::proto::BlockDesc *block = block_desc->Proto();
for (auto i = 0; i < block->ops_size(); i++) { for (auto i = 0; i < block->ops_size(); i++) {
...@@ -79,7 +79,7 @@ class AnakinOpConverter { ...@@ -79,7 +79,7 @@ class AnakinOpConverter {
framework::BlockDesc *block_desc, framework::Scope *scope, framework::BlockDesc *block_desc, framework::Scope *scope,
const std::vector<std::string> &inputs, const std::vector<std::string> &inputs,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const std::vector<std::string> &outputs, AnakinNvEngine *engine) { const std::vector<std::string> &outputs, AnakinEngineT *engine) {
ConvertBlock(block_desc, parameters, *scope, engine); ConvertBlock(block_desc, parameters, *scope, engine);
// if the max_batch size // if the max_batch size
int max_batch_size = engine->GetMaxBatchSize(); int max_batch_size = engine->GetMaxBatchSize();
...@@ -128,40 +128,60 @@ class AnakinOpConverter { ...@@ -128,40 +128,60 @@ class AnakinOpConverter {
engine->InitNet(); engine->InitNet();
} }
void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } void SetEngine(AnakinEngineT *engine) { engine_ = engine; }
virtual ~AnakinOpConverter() {} virtual ~AnakinOpConverter() {}
protected: protected:
bool test_mode_; bool test_mode_;
AnakinNvEngine *engine_{nullptr}; AnakinEngineT *engine_{nullptr};
private: private:
std::unordered_map<std::string, AnakinOpConverter *> converters_; std::unordered_map<std::string, AnakinOpConverter<TargetT> *> converters_;
framework::Scope *scope_{nullptr}; framework::Scope *scope_{nullptr};
std::mutex mutex_; std::mutex mutex_;
}; };
template class AnakinOpConverter<::anakin::saber::NV>;
template class AnakinOpConverter<::anakin::saber::X86>;
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ #define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \
struct anakin_##op_type__##_converter \ place_type__, place_class__) \
struct anakin_##op_type__##_##place_type__##_converter \
: public ::paddle::framework::Registrar { \ : public ::paddle::framework::Registrar { \
anakin_##op_type__##_converter() { \ anakin_##op_type__##_##place_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__; \ LOG(INFO) << "register convert " << #op_type__ << " "; \
::paddle::inference::Registry< \ ::paddle::inference::Registry< \
::paddle::inference::anakin::AnakinOpConverter>::Global() \ ::paddle::inference::anakin::AnakinOpConverter<place_class__>>:: \
Global() \
.Register<::paddle::inference::anakin::Converter__>(#op_type__); \ .Register<::paddle::inference::anakin::Converter__>(#op_type__); \
} \ } \
}; \ }; \
anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ anakin_##op_type__##_##place_type__##_converter \
int TouchConverterRegister_anakin_##op_type__() { \ anakin_##op_type__##_##place_type__##_converter__; \
anakin_##op_type__##_converter__.Touch(); \ int TouchConverterRegister_anakin_##op_type__##_##place_type__() { \
anakin_##op_type__##_##place_type__##_converter__.Touch(); \
return 0; \ return 0; \
} }
#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV)
#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86)
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__) \
extern int TouchConverterRegister_anakin_##op_type__##_##place_type__(); \
int use_op_converter_anakin_##op_type__##_##place_type__ \
__attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__##_##place_type__();
#define USE_ANAKIN_CONVERTER(op_type__) \ #define USE_ANAKIN_CONVERTER(op_type__) \
extern int TouchConverterRegister_anakin_##op_type__(); \ USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA)
int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__(); #define USE_CPU_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU)
...@@ -17,23 +17,16 @@ ...@@ -17,23 +17,16 @@
#include <string> #include <string>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void Pool2dOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -65,17 +58,22 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -65,17 +58,22 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_THROW("TensorRT unsupported pooling type!"); PADDLE_THROW("TensorRT unsupported pooling type!");
} }
engine_->AddOp(op_name, "Pooling", {x_name}, {y_name}); this->engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
engine_->AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
engine_->AddOpAttr(op_name, "method", anakin_pool_type); this->engine_->AddOpAttr(op_name, "method", anakin_pool_type);
engine_->AddOpAttr(op_name, "global_pooling", global_pooling); this->engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode); this->engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class Pool2dOpConverter : public AnakinOpConverter { template <typename TargetT>
class Pool2dOpConverter : public AnakinOpConverter<TargetT> {
public: public:
Pool2dOpConverter() = default; Pool2dOpConverter() = default;
......
...@@ -16,19 +16,14 @@ ...@@ -16,19 +16,14 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ReluOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void ReluOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -37,14 +32,14 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -37,14 +32,14 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_name = op_desc.Input("X").front(); auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "alpha", 0); this->engine_->AddOpAttr(op_name, "alpha", 0);
} }
void LeakyReluOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void LeakyReluOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -54,13 +49,19 @@ void LeakyReluOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -54,13 +49,19 @@ void LeakyReluOpConverter::operator()(const framework::proto::OpDesc &op,
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
float alpha = boost::get<float>(op_desc.GetAttr("alpha")); float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "alpha", alpha); this->engine_->AddOpAttr(op_name, "alpha", alpha);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::X86>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ReluOpConverter : public AnakinOpConverter { template <typename TargetT>
class ReluOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ReluOpConverter() = default; ReluOpConverter() = default;
...@@ -33,7 +34,8 @@ class ReluOpConverter : public AnakinOpConverter { ...@@ -33,7 +34,8 @@ class ReluOpConverter : public AnakinOpConverter {
virtual ~ReluOpConverter() {} virtual ~ReluOpConverter() {}
}; };
class LeakyReluOpConverter : public AnakinOpConverter { template <typename TargetT>
class LeakyReluOpConverter : public AnakinOpConverter<TargetT> {
public: public:
LeakyReluOpConverter() = default; LeakyReluOpConverter() = default;
......
...@@ -15,20 +15,16 @@ ...@@ -15,20 +15,16 @@
#include "paddle/fluid/inference/anakin/convert/reshape.h" #include "paddle/fluid/inference/anakin/convert/reshape.h"
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void ReshapeOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
...@@ -37,17 +33,23 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -37,17 +33,23 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
auto output = op_desc.Output("Out").front(); auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output}); this->engine_->AddOp(op_name, "Reshape", {input}, {output});
auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape")); auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
if (shape.size() < 4) { if (shape.size() < 4) {
shape.insert(shape.end(), 4 - shape.size(), 1); shape.insert(shape.end(), 4 - shape.size(), 1);
} }
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", shape); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", shape);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ReshapeOpConverter : public AnakinOpConverter { template <typename TargetT>
class ReshapeOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ReshapeOpConverter() = default; ReshapeOpConverter() = default;
......
...@@ -25,10 +25,10 @@ namespace paddle { ...@@ -25,10 +25,10 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void RoiAlignOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void RoiAlignOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1);
...@@ -44,16 +44,21 @@ void RoiAlignOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -44,16 +44,21 @@ void RoiAlignOpConverter::operator()(const framework::proto::OpDesc &op,
auto pooled_width = boost::get<int>(op_desc.GetAttr("pooled_width")); auto pooled_width = boost::get<int>(op_desc.GetAttr("pooled_width"));
auto sampling_ratio = boost::get<int>(op_desc.GetAttr("sampling_ratio")); auto sampling_ratio = boost::get<int>(op_desc.GetAttr("sampling_ratio"));
engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name}, this->engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name},
{output_name}); {output_name});
engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale); this->engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale);
engine_->AddOpAttr(op_name, "pooled_height", pooled_height); this->engine_->AddOpAttr(op_name, "pooled_height", pooled_height);
engine_->AddOpAttr(op_name, "pooled_width", pooled_width); this->engine_->AddOpAttr(op_name, "pooled_width", pooled_width);
engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio); this->engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::X86>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class RoiAlignOpConverter : public AnakinOpConverter { template <typename TargetT>
class RoiAlignOpConverter : public AnakinOpConverter<TargetT> {
public: public:
RoiAlignOpConverter() = default; RoiAlignOpConverter() = default;
......
...@@ -16,19 +16,14 @@ ...@@ -16,19 +16,14 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void ScaleOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -44,14 +39,14 @@ void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -44,14 +39,14 @@ void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
PADDLE_ENFORCE(bias_after_scale, PADDLE_ENFORCE(bias_after_scale,
"The anakin scale layer only support bias after scale now."); "The anakin scale layer only support bias after scale now.");
engine_->AddOp(op_name, "Power", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Power", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "shift", bias); this->engine_->AddOpAttr(op_name, "shift", bias);
engine_->AddOpAttr(op_name, "scale", scale); this->engine_->AddOpAttr(op_name, "scale", scale);
engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0)); this->engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter); REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter<::anakin::saber::NV>);
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class ScaleOpConverter : public AnakinOpConverter { template <typename TargetT>
class ScaleOpConverter : public AnakinOpConverter<TargetT> {
public: public:
ScaleOpConverter() = default; ScaleOpConverter() = default;
......
...@@ -14,19 +14,14 @@ ...@@ -14,19 +14,14 @@
#include "paddle/fluid/inference/anakin/convert/softmax.h" #include "paddle/fluid/inference/anakin/convert/softmax.h"
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void SoftMaxOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
...@@ -41,12 +36,18 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -41,12 +36,18 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto input_shape_in_fluid = input_var_desc->GetShape(); auto input_shape_in_fluid = input_var_desc->GetShape();
size_t input_dims = input_shape_in_fluid.size(); size_t input_dims = input_shape_in_fluid.size();
engine_->AddOp(op_name, "Softmax", {input}, {output}); this->engine_->AddOp(op_name, "Softmax", {input}, {output});
engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1)); this->engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class SoftMaxOpConverter : public AnakinOpConverter { template <typename TargetT>
class SoftMaxOpConverter : public AnakinOpConverter<TargetT> {
public: public:
SoftMaxOpConverter() = default; SoftMaxOpConverter() = default;
......
...@@ -16,23 +16,16 @@ ...@@ -16,23 +16,16 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void SplitOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void SplitOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
auto input_name = op_desc.Input("X").front(); auto input_name = op_desc.Input("X").front();
auto y_names = op_desc.Output("Out"); auto y_names = op_desc.Output("Out");
...@@ -51,14 +44,19 @@ void SplitOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -51,14 +44,19 @@ void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
num_sum += output_lengths[i]; num_sum += output_lengths[i];
slice_point.push_back(num_sum); slice_point.push_back(num_sum);
} }
engine_->AddOp(op_name, "Slice", {input_name}, y_names); this->engine_->AddOp(op_name, "Slice", {input_name}, y_names);
engine_->AddOpAttr(op_name, "axis", axis); this->engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr<PTuple<int>>(op_name, "slice_point", slice_point); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "slice_point",
slice_point);
// slice_dim is useless in anakin // slice_dim is useless in anakin
engine_->AddOpAttr(op_name, "slice_dim", 4); this->engine_->AddOpAttr(op_name, "slice_dim", 4);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class SplitOpConverter : public AnakinOpConverter { template <typename TargetT>
class SplitOpConverter : public AnakinOpConverter<TargetT> {
public: public:
SplitOpConverter() = default; SplitOpConverter() = default;
......
...@@ -17,22 +17,17 @@ ...@@ -17,22 +17,17 @@
#include <string> #include <string>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void SumOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
void SumOpConverter<TargetT>::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -43,13 +38,17 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -43,13 +38,17 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op,
std::vector<float> coeff = {1, 1}; std::vector<float> coeff = {1, 1};
std::string elementwise_type = "Add"; std::string elementwise_type = "Add";
engine_->AddOp(op_name, "Eltwise", input_names, {out_name}); this->engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff); this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type); this->engine_->template AddOpAttr<std::string>(op_name, "type",
elementwise_type);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class SumOpConverter : public AnakinOpConverter { template <typename TargetT>
class SumOpConverter : public AnakinOpConverter<TargetT> {
public: public:
SumOpConverter() = default; SumOpConverter() = default;
......
...@@ -21,12 +21,14 @@ namespace paddle { ...@@ -21,12 +21,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
static void test_activation_op(const std::string &op_type) { template <typename TargetT>
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type); static void test_activation_op(const std::string& op_type,
PADDLE_ENFORCE(converter != nullptr); const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc; framework::OpDesc desc;
...@@ -41,13 +43,42 @@ static void test_activation_op(const std::string &op_type) { ...@@ -41,13 +43,42 @@ static void test_activation_op(const std::string &op_type) {
validator.Execute(5); validator.Execute(5);
} }
TEST(sigm_op, test) { test_activation_op("sigmoid"); } #ifdef PADDLE_WITH_CUDA
TEST(tanh_op, test) { test_activation_op("tanh"); } TEST(sigm_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("sigmoid", ctx, true);
}
TEST(tanh_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("tanh", ctx, true);
}
#endif
TEST(sigm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
}
TEST(tanh_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(sigmoid); USE_OP(sigmoid);
USE_OP(tanh); USE_OP(tanh);
USE_CPU_ANAKIN_CONVERTER(sigmoid);
USE_CPU_ANAKIN_CONVERTER(tanh);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(tanh);
#endif
...@@ -21,16 +21,19 @@ namespace paddle { ...@@ -21,16 +21,19 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(affine_channel, native) { template <typename TargetT>
void test_affine_channel_op(const platform::DeviceContext& context,
bool use_gpu) {
// Declare the difference between the inputs. // Declare the difference between the inputs.
std::unordered_set<std::string> parameters({"scale", "bias"}); std::unordered_set<std::string> parameters({"scale", "bias"});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 3, 5, 2}); validator.DeclInputVar("x", {1, 3, 5, 2});
validator.DeclOutputVar("out", {1, 3, 5, 2}); validator.DeclOutputVar("out", {1, 3, 5, 2});
validator.DeclParamVar("scale", {1, 3, 1, 1}); validator.DeclParamVar("scale", {3});
validator.DeclParamVar("bias", {1, 3, 1, 1}); validator.DeclParamVar("bias", {3});
// Prepare Op descriptions. // Prepare Op descriptions.
framework::OpDesc desc; framework::OpDesc desc;
...@@ -47,9 +50,26 @@ TEST(affine_channel, native) { ...@@ -47,9 +50,26 @@ TEST(affine_channel, native) {
validator.Execute(1); validator.Execute(1);
} }
#ifdef PADDLE_WITH_CUDA
TEST(affine_channel_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_affine_channel_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(affine_channel_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_affine_channel_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(affine_channel); USE_OP(affine_channel);
USE_CPU_ANAKIN_CONVERTER(affine_channel);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(affine_channel); USE_ANAKIN_CONVERTER(affine_channel);
#endif
...@@ -19,12 +19,14 @@ namespace paddle { ...@@ -19,12 +19,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(batch_norm_op, test) { template <typename TargetT>
void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters( std::unordered_set<std::string> parameters(
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"}); "batch_norm_variance"});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
std::vector<int> param_shape{2}; std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
...@@ -64,8 +66,26 @@ TEST(batch_norm_op, test) { ...@@ -64,8 +66,26 @@ TEST(batch_norm_op, test) {
validator.Execute(1, neglected_output); validator.Execute(1, neglected_output);
} }
#ifdef PADDLE_WITH_CUDA
TEST(batch_norm_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_batchnorm_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(batch_norm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_batchnorm_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(batch_norm); USE_OP(batch_norm);
USE_CPU_ANAKIN_CONVERTER(batch_norm);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(batch_norm); USE_ANAKIN_CONVERTER(batch_norm);
#endif
...@@ -21,10 +21,12 @@ namespace paddle { ...@@ -21,10 +21,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(concat_op, test) { template <typename TargetT>
void test_concat_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({""}); std::unordered_set<std::string> parameters({""});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
...@@ -44,31 +46,26 @@ TEST(concat_op, test) { ...@@ -44,31 +46,26 @@ TEST(concat_op, test) {
validator.Execute(1); validator.Execute(1);
} }
TEST(concat_op, test2) { #ifdef PADDLE_WITH_CUDA
std::unordered_set<std::string> parameters({""}); TEST(concat_op, gpu) {
framework::Scope scope; platform::CUDAPlace gpu_place(0);
AnakinConvertValidation validator(parameters, &scope); platform::CUDADeviceContext ctx(gpu_place);
validator.DeclInputVar("concat_x1", {1, 4}); test_concat_op<::anakin::saber::NV>(ctx, true);
validator.DeclInputVar("concat_x2", {3, 4}); }
validator.DeclInputVar("concat_x3", {2, 4}); #endif
validator.DeclOutputVar("concat_out", {6, 4});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 0;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(1); TEST(concat_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_concat_op<::anakin::saber::X86>(ctx, false);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(concat); USE_OP(concat);
USE_CPU_ANAKIN_CONVERTER(concat);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(concat); USE_ANAKIN_CONVERTER(concat);
#endif
...@@ -21,13 +21,12 @@ namespace paddle { ...@@ -21,13 +21,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(conv2d_op, test) { template <typename TargetT>
auto* conv2d_converter = void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) {
Registry<AnakinOpConverter>::Global().Lookup("conv2d");
ASSERT_TRUE(conv2d_converter != nullptr);
std::unordered_set<std::string> parameters({"conv2d-Y"}); std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
...@@ -54,9 +53,27 @@ TEST(conv2d_op, test) { ...@@ -54,9 +53,27 @@ TEST(conv2d_op, test) {
validator.Execute(3); validator.Execute(3);
} }
#ifdef PADDLE_WITH_CUDA
TEST(conv2d_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_conv2d_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(conv2d_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_conv2d_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(conv2d); USE_OP(conv2d);
USE_CPU_ANAKIN_CONVERTER(conv2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(conv2d); USE_ANAKIN_CONVERTER(conv2d);
#endif
...@@ -21,10 +21,12 @@ namespace paddle { ...@@ -21,10 +21,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(dropout_op, native) { template <typename TargetT>
void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2});
validator.DeclOutputVar("mask", {1, 1, 2, 2}); validator.DeclOutputVar("mask", {1, 1, 2, 2});
...@@ -45,9 +47,26 @@ TEST(dropout_op, native) { ...@@ -45,9 +47,26 @@ TEST(dropout_op, native) {
validator.Execute(1, neglected_output); validator.Execute(1, neglected_output);
} }
#ifdef PADDLE_WITH_CUDA
TEST(dropout_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_dropout_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(dropout_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_dropout_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(dropout); USE_OP(dropout);
USE_CPU_ANAKIN_CONVERTER(dropout);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(dropout); USE_ANAKIN_CONVERTER(dropout);
#endif
...@@ -21,10 +21,14 @@ namespace paddle { ...@@ -21,10 +21,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
static void test_elementwise_op(const std::string &op_type) { template <typename TargetT>
static void test_elementwise_op(const std::string& op_type,
const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclInputVar("y", {1, 1, 2, 2}); validator.DeclInputVar("y", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2});
...@@ -43,14 +47,41 @@ static void test_elementwise_op(const std::string &op_type) { ...@@ -43,14 +47,41 @@ static void test_elementwise_op(const std::string &op_type) {
validator.Execute(1); validator.Execute(1);
} }
TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); } #ifdef PADDLE_WITH_CUDA
TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); } TEST(elementwise_op, native_add_gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_elementwise_op<::anakin::saber::NV>("elementwise_add", ctx, true);
}
TEST(elementwise_op, native_mul_gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
}
#endif
TEST(elementwise_op, native_add_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
}
TEST(elementwise_op, native_mul_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(elementwise_add); USE_OP(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_add);
USE_OP(elementwise_mul); USE_OP(elementwise_mul);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_mul); USE_ANAKIN_CONVERTER(elementwise_mul);
#endif
USE_CPU_ANAKIN_CONVERTER(elementwise_add);
USE_CPU_ANAKIN_CONVERTER(elementwise_mul);
...@@ -20,13 +20,13 @@ namespace paddle { ...@@ -20,13 +20,13 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(fc_op, test) { template <typename TargetT>
auto* fc_converter = Registry<AnakinOpConverter>::Global().Lookup("fc"); void test_mul_op(const platform::DeviceContext& context, bool use_gpu) {
ASSERT_TRUE(fc_converter);
std::unordered_set<std::string> parameters({"mul_y"}); std::unordered_set<std::string> parameters({"mul_y"});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("mul_x", {1, 1, 2, 2}); validator.DeclInputVar("mul_x", {1, 1, 2, 2});
validator.DeclParamVar("mul_y", {4, 2}); validator.DeclParamVar("mul_y", {4, 2});
validator.DeclOutputVar("mul_out", {1, 2}); validator.DeclOutputVar("mul_out", {1, 2});
...@@ -42,9 +42,26 @@ TEST(fc_op, test) { ...@@ -42,9 +42,26 @@ TEST(fc_op, test) {
validator.Execute(10); validator.Execute(10);
} }
#ifdef PADDLE_WITH_CUDA
TEST(mul_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_mul_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(mul_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_mul_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP(mul);
USE_CPU_ANAKIN_CONVERTER(fc);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(fc); USE_ANAKIN_CONVERTER(fc);
#endif
...@@ -20,13 +20,12 @@ namespace paddle { ...@@ -20,13 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(flatten_op, test) { template <typename TargetT>
auto *converter = Registry<AnakinOpConverter>::Global().Lookup("flatten"); void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) {
ASSERT_TRUE(converter);
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
framework::OpDesc desc; framework::OpDesc desc;
...@@ -42,10 +41,27 @@ TEST(flatten_op, test) { ...@@ -42,10 +41,27 @@ TEST(flatten_op, test) {
validator.Execute(5); validator.Execute(5);
} }
#ifdef PADDLE_WITH_CUDA
TEST(flatten_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_flatten_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(flatten_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_flatten_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(reshape); USE_OP(reshape);
USE_OP_ITSELF(flatten); USE_OP_ITSELF(flatten);
USE_CPU_ANAKIN_CONVERTER(flatten);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(flatten); USE_ANAKIN_CONVERTER(flatten);
#endif
...@@ -19,15 +19,14 @@ namespace paddle { ...@@ -19,15 +19,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void test_pool2d(bool global_pooling, bool ceil_mode, template <typename TargetT>
void test_pool2d(const platform::DeviceContext& context, bool use_gpu,
bool global_pooling, bool ceil_mode,
std::string pool_type = "max") { std::string pool_type = "max") {
auto* pool2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("pool2d");
ASSERT_TRUE(pool2d_converter);
framework::Scope scope; framework::Scope scope;
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
// The ITensor's Dims should not contain the batch size. // The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W. // So, the ITensor's Dims of input and output should be C * H * W.
...@@ -64,56 +63,61 @@ void test_pool2d(bool global_pooling, bool ceil_mode, ...@@ -64,56 +63,61 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
validator.Execute(1); validator.Execute(1);
} }
void test_pool2d2(bool global_pooling, bool ceil_mode, #ifdef PADDLE_WITH_CUDA
std::string pool_type = "max") { TEST(Pool2dOpConverter, normal) {
auto* pool2d_converter = platform::CUDAPlace gpu_place(0);
Registry<AnakinOpConverter>::Global().Lookup("pool2d"); platform::CUDADeviceContext ctx(gpu_place);
ASSERT_TRUE(pool2d_converter); test_pool2d<::anakin::saber::NV>(ctx, true, false, false);
}
framework::Scope scope; TEST(Pool2dOpConverter, test_global_pooling) {
std::unordered_set<std::string> parameters; platform::CUDAPlace gpu_place(0);
AnakinConvertValidation validator(parameters, &scope); platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, true, false);
// The ITensor's Dims should not contain the batch size. }
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pool2d");
desc.SetInput("X", {"pool2d_x"});
desc.SetOutput("Out", {"pool2d_out"});
std::vector<int> ksize({3, 3});
std::vector<int> strides({1, 1});
std::vector<int> paddings({1, 1});
std::string pooling_t = pool_type;
desc.SetAttr("pooling_type", pooling_t); TEST(Pool2dOpConverter, max_ceil_test) {
desc.SetAttr("ksize", ksize); platform::CUDAPlace gpu_place(0);
desc.SetAttr("strides", strides); platform::CUDADeviceContext ctx(gpu_place);
desc.SetAttr("paddings", paddings); test_pool2d<::anakin::saber::NV>(ctx, true, false, true);
desc.SetAttr("global_pooling", global_pooling); }
desc.SetAttr("ceil_mode", true);
LOG(INFO) << "set OP"; TEST(Pool2dOpConverter, avg_ceil_test) {
validator.SetOp(*desc.Proto()); platform::CUDAPlace gpu_place(0);
LOG(INFO) << "execute"; platform::CUDADeviceContext ctx(gpu_place);
test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
}
#endif
validator.Execute(1); TEST(Pool2dOpConverter, normal_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, false);
}
TEST(Pool2dOpConverter, test_global_pooling_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, true, false);
} }
TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } TEST(Pool2dOpConverter, max_ceil_test_cpu) {
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, true);
}
TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); } TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } platform::CPUPlace cpu_place;
TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); } platform::CPUDeviceContext ctx(cpu_place);
test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(pool2d); USE_OP(pool2d);
USE_CPU_ANAKIN_CONVERTER(pool2d);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(pool2d); USE_ANAKIN_CONVERTER(pool2d);
#endif
...@@ -21,12 +21,14 @@ namespace paddle { ...@@ -21,12 +21,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
static void test_relu_op(const std::string &op_type) { template <typename TargetT>
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type); static void test_activation_op(const std::string& op_type,
PADDLE_ENFORCE(converter != nullptr); const platform::DeviceContext& context,
bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc; framework::OpDesc desc;
...@@ -44,14 +46,44 @@ static void test_relu_op(const std::string &op_type) { ...@@ -44,14 +46,44 @@ static void test_relu_op(const std::string &op_type) {
validator.Execute(5); validator.Execute(5);
} }
TEST(activation, relu) { test_relu_op("relu"); } #ifdef PADDLE_WITH_CUDA
TEST(activation, leaky_relu) { test_relu_op("leaky_relu"); } TEST(relu_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("relu", ctx, true);
}
TEST(leaky_relu_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("leaky_relu", ctx, true);
}
#endif
/* seems bug here
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("relu", ctx, false);
}
TEST(leaky_relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("leaky_relu", ctx, false);
}
*/
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(relu); USE_OP(relu);
USE_ANAKIN_CONVERTER(relu);
USE_OP(leaky_relu); USE_OP(leaky_relu);
USE_CPU_ANAKIN_CONVERTER(relu);
USE_CPU_ANAKIN_CONVERTER(leaky_relu);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(relu);
USE_ANAKIN_CONVERTER(leaky_relu); USE_ANAKIN_CONVERTER(leaky_relu);
#endif
...@@ -20,12 +20,12 @@ namespace paddle { ...@@ -20,12 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(reshape, test) { template <typename TargetT>
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("reshape"); void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) {
ASSERT_TRUE(converter);
framework::Scope scope; framework::Scope scope;
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); // validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
...@@ -45,10 +45,12 @@ TEST(reshape, test) { ...@@ -45,10 +45,12 @@ TEST(reshape, test) {
validator.Execute(1); validator.Execute(1);
} }
TEST(reshape, test2) { template <typename TargetT>
void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope; framework::Scope scope;
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("reshape-X", {1, 2, 4}); validator.DeclInputVar("reshape-X", {1, 2, 4});
validator.DeclOutputVar("reshape-Out", {1, 4, 2}); validator.DeclOutputVar("reshape-Out", {1, 4, 2});
...@@ -66,9 +68,39 @@ TEST(reshape, test2) { ...@@ -66,9 +68,39 @@ TEST(reshape, test2) {
validator.Execute(1); validator.Execute(1);
} }
#ifdef PADDLE_WITH_CUDA
TEST(reshape1_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_reshape1_op<::anakin::saber::NV>(ctx, true);
}
TEST(reshape2_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_reshape2_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(reshape1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_reshape2_op<::anakin::saber::X86>(ctx, false);
}
TEST(reshape2_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_reshape2_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(reshape); USE_OP(reshape);
USE_CPU_ANAKIN_CONVERTER(reshape);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(reshape); USE_ANAKIN_CONVERTER(reshape);
#endif
...@@ -20,12 +20,12 @@ namespace paddle { ...@@ -20,12 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(softmax, test) { template <typename TargetT>
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("softmax"); void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) {
ASSERT_TRUE(converter);
framework::Scope scope; framework::Scope scope;
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("softmax-X", {1, 10, 2}); validator.DeclInputVar("softmax-X", {1, 10, 2});
validator.DeclOutputVar("softmax-Out", {1, 10, 2}); validator.DeclOutputVar("softmax-Out", {1, 10, 2});
...@@ -41,9 +41,27 @@ TEST(softmax, test) { ...@@ -41,9 +41,27 @@ TEST(softmax, test) {
validator.Execute(1); validator.Execute(1);
} }
#ifdef PADDLE_WITH_CUDA
TEST(softmax_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_softmax_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_softmax_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(softmax); USE_OP(softmax);
USE_CPU_ANAKIN_CONVERTER(softmax);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(softmax); USE_ANAKIN_CONVERTER(softmax);
#endif
...@@ -21,12 +21,14 @@ namespace paddle { ...@@ -21,12 +21,14 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
template <int Axis> template <typename TargetT, int Axis>
void AnakinSliceTest(const std::vector<int> &in_shape, void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu,
const std::vector<int> &in_shape,
const std::vector<int> &sections) { const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""}); std::unordered_set<std::string> parameters({""});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("split_input", in_shape); validator.DeclInputVar("split_input", in_shape);
std::vector<std::string> output_vars; std::vector<std::string> output_vars;
...@@ -55,51 +57,58 @@ void AnakinSliceTest(const std::vector<int> &in_shape, ...@@ -55,51 +57,58 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
// batch = 0, axis = 1, same shape // batch = 0, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch1) { TEST(split_op, test_same_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2}); platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 4, 2, 2}, {2, 2});
} }
// batch = 0, axis = 1, different shape // batch = 0, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch1) { TEST(split_op, test_different_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1}); platform::CUDAPlace gpu_place(0);
} platform::CUDADeviceContext ctx(gpu_place);
// batch = 10, axis = 1, same shape AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 3, 2, 2}, {2, 1});
TEST(split_op, test_same_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
}
// batch = 10, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
} }
// batch = 0, axis = 2, same shape // batch = 0, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch1) { TEST(split_op, test_same_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2}); platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 4, 2}, {2, 2});
} }
// batch = 0, axis = 2, different shape // batch = 0, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch1) { TEST(split_op, test_different_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1}); platform::CUDAPlace gpu_place(0);
} platform::CUDADeviceContext ctx(gpu_place);
// batch = 10, axis = 2, same shape AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 3, 2}, {2, 1});
TEST(split_op, test_same_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
}
// batch = 10, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
} }
// batch = 0, axis = 3, same shape // batch = 0, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch1) { TEST(split_op, test_same_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 4}, {2, 2});
} }
// batch = 0, axis = 3, different shape // batch = 0, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch1) { TEST(split_op, test_different_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
} }
// batch = 10, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch10) { TEST(split_op, test_different_shape_axis1_batch1_cpu) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 1>(ctx, false, {1, 3, 2, 3}, {2, 1});
}
TEST(split_op, test_different_shape_axis2_batch1_cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 2>(ctx, false, {1, 3, 4, 2}, {2, 2});
} }
// batch = 10, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch10) { TEST(split_op, test_different_shape_axis3_batch1_cpu) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
} }
} // namespace anakin } // namespace anakin
...@@ -107,4 +116,7 @@ TEST(split_op, test_different_shape_axis3_batch10) { ...@@ -107,4 +116,7 @@ TEST(split_op, test_different_shape_axis3_batch10) {
} // namespace paddle } // namespace paddle
USE_OP(split); USE_OP(split);
USE_CPU_ANAKIN_CONVERTER(split);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(split); USE_ANAKIN_CONVERTER(split);
#endif
...@@ -22,10 +22,12 @@ namespace paddle { ...@@ -22,10 +22,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(sum, native) { template <typename TargetT>
static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
...@@ -40,9 +42,26 @@ TEST(sum, native) { ...@@ -40,9 +42,26 @@ TEST(sum, native) {
validator.Execute(1); validator.Execute(1);
} }
#ifdef PADDLE_WITH_CUDA
TEST(sum_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_sum_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(sum_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_sum_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(sum); USE_OP(sum);
USE_CPU_ANAKIN_CONVERTER(sum);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sum); USE_ANAKIN_CONVERTER(sum);
#endif
...@@ -20,12 +20,12 @@ namespace paddle { ...@@ -20,12 +20,12 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
TEST(transpose_op, test) { template <typename TargetT>
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("transpose"); void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) {
ASSERT_TRUE(converter != nullptr);
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
...@@ -43,11 +43,12 @@ TEST(transpose_op, test) { ...@@ -43,11 +43,12 @@ TEST(transpose_op, test) {
validator.Execute(3); validator.Execute(3);
} }
// test input shape's dims < 4 template <typename TargetT>
TEST(transpose_op, test2) { void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope); AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
validator.DeclInputVar("transpose-X", {3, 4, 5}); validator.DeclInputVar("transpose-X", {3, 4, 5});
validator.DeclOutputVar("transpose-Out", {3, 5, 4}); validator.DeclOutputVar("transpose-Out", {3, 5, 4});
...@@ -65,9 +66,38 @@ TEST(transpose_op, test2) { ...@@ -65,9 +66,38 @@ TEST(transpose_op, test2) {
validator.Execute(1); validator.Execute(1);
} }
#ifdef PADDLE_WITH_CUDA
TEST(transpose1_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_transpose1_op<::anakin::saber::NV>(ctx, true);
}
TEST(transpose2_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_transpose2_op<::anakin::saber::NV>(ctx, true);
}
#endif
TEST(transpose1_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_transpose2_op<::anakin::saber::X86>(ctx, false);
}
TEST(transpose2_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_transpose2_op<::anakin::saber::X86>(ctx, false);
}
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(transpose); USE_OP(transpose);
USE_CPU_ANAKIN_CONVERTER(transpose);
#ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(transpose); USE_ANAKIN_CONVERTER(transpose);
#endif
...@@ -17,20 +17,16 @@ ...@@ -17,20 +17,16 @@
#include <string> #include <string>
#include <vector> #include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple; using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, template <typename TargetT>
const framework::BlockDesc &block_desc, void TransposeOpConverter<TargetT>::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
...@@ -38,7 +34,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -38,7 +34,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
auto input = op_desc.Input("X").front(); auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front(); auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Permute", {input}, {output}); this->engine_->AddOp(op_name, "Permute", {input}, {output});
auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis")); auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
size_t axis_size = axis.size(); size_t axis_size = axis.size();
...@@ -46,11 +42,17 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -46,11 +42,17 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
axis.push_back(axis_size); axis.push_back(axis_size);
axis_size += 1; axis_size += 1;
} }
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis); this->engine_->template AddOpAttr<PTuple<int>>(op_name, "dims", axis);
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter); #ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::NV>);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::X86>);
...@@ -20,7 +20,8 @@ namespace paddle { ...@@ -20,7 +20,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class TransposeOpConverter : public AnakinOpConverter { template <typename TargetT>
class TransposeOpConverter : public AnakinOpConverter<TargetT> {
public: public:
TransposeOpConverter() = default; TransposeOpConverter() = default;
......
...@@ -32,14 +32,8 @@ limitations under the License. */ ...@@ -32,14 +32,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision; using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86; using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -55,8 +49,8 @@ float random(float low, float high) { ...@@ -55,8 +49,8 @@ float random(float low, float high) {
return dist(mt); return dist(mt);
} }
void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, void RandomizeTensor(framework::LoDTensor* tensor,
const platform::DeviceContext& ctx) { const platform::Place& place) {
auto dims = tensor->dims(); auto dims = tensor->dims();
size_t num_elements = analysis::AccuDims(dims, dims.size()); size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0); PADDLE_ENFORCE_GT(num_elements, 0);
...@@ -78,17 +72,19 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, ...@@ -78,17 +72,19 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
* anakin * anakin
* layer. * layer.
*/ */
template <typename TargetT>
class AnakinConvertValidation { class AnakinConvertValidation {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>; using AnakinNvEngineT = AnakinEngine<TargetT, Precision::FP32>;
public: public:
AnakinConvertValidation() = delete; AnakinConvertValidation() = delete;
AnakinConvertValidation(const std::unordered_set<std::string>& parameters, AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
framework::Scope* scope) framework::Scope* scope,
: parameters_(parameters), scope_(scope), place_(0) { const platform::DeviceContext& ctx,
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); bool use_gpu = true)
engine_.reset(new AnakinEngine<NV, Precision::FP32>(true)); : parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) {
engine_.reset(new AnakinEngine<TargetT, Precision::FP32>(true));
} }
// Declare a Variable as input with random initialization. // Declare a Variable as input with random initialization.
...@@ -108,11 +104,10 @@ class AnakinConvertValidation { ...@@ -108,11 +104,10 @@ class AnakinConvertValidation {
} }
void DeclVar(const std::string& name, const std::vector<int> dim_vec) { void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_);
auto* x = scope_->Var(name); auto* x = scope_->Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec)); x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place_, ctx); RandomizeTensor(x_tensor, ctx_.GetPlace());
std::vector<int64_t> dim_vec_int64; std::vector<int64_t> dim_vec_int64;
for (auto& ele : dim_vec) { for (auto& ele : dim_vec) {
...@@ -132,7 +127,7 @@ class AnakinConvertValidation { ...@@ -132,7 +127,7 @@ class AnakinConvertValidation {
// should init anakin engine here. // should init anakin engine here.
auto& block_desc = program_desc_.Block(framework::kRootBlockIndex); auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
Singleton<AnakinOpConverter>::Global().ConvertOp( Singleton<AnakinOpConverter<TargetT>>::Global().ConvertOp(
desc, block_desc, parameters_, *scope_, engine_.get(), desc, block_desc, parameters_, *scope_, engine_.get(),
true /*test_mode*/); true /*test_mode*/);
engine_->Freeze(); engine_->Freeze();
...@@ -160,11 +155,8 @@ class AnakinConvertValidation { ...@@ -160,11 +155,8 @@ class AnakinConvertValidation {
void Execute(int batch_size, void Execute(int batch_size,
std::unordered_set<std::string> neglected_output = {}) { std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op // Execute Fluid Op
platform::CUDADeviceContext ctx(place_); op_->Run(*scope_, ctx_.GetPlace());
op_->Run(*scope_, place_);
// std::vector<framework::LoDTensor> input_vector;
// std::vector<framework::LoDTensor> output_vector;
std::map<std::string, framework::LoDTensor*> inputs; std::map<std::string, framework::LoDTensor*> inputs;
for (const auto& input : op_desc_->InputArgumentNames()) { for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue; if (parameters_.count(input)) continue;
...@@ -180,20 +172,27 @@ class AnakinConvertValidation { ...@@ -180,20 +172,27 @@ class AnakinConvertValidation {
std::vector<float> fluid_out; std::vector<float> fluid_out;
auto* var = scope_->FindVar(output); auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out); framework::TensorToVector(*tensor, ctx_, &fluid_out);
fluid_outputs.push_back(fluid_out); fluid_outputs.push_back(fluid_out);
outputs.insert({output, tensor}); outputs.insert({output, tensor});
} }
engine_->Execute(inputs, outputs, stream_); if (!use_gpu_) {
engine_->Execute(inputs, outputs);
} else {
cudaStream_t stream;
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0);
engine_->Execute(inputs, outputs, stream);
}
int i_output = 0; int i_output = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue; if (neglected_output.count(output)) continue;
std::vector<float> anakin_out; std::vector<float> anakin_out;
auto* var = scope_->FindVar(output); auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &anakin_out); framework::TensorToVector(*tensor, ctx_, &anakin_out);
size_t anakin_out_size = anakin_out.size(); size_t anakin_out_size = anakin_out.size();
auto fluid_out = fluid_outputs[i_output++]; auto fluid_out = fluid_outputs[i_output++];
...@@ -205,15 +204,17 @@ class AnakinConvertValidation { ...@@ -205,15 +204,17 @@ class AnakinConvertValidation {
private: private:
std::unique_ptr<AnakinNvEngineT> engine_{nullptr}; std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
std::unique_ptr<framework::OpDesc> op_desc_; std::unique_ptr<framework::OpDesc> op_desc_;
framework::ProgramDesc program_desc_; framework::ProgramDesc program_desc_;
const std::unordered_set<std::string>& parameters_; const std::unordered_set<std::string>& parameters_;
framework::Scope* scope_; framework::Scope* scope_;
platform::CUDAPlace place_; const platform::DeviceContext& ctx_;
bool use_gpu_{true};
}; };
template class AnakinConvertValidation<::anakin::saber::NV>;
template class AnakinConvertValidation<::anakin::saber::X86>;
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -69,11 +69,11 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp( ...@@ -69,11 +69,11 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( void AnakinEngine<TargetT, PrecisionType, RunType>::BindInput(
const std::map<std::string, framework::LoDTensor *> &inputs, const std::map<std::string, framework::LoDTensor *> &inputs) {
const std::map<std::string, framework::LoDTensor *> &outputs, #ifdef PADDLE_WITH_CUDA
cudaStream_t stream) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
for (const auto &input : inputs) { for (const auto &input : inputs) {
auto *tensor = input.second; auto *tensor = input.second;
auto *data = tensor->data<float>(); auto *data = tensor->data<float>();
...@@ -105,6 +105,35 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -105,6 +105,35 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
fluid_input_shape); fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor); anakin_input->copy_from(tmp_anakin_tensor);
} }
}
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs) {
BindInput(inputs);
net_->prediction();
for (const auto &output : outputs) {
platform::CPUPlace cpu_place;
auto *tensor = output.second;
auto *anakin_output = net_->get_out(output.first);
auto *anakin_data = anakin_output->data();
auto anakin_output_shape = anakin_output->valid_shape();
tensor->Resize(framework::make_ddim(anakin_output_shape));
auto *fluid_data = tensor->mutable_data<float>(cpu_place);
memory::Copy(cpu_place, static_cast<void *>(fluid_data), cpu_place,
static_cast<void *>(anakin_data),
tensor->numel() * sizeof(float));
}
}
#ifdef PADDLE_WITH_CUDA
template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream) {
BindInput(inputs);
net_->prediction(); net_->prediction();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
for (const auto &output : outputs) { for (const auto &output : outputs) {
...@@ -121,6 +150,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -121,6 +150,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
#endif
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() { void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
...@@ -140,7 +170,15 @@ AnakinEngine<TargetT, PrecisionType, RunType>::Clone() { ...@@ -140,7 +170,15 @@ AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
return std::unique_ptr<AnakinEngine>(engine); return std::unique_ptr<AnakinEngine>(engine);
} }
#ifdef PADDLE_WITH_CUDA
template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::NV>;
#endif
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::X86>;
// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -32,7 +32,6 @@ ...@@ -32,7 +32,6 @@
#include "saber/saber_types.h" #include "saber/saber_types.h"
using anakin::Precision; using anakin::Precision;
using anakin::saber::NV;
namespace anakin { namespace anakin {
...@@ -94,9 +93,16 @@ class AnakinEngine { ...@@ -94,9 +93,16 @@ class AnakinEngine {
void Save(std::string path) { graph_->save(path); } void Save(std::string path) { graph_->save(path); }
bool IsInit() { return initialized_; } bool IsInit() { return initialized_; }
int GetDevice() { return device_; } int GetDevice() { return device_; }
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs);
#ifdef PADDLE_WITH_CUDA
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs, void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs, const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream); cudaStream_t stream);
#endif
private:
void BindInput(const std::map<std::string, framework::LoDTensor *> &inputs);
private: private:
bool initialized_{false}; bool initialized_{false};
...@@ -108,24 +114,25 @@ class AnakinEngine { ...@@ -108,24 +114,25 @@ class AnakinEngine {
std::vector<std::string> program_inputs_; std::vector<std::string> program_inputs_;
}; };
template <typename TargetT>
class AnakinEngineManager { class AnakinEngineManager {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>; using AnakinEngineT = AnakinEngine<TargetT, Precision::FP32>;
public: public:
bool HasEngine(const std::string &name) const { bool HasEngine(const std::string &name) const {
if (engines_.count(name) == 0) return false; if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr; return engines_.at(name).get() != nullptr;
} }
AnakinNvEngineT *Get(const std::string &name) const { AnakinEngineT *Get(const std::string &name) const {
return engines_.at(name).get(); return engines_.at(name).get();
} }
AnakinNvEngineT *Create( AnakinEngineT *Create(bool need_summary, int device, int max_batch_size,
bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape, std::map<std::string, std::vector<int>> max_input_shape,
std::vector<std::string> program_inputs, std::string engine_name) { std::vector<std::string> program_inputs,
std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_); std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<NV, Precision::FP32>( auto *p = new AnakinEngine<TargetT, Precision::FP32>(
need_summary, device, max_batch_size, max_input_shape, program_inputs); need_summary, device, max_batch_size, max_input_shape, program_inputs);
engines_[engine_name].reset(p); engines_[engine_name].reset(p);
return p; return p;
...@@ -138,7 +145,7 @@ class AnakinEngineManager { ...@@ -138,7 +145,7 @@ class AnakinEngineManager {
} }
private: private:
std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_; std::unordered_map<std::string, std::unique_ptr<AnakinEngineT>> engines_;
std::mutex mut_; std::mutex mut_;
}; };
} // namespace anakin } // namespace anakin
......
...@@ -67,7 +67,7 @@ struct Argument { ...@@ -67,7 +67,7 @@ struct Argument {
#define DECL_ARGUMENT_FIELD(field__, Field, type__) \ #define DECL_ARGUMENT_FIELD(field__, Field, type__) \
public: \ public: \
type__& field__() { \ type__& field__() { \
PADDLE_ENFORCE(Has(#field__)); \ PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
return field__##_; \ return field__##_; \
} \ } \
void Set##Field(const type__& x) { \ void Set##Field(const type__& x) { \
......
...@@ -114,6 +114,7 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -114,6 +114,7 @@ void IRPassManager::CreatePasses(Argument *argument,
if (pass_name == "anakin_subgraph_pass") { if (pass_name == "anakin_subgraph_pass") {
pass->Set("program", pass->Set("program",
new framework::ProgramDesc *(&argument->main_program())); new framework::ProgramDesc *(&argument->main_program()));
pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("engine_opt_info", new std::map<std::string, std::string>( pass->Set("engine_opt_info", new std::map<std::string, std::string>(
......
...@@ -194,20 +194,49 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -194,20 +194,49 @@ void AnakinSubgraphPass::CreateAnakinOp(
auto max_batch_size = Get<int>("max_batch_size"); auto max_batch_size = Get<int>("max_batch_size");
auto program_inputs = program_desc->GetFeedTargetNames(); auto program_inputs = program_desc->GetFeedTargetNames();
auto *anakin_engine = bool use_gpu = Get<bool>("use_gpu");
inference::Singleton<anakin::AnakinEngineManager>::Global().Create( SetAttr(op_desc->Proto(), "use_gpu", use_gpu);
true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
program_inputs, engine_key); if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::NV>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
#endif
} else {
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::X86>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
}
auto *scope = param_scope(); auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end()); std::unordered_set<std::string> param_set(params.begin(), params.end());
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
if (use_gpu) {
inference::Singleton<inference::anakin::AnakinOpConverter>::Global() auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::NV>>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
} else {
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::X86>>::Global()
.ConvertBlockToAnakinEngine( .ConvertBlockToAnakinEngine(
&block_desc_temp, scope, &block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()), std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine); param_set, output_mapping, anakin_engine);
}
} }
} // namespace analysis } // namespace analysis
......
...@@ -70,4 +70,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI ...@@ -70,4 +70,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
anakin_target(inference_anakin_api) anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared) anakin_target(inference_anakin_api_shared)
endif() endif()
inference_analysis_test(faster_rcnn_test SRCS faster_rcnn_test.cc EXTRA_DEPS paddle_fluid)
...@@ -268,9 +268,11 @@ void AnalysisConfig::Update() { ...@@ -268,9 +268,11 @@ void AnalysisConfig::Update() {
PADDLE_ENFORCE(!use_tensorrt_, PADDLE_ENFORCE(!use_tensorrt_,
"Anakin sub-graph and TensorRT sub-graph are not allowed to " "Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!"); "run at the same time!");
PADDLE_ENFORCE( if (use_gpu_) {
use_gpu_, LOG(INFO) << "Run Anakin GPU mode";
"Anakin sub-graph engine need gpu, please use the EnableGpu API."); } else {
LOG(INFO) << "Run Anakin CPU mode";
}
pass_builder()->ClearPasses(); pass_builder()->ClearPasses();
for (const auto &pass : kAnakinSubgraphPasses) { for (const auto &pass : kAnakinSubgraphPasses) {
......
...@@ -382,7 +382,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -382,7 +382,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
} }
if (config_.use_gpu() && config_.anakin_engine_enabled()) { if (config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
......
...@@ -34,28 +34,16 @@ limitations under the License. */ ...@@ -34,28 +34,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using FluidDT = framework::proto::VarType_Type;
using inference::Singleton; using inference::Singleton;
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
using inference::anakin::AnakinEngine; using inference::anakin::AnakinEngine;
class AnakinEngineOp : public framework::OperatorBase { class AnakinEngineOp : public framework::OperatorBase {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
private: private:
std::vector<std::string> input_names_; std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_; std::unordered_set<std::string> param_names_;
mutable AnakinNvEngineT *anakin_engine_;
std::string engine_key_; std::string engine_key_;
std::string engine_serialized_data_; std::string engine_serialized_data_;
bool use_gpu_;
public: public:
AnakinEngineOp(const std::string &type, AnakinEngineOp(const std::string &type,
...@@ -66,10 +54,10 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -66,10 +54,10 @@ class AnakinEngineOp : public framework::OperatorBase {
input_names_ = Inputs("Xs"); input_names_ = Inputs("Xs");
engine_key_ = Attr<std::string>("engine_key"); engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters"); auto params = Attr<std::vector<std::string>>("parameters");
use_gpu_ = Attr<bool>("use_gpu");
for (const auto &param : params) { for (const auto &param : params) {
param_names_.insert(param); param_names_.insert(param);
} }
anakin_engine_ = nullptr;
} }
protected: protected:
...@@ -80,7 +68,6 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -80,7 +68,6 @@ class AnakinEngineOp : public framework::OperatorBase {
void RunAnakin(const framework::Scope &scope, void RunAnakin(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
auto *engine = GetEngine(scope, dev_place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place); auto &dev_ctx = *pool.Get(dev_place);
auto stream = auto stream =
...@@ -92,7 +79,6 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -92,7 +79,6 @@ class AnakinEngineOp : public framework::OperatorBase {
Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
std::map<std::string, framework::LoDTensor *> inputs; std::map<std::string, framework::LoDTensor *> inputs;
// Convert input tensor from fluid to engine.
for (const auto &x : Inputs("Xs")) { for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue; if (param_names_.count(x)) continue;
auto &t = auto &t =
...@@ -110,17 +96,21 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -110,17 +96,21 @@ class AnakinEngineOp : public framework::OperatorBase {
outputs.insert({output_maps[output_index], fluid_t}); outputs.insert({output_maps[output_index], fluid_t});
output_index += 1; output_index += 1;
} }
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs, stream); engine->Execute(inputs, outputs, stream);
} #endif
} else {
AnakinNvEngineT *GetEngine(const framework::Scope &scope, auto *engine =
const platform::Place &dev_place) const { inference::Singleton<inference::anakin::AnakinEngineManager<
if (anakin_engine_ == nullptr) { ::anakin::saber::X86>>::Global()
anakin_engine_ =
inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
.Get(engine_key_); .Get(engine_key_);
engine->Execute(inputs, outputs);
} }
return anakin_engine_;
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册