提交 e14ab180 编写于 作者: N nhzlx

Cherry-pick from 1662, 16797.. : add anakin int8 support

上级 7ad182e1
......@@ -48,8 +48,9 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
auto base_op_desc = *mul->Op()->Proto();
// Create an FC Node.
OpDesc desc;
OpDesc desc(base_op_desc, nullptr);
std::string fc_x_in = subgraph.at(x)->Name();
std::string fc_Y_in = w->Name();
std::string fc_bias_in = fc_bias->Name();
......
......@@ -1640,7 +1640,8 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const std::string &op_type,
const std::string &weight_name,
int times) {
int times,
const std::string &quant_type) {
const int kNumFields = 5;
const int kQuantizedWeightOffset = 0;
const int kQuantizedOpOffset = 1;
......@@ -1648,24 +1649,22 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
const int kDequantOpOffset = 3;
const int kDequantOpOutOffset = 4;
// the quant op always be one.
auto quant_op_in_scale =
pattern->NewNode(GetNodeName("quant_op_in_scale"))
->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
->AsInput();
auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
->assert_is_op("fake_quantize_range_abs_max");
auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
->assert_is_op_input(quant_type, "InScale")
->AsInput();
auto quant_op =
pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
auto quant_op_out_scale =
pattern->NewNode(GetNodeName("quant_op_out_scale"))
->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
->assert_is_op_output(quant_type, "OutScale")
->assert_is_op_input("fake_dequantize_max_abs", "Scale")
->AsIntermediate();
auto quant_op_out =
pattern->NewNode(GetNodeName("quant_op_out"))
->assert_is_op_output("fake_quantize_range_abs_max", "Out")
->assert_is_op_input(op_type)
->AsIntermediate();
auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
->assert_is_op_output(quant_type, "Out")
->assert_is_op_input(op_type)
->AsIntermediate();
// there are 'times' quantized and dequant op
std::vector<PDNode *> nodes;
......
......@@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase {
: PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
void operator()(PDNode* quant_op_input, const std::string& op_name,
const std::string& weight_name, int times = 1);
const std::string& weight_name, int times,
const std::string& quant_type);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
......
......@@ -25,7 +25,8 @@ namespace framework {
namespace ir {
void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std::string op_type) {
const std::string& op_type,
const std::string& quant_type) {
const std::string pattern_name = "quant_dequant_fuse";
// FusePassBase::Init(pattern_name, graph);
const int kNumFields = 5;
......@@ -38,7 +39,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("fake_quantize_range_abs_max", "X")
->assert_is_op_input(quant_type, "X")
->AsInput();
std::string quantized_op_type = "";
......@@ -46,6 +47,9 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
if (op_type == "conv2d") {
quantized_op_type = "conv2d";
weight_name = "Filter";
} else if (op_type == "depthwise_conv2d") {
quantized_op_type = "depthwise_conv2d";
weight_name = "Filter";
} else if (op_type == "conv2d_fusion") {
quantized_op_type = "conv2d_fusion";
weight_name = "Filter";
......@@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
}
patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
pattern(x, quantized_op_type, weight_name, times);
pattern(x, quantized_op_type, weight_name, times, quant_type);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
......@@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
std::unordered_set<const Node*> delete_nodes;
for (int i = 0; i < times; i++) {
// max_range = (range * range) / weight_scale
float max_range = boost::get<float>(
nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
float weight_scale = (range * range) / max_range;
......@@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
new_op_desc.SetType(quantized_op_type);
if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion") {
quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") {
......@@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "quant_dequant_fuse";
FusePassBase::Init(pattern_name, graph);
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
std::unordered_set<std::string> quant_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
"depthwise_conv2d"};
auto* scope = param_scope();
for (auto& op_type : quantized_op_types) {
for (int i = 1; i <= 6; i++) {
RunQuantDequant(graph, scope, i, op_type);
for (auto& quant_type : quant_types) {
for (auto& op_type : quantized_op_types) {
for (int i = 6; i >= 1; i--) {
RunQuantDequant(graph, scope, i, op_type, quant_type);
}
}
}
}
......
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc roi_align.cc DEPS anakin_engine framework_proto scope op_registry)
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc
batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry
gtest)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
......
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
ActivationOpConverter<TargetT>::ActivationOpConverter(
template <typename TargetT, ::anakin::Precision PrecisionT>
ActivationOpConverter<TargetT, PrecisionT>::ActivationOpConverter(
const std::string &op_type)
: op_type_(op_type) {
auto it = anakin_op_types_.find(op_type_);
......@@ -30,8 +30,8 @@ ActivationOpConverter<TargetT>::ActivationOpConverter(
anakin_op_type_ = it->second;
}
template <typename TargetT>
void ActivationOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ActivationOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -50,11 +50,40 @@ void ActivationOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::NV>);
using sigmoid_nv_fp32 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using sigmoid_nv_int8 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using tanh_nv_fp32 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using tanh_nv_int8 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_int8);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, tanh_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid,
SigmoidOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter<::anakin::saber::X86>);
using sigmoid_cpu_fp32 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sigmoid_cpu_int8 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using tanh_cpu_fp32 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using tanh_cpu_int8 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_int8);
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_int8);
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ActivationOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
explicit ActivationOpConverter(const std::string &op_type);
......@@ -40,16 +40,17 @@ class ActivationOpConverter : public AnakinOpConverter<TargetT> {
{"sigmoid", "Sigmoid"}};
};
template <typename TargetT>
class TanhOpConverter : public ActivationOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class TanhOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
public:
TanhOpConverter() : ActivationOpConverter<TargetT>("tanh") {}
TanhOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("tanh") {}
};
template <typename TargetT>
class SigmoidOpConverter : public ActivationOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
public:
SigmoidOpConverter() : ActivationOpConverter<TargetT>("sigmoid") {}
SigmoidOpConverter()
: ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {}
};
} // namespace anakin
} // namespace inference
......
......@@ -16,18 +16,14 @@
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void AffineChannelOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void AffineChannelOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -35,60 +31,20 @@ void AffineChannelOpConverter<TargetT>::operator()(
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
// Copy the Scale to CPUPlace and get the pointer.
auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
PADDLE_ENFORCE_NOT_NULL(scale_v);
auto *scale_t = scale_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> scale_tensor(
new framework::LoDTensor());
scale_tensor->Resize(scale_t->dims());
TensorCopySync((*scale_t), platform::CPUPlace(), scale_tensor.get());
auto weight1 = pblock_from_var<TargetT>(*scale_v);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// Copy the Bias to CPUPlace and get the pointer.
auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(bias_v);
auto *bias_t = bias_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> bias_tensor(new framework::LoDTensor());
bias_tensor->Resize(bias_t->dims());
TensorCopySync((*bias_t), platform::CPUPlace(), bias_tensor.get());
this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name});
// Generate the Scale parameter of Anakin.
auto scale_shape = framework::vectorize2int(scale_t->dims());
while (scale_shape.size() < 4) {
scale_shape.insert(scale_shape.begin(), 1);
}
Shape anakin_scale_shape(scale_shape);
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_scale_shape);
float *scale_cpu_data =
static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(scale_tensor->data<float>(), scale_tensor->numel(),
scale_cpu_data);
weight1->d_tensor().set_shape(anakin_scale_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
// Generate the Bias parameter of Anakin.
auto bias_shape = framework::vectorize2int(bias_t->dims());
while (bias_shape.size() < 4) {
bias_shape.insert(bias_shape.begin(), 1);
}
Shape anakin_bias_shape(bias_shape);
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *bias_cpu_data =
static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_tensor->data<float>(), bias_tensor->numel(), bias_cpu_data);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
auto weight2 = pblock_from_var<TargetT>(*bias_v);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
......@@ -97,8 +53,21 @@ void AffineChannelOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::NV>);
using affine_channel_nv_fp32 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using affine_channel_nv_int8 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
affine_channel, AffineChannelOpConverter<::anakin::saber::X86>);
using affine_channel_cpu_fp32 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using affine_channel_cpu_int8 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_int8);
......@@ -21,8 +21,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class AffineChannelOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class AffineChannelOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
AffineChannelOpConverter() = default;
......
......@@ -18,17 +18,14 @@
#include <map>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void BatchNormOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void BatchNormOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -36,87 +33,46 @@ void BatchNormOpConverter<TargetT>::operator()(
std::map<std::string, std::string> inputs;
for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) {
PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL);
auto v = op_desc.Input(k).front();
inputs.insert({k, v});
}
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Y").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
// auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
auto bn_op_name = op_name + ":bn";
auto bn_output = bn_op_name + "_output";
this->engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
this->engine_->AddOp(bn_op_name, "BatchNorm", {input}, {bn_output});
this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
auto scale_op_name = op_name + ":scale";
auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
framework::LoDTensor *tensor) {
auto *v = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(v);
auto *t = v->GetMutable<framework::LoDTensor>();
tensor->Resize(t->dims());
TensorCopySync(*t, platform::CPUPlace(), tensor);
};
framework::LoDTensor bias_t;
framework::LoDTensor mean_t;
framework::LoDTensor scale_t;
framework::LoDTensor variance_t;
get_lod_tensor(inputs["Bias"], &bias_t);
get_lod_tensor(inputs["Mean"], &mean_t);
get_lod_tensor(inputs["Scale"], &scale_t);
get_lod_tensor(inputs["Variance"], &variance_t);
this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
this->engine_->AddOpAttr(scale_op_name, "axis", 1);
this->engine_->AddOpAttr(scale_op_name, "num_axes", 1);
this->engine_->AddOpAttr(scale_op_name, "bias_term", true);
auto fill_shape = [](size_t n, std::vector<int> shape) {
shape.insert(shape.begin(), 1);
if (shape.size() < n) {
shape.insert(shape.end(), n - shape.size(), 1);
}
return shape;
};
Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
auto *mean_v = scope.FindVar(op_desc.Input("Mean").front());
PADDLE_ENFORCE_NOT_NULL(mean_v);
auto weight1 = pblock_from_var<TargetT>(*mean_v);
this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape2);
auto *variance_data =
static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
auto *variance_v = scope.FindVar(op_desc.Input("Variance").front());
PADDLE_ENFORCE_NOT_NULL(variance_v);
auto weight2 = pblock_from_var<TargetT>(*variance_v);
this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
Shape shape3(std::vector<int>({1, 1, 1, 1}));
auto *weight3 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape3);
auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
float weight3_data[] = {1};
std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
auto *weight3 = pblock_from_vector<TargetT>(std::vector<float>({1}));
this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
auto *scale = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
scale_shape);
auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
auto *bias = GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
bias_shape);
auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
this->engine_->AddOpAttr(scale_op_name, "axis", 1);
this->engine_->AddOpAttr(scale_op_name, "num_axes", 1);
this->engine_->AddOpAttr(scale_op_name, "bias_term", true);
auto *scale_v = scope.FindVar(op_desc.Input("Scale").front());
PADDLE_ENFORCE_NOT_NULL(scale_v);
auto scale = pblock_from_var<TargetT>(*scale_v);
this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
auto *bias_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(bias_v);
auto bias = pblock_from_var<TargetT>(*bias_v);
this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
}
......@@ -125,9 +81,17 @@ void BatchNormOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::NV>);
using bn_nv_fp32 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using bn_nv_int8 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm,
BatchNormOpConverter<::anakin::saber::X86>);
using bn_cpu_fp32 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using bn_cpu_int8 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class BatchNormOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class BatchNormOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
BatchNormOpConverter() = default;
......
......@@ -19,8 +19,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void ConcatOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ConcatOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -39,8 +39,21 @@ void ConcatOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::NV>);
using concat_nv_fp32 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using concat_nv_int8 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat, concat_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(concat, concat_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(concat,
ConcatOpConverter<::anakin::saber::X86>);
using concat_cpu_fp32 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using concat_cpu_int8 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(concat, concat_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(concat, concat_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ConcatOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ConcatOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
ConcatOpConverter() = default;
......
......@@ -16,18 +16,16 @@
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/inference/anakin/convert/helper.h"
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void Conv2dOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -42,11 +40,8 @@ void Conv2dOpConverter<TargetT>::operator()(
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(filter_t->dims());
TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
auto weight_shape = framework::vectorize2int(weight_tensor->dims());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
......@@ -69,25 +64,61 @@ void Conv2dOpConverter<TargetT>::operator()(
this->engine_->AddOpAttr(op_name, "axis", 1);
this->engine_->AddOpAttr(op_name, "bias_term", false);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
::anakin::saber::Shape anakin_shape(weight_shape);
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
auto *weight1 = ::anakin::graph::GraphGlobalMem<TargetT>::Global()
.template new_block<::anakin::AK_INT8>(anakin_shape);
float *weight_data = weight_tensor->data<float>();
std::vector<char> weight_int8;
int weight_num = weight_tensor->numel();
for (int i = 0; i < weight_tensor->numel(); i++) {
bool is_valid_int8 =
((weight_data[i] >= -128) && (weight_data[i] <= 127));
PADDLE_ENFORCE(is_valid_int8,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]");
weight_int8.push_back(static_cast<char>(weight_data[i]));
}
memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name,
{weight_scale / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else {
auto *weight1 = pblock_from_tensor<TargetT>(*weight_tensor, weight_shape);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
}
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::X86>);
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d,
Conv2dOpConverter<::anakin::saber::NV>);
using conv2d_nv_fp32 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using conv2d_nv_int8 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_int8);
#endif
using conv2d_cpu_fp32 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using conv2d_cpu_int8 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class Conv2dOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class Conv2dOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
Conv2dOpConverter() = default;
......
......@@ -16,18 +16,16 @@
#include <algorithm>
#include <memory>
#include <vector>
#include "paddle/fluid/inference/anakin/convert/helper.h"
using anakin::graph::GraphGlobalMem;
using anakin::PTuple;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void Conv2dFusionOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -43,24 +41,16 @@ void Conv2dFusionOpConverter<TargetT>::operator()(
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
auto weight_shape = framework::vectorize2int(weight_tensor->dims());
auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(b_v);
auto *b_t = b_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(filter_t->dims());
TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0];
this->engine_->template AddOpAttr<int>(op_name, "filter_num", filter_num);
this->engine_->template AddOpAttr<PTuple<int>>(op_name, "kernel_size",
......@@ -77,37 +67,42 @@ void Conv2dFusionOpConverter<TargetT>::operator()(
this->engine_->AddOpAttr(op_name, "axis", 1);
this->engine_->AddOpAttr(op_name, "bias_term", true);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor;
bias_tensor.Resize(b_t->dims());
TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
auto *bias_data = bias_tensor.data<float>();
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
::anakin::saber::Shape anakin_shape(weight_shape);
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
auto *weight1 = ::anakin::graph::GraphGlobalMem<TargetT>::Global()
.template new_block<::anakin::AK_INT8>(anakin_shape);
float *weight_data = weight_tensor->data<float>();
std::vector<char> weight_int8;
int weight_num = weight_tensor->numel();
for (int i = 0; i < weight_tensor->numel(); i++) {
bool is_valid_int8 =
((weight_data[i] >= -128) && (weight_data[i] <= 127));
PADDLE_ENFORCE(is_valid_int8,
"We are in anakin subgraph int8 mode, the weight of conv "
"should be in range [-128, 127]");
weight_int8.push_back(static_cast<char>(weight_data[i]));
}
memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name,
{weight_scale / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else {
auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
auto weight_shape = framework::vectorize2int(weight_tensor->dims());
auto *weight1 = pblock_from_tensor<TargetT>(*weight_tensor, weight_shape);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto weight2 = pblock_from_var<TargetT>(*b_v);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
}
} // namespace anakin
......@@ -115,9 +110,21 @@ void Conv2dFusionOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::NV>);
using conv2d_fusion_nv_fp32 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using conv2d_fusion_nv_int8 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion,
Conv2dFusionOpConverter<::anakin::saber::X86>);
using conv2d_fusion_cpu_fp32 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using conv2d_fusion_cpu_int8 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class Conv2dFusionOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class Conv2dFusionOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
Conv2dFusionOpConverter() = default;
......
......@@ -23,8 +23,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void DensityPriorBoxOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void DensityPriorBoxOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
const framework::Scope& scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -109,13 +109,24 @@ void DensityPriorBoxOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::NV>);
using ds_pr_nv_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using ds_pr_nv_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
density_prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
prior_box, DensityPriorBoxOpConverter<::anakin::saber::X86>);
using ds_pr_cpu_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using ds_pr_cpu_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_int8);
......@@ -22,8 +22,9 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class DensityPriorBoxOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class DensityPriorBoxOpConverter
: public AnakinOpConverter<TargetT, PrecisionT> {
public:
DensityPriorBoxOpConverter() = default;
......
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void DetectionOutOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void DetectionOutOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -67,8 +67,21 @@ void DetectionOutOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::NV>);
using detection_out_nv_fp32 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using detection_out_nv_int8 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out,
DetectionOutOpConverter<::anakin::saber::X86>);
using detection_out_cpu_fp32 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using detection_out_cpu_int8 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_int8);
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class DetectionOutOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class DetectionOutOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
DetectionOutOpConverter() = default;
......
......@@ -16,17 +16,14 @@
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void DropoutOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void DropoutOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -42,12 +39,7 @@ void DropoutOpConverter<TargetT>::operator()(
auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
auto factor = 1 - dropout_prob;
Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(shape1);
auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {factor};
std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
auto *weight1 = pblock_from_vector<TargetT>(std::vector<float>({factor}));
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->AddOpAttr(op_name, "axis", 0);
......@@ -60,8 +52,21 @@ void DropoutOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::NV>);
using dropout_nv_fp32 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using dropout_nv_int8 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout, dropout_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout,
DropoutOpConverter<::anakin::saber::X86>);
using dropout_cpu_fp32 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using dropout_cpu_int8 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class DropoutOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class DropoutOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
DropoutOpConverter() = default;
......
......@@ -17,17 +17,14 @@
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void ElementwiseAddOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ElementwiseAddOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -48,8 +45,8 @@ void ElementwiseAddOpConverter<TargetT>::operator()(
this->engine_->template AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
}
template <typename TargetT>
void ElementwiseMulOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -75,12 +72,31 @@ void ElementwiseMulOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::NV>);
using elet_nv_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using elet_nv_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
using eletmul_nv_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using eletmul_nv_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_int8);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_add, ElementwiseAddOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(
elementwise_mul, ElementwiseMulOpConverter<::anakin::saber::X86>);
using elet_cpu_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using elet_cpu_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
using eletmul_cpu_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using eletmul_cpu_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_int8);
REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_int8);
......@@ -20,8 +20,9 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ElementwiseAddOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ElementwiseAddOpConverter
: public AnakinOpConverter<TargetT, PrecisionT> {
public:
ElementwiseAddOpConverter() = default;
......@@ -34,8 +35,9 @@ class ElementwiseAddOpConverter : public AnakinOpConverter<TargetT> {
private:
};
template <typename TargetT>
class ElementwiseMulOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ElementwiseMulOpConverter
: public AnakinOpConverter<TargetT, PrecisionT> {
public:
ElementwiseMulOpConverter() = default;
......
......@@ -16,22 +16,19 @@
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::Shape;
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void FcBaseOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto input_names = op_desc.InputNames();
bool with_bias = input_names.size() == 3;
bool with_bias = input_names.size() >= 3;
std::string w_name = "Y";
std::string i_name = "X";
......@@ -45,7 +42,12 @@ void FcBaseOpConverter<TargetT>::operator()(
// get weights
auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
PADDLE_ENFORCE_NOT_NULL(y_v);
auto *y_t = y_v->GetMutable<framework::LoDTensor>();
auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace());
auto weight_shape = framework::vectorize2int(weight_tensor->dims());
int out_dim = weight_shape[1];
const int w_m = weight_shape[0];
const int w_k = weight_shape[1];
auto input_name = op_desc.Input(i_name).front();
auto output_name = op_desc.Output("Out").front();
......@@ -53,64 +55,58 @@ void FcBaseOpConverter<TargetT>::operator()(
this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "bias_term", with_bias);
this->engine_->AddOpAttr(op_name, "axis", 1);
auto weight_shape = framework::vectorize2int(y_t->dims());
int out_dim = weight_shape[1];
this->engine_->AddOpAttr(op_name, "out_dim", out_dim);
const int w_m = weight_shape[0];
const int w_k = weight_shape[1];
if (weight_shape.size() < 4UL) {
weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
}
Shape anakin_shape(weight_shape);
framework::LoDTensor weight_tensor;
weight_tensor.Resize(y_t->dims());
TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
auto *weight_data = weight_tensor.data<float>();
PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());
auto *weight_data = weight_tensor->data<float>();
PADDLE_ENFORCE(w_m * w_k == weight_tensor->numel());
std::vector<float> trans_weight_data(weight_tensor.numel());
std::vector<float> trans_weight_data(weight_tensor->numel());
for (int i = 0; i < w_m; i++) {
for (int j = 0; j < w_k; j++) {
trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
}
}
auto *weight1 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
int weight_num = weight_tensor->numel();
bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
if (enable_int8) {
if (weight_shape.size() < 4UL) {
weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
}
::anakin::saber::Shape anakin_shape(weight_shape);
const float int8_range = 127.;
float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
auto *weight1 = ::anakin::graph::GraphGlobalMem<TargetT>::Global()
.template new_block<::anakin::AK_INT8>(anakin_shape);
std::vector<char> weight_int8;
for (int i = 0; i < weight_num; i++) {
bool is_valid_int8 =
((trans_weight_data[i] >= -128) && (trans_weight_data[i] <= 127));
PADDLE_ENFORCE(is_valid_int8,
"We are in anakin subgraph int8 mode, the weight of fc "
"should be in range [-128, 127]");
weight_int8.push_back(static_cast<char>(trans_weight_data[i]));
}
memcpy(static_cast<void *>(weight1->h_tensor().mutable_data()),
static_cast<void *>(weight_int8.data()), sizeof(char) * weight_num);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
this->engine_->Graph()->SetWeightsScale(op_name,
{weight_scale / int8_range}, false);
this->engine_->AddTensorScale(input_name, in_scale / int8_range);
} else {
auto *weight1 = pblock_from_vector<TargetT>(trans_weight_data);
this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
}
// get bias
if (with_bias) {
auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(b_v);
auto *b_t = b_v->GetMutable<framework::LoDTensor>();
auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor;
bias_tensor.Resize(b_t->dims());
TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
auto *bias_data = bias_tensor.data<float>();
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 =
GraphGlobalMem<TargetT>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
auto weight2 = pblock_from_var<TargetT>(*b_v);
this->engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
}
......@@ -120,9 +116,39 @@ void FcBaseOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::NV>);
using mul_nv_fp32 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using fc_nv_fp32 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using mul_nv_int8 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using fc_nv_int8 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, mul_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, fc_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(mul, mul_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(fc, fc_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, MulOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, FcOpConverter<::anakin::saber::X86>);
using mul_cpu_fp32 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using fc_cpu_fp32 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using mul_cpu_int8 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using fc_cpu_int8 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, mul_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, fc_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(mul, mul_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(fc, fc_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class FcBaseOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class FcBaseOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
FcBaseOpConverter() = default;
......@@ -33,15 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter<TargetT> {
};
// with bias
template <typename TargetT>
class FcOpConverter : public FcBaseOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class FcOpConverter : public FcBaseOpConverter<TargetT, PrecisionT> {
public:
FcOpConverter() = default;
};
// without bias
template <typename TargetT>
class MulOpConverter : public FcBaseOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class MulOpConverter : public FcBaseOpConverter<TargetT, PrecisionT> {
public:
MulOpConverter() = default;
};
......
......@@ -21,8 +21,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void FlattenOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void FlattenOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -46,8 +46,21 @@ void FlattenOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::NV>);
using flatten_nv_fp32 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using flatten_nv_int8 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten, flatten_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten,
FlattenOpConverter<::anakin::saber::X86>);
using flatten_cpu_fp32 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using flatten_cpu_int8 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class FlattenOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class FlattenOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
FlattenOpConverter() = default;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/helper.h"
namespace paddle {
namespace inference {
namespace anakin {
std::unique_ptr<framework::LoDTensor> tensor_from_var(
const framework::Variable& var, const platform::Place& place) {
auto& src = var.Get<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> dst(new framework::LoDTensor());
dst->Resize(src.dims());
TensorCopySync((src), place, dst.get());
return dst;
}
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "framework/core/net/net.h"
#include "framework/core/types.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "saber/saber_types.h"
using anakin::saber::Shape;
using anakin::AK_FLOAT;
using anakin::PBlock;
using anakin::graph::GraphGlobalMem;
namespace paddle {
namespace inference {
namespace anakin {
std::unique_ptr<framework::LoDTensor> tensor_from_var(
const framework::Variable& var, const platform::Place& place);
template <typename T>
PBlock<T>* pblock_from_tensor(const framework::LoDTensor& tensor,
std::vector<int> shape) {
while (shape.size() < 4) {
shape.insert(shape.begin(), 1);
}
Shape anakin_shape(shape);
auto* weight =
GraphGlobalMem<T>::Global().template new_block<AK_FLOAT>(anakin_shape);
float* cpu_data = static_cast<float*>(weight->h_tensor().mutable_data());
std::copy_n(tensor.data<float>(), tensor.numel(), cpu_data);
weight->d_tensor().set_shape(anakin_shape);
weight->d_tensor().copy_from(weight->h_tensor());
return weight;
}
template <typename T>
PBlock<T>* pblock_from_vector(const std::vector<float>& vec,
std::vector<int> shape_vec) {
while (shape_vec.size() < 4) {
shape_vec.insert(shape_vec.begin(), 1);
}
Shape shape(shape_vec);
auto* weight =
GraphGlobalMem<T>::Global().template new_block<AK_FLOAT>(shape);
auto* weight_data = static_cast<float*>(weight->h_tensor().mutable_data());
std::copy(std::begin(vec), std::end(vec), weight_data);
weight->d_tensor().set_shape(shape);
weight->d_tensor().copy_from(weight->h_tensor());
return weight;
}
template <typename T>
PBlock<T>* pblock_from_vector(const std::vector<float>& vec) {
int size = vec.size();
return pblock_from_vector<T>(vec, std::vector<int>({1, 1, 1, size}));
}
template <typename T>
PBlock<T>* pblock_from_var(const framework::Variable& var) {
auto tensor = tensor_from_var(var, platform::CPUPlace());
auto shape = framework::vectorize2int(tensor->dims());
return pblock_from_tensor<T>(*tensor, shape);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -23,8 +23,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void Im2SequenceConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void Im2SequenceConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -55,5 +55,18 @@ void Im2SequenceConverter<TargetT>::operator()(
} // namespace inference
} // namespace paddle
REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence,
Im2SequenceConverter<::anakin::saber::NV>);
#ifdef PADDLE_WITH_CUDA
using im2sequence_nv_fp32 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using im2sequence_nv_int8 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_int8);
#endif
using im2sequence_cpu_fp32 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using im2sequence_cpu_int8 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class Im2SequenceConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class Im2SequenceConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
Im2SequenceConverter() = default;
......
......@@ -32,9 +32,9 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
template <typename TargetT, ::anakin::Precision PrecisionT>
class AnakinOpConverter {
using AnakinEngineT = AnakinEngine<TargetT, ::anakin::Precision::FP32>;
using AnakinEngineT = AnakinEngine<TargetT, PrecisionT>;
public:
AnakinOpConverter() = default;
......@@ -96,6 +96,13 @@ class AnakinOpConverter {
engine->Graph()->RegistVar(output);
}
engine->Freeze();
// Add scale for tensor in int8 mode.
auto tensor_scales = engine->GetTensorScales();
for (auto &item : tensor_scales) {
engine->Graph()->SetVarScale(item.first, item.second);
}
for (auto &input : inputs) {
if (parameters.count(input)) continue;
std::vector<int> input_shape;
......@@ -136,52 +143,78 @@ class AnakinOpConverter {
AnakinEngineT *engine_{nullptr};
private:
std::unordered_map<std::string, AnakinOpConverter<TargetT> *> converters_;
std::unordered_map<std::string, AnakinOpConverter<TargetT, PrecisionT> *>
converters_;
framework::Scope *scope_{nullptr};
std::mutex mutex_;
};
template class AnakinOpConverter<::anakin::saber::NV>;
template class AnakinOpConverter<::anakin::saber::X86>;
template class AnakinOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
template class AnakinOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
template class AnakinOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
} // namespace anakin
} // namespace inference
} // namespace paddle
#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \
place_type__, place_class__) \
struct anakin_##op_type__##_##place_type__##_converter \
place_type__, place_class__, \
precision_type__, precision_class__) \
struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter \
: public ::paddle::framework::Registrar { \
anakin_##op_type__##_##place_type__##_converter() { \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter() { \
LOG(INFO) << "register convert " << #op_type__ << " "; \
::paddle::inference::Registry< \
::paddle::inference::anakin::AnakinOpConverter<place_class__>>:: \
Global() \
.Register<::paddle::inference::anakin::Converter__>(#op_type__); \
::paddle::inference::anakin::AnakinOpConverter< \
place_class__, precision_class__>>::Global() \
.Register<Converter__>(#op_type__); \
} \
}; \
anakin_##op_type__##_##place_type__##_converter \
anakin_##op_type__##_##place_type__##_converter__; \
int TouchConverterRegister_anakin_##op_type__##_##place_type__() { \
anakin_##op_type__##_##place_type__##_converter__.Touch(); \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter__; \
int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() { \
anakin_##op_type__##_##place_type__##_##precision_type__##_converter__ \
.Touch(); \
return 0; \
}
#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV)
::anakin::saber::NV, FP32, \
::anakin::Precision::FP32)
#define REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV, INT8, \
::anakin::Precision::INT8)
#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86)
::anakin::saber::X86, FP32, \
::anakin::Precision::FP32)
#define REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \
::anakin::saber::X86, INT8, \
::anakin::Precision::INT8)
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__) \
extern int TouchConverterRegister_anakin_##op_type__##_##place_type__(); \
int use_op_converter_anakin_##op_type__##_##place_type__ \
__attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__##_##place_type__();
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \
extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \
__attribute__((unused)) = \
Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
#define USE_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA)
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
#define USE_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU)
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \
USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
......@@ -23,8 +23,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void Pool2dOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void Pool2dOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -72,8 +72,21 @@ void Pool2dOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::NV>);
using pool2d_nv_float32 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using pool2d_nv_int8 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_float32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d,
Pool2dOpConverter<::anakin::saber::X86>);
using pool2d_cpu_float32 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using pool2d_cpu_int8 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_float32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class Pool2dOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class Pool2dOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
Pool2dOpConverter() = default;
......
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void ReluOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ReluOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -36,8 +36,8 @@ void ReluOpConverter<TargetT>::operator()(
this->engine_->AddOpAttr(op_name, "alpha", 0);
}
template <typename TargetT>
void LeakyReluOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void LeakyReluOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -58,10 +58,35 @@ void LeakyReluOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::NV>);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::NV>);
using relu_nv_fp32 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using leaky_nv_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using relu_nv_int8 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using leaky_nv_int8 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, relu_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(relu, relu_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, ReluOpConverter<::anakin::saber::X86>);
REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu,
LeakyReluOpConverter<::anakin::saber::X86>);
using relu_cpu_fp32 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using leaky_cpu_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using relu_cpu_int8 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using leaky_cpu_int8 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, relu_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(relu, relu_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_int8);
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ReluOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ReluOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
ReluOpConverter() = default;
......@@ -34,8 +34,8 @@ class ReluOpConverter : public AnakinOpConverter<TargetT> {
virtual ~ReluOpConverter() {}
};
template <typename TargetT>
class LeakyReluOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class LeakyReluOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
LeakyReluOpConverter() = default;
......
......@@ -21,8 +21,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void ReshapeOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ReshapeOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -47,9 +47,21 @@ void ReshapeOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::NV>);
using reshape_nv_fp32 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using reshape_nv_int8 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape, reshape_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape,
ReshapeOpConverter<::anakin::saber::X86>);
using reshape_cpu_fp32 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using reshape_cpu_int8 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ReshapeOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ReshapeOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
ReshapeOpConverter() = default;
......
......@@ -16,17 +16,12 @@
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void RoiAlignOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void RoiAlignOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -57,8 +52,21 @@ void RoiAlignOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::NV>);
using roi_align_nv_fp32 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using roi_align_nv_int8 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align,
RoiAlignOpConverter<::anakin::saber::X86>);
using roi_align_cpu_fp32 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using roi_align_cpu_int8 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_int8);
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class RoiAlignOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class RoiAlignOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
RoiAlignOpConverter() = default;
......
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void ScaleOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void ScaleOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -49,4 +49,22 @@ void ScaleOpConverter<TargetT>::operator()(
} // namespace inference
} // namespace paddle
REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter<::anakin::saber::NV>);
#ifdef PADDLE_WITH_CUDA
using scale_nv_fp32 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using scale_nv_int8 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, scale_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(scale, scale_nv_int8);
#endif
using scale_cpu_fp32 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using scale_cpu_int8 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(scale, scale_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(scale, scale_cpu_int8);
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class ScaleOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class ScaleOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
ScaleOpConverter() = default;
......
......@@ -18,8 +18,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void SoftMaxOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void SoftMaxOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -45,9 +45,22 @@ void SoftMaxOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::NV>);
using sm_nv_fp32 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using sm_nv_int8 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax, sm_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(softmax, sm_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax,
SoftMaxOpConverter<::anakin::saber::X86>);
using sm_cpu_fp32 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sm_cpu_int8 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax, sm_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(softmax, sm_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class SoftMaxOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class SoftMaxOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
SoftMaxOpConverter() = default;
......
......@@ -22,8 +22,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void SplitOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void SplitOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -56,7 +56,22 @@ void SplitOpConverter<TargetT>::operator()(
} // namespace inference
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::NV>);
using split_nv_fp32 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using split_nv_int8 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, split_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(split, split_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(split, SplitOpConverter<::anakin::saber::X86>);
using split_cpu_fp32 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using split_cpu_int8 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(split, split_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(split, split_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class SplitOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class SplitOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
SplitOpConverter() = default;
......
......@@ -23,11 +23,10 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void SumOpConverter<TargetT>::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope,
bool test_mode) {
template <typename TargetT, ::anakin::Precision PrecisionT>
void SumOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
......@@ -49,6 +48,21 @@ void SumOpConverter<TargetT>::operator()(const framework::proto::OpDesc &op,
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::NV>);
using sum_nv_fp32 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using sum_nv_int8 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, sum_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sum, sum_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, SumOpConverter<::anakin::saber::X86>);
using sum_cpu_fp32 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sum_cpu_int8 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, sum_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sum, sum_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class SumOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class SumOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
SumOpConverter() = default;
......
......@@ -27,8 +27,8 @@ static void test_activation_op(const std::string& op_type,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......@@ -57,6 +57,7 @@ TEST(tanh_op, gpu) {
}
#endif
/*
TEST(sigm_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
......@@ -68,6 +69,7 @@ TEST(tanh_op, cpu) {
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
}
*/
} // namespace anakin
} // namespace inference
......
......@@ -28,8 +28,8 @@ void test_affine_channel_op(const platform::DeviceContext& context,
std::unordered_set<std::string> parameters({"scale", "bias"});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("x", {1, 3, 5, 2});
validator.DeclOutputVar("out", {1, 3, 5, 2});
validator.DeclParamVar("scale", {3});
......
......@@ -25,8 +25,8 @@ void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) {
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
......
......@@ -25,8 +25,8 @@ template <typename TargetT>
void test_concat_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
......
......@@ -25,8 +25,8 @@ template <typename TargetT>
void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
......
......@@ -25,8 +25,8 @@ template <typename TargetT>
void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
validator.DeclOutputVar("mask", {1, 1, 2, 2});
......
......@@ -27,8 +27,8 @@ static void test_elementwise_op(const std::string& op_type,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclInputVar("y", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
......
......@@ -25,8 +25,8 @@ void test_mul_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters({"mul_y"});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("mul_x", {1, 1, 2, 2});
validator.DeclParamVar("mul_y", {4, 2});
validator.DeclOutputVar("mul_out", {1, 2});
......
......@@ -24,8 +24,8 @@ template <typename TargetT>
void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
framework::OpDesc desc;
......
......@@ -25,8 +25,8 @@ void test_pool2d(const platform::DeviceContext& context, bool use_gpu,
std::string pool_type = "max") {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
......
......@@ -27,8 +27,8 @@ static void test_activation_op(const std::string& op_type,
bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
......@@ -60,20 +60,6 @@ TEST(leaky_relu_op, gpu) {
}
#endif
/* seems bug here
TEST(relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("relu", ctx, false);
}
TEST(leaky_relu_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("leaky_relu", ctx, false);
}
*/
} // namespace anakin
} // namespace inference
} // namespace paddle
......
......@@ -24,8 +24,8 @@ template <typename TargetT>
void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
......@@ -49,8 +49,8 @@ template <typename TargetT>
void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("reshape-X", {1, 2, 4});
validator.DeclOutputVar("reshape-Out", {1, 4, 2});
......
......@@ -24,8 +24,8 @@ template <typename TargetT>
void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("softmax-X", {1, 10, 2});
validator.DeclOutputVar("softmax-Out", {1, 10, 2});
......
......@@ -27,8 +27,8 @@ void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu,
const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("split_input", in_shape);
std::vector<std::string> output_vars;
......
......@@ -26,8 +26,8 @@ template <typename TargetT>
static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
......
......@@ -24,8 +24,8 @@ template <typename TargetT>
void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
......@@ -47,8 +47,8 @@ template <typename TargetT>
void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation<TargetT> validator(parameters, &scope, context,
use_gpu);
AnakinConvertValidation<TargetT, ::anakin::Precision::FP32> validator(
parameters, &scope, context, use_gpu);
validator.DeclInputVar("transpose-X", {3, 4, 5});
validator.DeclOutputVar("transpose-Out", {3, 5, 4});
......
......@@ -23,8 +23,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
void TransposeOpConverter<TargetT>::operator()(
template <typename TargetT, ::anakin::Precision PrecisionT>
void TransposeOpConverter<TargetT, PrecisionT>::operator()(
const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
......@@ -50,9 +50,17 @@ void TransposeOpConverter<TargetT>::operator()(
} // namespace paddle
#ifdef PADDLE_WITH_CUDA
REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::NV>);
using transpose_nv_fp32 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using transpose_nv_int8 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose, transpose_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_nv_int8);
#endif
REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose,
TransposeOpConverter<::anakin::saber::X86>);
using transpose_cpu_fp32 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using transpose_cpu_int8 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_int8);
......@@ -20,8 +20,8 @@ namespace paddle {
namespace inference {
namespace anakin {
template <typename TargetT>
class TransposeOpConverter : public AnakinOpConverter<TargetT> {
template <typename TargetT, ::anakin::Precision PrecisionT>
class TransposeOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
public:
TransposeOpConverter() = default;
......
......@@ -61,7 +61,7 @@ void RandomizeTensor(framework::LoDTensor* tensor,
auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
for (size_t i = 0; i < num_elements; i++) {
*(temp_data + i) = random(-128., 128.);
*(temp_data + i) = random(0., 1.);
}
TensorCopySync(temp_tensor, place, tensor);
......@@ -72,9 +72,9 @@ void RandomizeTensor(framework::LoDTensor* tensor,
* anakin
* layer.
*/
template <typename TargetT>
template <typename TargetT, ::anakin::Precision PrecisionT>
class AnakinConvertValidation {
using AnakinNvEngineT = AnakinEngine<TargetT, Precision::FP32>;
using AnakinNvEngineT = AnakinEngine<TargetT, PrecisionT>;
public:
AnakinConvertValidation() = delete;
......@@ -84,7 +84,7 @@ class AnakinConvertValidation {
const platform::DeviceContext& ctx,
bool use_gpu = true)
: parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) {
engine_.reset(new AnakinEngine<TargetT, Precision::FP32>(true));
engine_.reset(new AnakinEngine<TargetT, PrecisionT>(true));
}
// Declare a Variable as input with random initialization.
......@@ -127,7 +127,7 @@ class AnakinConvertValidation {
// should init anakin engine here.
auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
Singleton<AnakinOpConverter<TargetT>>::Global().ConvertOp(
Singleton<AnakinOpConverter<TargetT, PrecisionT>>::Global().ConvertOp(
desc, block_desc, parameters_, *scope_, engine_.get(),
true /*test_mode*/);
engine_->Freeze();
......@@ -213,8 +213,15 @@ class AnakinConvertValidation {
bool use_gpu_{true};
};
template class AnakinConvertValidation<::anakin::saber::NV>;
template class AnakinConvertValidation<::anakin::saber::X86>;
template class AnakinConvertValidation<::anakin::saber::NV,
::anakin::Precision::FP32>;
template class AnakinConvertValidation<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinConvertValidation<::anakin::saber::NV,
::anakin::Precision::INT8>;
template class AnakinConvertValidation<::anakin::saber::X86,
::anakin::Precision::INT8>;
} // namespace anakin
} // namespace inference
} // namespace paddle
......@@ -172,11 +172,20 @@ AnakinEngine<TargetT, PrecisionType, RunType>::Clone() {
#ifdef PADDLE_WITH_CUDA
template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::NV>;
template class AnakinEngineManager<::anakin::saber::NV,
::anakin::Precision::FP32>;
template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>;
template class AnakinEngineManager<::anakin::saber::NV,
::anakin::Precision::INT8>;
#endif
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
template class AnakinEngineManager<::anakin::saber::X86>;
template class AnakinEngineManager<::anakin::saber::X86,
::anakin::Precision::FP32>;
template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>;
template class AnakinEngineManager<::anakin::saber::X86,
::anakin::Precision::INT8>;
// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
} // namespace anakin
......
......@@ -93,6 +93,12 @@ class AnakinEngine {
void Save(std::string path) { graph_->save(path); }
bool IsInit() { return initialized_; }
int GetDevice() { return device_; }
void AddTensorScale(const std::string &tensor_name, float scale) {
tensor_scales_[tensor_name] = scale;
}
std::unordered_map<std::string, float> GetTensorScales() {
return tensor_scales_;
}
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs);
#ifdef PADDLE_WITH_CUDA
......@@ -112,11 +118,12 @@ class AnakinEngine {
std::unique_ptr<GraphT> graph_;
std::unique_ptr<NetT> net_;
std::vector<std::string> program_inputs_;
std::unordered_map<std::string, float> tensor_scales_;
};
template <typename TargetT>
template <typename TargetT, ::anakin::Precision PrecisionType>
class AnakinEngineManager {
using AnakinEngineT = AnakinEngine<TargetT, Precision::FP32>;
using AnakinEngineT = AnakinEngine<TargetT, PrecisionType>;
public:
bool HasEngine(const std::string &name) const {
......@@ -132,7 +139,7 @@ class AnakinEngineManager {
std::vector<std::string> program_inputs,
std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<TargetT, Precision::FP32>(
auto *p = new AnakinEngine<TargetT, PrecisionType>(
need_summary, device, max_batch_size, max_input_shape, program_inputs);
engines_[engine_name].reset(p);
return p;
......
......@@ -169,7 +169,13 @@ struct Argument {
anakin_max_shape_t);
DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode,
AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter,
std::vector<std::string>);
DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter,
std::vector<std::string>);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
......
......@@ -123,6 +123,11 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
argument->anakin_max_input_shape()));
pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
bool enable_int8 =
argument->anakin_precision_mode() == AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8));
pass->Set("anakin_ops_filter",
new std::vector<std::string>(argument->anakin_ops_filter()));
}
pre_pass = pass_name;
......
......@@ -39,8 +39,14 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
framework::ir::Graph *graph) const {
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
auto &anakin_ops_filter = Get<std::vector<std::string>>("anakin_ops_filter");
auto teller = [&anakin_ops_filter](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op())
return false;
else if (std::find(anakin_ops_filter.begin(), anakin_ops_filter.end(),
node->Op()->Type()) != anakin_ops_filter.end())
return false;
return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
......@@ -191,47 +197,71 @@ void AnakinSubgraphPass::CreateAnakinOp(
SetAttr(op_desc->Proto(), "engine_key", engine_key);
auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape");
auto max_batch_size = Get<int>("max_batch_size");
auto program_inputs = program_desc->GetFeedTargetNames();
bool use_gpu = Get<bool>("use_gpu");
SetAttr(op_desc->Proto(), "use_gpu", use_gpu);
bool enable_int8 = Get<bool>("enable_int8");
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
if (enable_int8) {
CreateAnakinEngine<::anakin::Precision::INT8>(&block_desc, params,
input_names, output_mapping,
program_inputs, engine_key);
} else {
CreateAnakinEngine<::anakin::Precision::FP32>(&block_desc, params,
input_names, output_mapping,
program_inputs, engine_key);
}
}
template <::anakin::Precision PrecisionT>
void AnakinSubgraphPass::CreateAnakinEngine(
framework::BlockDesc *block_desc, const std::vector<std::string> &params,
const std::set<std::string> &input_names,
const std::vector<std::string> &output_mapping,
const std::vector<std::string> &program_inputs,
const std::string &engine_key) const {
framework::BlockDesc block_desc_temp(nullptr, block_desc->Proto());
bool use_gpu = Get<bool>("use_gpu");
auto max_batch_size = Get<int>("max_batch_size");
auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape");
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::NV>>::Global()
anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
#endif
} else {
inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::X86>>::Global()
anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key);
}
auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end());
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
::anakin::saber::NV, PrecisionT>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::NV>>::Global()
inference::Singleton<inference::anakin::AnakinOpConverter<
::anakin::saber::NV, PrecisionT>>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
#endif
} else {
auto *anakin_engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86>>::Global()
::anakin::saber::X86, PrecisionT>>::Global()
.Get(engine_key);
inference::Singleton<
inference::anakin::AnakinOpConverter<::anakin::saber::X86>>::Global()
inference::Singleton<inference::anakin::AnakinOpConverter<
::anakin::saber::X86, PrecisionT>>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
......
......@@ -15,6 +15,7 @@
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
......@@ -36,6 +37,13 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
template <::anakin::Precision PrecisionT>
void CreateAnakinEngine(framework::BlockDesc *block_desc,
const std::vector<std::string> &params,
const std::set<std::string> &input_names,
const std::vector<std::string> &output_mapping,
const std::vector<std::string> &program_inputs,
const std::string &engine_key) const;
};
} // namespace analysis
......
......@@ -116,6 +116,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(anakin_max_batchsize_);
CP_MEMBER(anakin_max_input_shape_);
CP_MEMBER(anakin_min_subgraph_size_);
CP_MEMBER(anakin_precision_mode_);
CP_MEMBER(anakin_passes_filter_);
CP_MEMBER(anakin_ops_filter_);
// Ir related.
CP_MEMBER(enable_ir_optim_);
......@@ -276,7 +279,10 @@ void AnalysisConfig::Update() {
pass_builder()->ClearPasses();
for (const auto &pass : kAnakinSubgraphPasses) {
pass_builder()->AppendPass(pass);
if (std::find(anakin_passes_filter_.begin(), anakin_passes_filter_.end(),
pass) == anakin_passes_filter_.end()) {
pass_builder()->AppendPass(pass);
}
}
}
......@@ -391,11 +397,16 @@ void AnalysisConfig::SwitchIrDebug(int x) {
}
void AnalysisConfig::EnableAnakinEngine(
int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
int min_subgraph_size) {
int min_subgraph_size, AnalysisConfig::Precision precision_mode,
std::vector<std::string> passes_filter,
std::vector<std::string> ops_filter) {
anakin_max_batchsize_ = max_batch_size;
anakin_max_input_shape_ = max_input_shape;
anakin_min_subgraph_size_ = min_subgraph_size;
anakin_passes_filter_ = passes_filter;
anakin_ops_filter_ = ops_filter;
use_anakin_ = true;
anakin_precision_mode_ = precision_mode;
Update();
}
} // namespace paddle
......@@ -386,6 +386,9 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_);
argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_);
argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_);
LOG(INFO) << "Anakin subgraph engine is enabled";
}
......
......@@ -152,7 +152,9 @@ struct AnalysisConfig {
void EnableAnakinEngine(
int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {},
int min_subgraph_size = 6);
int min_subgraph_size = 6, Precision precision = Precision::kFloat32,
std::vector<std::string> passes_filter = {},
std::vector<std::string> ops_filter = {});
/** A boolean state indicating whether the Anakin sub-graph engine is used.
*/
......@@ -291,6 +293,9 @@ struct AnalysisConfig {
int anakin_max_batchsize_;
int anakin_min_subgraph_size_{6};
std::map<std::string, std::vector<int>> anakin_max_input_shape_;
Precision anakin_precision_mode_;
std::vector<std::string> anakin_passes_filter_;
std::vector<std::string> anakin_ops_filter_;
std::map<std::string, std::string> engine_opt_info_;
bool use_mkldnn_quantizer_{false};
......
......@@ -73,15 +73,21 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
// The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", //
"graph_viz_pass", //
"quant_conv2d_dequant_fuse_pass", //
"graph_viz_pass", //
"simplify_anakin_priorbox_detection_out_pass", //
"fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"quant_conv2d_dequant_fuse_pass", //
"anakin_subgraph_pass",
// "conv_bn_fuse_pass", //
// "conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"graph_viz_pass", //
"anakin_subgraph_pass", //
"graph_viz_pass", //
"fc_gru_fuse_pass", //
"graph_viz_pass", //
});
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
......
......@@ -44,6 +44,7 @@ class AnakinEngineOp : public framework::OperatorBase {
std::string engine_key_;
std::string engine_serialized_data_;
bool use_gpu_;
bool enable_int8_;
public:
AnakinEngineOp(const std::string &type,
......@@ -55,6 +56,7 @@ class AnakinEngineOp : public framework::OperatorBase {
engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters");
use_gpu_ = Attr<bool>("use_gpu");
enable_int8_ = Attr<bool>("enable_int8");
for (const auto &param : params) {
param_names_.insert(param);
}
......@@ -68,11 +70,6 @@ class AnakinEngineOp : public framework::OperatorBase {
void RunAnakin(const framework::Scope &scope,
const platform::Place &dev_place) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
std::vector<std::string> output_maps =
......@@ -96,18 +93,35 @@ class AnakinEngineOp : public framework::OperatorBase {
outputs.insert({output_maps[output_index], fluid_t});
output_index += 1;
}
if (enable_int8_) {
Execute<::anakin::Precision::INT8>(inputs, outputs, dev_place);
} else {
Execute<::anakin::Precision::FP32>(inputs, outputs, dev_place);
}
}
template <::anakin::Precision PrecisionT>
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs,
const platform::Place &dev_place) const {
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx)
.stream();
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::NV>>::Global()
::anakin::saber::NV, PrecisionT>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs, stream);
#endif
} else {
auto *engine =
inference::Singleton<inference::anakin::AnakinEngineManager<
::anakin::saber::X86>>::Global()
::anakin::saber::X86, PrecisionT>>::Global()
.Get(engine_key_);
engine->Execute(inputs, outputs);
}
......
......@@ -16,6 +16,7 @@
#include <pybind11/stl.h>
#include <cstring>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/analysis_predictor.h"
......@@ -230,8 +231,13 @@ void BindAnalysisConfig(py::module *m) {
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = true)
.def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine,
py::arg("max_batch_size") = 1, py::arg("max_input_shape") = {},
py::arg("min_subgraph_size") = 6)
py::arg("max_batch_size") = 1,
py::arg("max_input_shape") =
std::map<std::string, std::vector<int>>(),
py::arg("min_subgraph_size") = 6,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("passes_filter") = std::vector<std::string>(),
py::arg("ops_filter") = std::vector<std::string>())
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
py::arg("x") = true)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册