未验证 提交 cf5af3b5 编写于 作者: 石晓伟 提交者: GitHub

Merge pull request #16847 from NHZlX/cherry_pick_anakin_to_1.4_2

Cherry-pick from 16837 & 16846 to 1.4
...@@ -43,47 +43,22 @@ void ActivationOpConverter<TargetT, PrecisionT>::operator()( ...@@ -43,47 +43,22 @@ void ActivationOpConverter<TargetT, PrecisionT>::operator()(
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
this->engine_->AddOpAttr(op_name, "type", anakin_op_type_); this->engine_->AddOpAttr(op_name, "type", anakin_op_type_);
if (op_type_ == "swish") {
float beta = boost::get<float>(op_desc.GetAttr("beta"));
this->engine_->AddOpAttr(op_name, "clip_relu_num", beta);
}
if (op_type_ == "relu6") {
float threshold = boost::get<float>(op_desc.GetAttr("threshold"));
this->engine_->AddOpAttr(op_name, "clip_relu_num", threshold);
}
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
using sigmoid_nv_fp32 = REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV, REGISTER_ANAKIN_OP_CONVERTER(swish, SwishOpConverter);
::anakin::Precision::FP32>; REGISTER_ANAKIN_OP_CONVERTER(relu6, Relu6OpConverter);
using sigmoid_nv_int8 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using tanh_nv_fp32 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using tanh_nv_int8 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_nv_int8);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(tanh, tanh_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_nv_int8);
#endif
using sigmoid_cpu_fp32 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sigmoid_cpu_int8 =
::paddle::inference::anakin::SigmoidOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using tanh_cpu_fp32 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using tanh_cpu_int8 =
::paddle::inference::anakin::TanhOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sigmoid, sigmoid_cpu_int8);
REGISTER_CPU_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(tanh, tanh_cpu_int8);
...@@ -37,7 +37,9 @@ class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> { ...@@ -37,7 +37,9 @@ class ActivationOpConverter : public AnakinOpConverter<TargetT, PrecisionT> {
std::string op_type_; std::string op_type_;
std::string anakin_op_type_; std::string anakin_op_type_;
std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"}, std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"},
{"sigmoid", "Sigmoid"}}; {"sigmoid", "Sigmoid"},
{"relu6", "ClippedRelu"},
{"swish", "Swish"}};
}; };
template <typename TargetT, ::anakin::Precision PrecisionT> template <typename TargetT, ::anakin::Precision PrecisionT>
...@@ -52,6 +54,19 @@ class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> { ...@@ -52,6 +54,19 @@ class SigmoidOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
SigmoidOpConverter() SigmoidOpConverter()
: ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {} : ActivationOpConverter<TargetT, PrecisionT>("sigmoid") {}
}; };
template <typename TargetT, ::anakin::Precision PrecisionT>
class Relu6OpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
public:
Relu6OpConverter() : ActivationOpConverter<TargetT, PrecisionT>("relu6") {}
};
template <typename TargetT, ::anakin::Precision PrecisionT>
class SwishOpConverter : public ActivationOpConverter<TargetT, PrecisionT> {
public:
SwishOpConverter() : ActivationOpConverter<TargetT, PrecisionT>("swish") {}
};
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -52,22 +52,4 @@ void AffineChannelOpConverter<TargetT, PrecisionT>::operator()( ...@@ -52,22 +52,4 @@ void AffineChannelOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
using affine_channel_nv_fp32 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using affine_channel_nv_int8 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_nv_int8);
#endif
using affine_channel_cpu_fp32 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using affine_channel_cpu_int8 =
::paddle::inference::anakin::AffineChannelOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(affine_channel, affine_channel_cpu_int8);
...@@ -82,18 +82,4 @@ void BatchNormOpConverter<TargetT, PrecisionT>::operator()( ...@@ -82,18 +82,4 @@ void BatchNormOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
using bn_nv_fp32 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using bn_nv_int8 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_nv_int8);
#endif
using bn_cpu_fp32 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using bn_cpu_int8 = ::paddle::inference::anakin::BatchNormOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(batch_norm, bn_cpu_int8);
...@@ -38,22 +38,4 @@ void ConcatOpConverter<TargetT, PrecisionT>::operator()( ...@@ -38,22 +38,4 @@ void ConcatOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
using concat_nv_fp32 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using concat_nv_int8 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(concat, concat_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(concat, concat_nv_int8);
#endif
using concat_cpu_fp32 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using concat_cpu_int8 =
::paddle::inference::anakin::ConcatOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(concat, concat_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(concat, concat_cpu_int8);
...@@ -105,22 +105,4 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()( ...@@ -105,22 +105,4 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
using conv2d_nv_fp32 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using conv2d_nv_int8 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_nv_int8);
#endif
using conv2d_cpu_fp32 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using conv2d_cpu_int8 =
::paddle::inference::anakin::Conv2dOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d, conv2d_cpu_int8);
...@@ -111,22 +111,4 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()( ...@@ -111,22 +111,4 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
using conv2d_fusion_nv_fp32 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using conv2d_fusion_nv_int8 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_nv_int8);
#endif
using conv2d_fusion_cpu_fp32 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using conv2d_fusion_cpu_int8 =
::paddle::inference::anakin::Conv2dFusionOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(conv2d_fusion, conv2d_fusion_cpu_int8);
...@@ -108,25 +108,5 @@ void DensityPriorBoxOpConverter<TargetT, PrecisionT>::operator()( ...@@ -108,25 +108,5 @@ void DensityPriorBoxOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
using ds_pr_nv_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter< REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
::anakin::saber::NV, ::anakin::Precision::FP32>;
using ds_pr_nv_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_nv_int8);
#endif
using ds_pr_cpu_fp32 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using ds_pr_cpu_int8 = ::paddle::inference::anakin::DensityPriorBoxOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(density_prior_box, ds_pr_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(prior_box, ds_pr_cpu_int8);
...@@ -66,22 +66,4 @@ void DetectionOutOpConverter<TargetT, PrecisionT>::operator()( ...@@ -66,22 +66,4 @@ void DetectionOutOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
using detection_out_nv_fp32 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using detection_out_nv_int8 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_nv_int8);
#endif
using detection_out_cpu_fp32 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using detection_out_cpu_int8 =
::paddle::inference::anakin::DetectionOutOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(detection_out, detection_out_cpu_int8);
...@@ -52,22 +52,4 @@ void DropoutOpConverter<TargetT, PrecisionT>::operator()( ...@@ -52,22 +52,4 @@ void DropoutOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
using dropout_nv_fp32 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using dropout_nv_int8 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(dropout, dropout_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_nv_int8);
#endif
using dropout_cpu_fp32 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using dropout_cpu_int8 =
::paddle::inference::anakin::DropoutOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(dropout, dropout_cpu_int8);
...@@ -71,32 +71,5 @@ void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()( ...@@ -71,32 +71,5 @@ void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
using elet_nv_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter< REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
::anakin::saber::NV, ::anakin::Precision::FP32>;
using elet_nv_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
using eletmul_nv_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using eletmul_nv_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_nv_int8);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_nv_int8);
#endif
using elet_cpu_fp32 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using elet_cpu_int8 = ::paddle::inference::anakin::ElementwiseAddOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
using eletmul_cpu_fp32 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using eletmul_cpu_int8 = ::paddle::inference::anakin::ElementwiseMulOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_add, elet_cpu_int8);
REGISTER_CPU_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(elementwise_mul, eletmul_cpu_int8);
...@@ -117,40 +117,5 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()( ...@@ -117,40 +117,5 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
using mul_nv_fp32 = REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using fc_nv_fp32 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using mul_nv_int8 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using fc_nv_int8 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(mul, mul_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(fc, fc_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(mul, mul_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(fc, fc_nv_int8);
#endif
using mul_cpu_fp32 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using fc_cpu_fp32 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using mul_cpu_int8 =
::paddle::inference::anakin::MulOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using fc_cpu_int8 =
::paddle::inference::anakin::FcOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(mul, mul_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(fc, fc_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(mul, mul_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(fc, fc_cpu_int8);
...@@ -45,22 +45,4 @@ void FlattenOpConverter<TargetT, PrecisionT>::operator()( ...@@ -45,22 +45,4 @@ void FlattenOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
using flatten_nv_fp32 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using flatten_nv_int8 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(flatten, flatten_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_nv_int8);
#endif
using flatten_cpu_fp32 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using flatten_cpu_int8 =
::paddle::inference::anakin::FlattenOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(flatten, flatten_cpu_int8);
...@@ -55,18 +55,4 @@ void Im2SequenceConverter<TargetT, PrecisionT>::operator()( ...@@ -55,18 +55,4 @@ void Im2SequenceConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
using im2sequence_nv_fp32 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using im2sequence_nv_int8 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_nv_int8);
#endif
using im2sequence_cpu_fp32 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using im2sequence_cpu_int8 = ::paddle::inference::anakin::Im2SequenceConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(im2sequence, im2sequence_cpu_int8);
...@@ -183,25 +183,37 @@ template class AnakinOpConverter<::anakin::saber::X86, ...@@ -183,25 +183,37 @@ template class AnakinOpConverter<::anakin::saber::X86,
return 0; \ return 0; \
} }
#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ #define WRAP(...) __VA_ARGS__
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \
::anakin::saber::NV, FP32, \ #define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, \
::anakin::Precision::FP32) precision_type__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE( \
#define REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ op_type__, \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CUDA, \ ::paddle::inference::anakin::Converter__<WRAP( \
::anakin::saber::NV, INT8, \ ::anakin::saber::NV, ::anakin::Precision::precision_type__)>, \
::anakin::Precision::INT8) CUDA, ::anakin::saber::NV, precision_type__, \
::anakin::Precision::precision_type__)
#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \ #define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, \
::anakin::saber::X86, FP32, \ precision_type__) \
::anakin::Precision::FP32) REGISTER_ANAKIN_OP_CONVERTER_BASE( \
op_type__, \
#define REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ ::paddle::inference::anakin::Converter__<WRAP( \
REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, CPU, \ ::anakin::saber::X86, ::anakin::Precision::precision_type__)>, \
::anakin::saber::X86, INT8, \ CPU, ::anakin::saber::X86, precision_type__, \
::anakin::Precision::INT8) ::anakin::Precision::precision_type__)
#ifdef PADDLE_WITH_CUDA
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
#else
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
#endif
#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \
extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \ extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \
......
...@@ -71,22 +71,4 @@ void Pool2dOpConverter<TargetT, PrecisionT>::operator()( ...@@ -71,22 +71,4 @@ void Pool2dOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
using pool2d_nv_float32 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using pool2d_nv_int8 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_float32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_nv_int8);
#endif
using pool2d_cpu_float32 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using pool2d_cpu_int8 =
::paddle::inference::anakin::Pool2dOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_float32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(pool2d, pool2d_cpu_int8);
...@@ -57,36 +57,5 @@ void LeakyReluOpConverter<TargetT, PrecisionT>::operator()( ...@@ -57,36 +57,5 @@ void LeakyReluOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
using relu_nv_fp32 = REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using leaky_nv_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using relu_nv_int8 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
using leaky_nv_int8 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(relu, relu_nv_fp32);
REGISTER_CUDA_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(relu, relu_nv_int8);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_nv_int8);
#endif
using relu_cpu_fp32 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using leaky_cpu_fp32 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using relu_cpu_int8 =
::paddle::inference::anakin::ReluOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
using leaky_cpu_int8 = ::paddle::inference::anakin::LeakyReluOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(relu, relu_cpu_fp32);
REGISTER_CPU_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(relu, relu_cpu_int8);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(leaky_relu, leaky_cpu_int8);
...@@ -46,22 +46,4 @@ void ReshapeOpConverter<TargetT, PrecisionT>::operator()( ...@@ -46,22 +46,4 @@ void ReshapeOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
using reshape_nv_fp32 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using reshape_nv_int8 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(reshape, reshape_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_nv_int8);
#endif
using reshape_cpu_fp32 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using reshape_cpu_int8 =
::paddle::inference::anakin::ReshapeOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(reshape, reshape_cpu_int8);
...@@ -51,22 +51,4 @@ void RoiAlignOpConverter<TargetT, PrecisionT>::operator()( ...@@ -51,22 +51,4 @@ void RoiAlignOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter);
using roi_align_nv_fp32 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using roi_align_nv_int8 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_nv_int8);
#endif
using roi_align_cpu_fp32 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using roi_align_cpu_int8 =
::paddle::inference::anakin::RoiAlignOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(roi_align, roi_align_cpu_int8);
...@@ -49,22 +49,4 @@ void ScaleOpConverter<TargetT, PrecisionT>::operator()( ...@@ -49,22 +49,4 @@ void ScaleOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
using scale_nv_fp32 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using scale_nv_int8 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(scale, scale_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(scale, scale_nv_int8);
#endif
using scale_cpu_fp32 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using scale_cpu_int8 =
::paddle::inference::anakin::ScaleOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(scale, scale_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(scale, scale_cpu_int8);
...@@ -44,23 +44,4 @@ void SoftMaxOpConverter<TargetT, PrecisionT>::operator()( ...@@ -44,23 +44,4 @@ void SoftMaxOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
using sm_nv_fp32 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using sm_nv_int8 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(softmax, sm_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(softmax, sm_nv_int8);
#endif
using sm_cpu_fp32 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sm_cpu_int8 =
::paddle::inference::anakin::SoftMaxOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(softmax, sm_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(softmax, sm_cpu_int8);
...@@ -55,23 +55,5 @@ void SplitOpConverter<TargetT, PrecisionT>::operator()( ...@@ -55,23 +55,5 @@ void SplitOpConverter<TargetT, PrecisionT>::operator()(
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA
using split_nv_fp32 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using split_nv_int8 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(split, split_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(split, split_nv_int8);
#endif
using split_cpu_fp32 = REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter);
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using split_cpu_int8 =
::paddle::inference::anakin::SplitOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(split, split_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(split, split_cpu_int8);
...@@ -47,22 +47,4 @@ void SumOpConverter<TargetT, PrecisionT>::operator()( ...@@ -47,22 +47,4 @@ void SumOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter);
using sum_nv_fp32 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV,
::anakin::Precision::FP32>;
using sum_nv_int8 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::NV,
::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(sum, sum_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(sum, sum_nv_int8);
#endif
using sum_cpu_fp32 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86,
::anakin::Precision::FP32>;
using sum_cpu_int8 =
::paddle::inference::anakin::SumOpConverter<::anakin::saber::X86,
::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(sum, sum_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(sum, sum_cpu_int8);
...@@ -36,6 +36,14 @@ static void test_activation_op(const std::string& op_type, ...@@ -36,6 +36,14 @@ static void test_activation_op(const std::string& op_type,
desc.SetInput("X", {"act-X"}); desc.SetInput("X", {"act-X"});
desc.SetOutput("Out", {"act-Out"}); desc.SetOutput("Out", {"act-Out"});
if (op_type == "swish") {
desc.SetAttr("beta", 1.0f);
}
if (op_type == "relu6") {
desc.SetAttr("threshold", 6.0f);
}
LOG(INFO) << "set OP"; LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
...@@ -55,6 +63,18 @@ TEST(tanh_op, gpu) { ...@@ -55,6 +63,18 @@ TEST(tanh_op, gpu) {
platform::CUDADeviceContext ctx(gpu_place); platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("tanh", ctx, true); test_activation_op<::anakin::saber::NV>("tanh", ctx, true);
} }
TEST(relu6_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("relu6", ctx, true);
}
TEST(swish_op, gpu) {
platform::CUDAPlace gpu_place(0);
platform::CUDADeviceContext ctx(gpu_place);
test_activation_op<::anakin::saber::NV>("swish", ctx, true);
}
#endif #endif
/* /*
...@@ -69,6 +89,18 @@ TEST(tanh_op, cpu) { ...@@ -69,6 +89,18 @@ TEST(tanh_op, cpu) {
platform::CPUDeviceContext ctx(cpu_place); platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("tanh", ctx, false); test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
} }
TEST(relu6_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("relu6", ctx, false);
}
TEST(swish_op, cpu) {
platform::CPUPlace cpu_place;
platform::CPUDeviceContext ctx(cpu_place);
test_activation_op<::anakin::saber::X86>("swish", ctx, false);
}
*/ */
} // namespace anakin } // namespace anakin
...@@ -77,10 +109,16 @@ TEST(tanh_op, cpu) { ...@@ -77,10 +109,16 @@ TEST(tanh_op, cpu) {
USE_OP(sigmoid); USE_OP(sigmoid);
USE_OP(tanh); USE_OP(tanh);
USE_OP(relu6);
USE_OP(swish);
USE_CPU_ANAKIN_CONVERTER(sigmoid); USE_CPU_ANAKIN_CONVERTER(sigmoid);
USE_CPU_ANAKIN_CONVERTER(tanh); USE_CPU_ANAKIN_CONVERTER(tanh);
USE_CPU_ANAKIN_CONVERTER(relu6);
USE_CPU_ANAKIN_CONVERTER(swish);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(tanh);
USE_ANAKIN_CONVERTER(relu6);
USE_ANAKIN_CONVERTER(swish);
#endif #endif
...@@ -49,18 +49,4 @@ void TransposeOpConverter<TargetT, PrecisionT>::operator()( ...@@ -49,18 +49,4 @@ void TransposeOpConverter<TargetT, PrecisionT>::operator()(
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
using transpose_nv_fp32 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::NV, ::anakin::Precision::FP32>;
using transpose_nv_int8 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::NV, ::anakin::Precision::INT8>;
REGISTER_CUDA_ANAKIN_OP_CONVERTER(transpose, transpose_nv_fp32);
REGISTER_CUDA_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_nv_int8);
#endif
using transpose_cpu_fp32 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::X86, ::anakin::Precision::FP32>;
using transpose_cpu_int8 = ::paddle::inference::anakin::TransposeOpConverter<
::anakin::saber::X86, ::anakin::Precision::INT8>;
REGISTER_CPU_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_fp32);
REGISTER_CPU_INT8_ANAKIN_OP_CONVERTER(transpose, transpose_cpu_int8);
...@@ -36,13 +36,14 @@ template <typename TargetT, Precision PrecisionType, OpRunType RunType> ...@@ -36,13 +36,14 @@ template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine( AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
bool need_summary, int device, int max_batch_size, bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape, std::map<std::string, std::vector<int>> max_input_shape,
std::vector<std::string> program_inputs) std::vector<std::string> program_inputs, bool auto_config_layout)
: graph_(new AnakinGraphT<TargetT, PrecisionType>()), : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) { net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
device_ = device; device_ = device;
max_batch_size_ = max_batch_size; max_batch_size_ = max_batch_size;
max_input_shape_ = max_input_shape; max_input_shape_ = max_input_shape;
program_inputs_ = program_inputs; program_inputs_ = program_inputs;
auto_config_layout_ = auto_config_layout;
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
...@@ -57,7 +58,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::SetInputShape( ...@@ -57,7 +58,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::SetInputShape(
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::InitNet() { void AnakinEngine<TargetT, PrecisionType, RunType>::InitNet() {
net_->init(*graph_); net_->init(*graph_, auto_config_layout_);
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
......
...@@ -58,7 +58,8 @@ class AnakinEngine { ...@@ -58,7 +58,8 @@ class AnakinEngine {
explicit AnakinEngine( explicit AnakinEngine(
bool need_summary = false, int device = 0, int max_batch_size = 1, bool need_summary = false, int device = 0, int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {}, std::map<std::string, std::vector<int>> max_input_shape = {},
std::vector<std::string> program_inputs = {}); std::vector<std::string> program_inputs = {},
bool auto_config_layout = false);
~AnakinEngine(); ~AnakinEngine();
void InitNet(); void InitNet();
void SetInputShape(const std::string &name, std::vector<int> shape); void SetInputShape(const std::string &name, std::vector<int> shape);
...@@ -120,6 +121,8 @@ class AnakinEngine { ...@@ -120,6 +121,8 @@ class AnakinEngine {
std::unique_ptr<NetT> net_; std::unique_ptr<NetT> net_;
std::vector<std::string> program_inputs_; std::vector<std::string> program_inputs_;
std::unordered_map<std::string, float> tensor_scales_; std::unordered_map<std::string, float> tensor_scales_;
// Always be false in gpu mode but true in most cpu cases.
bool auto_config_layout_;
}; };
template <typename TargetT, ::anakin::Precision PrecisionType> template <typename TargetT, ::anakin::Precision PrecisionType>
...@@ -138,10 +141,11 @@ class AnakinEngineManager { ...@@ -138,10 +141,11 @@ class AnakinEngineManager {
AnakinEngineT *Create(bool need_summary, int device, int max_batch_size, AnakinEngineT *Create(bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape, std::map<std::string, std::vector<int>> max_input_shape,
std::vector<std::string> program_inputs, std::vector<std::string> program_inputs,
std::string engine_name) { bool auto_config_layout, std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_); std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<TargetT, PrecisionType>( auto *p = new AnakinEngine<TargetT, PrecisionType>(
need_summary, device, max_batch_size, max_input_shape, program_inputs); need_summary, device, max_batch_size, max_input_shape, program_inputs,
auto_config_layout);
engines_[engine_name].reset(p); engines_[engine_name].reset(p);
return p; return p;
} }
......
...@@ -46,6 +46,8 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -46,6 +46,8 @@ struct SimpleOpTypeSetTeller : public Teller {
teller_set.insert("prior_box"); teller_set.insert("prior_box");
teller_set.insert("leaky_relu"); teller_set.insert("leaky_relu");
teller_set.insert("affine_channel"); teller_set.insert("affine_channel");
teller_set.insert("relu6");
teller_set.insert("swish");
} }
bool operator()(const std::string& op_type, bool operator()(const std::string& op_type,
......
...@@ -171,6 +171,7 @@ struct Argument { ...@@ -171,6 +171,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int); DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode, DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode,
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(anakin_auto_config_layout, AnakinAutoConfigLayout, bool);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter, DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter,
std::vector<std::string>); std::vector<std::string>);
......
...@@ -128,6 +128,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -128,6 +128,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
pass->Set("anakin_ops_filter", pass->Set("anakin_ops_filter",
new std::vector<std::string>(argument->anakin_ops_filter())); new std::vector<std::string>(argument->anakin_ops_filter()));
pass->Set("auto_config_layout",
new bool(argument->anakin_auto_config_layout()));
} }
pre_pass = pass_name; pre_pass = pass_name;
......
...@@ -226,18 +226,20 @@ void AnakinSubgraphPass::CreateAnakinEngine( ...@@ -226,18 +226,20 @@ void AnakinSubgraphPass::CreateAnakinEngine(
auto max_batch_size = Get<int>("max_batch_size"); auto max_batch_size = Get<int>("max_batch_size");
auto max_input_shape = auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape"); Get<std::map<std::string, std::vector<int>>>("max_input_shape");
bool auto_config_layout = Get<bool>("auto_config_layout");
if (use_gpu) { if (use_gpu) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
inference::Singleton< inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global() anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size, .Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key); max_input_shape, program_inputs, false, engine_key);
#endif #endif
} else { } else {
inference::Singleton< inference::Singleton<
anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global() anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
.Create(true, Get<int>("gpu_device_id"), max_batch_size, .Create(true, Get<int>("gpu_device_id"), max_batch_size,
max_input_shape, program_inputs, engine_key); max_input_shape, program_inputs, auto_config_layout,
engine_key);
} }
auto *scope = param_scope(); auto *scope = param_scope();
......
...@@ -117,6 +117,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -117,6 +117,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(anakin_max_input_shape_); CP_MEMBER(anakin_max_input_shape_);
CP_MEMBER(anakin_min_subgraph_size_); CP_MEMBER(anakin_min_subgraph_size_);
CP_MEMBER(anakin_precision_mode_); CP_MEMBER(anakin_precision_mode_);
CP_MEMBER(anakin_auto_config_layout_);
CP_MEMBER(anakin_passes_filter_); CP_MEMBER(anakin_passes_filter_);
CP_MEMBER(anakin_ops_filter_); CP_MEMBER(anakin_ops_filter_);
...@@ -398,7 +399,7 @@ void AnalysisConfig::SwitchIrDebug(int x) { ...@@ -398,7 +399,7 @@ void AnalysisConfig::SwitchIrDebug(int x) {
void AnalysisConfig::EnableAnakinEngine( void AnalysisConfig::EnableAnakinEngine(
int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape, int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
int min_subgraph_size, AnalysisConfig::Precision precision_mode, int min_subgraph_size, AnalysisConfig::Precision precision_mode,
std::vector<std::string> passes_filter, bool auto_config_layout, std::vector<std::string> passes_filter,
std::vector<std::string> ops_filter) { std::vector<std::string> ops_filter) {
anakin_max_batchsize_ = max_batch_size; anakin_max_batchsize_ = max_batch_size;
anakin_max_input_shape_ = max_input_shape; anakin_max_input_shape_ = max_input_shape;
...@@ -407,6 +408,7 @@ void AnalysisConfig::EnableAnakinEngine( ...@@ -407,6 +408,7 @@ void AnalysisConfig::EnableAnakinEngine(
anakin_ops_filter_ = ops_filter; anakin_ops_filter_ = ops_filter;
use_anakin_ = true; use_anakin_ = true;
anakin_precision_mode_ = precision_mode; anakin_precision_mode_ = precision_mode;
anakin_auto_config_layout_ = auto_config_layout;
Update(); Update();
} }
} // namespace paddle } // namespace paddle
...@@ -387,6 +387,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -387,6 +387,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_); argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_);
argument_.SetAnakinAutoConfigLayout(config_.anakin_auto_config_layout_);
argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_); argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_);
argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_); argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_);
LOG(INFO) << "Anakin subgraph engine is enabled"; LOG(INFO) << "Anakin subgraph engine is enabled";
...@@ -893,4 +894,6 @@ USE_ANAKIN_CONVERTER(sum); ...@@ -893,4 +894,6 @@ USE_ANAKIN_CONVERTER(sum);
USE_ANAKIN_CONVERTER(prior_box); USE_ANAKIN_CONVERTER(prior_box);
USE_ANAKIN_CONVERTER(leaky_relu); USE_ANAKIN_CONVERTER(leaky_relu);
USE_ANAKIN_CONVERTER(affine_channel); USE_ANAKIN_CONVERTER(affine_channel);
USE_ANAKIN_CONVERTER(relu6);
USE_ANAKIN_CONVERTER(swish);
#endif #endif
...@@ -153,6 +153,7 @@ struct AnalysisConfig { ...@@ -153,6 +153,7 @@ struct AnalysisConfig {
int max_batch_size = 1, int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {}, std::map<std::string, std::vector<int>> max_input_shape = {},
int min_subgraph_size = 6, Precision precision = Precision::kFloat32, int min_subgraph_size = 6, Precision precision = Precision::kFloat32,
bool auto_config_layout = false,
std::vector<std::string> passes_filter = {}, std::vector<std::string> passes_filter = {},
std::vector<std::string> ops_filter = {}); std::vector<std::string> ops_filter = {});
...@@ -294,6 +295,7 @@ struct AnalysisConfig { ...@@ -294,6 +295,7 @@ struct AnalysisConfig {
int anakin_min_subgraph_size_{6}; int anakin_min_subgraph_size_{6};
std::map<std::string, std::vector<int>> anakin_max_input_shape_; std::map<std::string, std::vector<int>> anakin_max_input_shape_;
Precision anakin_precision_mode_; Precision anakin_precision_mode_;
bool anakin_auto_config_layout_{false};
std::vector<std::string> anakin_passes_filter_; std::vector<std::string> anakin_passes_filter_;
std::vector<std::string> anakin_ops_filter_; std::vector<std::string> anakin_ops_filter_;
std::map<std::string, std::string> engine_opt_info_; std::map<std::string, std::string> engine_opt_info_;
......
...@@ -78,8 +78,6 @@ const std::vector<std::string> kAnakinSubgraphPasses({ ...@@ -78,8 +78,6 @@ const std::vector<std::string> kAnakinSubgraphPasses({
"fillconstant_elementwisemul_fuse", // "fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", // "fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
// "conv_bn_fuse_pass", //
// "conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", // "fc_gru_fuse_pass", //
"anakin_subgraph_pass", // "anakin_subgraph_pass", //
"fc_gru_fuse_pass", // "fc_gru_fuse_pass", //
......
...@@ -236,6 +236,7 @@ void BindAnalysisConfig(py::module *m) { ...@@ -236,6 +236,7 @@ void BindAnalysisConfig(py::module *m) {
std::map<std::string, std::vector<int>>(), std::map<std::string, std::vector<int>>(),
py::arg("min_subgraph_size") = 6, py::arg("min_subgraph_size") = 6,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("auto_config_layout") = false,
py::arg("passes_filter") = std::vector<std::string>(), py::arg("passes_filter") = std::vector<std::string>(),
py::arg("ops_filter") = std::vector<std::string>()) py::arg("ops_filter") = std::vector<std::string>())
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册