提交 ae4e9075 编写于 作者: X xiaoli.liu@intel.com

extract functions for primitive cache

上级 e02a8025
...@@ -33,6 +33,35 @@ using platform::GetMKLDNNFormat; ...@@ -33,6 +33,35 @@ using platform::GetMKLDNNFormat;
template <typename T> template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
struct ConvInfo{
const paddle::framework::Tensor* input;
const paddle::framework::Tensor* bias;
const paddle::framework::Tensor* output;
const paddle::framework::Tensor* weight;
std::vector<int>* strides;
std::vector<int>* paddings;
std::vector<int>* dilations;
std::vector<int>* src_tz;
std::vector<int>* weights_tz;
std::vector<int>* dst_tz;
int g;
};
struct MkldnnInfo{
bool fuse_relu;
bool fuse_residual_conn;
bool force_fp32_output;
bool is_test;
const mkldnn::engine* mkldnn_engine;
std::vector<primitive>* pipeline;
const std::string* key_conv_pd;
std::string* key;
std::shared_ptr<platform::ConvMKLDNNHandler> handler;
std::shared_ptr<mkldnn::memory> src_memory_p;
std::shared_ptr<mkldnn::memory> user_src_memory_p;
std::shared_ptr<mkldnn::memory> dst_memory_p;
std::shared_ptr<mkldnn::convolution_forward> conv_p;
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
};
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
...@@ -85,7 +114,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -85,7 +114,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const float* filter_data = filter->data<float>();
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
...@@ -127,13 +155,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -127,13 +155,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
need_s8_to_u8 = true; need_s8_to_u8 = true;
} }
std::shared_ptr<mkldnn::convolution_forward> conv_p; std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
std::shared_ptr<mkldnn::memory> src_memory_p; std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> user_src_memory_p; std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> dst_memory_p; std::shared_ptr<mkldnn::memory> dst_memory_p = nullptr;
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd; std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd = nullptr;
std::shared_ptr<platform::ConvMKLDNNHandler> handler; std::shared_ptr<platform::ConvMKLDNNHandler> handler = nullptr;
auto prim_key = key + "@conv_p"; auto prim_key = key + "@conv_p";
auto dst_key = key + "@dst_mem_p"; auto dst_key = key + "@dst_mem_p";
...@@ -142,42 +170,38 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -142,42 +170,38 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_reorder_key = key + "@src_mem_preorder_p"; auto src_reorder_key = key + "@src_mem_preorder_p";
conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(dev_ctx.GetBlob(prim_key)); conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(dev_ctx.GetBlob(prim_key));
if(conv_p == nullptr){ if(conv_p == nullptr){
struct ConvInfo convinfo;
struct MkldnnInfo mkldnninfo;
convinfo.strides = &strides;
convinfo.paddings = &paddings;
convinfo.dilations = &dilations;
convinfo.src_tz = &src_tz;
convinfo.weights_tz = &weights_tz;
convinfo.dst_tz = &dst_tz;
convinfo.g = g;
mkldnninfo.fuse_relu = fuse_relu;
mkldnninfo.fuse_residual_conn = fuse_residual_conn;
mkldnninfo.force_fp32_output = force_fp32_output;
mkldnninfo.is_test = is_test;
mkldnninfo.mkldnn_engine = &mkldnn_engine;
mkldnninfo.handler = handler;
mkldnninfo.pipeline = &pipeline;
mkldnninfo.key_conv_pd = &key_conv_pd;
mkldnninfo.key = &key;
mkldnninfo.src_memory_p = src_memory_p;
mkldnninfo.user_src_memory_p = user_src_memory_p;
mkldnninfo.dst_memory_p = dst_memory_p;
mkldnninfo.conv_p = conv_p;
mkldnninfo.conv_pd = conv_pd;
if(is_INT8){ if(is_INT8){
CreateINT8Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, //filter, CreateINT8Primitive(ctx, &dev_ctx, input, filter, bias, output, &convinfo, &mkldnninfo);
bias, output,
strides, paddings,
dilations, fuse_relu,
fuse_residual_conn, input_data,
filter_data, src_tz,
weights_tz, g,
dst_tz, key,
dst_memory_p,
pipeline,
key_conv_pd,
src_memory_p,
user_src_memory_p,
conv_p,
conv_pd,
handler,
force_fp32_output);
}else{ }else{
CreateFP32Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, //filter, CreateFP32Primitive(ctx, &dev_ctx, input, filter, bias, output, &convinfo, &mkldnninfo);
bias, output,
strides, paddings,
dilations, fuse_relu,
fuse_residual_conn, input_data,
filter_data, src_tz,
weights_tz, g,
dst_tz, key,
dst_memory_p,
pipeline,
key_conv_pd,
src_memory_p,
user_src_memory_p,
conv_p,
conv_pd,
handler);
} }
//src_memory_p = mkldnninfo.src_memory_p;
//user_src_memory_p = mkldnninfo.user_src_memory_p;
dst_memory_p = mkldnninfo.dst_memory_p;
//conv_p = mkldnninfo.conv_p;
} else { } else {
auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_reorder_key)); auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_reorder_key));
src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key)); src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
...@@ -267,33 +291,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -267,33 +291,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
private: private:
void CreateFP32Primitive( void CreateFP32Primitive(
paddle::framework::ExecutionContext ctx, bool is_test, const paddle::framework::ExecutionContext& ctx,
const paddle::platform::MKLDNNDeviceContext& dev_ctx, const paddle::platform::MKLDNNDeviceContext* dev_ctx,
const mkldnn::engine& mkldnn_engine, const paddle::framework::Tensor* input, const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* input,// const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* bias, paddle::framework::Tensor* output, const paddle::framework::Tensor* bias, paddle::framework::Tensor* output,
std::vector<int> strides, std::vector<int> paddings, ConvInfo* convinfo, MkldnnInfo* mkldnninfo) const{
std::vector<int> dilations, bool fuse_relu, const T* input_data = input->data<T>();
bool fuse_residual_conn, const T* input_data, const float* filter_data = filter->data<float>();
const float* filter_data, std::vector<int> src_tz,
std::vector<int> weights_tz, int g,
std::vector<int> dst_tz, const std::string key,
std::shared_ptr<mkldnn::memory> &dst_memory_p,
std::vector<primitive>& pipeline,
const std::string &key_conv_pd,
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> user_src_memory_p,
std::shared_ptr<mkldnn::convolution_forward> conv_p,
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<platform::ConvMKLDNNHandler> handler) const{
//const T* input_data = input->data<T>();
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {*(convinfo->src_tz)}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), {*(convinfo->weights_tz)}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); (convinfo->g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -304,46 +313,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -304,46 +313,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); *(convinfo->src_tz), platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); *(convinfo->weights_tz), platform::MKLDNNGetDataType<T>(), chosen_memory_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation. std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr. // Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); *(convinfo->dst_tz), platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
if (bias) { if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, mkldnninfo->conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine, *convinfo->strides, *convinfo->paddings,
fuse_relu, fuse_residual_conn, is_test); *mkldnninfo->mkldnn_engine,
mkldnninfo->fuse_relu, mkldnninfo->fuse_residual_conn,
mkldnninfo->is_test);
} else { } else {
conv_pd = mkldnninfo->conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, ConvFwdPrimitiveDesc(src_md, weights_md, dst_md,
mkldnn_engine, fuse_relu, fuse_residual_conn, is_test); *convinfo->strides, *convinfo->paddings,
*mkldnninfo->mkldnn_engine,
mkldnninfo->fuse_relu, mkldnninfo->fuse_residual_conn,
mkldnninfo->is_test);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx->SetBlob(*mkldnninfo->key_conv_pd, mkldnninfo->conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); mkldnninfo->handler.reset(new platform::ConvMKLDNNHandler(mkldnninfo->conv_pd, *dev_ctx, *mkldnninfo->mkldnn_engine, *mkldnninfo->key));
// create mkldnn memory from input tensors (data/weights) // create mkldnn memory from input tensors (data/weights)
user_src_memory_p = mkldnninfo->user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data)); mkldnninfo->handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory( auto user_weights_memory_p = mkldnninfo->handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data)); user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
src_memory_p = mkldnninfo->src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); mkldnninfo->handler->AcquireSrcMemoryFromPrimitive(mkldnninfo->user_src_memory_p, *mkldnninfo->pipeline);
auto weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( auto weights_memory_p = mkldnninfo->handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test); user_weights_memory_p, *mkldnninfo->pipeline, mkldnninfo->is_test);
if (fuse_residual_conn) { if (mkldnninfo->fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_param_data = residual_param->data<T>(); auto residual_param_data = residual_param->data<T>();
...@@ -354,9 +368,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -354,9 +368,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
"Output and elementwise parameter need to have the " "Output and elementwise parameter need to have the "
"same dimension sizes"); "same dimension sizes");
if (residual_param->format() != handler->GetDstFormat()) { if (residual_param->format() != mkldnninfo->handler->GetDstFormat()) {
auto output_data = auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, mkldnninfo->handler->GetDstMemorySize());
auto residual_data_tz = auto residual_data_tz =
paddle::framework::vectorize2int(residual_param->dims()); paddle::framework::vectorize2int(residual_param->dims());
auto residual_data_type = auto residual_data_type =
...@@ -364,21 +378,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -364,21 +378,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_residual_md = platform::MKLDNNMemDesc( auto user_residual_md = platform::MKLDNNMemDesc(
residual_data_tz, residual_data_type, residual_param->format()); residual_data_tz, residual_data_type, residual_param->format());
auto user_residual_memory_p = handler->AcquireResidualDataMemory( auto user_residual_memory_p = mkldnninfo->handler->AcquireResidualDataMemory(
user_residual_md, to_void_cast<T>(residual_param_data)); user_residual_md, to_void_cast<T>(residual_param_data));
dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( mkldnninfo->dst_memory_p = mkldnninfo->handler->AcquireDstMemoryFromResidualDataMemory(
user_residual_memory_p, to_void_cast<T>(output_data), pipeline); user_residual_memory_p, to_void_cast<T>(output_data), *mkldnninfo->pipeline);
} else { } else {
output->ShareDataWith(*residual_param); output->ShareDataWith(*residual_param);
auto output_data = output->mutable_data<T>(ctx.GetPlace()); auto output_data = output->mutable_data<T>(ctx.GetPlace());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
} else { } else {
auto output_data = auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, mkldnninfo->handler->GetDstMemorySize());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
// create convolution op primitive // create convolution op primitive
...@@ -387,72 +401,41 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -387,72 +401,41 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_bias_md = platform::MKLDNNMemDesc( auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x); {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
auto user_bias_memory_p = auto user_bias_memory_p =
handler->AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data)); mkldnninfo->handler->AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
auto bias_memory_p = auto bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test); mkldnninfo->handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, *mkldnninfo->pipeline, mkldnninfo->is_test);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, mkldnninfo->conv_p = mkldnninfo->handler->AcquireConvolution(
bias_memory_p, dst_memory_p); mkldnninfo->src_memory_p, weights_memory_p,
bias_memory_p, mkldnninfo->dst_memory_p);
} else { } else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, mkldnninfo->conv_p = mkldnninfo->handler->AcquireConvolution(
dst_memory_p); mkldnninfo->src_memory_p, weights_memory_p,
mkldnninfo->dst_memory_p);
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p); mkldnninfo->pipeline->push_back(*mkldnninfo->conv_p);
}; };
void CreateINT8Primitive( void CreateINT8Primitive(
const paddle::framework::ExecutionContext& ctx, bool is_test, const paddle::framework::ExecutionContext& ctx,
const paddle::platform::MKLDNNDeviceContext & dev_ctx, const paddle::platform::MKLDNNDeviceContext* dev_ctx,
const mkldnn::engine & mkldnn_engine, const paddle::framework::Tensor* input, const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* input, //const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* bias, paddle::framework::Tensor* output, const paddle::framework::Tensor* bias, paddle::framework::Tensor* output,
std::vector<int> strides, std::vector<int> paddings, ConvInfo* convinfo, MkldnnInfo* mkldnninfo) const {
std::vector<int> dilations, bool fuse_relu, const T* input_data = input->data<T>();
bool fuse_residual_conn, const T* input_data, const float* filter_data = filter->data<float>();
const float* filter_data, std::vector<int> src_tz,
std::vector<int> weights_tz, int g,
std::vector<int> dst_tz, const std::string key,
std::shared_ptr<mkldnn::memory>& dst_memory_p,
std::vector<primitive>& pipeline,
const std::string &key_conv_pd,
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> user_src_memory_p,
std::shared_ptr<mkldnn::convolution_forward> conv_p,
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<platform::ConvMKLDNNHandler> handler,
bool force_fp32_output) const {
//const T* input_data = input->data<T>();
bool is_INT8 = true; bool is_INT8 = true;
auto scale_in_data = ctx.Attr<float>("Scale_in"); auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise"); auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights"); auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_out_data = force_fp32_output? 1.0f : ctx.Attr<float>("Scale_out"); auto scale_out_data = mkldnninfo->force_fp32_output? 1.0f : ctx.Attr<float>("Scale_out");
bool is_multi_channel = scale_weights_data.size() > 1 ? true : false; bool is_multi_channel = scale_weights_data.size() > 1 ? true : false;
auto scale_in_key = key + "@scale_in";
auto scale_weights_key = key + "@scale_weights";
auto scale_out_key = key + "@scale_out";
auto output_shift_scale_key = key + "@output_shift_scale";
auto sum_scale_key = key + "@sum_scale";
auto scale_in_eltwise_key = key + "@scale_in_eltwise";
//std::vector<float> scale_in_data;
//std::vector<float> scale_out_data = {1.0f};
//std::vector<float> scale_weights_data;
//std::vector<float> scale_in_eltwise_data;
std::vector<float> output_shift_scale; std::vector<float> output_shift_scale;
float sum_scale = 1.0f; float sum_scale = 1.0f;
int count = is_multi_channel? (convinfo->g>1? (*convinfo->weights_tz)[1] * (*convinfo->weights_tz)[0] : (*convinfo->weights_tz)[0]) : 1;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
//scale_in_data = {scale_in};
//scale_weights_data.resize(count);
//#pragma omp parallel for if (count > 1)
//for(int i=0; i<count; i++){
//scale_weights_data[i] =*(scale_weights->data<float>() + i);
//}
//if(!force_fp32_output)
//scale_out_data = {*(scale_out->data<float>())};
output_shift_scale.resize(count); output_shift_scale.resize(count);
#pragma omp parallel for if (count > 1) #pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){ for(int i=0; i<count; i++){
...@@ -461,16 +444,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -461,16 +444,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
else else
output_shift_scale[i] = scale_out_data / (scale_in_data * scale_weights_data[i]); output_shift_scale[i] = scale_out_data / (scale_in_data * scale_weights_data[i]);
} }
if(fuse_residual_conn){ if(mkldnninfo->fuse_residual_conn){
//scale_in_eltwise_data = {*(scale_in_eltwise->data<float>())};
sum_scale = scale_out_data / scale_in_eltwise_data; sum_scale = scale_out_data / scale_in_eltwise_data;
} }
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); {*convinfo->src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<float>(), {*convinfo->weights_tz}, platform::MKLDNNGetDataType<float>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); ((convinfo->g) == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -483,123 +465,123 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -483,123 +465,123 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_tz = paddle::framework::vectorize2int(bias->dims());
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, memory::data_type::u8, chosen_memory_format); *convinfo->src_tz, memory::data_type::u8, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format); *convinfo->weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = fuse_relu? auto dst_dt = mkldnninfo->fuse_relu?
paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char)))
: paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char)));
if(force_fp32_output){ if(mkldnninfo->force_fp32_output){
dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float))); dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float)));
} }
if(fuse_residual_conn){ if(mkldnninfo->fuse_residual_conn){
auto residual = ctx.Input<Tensor>("ResidualData"); auto residual = ctx.Input<Tensor>("ResidualData");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
if(dst_dt != residual_dt) if(dst_dt != residual_dt)
dst_dt = residual_dt; dst_dt = residual_dt;
} }
auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); auto dst_md = platform::MKLDNNMemDesc(*convinfo->dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
if (bias) { if (bias) {
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, memory::data_type::s32, memory::format::x); bias_tz, memory::data_type::s32, memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, mkldnninfo->conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine, *convinfo->strides, *convinfo->paddings, *mkldnninfo->mkldnn_engine,
fuse_relu, fuse_residual_conn, mkldnninfo->fuse_relu, mkldnninfo->fuse_residual_conn,
output_shift_scale, sum_scale, is_test); output_shift_scale, sum_scale, mkldnninfo->is_test);
} else { } else {
conv_pd = mkldnninfo->conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, *convinfo->strides, *convinfo->paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn, *mkldnninfo->mkldnn_engine, mkldnninfo->fuse_relu, mkldnninfo->fuse_residual_conn,
output_shift_scale, sum_scale, is_test); output_shift_scale, sum_scale, mkldnninfo->is_test);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx->SetBlob(*mkldnninfo->key_conv_pd, mkldnninfo->conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); mkldnninfo->handler.reset(new platform::ConvMKLDNNHandler(mkldnninfo->conv_pd, *dev_ctx, *mkldnninfo->mkldnn_engine, *mkldnninfo->key));
// create mkldnn memory from input tensors (data/weights) // create mkldnn memory from input tensors (data/weights)
user_src_memory_p = mkldnninfo->user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data)); mkldnninfo->handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory( auto user_weights_memory_p = mkldnninfo->handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data)); user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
src_memory_p = mkldnninfo->src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); mkldnninfo->handler->AcquireSrcMemoryFromPrimitive(mkldnninfo->user_src_memory_p, *mkldnninfo->pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p; std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0; int mask_reorder = is_multi_channel? ((convinfo->g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0;
weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( weights_memory_p = mkldnninfo->handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); user_weights_memory_p, *mkldnninfo->pipeline, mkldnninfo->is_test, is_INT8, scale_weights_data, mask_reorder);
if(fuse_residual_conn) { if(mkldnninfo->fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the " "Output and elementwise parameter need to have the "
"same dimension sizes"); "same dimension sizes");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type());
PADDLE_ENFORCE_EQ(residual_param->format(), handler->GetDstFormat(), PADDLE_ENFORCE_EQ(residual_param->format(), mkldnninfo->handler->GetDstFormat(),
"Conv input dimension and filter dimension should be the same."); "Conv input dimension and filter dimension should be the same.");
output->ShareDataWith(*residual_param); output->ShareDataWith(*residual_param);
if(residual_dt == mkldnn::memory::data_type::u8){ if(residual_dt == mkldnn::memory::data_type::u8){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace()); uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{ } else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace()); int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
} else if(!force_fp32_output){ } else if(!mkldnninfo->force_fp32_output){
if(fuse_relu){ if(mkldnninfo->fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, mkldnninfo->handler->GetDstMemorySize());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{ } else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, mkldnninfo->handler->GetDstMemorySize());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
} else { } else {
float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, mkldnninfo->handler->GetDstMemorySize());
dst_memory_p = mkldnninfo->dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<float>(output_data)); mkldnninfo->handler->AcquireDstMemoryFromPrimitive(to_void_cast<float>(output_data));
} }
// create convolution op primitive // create convolution op primitive
std::vector<float> scale_bias_data; std::vector<float> scale_bias_data;
auto scale_bias_key = key + "@scale_bias"; auto scale_bias_key = *mkldnninfo->key + "@scale_bias";
if (bias) { if (bias) {
const float* bias_data = bias->data<float>(); const float* bias_data = bias->data<float>();
auto user_bias_md = platform::MKLDNNMemDesc( auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x); {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
auto user_bias_memory_p = auto user_bias_memory_p =
handler->AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data)); mkldnninfo->handler->AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data));
std::shared_ptr<mkldnn::memory> bias_memory_p; std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel? 1<<0 : 1; int mask_reorder = is_multi_channel? 1<<0 : 1;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; int count = is_multi_channel? (convinfo->g>1? (*convinfo->weights_tz)[1] * (*convinfo->weights_tz)[0] : (*convinfo->weights_tz)[0]) : 1;
scale_bias_data.resize(count); scale_bias_data.resize(count);
#pragma omp parallel for if (count > 1) #pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){ for(int i=0; i<count; i++){
scale_bias_data[i] = scale_in_data * scale_weights_data[i]; scale_bias_data[i] = scale_in_data * scale_weights_data[i];
} }
bias_memory_p = bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder); mkldnninfo->handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, *mkldnninfo->pipeline, mkldnninfo->is_test, is_INT8, scale_bias_data, mask_reorder);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, mkldnninfo->conv_p = mkldnninfo->handler->AcquireConvolution(mkldnninfo->src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p); bias_memory_p, mkldnninfo->dst_memory_p);
} else { } else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, mkldnninfo->conv_p = mkldnninfo->handler->AcquireConvolution(mkldnninfo->src_memory_p, weights_memory_p,
dst_memory_p); mkldnninfo->dst_memory_p);
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p); mkldnninfo->pipeline->push_back(*mkldnninfo->conv_p);
}; };
void AppendKey(std::string& key, mkldnn::memory::dims& input_dims, // NOLINT void AppendKey(std::string& key, mkldnn::memory::dims& input_dims, // NOLINT
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册