diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index d6d8dafa6eb8adb9359e97c912f520334357764b..fb55e8e5b242f70a7204d1e9fca359e726eb88d8 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -132,6 +132,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::shared_ptr user_src_memory_p; std::shared_ptr dst_memory_p; std::vector pipeline; + std::shared_ptr conv_pd; + std::shared_ptr handler; auto prim_key = key + "@conv_p"; auto dst_key = key + "@dst_mem_p"; @@ -139,144 +141,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_key = key + "@user_src_mem_p"; auto src_reorder_key = key + "@src_mem_p" + "reorder_p"; conv_p = std::static_pointer_cast(dev_ctx.GetBlob(prim_key)); - auto src_memory_reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if(src_memory_reorder_p){ - user_src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - } else if(src_memory_p){ - src_memory_p->set_data_handle(to_void_cast(input_data)); - } - - dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - - std::shared_ptr conv_pd; - conv_pd = std::static_pointer_cast(dev_ctx.GetBlob(key_conv_pd)); - std::shared_ptr handler; - if(conv_pd){ - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); - } - if (!is_INT8 && dst_memory_p){ - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - auto residual_param_data = residual_param->data(); - if (residual_param->format() != handler->GetDstFormat()) { - auto output_data = - output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - auto residual_data_tz = - paddle::framework::vectorize2int(residual_param->dims()); - auto residual_data_type = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_data_type, residual_param->format()); - auto user_residual_memory_p = handler->AcquireResidualDataMemory( - user_residual_md, to_void_cast(residual_param_data)); - dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), pipeline); - } else { - output->ShareDataWith(*residual_param); - auto output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); - } - } else { - auto output_data = - output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); + if(conv_p == nullptr){ + if(is_INT8){ + CreateINT8Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, //filter, + bias, output, + strides, paddings, + dilations, fuse_relu, + fuse_residual_conn, input_data, + filter_data, src_tz, + weights_tz, g, + dst_tz, key, + dst_memory_p, + pipeline, + key_conv_pd, + src_memory_p, + user_src_memory_p, + conv_p, + conv_pd, + handler, + force_fp32_output); + }else{ + CreateFP32Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, //filter, + bias, output, + strides, paddings, + dilations, fuse_relu, + fuse_residual_conn, input_data, + filter_data, src_tz, + weights_tz, g, + dst_tz, key, + dst_memory_p, + pipeline, + key_conv_pd, + src_memory_p, + user_src_memory_p, + conv_p, + conv_pd, + handler); } - } else if(is_INT8 && dst_memory_p){ - if(fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); - output->ShareDataWith(*residual_param); - if(residual_dt == mkldnn::memory::data_type::u8){ - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); - } else{ - int8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); - } - } else if(!force_fp32_output){ - if(fuse_relu){ - uint8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); - } else{ - int8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); - } - } else { - float* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); + } else { + auto src_memory_reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(src_reorder_key)); + src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(src_key)); + if(src_memory_reorder_p){ + user_src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if(src_memory_p){ + src_memory_p->set_data_handle(to_void_cast(input_data)); } - } - - if(!is_INT8){ - if(conv_p == nullptr){ - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - std::string data_format = ctx.Attr("data_format"); - auto chosen_memory_format = - platform::data_format_to_memory_format(data_format); - - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. - // Currently used whenever bias is != nullptr. - - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - // create a conv primitive descriptor and save it for usage in backward - if (bias) { - bias_tz = paddle::framework::vectorize2int(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn, is_test); - } else { - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn, is_test); - } - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - + + dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + conv_pd = std::static_pointer_cast(dev_ctx.GetBlob(key_conv_pd)); + if(conv_pd){ handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - auto weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test); - + } + if (!is_INT8){ if (fuse_residual_conn) { auto residual_param = ctx.Input("ResidualData"); auto residual_param_data = residual_param->data(); - - PADDLE_ENFORCE( - residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), - "Output and elementwise parameter need to have the " - "same dimension sizes"); - if (residual_param->format() != handler->GetDstFormat()) { auto output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); @@ -284,7 +204,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(residual_param->dims()); auto residual_data_type = paddle::framework::ToMKLDNNDataType(residual_param->type()); - + auto user_residual_md = platform::MKLDNNMemDesc( residual_data_tz, residual_data_type, residual_param->format()); auto user_residual_memory_p = handler->AcquireResidualDataMemory( @@ -294,254 +214,394 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } else { output->ShareDataWith(*residual_param); auto output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } } else { auto output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - } - - // create convolution op primitive - if (bias) { - const T* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); - auto user_bias_memory_p = - handler->AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - - auto bias_memory_p = - handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test); - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); - } else { - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); - } - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); - stream(stream::kind::eager).submit(pipeline).wait(); - - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); - } else { - if(src_memory_reorder_p){ - pipeline.push_back(*src_memory_reorder_p); - } - pipeline.push_back(*conv_p); - stream(stream::kind::eager).submit(pipeline).wait(); - - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); - } - } else{ - if(conv_p == nullptr){ - auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input("Scale_in") : nullptr; - auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input("Scale_in_eltwise") : nullptr; - auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input("Scale_weights") : nullptr; - auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input("Scale_out") : nullptr; - - bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false; - - auto scale_in_key = key + "@scale_in"; - auto scale_weights_key = key + "@scale_weights"; - auto scale_out_key = key + "@scale_out"; - auto output_shift_scale_key = key + "@output_shift_scale"; - auto sum_scale_key = key + "@sum_scale"; - auto scale_in_eltwise_key = key + "@scale_in_eltwise"; - std::vector scale_in_data; - std::vector scale_out_data = {1.0f}; - std::vector scale_weights_data; - std::vector scale_in_eltwise_data; - std::vector output_shift_scale; - std::vector sum_scale = {1.0f}; - std::vector none_scale = {0}; - - int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - scale_in_data = {*(scale_in->data())}; - scale_weights_data.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; idata() + i); - } - if(!force_fp32_output) - scale_out_data = {*(scale_out->data())}; - output_shift_scale.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; idata())}; - sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0]; + dst_memory_p->set_data_handle(to_void_cast(output_data)); } - - std::vector pipeline; - - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - std::string data_format = ctx.Attr("data_format"); - auto chosen_memory_format = - platform::data_format_to_memory_format(data_format); - - auto bias_tz = paddle::framework::vectorize2int(bias->dims()); - - auto src_md = platform::MKLDNNMemDesc( - src_tz, memory::data_type::u8, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - - auto dst_dt = fuse_relu? - paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) - : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); - - if(force_fp32_output){ - dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float))); - } - - if(fuse_residual_conn){ - auto residual = ctx.Input("ResidualData"); - auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); - if(dst_dt != residual_dt) - dst_dt = residual_dt; - } - auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); - - // create a conv primitive descriptor and save it for usage in backward - if (bias) { - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, memory::data_type::s32, memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn, - output_shift_scale, sum_scale[0], is_test); - } else { - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn, - output_shift_scale, sum_scale[0], is_test); - } - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); - + } else if(is_INT8){ if(fuse_residual_conn) { auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), - "Output and elementwise parameter need to have the " - "same dimension sizes"); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); - PADDLE_ENFORCE_EQ(residual_param->format(), handler->GetDstFormat(), - "Conv input dimension and filter dimension should be the same."); output->ShareDataWith(*residual_param); if(residual_dt == mkldnn::memory::data_type::u8){ uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } else{ int8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } } else if(!force_fp32_output){ if(fuse_relu){ uint8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } else{ int8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } } else { float* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); - dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + dst_memory_p->set_data_handle(to_void_cast(output_data)); } + } - // create convolution op primitive - std::vector scale_bias_data; - auto scale_bias_key = key + "@scale_bias"; - if (bias) { - const float* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); - auto user_bias_memory_p = - handler->AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel? 1<<0 : 1; - int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - scale_bias_data.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; iAcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder); - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); + if(src_memory_reorder_p){ + pipeline.push_back(*src_memory_reorder_p); + } + pipeline.push_back(*conv_p); + } + + // push primitive to stream and wait until it's executed + //pipeline.push_back(*conv_p); + stream(stream::kind::eager).submit(pipeline).wait(); + + if (need_s8_to_u8) { + output->mutable_data(ctx.GetPlace()); + } + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + }; + + private: + void CreateFP32Primitive( + paddle::framework::ExecutionContext ctx, bool is_test, + const paddle::platform::MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine& mkldnn_engine, + const paddle::framework::Tensor* input,// const paddle::framework::Tensor* filter, + const paddle::framework::Tensor* bias, paddle::framework::Tensor* output, + std::vector strides, std::vector paddings, + std::vector dilations, bool fuse_relu, + bool fuse_residual_conn, const T* input_data, + const float* filter_data, std::vector src_tz, + std::vector weights_tz, int g, + std::vector dst_tz, const std::string key, + std::shared_ptr &dst_memory_p, + std::vector& pipeline, + const std::string &key_conv_pd, + std::shared_ptr src_memory_p, + std::shared_ptr user_src_memory_p, + std::shared_ptr conv_p, + std::shared_ptr conv_pd, + std::shared_ptr handler) const{ + + //const T* input_data = input->data(); + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. + // Currently used whenever bias is != nullptr. + + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, platform::MKLDNNGetDataType(), memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_residual_conn, is_test); + } else { + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_residual_conn, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test); + + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + auto residual_param_data = residual_param->data(); + + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + + if (residual_param->format() != handler->GetDstFormat()) { + auto output_data = + output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + auto residual_data_tz = + paddle::framework::vectorize2int(residual_param->dims()); + auto residual_data_type = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + + auto user_residual_md = platform::MKLDNNMemDesc( + residual_data_tz, residual_data_type, residual_param->format()); + auto user_residual_memory_p = handler->AcquireResidualDataMemory( + user_residual_md, to_void_cast(residual_param_data)); + dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), pipeline); } else { - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); + output->ShareDataWith(*residual_param); + auto output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } + } else { + auto output_data = + output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); - stream(stream::kind::eager).submit(pipeline).wait(); + // create convolution op primitive + if (bias) { + const T* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = + handler->AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); + + auto bias_memory_p = + handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + }; + + void CreateINT8Primitive( + const paddle::framework::ExecutionContext& ctx, bool is_test, + const paddle::platform::MKLDNNDeviceContext & dev_ctx, + const mkldnn::engine & mkldnn_engine, + const paddle::framework::Tensor* input, //const paddle::framework::Tensor* filter, + const paddle::framework::Tensor* bias, paddle::framework::Tensor* output, + std::vector strides, std::vector paddings, + std::vector dilations, bool fuse_relu, + bool fuse_residual_conn, const T* input_data, + const float* filter_data, std::vector src_tz, + std::vector weights_tz, int g, + std::vector dst_tz, const std::string key, + std::shared_ptr& dst_memory_p, + std::vector& pipeline, + const std::string &key_conv_pd, + std::shared_ptr src_memory_p, + std::shared_ptr user_src_memory_p, + std::shared_ptr conv_p, + std::shared_ptr conv_pd, + std::shared_ptr handler, + bool force_fp32_output) const { + //const T* input_data = input->data(); + bool is_INT8 = true; + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = force_fp32_output? 1.0f : ctx.Attr("Scale_out"); + + bool is_multi_channel = scale_weights_data.size() > 1 ? true : false; + + auto scale_in_key = key + "@scale_in"; + auto scale_weights_key = key + "@scale_weights"; + auto scale_out_key = key + "@scale_out"; + auto output_shift_scale_key = key + "@output_shift_scale"; + auto sum_scale_key = key + "@sum_scale"; + auto scale_in_eltwise_key = key + "@scale_in_eltwise"; + //std::vector scale_in_data; + //std::vector scale_out_data = {1.0f}; + //std::vector scale_weights_data; + //std::vector scale_in_eltwise_data; + std::vector output_shift_scale; + float sum_scale = 1.0f; + + int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; + //scale_in_data = {scale_in}; + //scale_weights_data.resize(count); + //#pragma omp parallel for if (count > 1) + //for(int i=0; idata() + i); + //} + //if(!force_fp32_output) + //scale_out_data = {*(scale_out->data())}; + output_shift_scale.resize(count); + #pragma omp parallel for if (count > 1) + for(int i=0; idata())}; + sum_scale = scale_out_data / scale_in_eltwise_data; + } - if(need_s8_to_u8){ - output->mutable_data(ctx.GetPlace()); - } + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + auto bias_tz = paddle::framework::vectorize2int(bias->dims()); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + auto src_md = platform::MKLDNNMemDesc( + src_tz, memory::data_type::u8, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, chosen_memory_format); + + auto dst_dt = fuse_relu? + paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) + : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); + + if(force_fp32_output){ + dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float))); + } + + if(fuse_residual_conn){ + auto residual = ctx.Input("ResidualData"); + auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); + if(dst_dt != residual_dt) + dst_dt = residual_dt; + } + auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, memory::data_type::s32, memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale, is_test); } else { - if(src_memory_reorder_p){ - pipeline.push_back(*src_memory_reorder_p); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0; + weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); + + if(fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); + PADDLE_ENFORCE_EQ(residual_param->format(), handler->GetDstFormat(), + "Conv input dimension and filter dimension should be the same."); + output->ShareDataWith(*residual_param); + if(residual_dt == mkldnn::memory::data_type::u8){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } else{ + int8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } - pipeline.push_back(*conv_p); - stream(stream::kind::eager).submit(pipeline).wait(); - - if (need_s8_to_u8) { - output->mutable_data(ctx.GetPlace()); + } else if(!force_fp32_output){ + if(fuse_relu){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } else{ + int8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } + } else { + float* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + // create convolution op primitive + std::vector scale_bias_data; + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = + handler->AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel? 1<<0 : 1; + int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; + scale_bias_data.resize(count); + #pragma omp parallel for if (count > 1) + for(int i=0; iAcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); } - } - } - private: + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + }; + void AppendKey(std::string& key, mkldnn::memory::dims& input_dims, // NOLINT mkldnn::memory::dims& weights_dims, // NOLINT std::vector& strides, // NOLINT diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 3677a68a7f59527655c5aa8c9b446d603d972d9a..b902423e7f201923e5b4561a93b3914e97ec6b56 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -131,21 +131,14 @@ void Conv2DOpMaker::Make() { "The format of output tensor is X (one-dimensional) of size equal" "to the number of output channels. Only used with MKL-DNN.") .AsDispensable(); - AddInput("Scale_in", - "(Tensor) Scale_in to be used for int8 input data." - "Only used with INT8.") - .AsDispensable(); - AddInput("Scale_in_eltwise", - "(Tensor) Scale_in_eltwise to be used for int8 eltwise input data." - "Only used with MKL-DNN.") - .AsDispensable(); - AddInput("Scale_weights", - "(Tensor) Scale_weights to be used for int8 weights data." - "Only used with MKL-DNN.") - .AsDispensable(); - AddInput("Scale_out", - "(Tensor) Scale_out to be used for int8 output data." - "Only used with MKL-DNN.") + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") .AsDispensable(); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " @@ -193,6 +186,22 @@ void Conv2DOpMaker::Make() { "whenever convolution output is as an input to residual " "connection.") .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN.") + .SetDefault({1.0f}); AddAttr("force_fp32_output", "(bool, default false) Force INT8 kernel output FP32, only used in mkldnn kernel") .SetDefault(false); AddAttr( diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 30c152de36344fb3c6e6ce776fd733b3b24d46b1..8fed7d6e3da160d5baf477f83aeb1197dca55e8b 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -37,7 +37,7 @@ class DeQuantOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); - auto* scale = ctx.Input("Scale"); + auto scale_data = ctx.Attr("Scale"); auto* output = ctx.Output("Output"); auto& dev_ctx = ctx.template device_context(); @@ -45,8 +45,7 @@ class DeQuantOpKernel : public framework::OpKernel { const T* input_data = input->data(); float* output_data = output->mutable_data(ctx.GetPlace()); - std::vector scale_data = {*(scale->data())}; - std::vector reorder_scale = {1.0f / scale_data[0]}; + std::vector reorder_scale = {1.0f / scale_data}; std::vector pipeline; std::vector src_tz = paddle::framework::vectorize2int(input->dims()); @@ -99,8 +98,8 @@ framework::OpKernelType DeQuantOp::GetExpectedKernelType(const framework::Execut void DeQuantOpMaker::Make() { AddInput("Input","input data"); - AddInput("Scale","scale data"); AddOutput("Output","output data"); + AddAttr("Scale","scale data").SetDefault({1.0f}); AddComment(R"DOC(This op will quantize data from INT8 to FP32)DOC"); } diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index f7c84bcb58f49babeba70e762844a5e77fb40a88..aba7ad51d0a114b4e3d992aca2b6775eaf4c0885 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -35,7 +35,7 @@ class QuantOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); - auto* scale = ctx.Input("Scale"); + auto scale_data = ctx.Attr("Scale"); auto* output = ctx.Output("Output"); auto& dev_ctx = ctx.template device_context(); @@ -47,11 +47,9 @@ class QuantOpKernel : public framework::OpKernel { const T* input_data = input->data(); - std::vector scale_data = {*(scale->data())}; - mkldnn::primitive_attr attri; int mask = 0; - attri.set_output_scales(mask, scale_data); + attri.set_output_scales(mask, {scale_data}); auto src_md = platform::MKLDNNMemDesc( {src_tz}, memory::data_type::f32, input->format()); @@ -108,11 +106,12 @@ framework::OpKernelType QuantOp::GetExpectedKernelType(const framework::Executio void QuantOpMaker::Make() { AddInput("Input","input data"); - AddInput("Scale","scale data"); AddOutput("Output","output data"); AddAttr("is_negative_input", "(bool, default false) Only used in mkldnn INT8 kernel") .SetDefault(false); + AddAttr("Scale","scale data") + .SetDefault({1.0f}); AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC"); }