diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 758a8f4a41f686aa9ad5ee965d0586df5b89476f..beab7ad8d7405eff48ec63238d8890d1878f81bf 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -300,7 +300,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - const bool is_test = ctx.Attr("is_test"); auto& dev_ctx = @@ -335,8 +334,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector dilations = ctx.Attr>("dilations"); bool fuse_relu = ctx.Attr("fuse_relu"); bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + bool force_fp32_output = ctx.Attr("force_fp32_output"); int groups = ctx.Attr("groups"); + if (fuse_residual_conn) { + PADDLE_ENFORCE(force_fp32_output != true, + "residual fusion does not support force output with fp32"); + } + // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, @@ -378,20 +383,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::shared_ptr conv_p; std::shared_ptr src_memory_p; + std::shared_ptr user_src_memory_p; std::shared_ptr dst_memory_p; std::vector pipeline; auto prim_key = key + "@conv_p"; auto dst_key = key + "@dst_mem_p"; auto src_key = key + "@src_mem_p"; + auto user_src_key = key + "@user_src_mem_p"; + auto src_reorder_key = key + "@src_mem_p" + "reorder_p"; conv_p = std::static_pointer_cast(dev_ctx.GetBlob(prim_key)); + auto src_memory_reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(src_reorder_key)); src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - - if (src_memory_p) { + if(src_memory_reorder_p){ + user_src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if(src_memory_p){ src_memory_p->set_data_handle(to_void_cast(input_data)); } + dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + std::shared_ptr conv_pd; conv_pd = std::static_pointer_cast(dev_ctx.GetBlob(key_conv_pd)); std::shared_ptr handler; @@ -414,7 +426,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { residual_data_tz, residual_data_type, residual_param->format()); auto user_residual_memory_p = handler->AcquireResidualDataMemory( user_residual_md, to_void_cast(residual_param_data)); - dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( user_residual_memory_p, to_void_cast(output_data), pipeline); } else { @@ -427,6 +438,30 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); dst_memory_p->set_data_handle(to_void_cast(output_data)); } + } else if(is_INT8 && dst_memory_p){ + if(fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); + output->ShareDataWith(*residual_param); + if(residual_dt == mkldnn::memory::data_type::u8){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p->set_data_handle(to_void_cast(output_data)); + } else{ + int8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p->set_data_handle(to_void_cast(output_data)); + } + } else if(!force_fp32_output){ + if(fuse_relu){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p->set_data_handle(to_void_cast(output_data)); + } else{ + int8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p->set_data_handle(to_void_cast(output_data)); + } + } else { + float* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p->set_data_handle(to_void_cast(output_data)); + } } if(!is_INT8){ @@ -462,11 +497,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz, platform::MKLDNNGetDataType(), memory::format::x); conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn); + fuse_relu, fuse_residual_conn, is_test); } else { conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn); + mkldnn_engine, fuse_relu, fuse_residual_conn, is_test); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -474,7 +509,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { handler.reset(new ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = + user_src_memory_p = handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); auto user_weights_memory_p = handler->AcquireWeightsMemory( user_weights_md, to_void_cast(filter_data)); @@ -508,7 +543,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { residual_data_tz, residual_data_type, residual_param->format()); auto user_residual_memory_p = handler->AcquireResidualDataMemory( user_residual_md, to_void_cast(residual_param_data)); - dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( user_residual_memory_p, to_void_cast(output_data), pipeline); } else { @@ -546,10 +580,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); - } else { + } else { + if(src_memory_reorder_p){ + pipeline.push_back(*src_memory_reorder_p); + } pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - + output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } @@ -572,7 +609,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto sum_scale_key = key + "@sum_scale"; auto scale_in_eltwise_key = key + "@scale_in_eltwise"; std::vector scale_in_data; - std::vector scale_out_data; + std::vector scale_out_data = {1.0f}; std::vector scale_weights_data; std::vector scale_in_eltwise_data; std::vector output_shift_scale; @@ -591,7 +628,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { for(int i=0; idata() + i); } - scale_out_data = {*(scale_out->data())}; + if(!force_fp32_output) + scale_out_data = {*(scale_out->data())}; output_shift_scale.resize(count); #pragma omp parallel for if (count > 1) for(int i=0; i { paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); + if(force_fp32_output){ + dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float))); + } + if(fuse_residual_conn){ auto residual = ctx.Input("ResidualData"); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); @@ -678,7 +720,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { handler.reset(new ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = + user_src_memory_p = handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); auto user_weights_memory_p = handler->AcquireWeightsMemory( user_weights_md, to_void_cast(filter_data)); @@ -710,7 +752,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst_memory_p = handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } - } else { + } else if(!force_fp32_output){ if(fuse_relu){ uint8_t* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); dst_memory_p = @@ -720,6 +762,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst_memory_p = handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } + } else { + float* output_data = output->mutable_data(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); + dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } // create convolution op primitive @@ -765,6 +811,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } else { + if(src_memory_reorder_p){ + pipeline.push_back(*src_memory_reorder_p); + } pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); @@ -1141,7 +1190,8 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 609f5cf6172db41c7d07719070f79de0c8b2888e..30c152de36344fb3c6e6ce776fd733b3b24d46b1 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -30,7 +30,6 @@ using Tensor = framework::Tensor; using framework::DataLayout; using mkldnn::stream; using platform::GetMKLDNNFormat; -//using MKLDNNDataType = mkldnn::memory::data_type; template class DeQuantOpKernel : public framework::OpKernel { @@ -46,7 +45,6 @@ class DeQuantOpKernel : public framework::OpKernel { const T* input_data = input->data(); float* output_data = output->mutable_data(ctx.GetPlace()); - //T scale_data = *(scale->data()); std::vector scale_data = {*(scale->data())}; std::vector reorder_scale = {1.0f / scale_data[0]}; @@ -77,7 +75,6 @@ class DeQuantOpKernel : public framework::OpKernel { pipeline.push_back(*reorder_p); stream(stream::kind::eager).submit(pipeline).wait(); - //output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(dst_memory)); } @@ -114,5 +111,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, ops::DeQuantOpKernel); +REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, ops::DeQuantOpKernel, ops::DeQuantOpKernel); diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index 3d2cd64fcdcc228dc9d7d1f357b7b41a62f65980..f7c84bcb58f49babeba70e762844a5e77fb40a88 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -46,7 +46,7 @@ class QuantOpKernel : public framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); const T* input_data = input->data(); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + std::vector scale_data = {*(scale->data())}; mkldnn::primitive_attr attri; @@ -59,20 +59,32 @@ class QuantOpKernel : public framework::OpKernel { auto src_memory = std::make_shared(src_pd, to_void_cast(input_data)); std::shared_ptr src_memory_p = std::shared_ptr(new primitive::at(*src_memory)); - auto dst_md = platform::MKLDNNMemDesc( - {dst_tz}, memory::data_type::u8, memory::format::nhwc); - auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); - auto dst_memory = mkldnn::memory(dst_pd, to_void_cast(output_data)); + bool is_negative = ctx.Attr("is_negative_input"); + mkldnn::memory::primitive_desc dst_pd; + std::shared_ptr dst_memory; + if (is_negative) { + int8_t* output_data = output->mutable_data(ctx.GetPlace()); + auto dst_md = platform::MKLDNNMemDesc( + {dst_tz}, memory::data_type::s8, memory::format::nhwc); + dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + dst_memory.reset(new mkldnn::memory(dst_pd, to_void_cast(output_data))); + } else { + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + auto dst_md = platform::MKLDNNMemDesc( + {dst_tz}, memory::data_type::u8, memory::format::nhwc); + dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); + dst_memory.reset(new mkldnn::memory(dst_pd, to_void_cast(output_data))); + } auto reorder_pd = std::shared_ptr( new reorder::primitive_desc(src_pd, dst_pd, attri)); - auto reorder_p= std::shared_ptr(new reorder(*reorder_pd, *src_memory_p, dst_memory)); + auto reorder_p= std::shared_ptr(new reorder(*reorder_pd, *src_memory_p, *dst_memory)); pipeline.push_back(*reorder_p); stream(stream::kind::eager).submit(pipeline).wait(); output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(dst_memory)); + output->set_format(GetMKLDNNFormat(*dst_memory)); } };