From fcbe4898b45d62e00a86248cfd7a4b50d1ab133d Mon Sep 17 00:00:00 2001 From: xiaolil1 Date: Tue, 16 Oct 2018 15:41:44 +0800 Subject: [PATCH] modify for eltwise with some useless log --- paddle/fluid/framework/operator.cc | 1 + paddle/fluid/operators/conv_mkldnn_op.cc | 143 +++++++++++++++-------- paddle/fluid/operators/conv_op.cc | 8 +- paddle/fluid/operators/dequantize_op.cc | 2 +- paddle/fluid/operators/quantize_op.cc | 7 +- paddle/fluid/platform/mkldnn_helper.h | 7 ++ 6 files changed, 113 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a103be7191d..56db6766c73 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -821,6 +821,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", Type(), last_input_name, data_type, ipt_name, tmp); data_type = tmp; + std::cout<<"data_type = "< { void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - +std::cout<<"this is conv kernel op....................."<("is_test"); auto& dev_ctx = @@ -324,7 +325,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { "dilation in convolution is not implemented yet"); const T* input_data = input->data(); - const T* filter_data = filter->data(); + const float* filter_data = filter->data(); std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = @@ -344,17 +345,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - std::vector output_shift_scale; - T sum_scale = 1.0f; + std::vector output_shift_scale; + float sum_scale = 1.0f; if(is_INT8){ +std::cout<<"this is conv int8 op .............."<1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - T scale_in_data = *(scale_in->data()); - T scale_in_eltwise_data = *(scale_in_eltwise->data()); - std::vector scale_weights_data(count); + float scale_in_data = *(scale_in->data()); + std::vector scale_weights_data(count); for(int i=0; idata() + i); + scale_weights_data[i] =*(scale_weights->data() + i); } - T scale_out_data = *(scale_out->data()); + float scale_out_data = *(scale_out->data()); output_shift_scale.resize(count); for(int i=0; i { else output_shift_scale[i] = scale_out_data / (scale_in_data * scale_weights_data[i]); } - - sum_scale = scale_out_data / scale_in_eltwise_data; + if(fuse_residual_conn){ + float scale_in_eltwise_data = *(scale_in_eltwise->data()); + sum_scale = scale_out_data / scale_in_eltwise_data; + } } // Get unique name for storing MKLDNN primitives @@ -378,7 +381,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), + {weights_tz}, platform::MKLDNNGetDataType(), (g == 1) ? filter->format() : mkldnn::memory::format::goihw); /* create memory descriptor for convolution without specified format @@ -399,12 +402,28 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + if(is_INT8){ + src_md = platform::MKLDNNMemDesc( + src_tz, memory::data_type::u8, chosen_memory_format); + weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); + dst_md = platform::MKLDNNMemDesc( + dst_tz, + fuse_relu?memory::data_type::u8:memory::data_type::s8, + chosen_memory_format); + } + // create a conv primitive descriptor and save it for usage in backward std::shared_ptr conv_pd; if (bias) { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), memory::format::x); + bias_tz, platform::MKLDNNGetDataType(), memory::format::x); + if(is_INT8){ + bias_md = platform::MKLDNNMemDesc( + bias_tz, memory::data_type::s32, memory::format::x); + } if(is_INT8){ conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, @@ -436,62 +455,85 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_memory_p = handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - T* output_data = nullptr; - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - auto residual_param_data = residual_param->data(); - - PADDLE_ENFORCE( - residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), - "Output and elementwise parameter need to have the " - "same dimension sizes"); - - output->ShareDataWith(*residual_param); - output_data = output->mutable_data(ctx.GetPlace()); - } else { - output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - } + user_weights_md, to_void_cast(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = - handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline, is_INT8); auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( user_weights_memory_p, pipeline, is_test); if(is_INT8){ int mask_reorder = is_multi_channel? 0 : ((g!= 1) ? (1<<1)+(1<<0) : 1<<0); int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - std::vector scale_weights_data(count); + std::vector scale_weights_data(count); for(int i=0; idata() + i); } auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); } - auto dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + + std::shared_ptr dst_memory_p; + if(is_INT8){ + int8_t* output_data = nullptr; + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + + output->ShareDataWith(*residual_param); + output_data = output->mutable_data(ctx.GetPlace()); + } else { + std::cout<<"conv log 1 ....................."<mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + std::cout<<"conv log 2 //////////////////////"<(output_data)); +std::cout<<"input fmt = "<format()<<" output fmt = "<format()<<" dst fmt = "<get_primitive_desc().desc().data.format<("ResidualData"); + auto residual_param_data = residual_param->data(); + + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + + output->ShareDataWith(*residual_param); + output_data = output->mutable_data(ctx.GetPlace()); + } else { + std::cout<<"conv log 1 ....................."<mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + std::cout<<"conv log 2 //////////////////////"<(output_data)); + } // create convolution op primitive std::shared_ptr conv_p; if (bias) { - const T* bias_data = bias->data(); + const float* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); auto user_bias_memory_p = - handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); + handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); auto bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); if(is_INT8){ int mask_reorder = is_multi_channel? 0 : 1<<0; int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - std::vector scale_bias_data(count); + std::vector scale_bias_data(count); for(int i=0; idata()) * (*(scale_weights->data() + i)); + scale_bias_data[i] = (*scale_in->data()) * (*(scale_weights->data() + i)); } auto bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_INT8, scale_bias_data, mask_reorder); @@ -503,17 +545,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst_memory_p); } + // push primitive to stream and wait until it's executed pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); + std::cout<<"input fmt = "<format()<<" output fmt = "<format()<<" dst fmt = "<get_primitive_desc().desc().data.format< output_shift_scale, T sum_scale) const { + const std::vector output_shift_scale, float sum_scale) const { mkldnn::primitive_attr conv_attr; mkldnn::post_ops post_operations; // Fusion with Elementwise layer relies on adding a sum post-operation with @@ -568,7 +612,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, - const std::vector output_shift_scale, const T sum_scale) const { + const std::vector output_shift_scale, const float sum_scale) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -617,7 +661,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, - const std::vector output_shift_scale, const T sum_scale) const { + const std::vector output_shift_scale, const float sum_scale) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -841,7 +885,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 068ec7e46c0..8a9253dea92 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -94,10 +94,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( auto input_data_type = framework::ToDataType(ctx.Input("Input")->type()); - auto filter_data_type = - framework::ToDataType(ctx.Input("Filter")->type()); - PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, - "input and filter data type should be consistent"); + //auto filter_data_type = + // framework::ToDataType(ctx.Input("Filter")->type()); + //PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + // "input and filter data type should be consistent"); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 07a20bfae54..bdf4057c96f 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -40,7 +40,7 @@ class DeQuantOpKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* scale = ctx.Input("Scale"); auto* output = ctx.Output("Output"); - +std::cout<<"this is dequant op ***********"<(); const auto& engine = dev_ctx.GetEngine(); diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index a18c6f74137..cb5a9e4c1c8 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -37,7 +37,7 @@ class QuantOpKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* scale = ctx.Input("Scale"); auto* output = ctx.Output("Output"); - +std::cout<<"this is quantize op!!!!!!!!!!!!!!"<(); const auto& engine = dev_ctx.GetEngine(); @@ -68,7 +68,12 @@ class QuantOpKernel : public framework::OpKernel { auto reorder_pd = std::shared_ptr( new reorder::primitive_desc(dst_pd, src_pd, attri)); auto reorder_p= std::shared_ptr(new reorder(*reorder_pd, *src_memory_p, dst_memory)); + pipeline.push_back(*reorder_p); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); } }; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index c5026194e0a..c99966dbcf2 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -153,8 +153,11 @@ class MKLDNNHandler { std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), "Fail to find mem primitive in device context"); + //mem_p = nullptr; if (mem_p == nullptr) { mem_p = std::make_shared(mdp, ptr); +std::cout<<"mem_p == null"<( mkldnn::memory::primitive_desc{md, engine_}, ptr); dev_ctx_.SetBlob(local_key, mem_p); @@ -184,6 +190,7 @@ class MKLDNNHandler { // should be reused or none of them. So we check consistency is_reusing_ = true; } +std::cout<<"md fmt = "<