From a042d86bce18d1a5480f3316511171c0ba1060a0 Mon Sep 17 00:00:00 2001 From: xiaolil1 Date: Thu, 15 Nov 2018 11:27:27 +0800 Subject: [PATCH] adjust structure for PR, peel int8 from fp32 --- paddle/fluid/operators/conv_mkldnn_op.cc | 820 +++++++++++++---------- 1 file changed, 477 insertions(+), 343 deletions(-) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index f1ecfe41b96..171125b7353 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -296,384 +296,518 @@ template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - const bool is_test = ctx.Attr("is_test"); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - auto* output = ctx.Output("Output"); - - auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input("Scale_in") : nullptr; - auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input("Scale_in_eltwise") : nullptr; - auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input("Scale_weights") : nullptr; - auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input("Scale_out") : nullptr; bool is_INT8 = ctx.HasInput("Scale_in")? true : false; - bool is_multi_channel = (is_INT8 && scale_weights->memory_size() > 1) ? true : false; - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4, - "Input must be with 4 dimensions, i.e. NCHW"); - PADDLE_ENFORCE(filter->dims().size() == 4, - "Filter must be with 4 dimensions, i.e. OIHW"); - if (bias) { - PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && - bias->format() != memory::format::format_undef, - "Wrong layout/format set for Bias tensor"); - PADDLE_ENFORCE(bias->dims().size() == 1, - "Bias must only have 1 dimension, i.e. X"); - } + if(!is_INT8){ + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input must be with 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE(filter->dims().size() == 4, + "Filter must be with 4 dimensions, i.e. OIHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - bool fuse_relu = ctx.Attr("fuse_relu"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - int groups = ctx.Attr("groups"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + int groups = ctx.Attr("groups"); + + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // Get unique name for storing MKLDNN primitives + const std::string key = ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector pipeline; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. + // Currently used whenever bias is != nullptr. + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + + // create a conv primitive descriptor and save it for usage in backward + std::shared_ptr conv_pd; + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, platform::MKLDNNGetDataType(), memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_residual_conn); + } else { + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_residual_conn); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); - // TODO(tpatejko): add support for dilation - PADDLE_ENFORCE( - dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, - "dilation in convolution is not implemented yet"); + ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); - const T* input_data = input->data(); - const float* filter_data = filter->data(); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector weights_tz = - paddle::framework::vectorize2int(filter->dims()); - int g = std::max(groups, 1); - if (g > 1) { - int o = weights_tz[0]; - int i = weights_tz[1]; - int h = weights_tz[2]; - int w = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = h; - weights_tz[4] = w; - } - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test); - // Get unique name for storing MKLDNN primitives - const std::string key = ConvMKLDNNHandler::GetHash( - src_tz, weights_tz, strides, paddings, dilations, groups, - ctx.op().Output("Output")); - const std::string key_conv_pd = key + "@conv_pd"; - static std::unordered_map> scale_map; - //scale_map.insert({key_conv_pd,{1.0f}}); - //scale_map[key_conv_pd]={0.1f}; - bool scale_reuse = false; - auto scale_in_key = key + "@scale_in"; - auto scale_weights_key = key + "@scale_weights"; - auto scale_out_key = key + "@scale_out"; - auto output_shift_scale_key = key + "@output_shift_scale"; - auto sum_scale_key = key + "@sum_scale"; - auto scale_in_eltwise_key = key + "@scale_in_eltwise"; - std::vector scale_in_data; - std::vector scale_out_data; - std::vector scale_weights_data; - std::vector scale_in_eltwise_data; - std::vector output_shift_scale; - std::vector sum_scale = {1.0f}; - std::vector none_scale = {0}; - - if (is_INT8 && GetScaleMap(scale_map, scale_in_key) == none_scale){ - scale_reuse = true; - } -//std::cout<<"scale_reuse = "<1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - scale_in_data = {*(scale_in->data())}; - scale_weights_data.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; idata() + i); - } - scale_out_data = {*(scale_out->data())}; - output_shift_scale.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; idata())}; - sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0]; - SetScaleMap(scale_map, scale_in_eltwise_key, scale_in_eltwise_data); - } + std::shared_ptr dst_memory_p; - //scale reuse - SetScaleMap(scale_map, scale_in_key, scale_in_data); - SetScaleMap(scale_map, scale_weights_key, scale_weights_data); - SetScaleMap(scale_map, scale_out_key, scale_out_data); - SetScaleMap(scale_map, output_shift_scale_key, output_shift_scale); - SetScaleMap(scale_map, sum_scale_key, sum_scale); - } else{ - scale_in_data = GetScaleMap(scale_map, scale_in_key); - scale_out_data = GetScaleMap(scale_map, scale_out_key); - scale_weights_data = GetScaleMap(scale_map, scale_weights_key); - if(fuse_residual_conn){ - scale_in_eltwise_data = GetScaleMap(scale_map, scale_in_eltwise_key); - } - output_shift_scale = GetScaleMap(scale_map, output_shift_scale_key); - sum_scale = GetScaleMap(scale_map, sum_scale_key); - //printf("pause!!!"); - } + if (fuse_residual_conn) { + auto residual_param = ctx.Input("ResidualData"); + auto residual_param_data = residual_param->data(); - } + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + + if (residual_param->format() != handler.GetDstFormat()) { + auto output_data = + output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + auto residual_data_tz = + paddle::framework::vectorize2int(residual_param->dims()); + auto residual_data_type = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + + auto user_residual_md = platform::MKLDNNMemDesc( + residual_data_tz, residual_data_type, residual_param->format()); + auto user_residual_memory_p = handler.AcquireResidualDataMemory( + user_residual_md, to_void_cast(residual_param_data)); + + dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), pipeline); + } else { + output->ShareDataWith(*residual_param); + auto output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } + } else { + auto output_data = + output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + } + // create convolution op primitive + std::shared_ptr conv_p; + if (bias) { + const T* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = + handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); + + auto bias_memory_p = + handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } - std::vector pipeline; - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + stream(stream::kind::eager).submit(pipeline).wait(); - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - std::string data_format = ctx.Attr("data_format"); - auto chosen_memory_format = - platform::data_format_to_memory_format(data_format); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } else{ + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input("Scale_in") : nullptr; + auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input("Scale_in_eltwise") : nullptr; + auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input("Scale_weights") : nullptr; + auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input("Scale_out") : nullptr; + + bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false; + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input must be with 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE(filter->dims().size() == 4, + "Filter must be with 4 dimensions, i.e. OIHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } - std::shared_ptr conv_pd; - auto bias_tz = paddle::framework::vectorize2int(bias->dims()); - if(is_INT8){ - auto src_md = platform::MKLDNNMemDesc( - src_tz, memory::data_type::u8, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); - auto dst_dt = fuse_relu? paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); - if(fuse_residual_conn){ - auto residual = ctx.Input("ResidualData"); - auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); - if(dst_dt != residual_dt) - dst_dt = residual_dt; + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + int groups = ctx.Attr("groups"); + + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + const T* input_data = input->data(); + const float* filter_data = filter->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // Get unique name for storing MKLDNN primitives + const std::string key = ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + static std::unordered_map> scale_map; + + bool scale_reuse = false; + auto scale_in_key = key + "@scale_in"; + auto scale_weights_key = key + "@scale_weights"; + auto scale_out_key = key + "@scale_out"; + auto output_shift_scale_key = key + "@output_shift_scale"; + auto sum_scale_key = key + "@sum_scale"; + auto scale_in_eltwise_key = key + "@scale_in_eltwise"; + std::vector scale_in_data; + std::vector scale_out_data; + std::vector scale_weights_data; + std::vector scale_in_eltwise_data; + std::vector output_shift_scale; + std::vector sum_scale = {1.0f}; + std::vector none_scale = {0}; + + if (GetScaleMap(scale_map, scale_in_key) == none_scale){ + scale_reuse = true; + } +//std::cout<<"scale_reuse = "<1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; + scale_in_data = {*(scale_in->data())}; + scale_weights_data.resize(count); + #pragma omp parallel for if (count > 1) + for(int i=0; idata() + i); } - auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); - - // create a conv primitive descriptor and save it for usage in backward - if (bias) { - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, memory::data_type::s32, memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn, - output_shift_scale, sum_scale[0], is_test); - } else { - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn, - output_shift_scale, sum_scale[0], is_test); + scale_out_data = {*(scale_out->data())}; + output_shift_scale.resize(count); + #pragma omp parallel for if (count > 1) + for(int i=0; i(), chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), - (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - // create a conv primitive descriptor and save it for usage in backward - if (bias) { - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn, is_test); - } else { - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn, is_test); + if(fuse_residual_conn){ + scale_in_eltwise_data = {*(scale_in_eltwise->data())}; + sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0]; + SetScaleMap(scale_map, scale_in_eltwise_key, scale_in_eltwise_data); } - } - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - - ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); - // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = - handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - auto src_memory_p = - handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - if(is_INT8){ - int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0; - weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); - } else{ - weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test); - } + //scale reuse + SetScaleMap(scale_map, scale_in_key, scale_in_data); + SetScaleMap(scale_map, scale_weights_key, scale_weights_data); + SetScaleMap(scale_map, scale_out_key, scale_out_data); + SetScaleMap(scale_map, output_shift_scale_key, output_shift_scale); + SetScaleMap(scale_map, sum_scale_key, sum_scale); + } else{ + scale_in_data = GetScaleMap(scale_map, scale_in_key); + scale_out_data = GetScaleMap(scale_map, scale_out_key); + scale_weights_data = GetScaleMap(scale_map, scale_weights_key); + if(fuse_residual_conn){ + scale_in_eltwise_data = GetScaleMap(scale_map, scale_in_eltwise_key); + } + output_shift_scale = GetScaleMap(scale_map, output_shift_scale_key); + sum_scale = GetScaleMap(scale_map, sum_scale_key); + //printf("pause!!!"); + } - std::shared_ptr dst_memory_p; - bool need_s8_to_u8 = false; - if(fuse_residual_conn) { + std::vector pipeline; + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + std::shared_ptr conv_pd; + auto bias_tz = paddle::framework::vectorize2int(bias->dims()); + + auto src_md = platform::MKLDNNMemDesc( + src_tz, memory::data_type::u8, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); + auto dst_dt = fuse_relu? paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char))); + if(fuse_residual_conn){ + auto residual = ctx.Input("ResidualData"); + auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); + if(dst_dt != residual_dt) + dst_dt = residual_dt; + } + auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, memory::data_type::s32, memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale[0], is_test); + } else { + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_residual_conn, + output_shift_scale, sum_scale[0], is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); + + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0; + weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder); + + std::shared_ptr dst_memory_p; + bool need_s8_to_u8 = false; + if(fuse_residual_conn) { auto residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), "Output and elementwise parameter need to have the " "same dimension sizes"); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); if(residual_param->format() != handler.GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize2int(residual_param->dims()); - auto residual_data_type = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_data_type, residual_param->format()); - if(is_INT8){ - if(residual_dt == mkldnn::memory::data_type::u8){ - auto residual_param_data = residual_param->data(); - auto user_residual_memory_p = handler.AcquireResidualDataMemory( - user_residual_md, to_void_cast(residual_param_data)); - PADDLE_ENFORCE( - residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), pipeline); - } else{ - auto residual_param_data = residual_param->data(); - auto user_residual_memory_p = handler.AcquireResidualDataMemory( - user_residual_md, to_void_cast(residual_param_data)); - PADDLE_ENFORCE( - residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - int8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), pipeline); - if(fuse_relu) - need_s8_to_u8 = true; - } + auto residual_data_tz = + paddle::framework::vectorize2int(residual_param->dims()); + auto residual_data_type = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + auto user_residual_md = platform::MKLDNNMemDesc( + residual_data_tz, residual_data_type, residual_param->format()); + if(residual_dt == mkldnn::memory::data_type::u8){ + auto residual_param_data = residual_param->data(); + auto user_residual_memory_p = handler.AcquireResidualDataMemory( + user_residual_md, to_void_cast(residual_param_data)); + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler.AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), pipeline); } else{ - auto residual_param_data = residual_param->data(); - auto user_residual_memory_p = handler.AcquireResidualDataMemory( - user_residual_md, to_void_cast(residual_param_data)); - PADDLE_ENFORCE( - residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - auto output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), pipeline); + auto residual_param_data = residual_param->data(); + auto user_residual_memory_p = handler.AcquireResidualDataMemory( + user_residual_md, to_void_cast(residual_param_data)); + PADDLE_ENFORCE( + residual_param_data != nullptr, + "Provide data if you want MKLDNN conv+elementwise_add fusion"); + int8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler.AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), pipeline); + if(fuse_relu) + need_s8_to_u8 = true; } } else { - output->ShareDataWith(*residual_param); - if(is_INT8){ - if(residual_dt == mkldnn::memory::data_type::u8){ - - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - } else{ - int8_t* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - if(fuse_relu) - need_s8_to_u8 = true; - } - } else{ - auto output_data = output->mutable_data(ctx.GetPlace()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - } - } - } else { - if(is_INT8){ - if(fuse_relu){ - uint8_t* output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + output->ShareDataWith(*residual_param); + if(residual_dt == mkldnn::memory::data_type::u8){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } else{ - int8_t* output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + int8_t* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + if(fuse_relu) + need_s8_to_u8 = true; } + } + } else { + if(fuse_relu){ + uint8_t* output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } else{ - auto output_data = - output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); - dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + int8_t* output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); + dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); } - } + } - // create convolution op primitive - std::shared_ptr conv_p; - std::vector scale_bias_data; - auto scale_bias_key = key + "@scale_bias"; - if (bias) { - const float* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); - auto user_bias_memory_p = - handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - if(is_INT8){ - int mask_reorder = is_multi_channel? 1<<0 : 1; - if(scale_reuse){ - int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; - scale_bias_data.resize(count); - #pragma omp parallel for if (count > 1) - for(int i=0; i conv_p; + std::vector scale_bias_data; + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = + handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel? 1<<0 : 1; + if(scale_reuse){ + int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; + scale_bias_data.resize(count); + #pragma omp parallel for if (count > 1) + for(int i=0; imutable_data(ctx.GetPlace()); - } + if(need_s8_to_u8){ + output->mutable_data(ctx.GetPlace()); + } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } } private: @@ -780,7 +914,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& dst, const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, - const bool fuse_residual_conn, bool is_test) const{ + const bool fuse_residual_conn, bool is_test=false) const{ memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -834,7 +968,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, const bool fuse_relu, - const bool fuse_residual_conn, bool is_test) const{ + const bool fuse_residual_conn, bool is_test=false) const{ memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; -- GitLab