diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index ed265edf003e01261286b95bd101f98b6bd71477..db1127b055c31e6baf69b263b79f9da0e7bf71fd 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 1b69dd7ea00c7cce45e2ef2691ea14f03e59318d..c663ba2f886809f55f8da09fe15d4477c82f99ea 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,27 +12,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_layout_transform.h" +#include + #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { - -using framework::DataLayout; -using mkldnn::memory; -using mkldnn::primitive; -using mkldnn::reorder; -using mkldnn::stream; -using platform::GetMKLDNNFormat; -using platform::to_void_cast; +namespace { inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, const int groups, @@ -78,7 +67,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights> { public: - ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, + ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, @@ -92,19 +81,19 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( - input->layout(), DataLayout::kMKLDNN, + input->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); + framework::DataLayout::kMKLDNN, input->layout())); PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Input tensor")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Filter tensor")); @@ -137,10 +126,10 @@ class ConvMKLDNNHandlerT if (bias) { PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, + bias->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); + framework::DataLayout::kMKLDNN, bias->layout())); PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Bias tensor.")); @@ -188,12 +177,12 @@ class ConvMKLDNNHandlerT std::transform(dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; }); - const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto src_tz = framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = framework::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - const auto dst_tz = paddle::framework::vectorize(output->dims()); + const auto dst_tz = framework::vectorize(output->dims()); const mkldnn::memory::dims stride_dims = strides; const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); @@ -204,29 +193,48 @@ class ConvMKLDNNHandlerT * the memory format preferred for best performance */ auto chosen_memory_format = MKLDNNMemoryFormat::any; - auto data_type = mkldnn::memory::data_type::f32; if (ctx.Attr("mkldnn_data_type") == "bfloat16" || std::is_same::value) data_type = mkldnn::memory::data_type::bf16; - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, - MKLDNNMemoryFormat::any); + mkldnn::memory::desc src_md, weights_md; + if (platform::is_int8()) { + src_md = platform::MKLDNNMemDesc( + src_tz, framework::ToMKLDNNDataType(input->type()), + chosen_memory_format); + weights_md = platform::MKLDNNMemDesc( + weights_tz, mkldnn::memory::data_type::s8, chosen_memory_format); + } else { + src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, + MKLDNNMemoryFormat::any); + } + const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; + float sum_scale; + std::vector output_shift_scale; + std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); + const mkldnn::primitive_attr conv_attr = CreatePostOps( - fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn); + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, + output_shift_scale, sum_scale); // for INT8 only! if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc(bias_tz, data_type, + MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, @@ -255,28 +263,28 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( - in->layout(), DataLayout::kMKLDNN, + in->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, in->layout())); + framework::DataLayout::kMKLDNN, in->layout())); PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Input tensor.")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Filter tensor.")); PADDLE_ENFORCE_EQ( - out_grad->layout(), DataLayout::kMKLDNN, + out_grad->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The output_grad tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, out_grad->layout())); + framework::DataLayout::kMKLDNN, out_grad->layout())); PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for output_grad tensor")); @@ -296,28 +304,25 @@ class ConvMKLDNNHandlerT std::vector dilations(begin(dilations_temp), end(dilations_temp)); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - int groups = ctx.Attr("groups"); - auto input_dims = in->dims(); auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); auto filter_dims = filter->dims(); auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + std::string padding_algorithm = + ctx.Attr("padding_algorithm"); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); auto src_tz = framework::vectorize(in->dims()); auto weights_tz = framework::vectorize(filter->dims()); + int groups = ctx.Attr("groups"); int g = std::max(groups, 1); platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(out_grad->dims()); + auto dst_tz = framework::vectorize(out_grad->dims()); /* create memory descriptor for conv backward without specified format * ('any') which lets a primitive (conv backward in this case) choose @@ -349,8 +354,14 @@ class ConvMKLDNNHandlerT mkldnn::primitive_attr conv_attr; if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, mkldnn::prop_kind::forward_training, @@ -377,6 +388,71 @@ class ConvMKLDNNHandlerT } } + std::tuple> get_int8_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_in_data = ctx.Attr("Scale_in"); + const auto& scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool is_multi_channel = scale_weights_data.size() > 1; + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale); + } + + std::tuple> get_int8_bias_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const auto& scale_in_data = ctx.Attr("Scale_in"); + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + + return std::make_tuple(mask_reorder, scale_bias_data); + } + mkldnn::primitive_attr CreatePostOps( std::string fuse_activation, float fuse_alpha, float fuse_beta, bool fuse_residual_conn, const std::vector output_shift_scale = {}, @@ -433,7 +509,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->bwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_d_p", false); + platform::to_void_cast(filter_data), "@weights_mem_d_p", false); } std::shared_ptr AcquireSrcMemoryWithReorder( @@ -480,11 +556,11 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, to_void_cast(in_mem_data), key_mem); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); - user_mem_p->set_data_handle(to_void_cast(in_mem_data)); + user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } @@ -494,7 +570,8 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const bool is_test) { + const bool is_test, const std::vector& scale_data = {1.0f}, + int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); @@ -511,12 +588,14 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_p", is_test); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, + scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool is_test) { + const framework::Tensor* bias, const bool is_test, + const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); if (is_test && bias_mem_p) { return bias_mem_p; @@ -527,8 +606,9 @@ class ConvMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( - user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast(bias_data), - "@bias_mem_p", is_test); + user_bias_md, this->fwd_pd_->bias_desc(), + platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, + scale_data, mask); } } @@ -536,8 +616,8 @@ class ConvMKLDNNHandlerT const framework::Tensor* residual_param) { void* residual_data = residual_param->type() == framework::DataTypeTrait::DataType() - ? to_void_cast(residual_param->data()) - : to_void_cast(residual_param->data()); + ? platform::to_void_cast(residual_param->data()) + : platform::to_void_cast(residual_param->data()); auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); if (residual_mem_p) { residual_mem_p->set_data_handle(residual_data); @@ -572,12 +652,14 @@ class ConvMKLDNNHandlerT } }; +} // anonymous namespace + template -class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; @@ -607,9 +689,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } template - void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { + void ComputeFP32(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const bool is_test = ctx.Attr("is_test"); @@ -656,407 +738,112 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p->execute(astream, args); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } template - void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { - const bool is_test = ctx.Attr("is_test"); - + void ComputeINT8(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto* input = ctx.Input("Input"); - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); - - PADDLE_ENFORCE_GE(input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); - PADDLE_ENFORCE_LE(input->dims().size(), 5, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); + const std::string& fuse_activation = + ctx.Attr("fuse_activation"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool is_conv3d = ctx.Attr>("strides").size() == 3U; - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); bool unsigned_output = (fuse_activation == "relu" || fuse_activation == "relu6"); - - const T* input_data = input->data(); - - auto src_tz = paddle::framework::vectorize(input->dims()); - - mkldnn::memory::data_type src_dt = - paddle::framework::ToMKLDNNDataType(input->type()); - - std::string key = - platform::CreateKey(dev_ctx, src_tz, src_dt, - ctx.InputName("Input") + ctx.InputName("Filter")); - bool need_s8_to_u8 = false; - std::shared_ptr conv_p; - std::shared_ptr src_memory_p; - std::shared_ptr user_src_memory_p; - std::shared_ptr dst_memory_p; - std::vector pipeline; - std::shared_ptr conv_pd; - std::shared_ptr handler; - - // This is workaround for hacky implementation - // of conv int8 mkl-dnn. Once conv fp32 and conv int8 - // are merged/unified, this will disappear - auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_conv_pd = key_tid + "@conv_pd"; - auto prim_key = key_tid + "@conv_p"; - auto dst_key = key_tid + "@dst_mem_p"; - auto src_key = key_tid + "@src_mem_p"; - auto weights_key = key_tid + "@weights_mem_p"; - auto bias_key = key_tid + "@bias_mem_p"; - auto user_src_key = key_tid + "@user_src_mem_p"; - auto user_residual_key = key_tid + "@user_residual_data_mem_p"; - auto src_reorder_key = key_tid + "@src_mem_preorder_p"; - auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p"; - - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + PADDLE_ENFORCE_NE( + is_conv3d, true, + platform::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, false, + platform::errors::Unimplemented( + "residual fusion does not support force output with fp32")); - if (conv_pd == nullptr || !is_test) { - float fuse_alpha = ctx.Attr("fuse_alpha"); - float fuse_beta = ctx.Attr("fuse_beta"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); - auto* filter = ctx.Input("Filter"); + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, + output, ctx.InputName("Input") + ctx.InputName("Filter")); - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - PADDLE_ENFORCE_GE(filter->dims().size(), 4, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); - PADDLE_ENFORCE_LE(filter->dims().size(), 5, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const bool is_multi_channel = scale_weights_data.size() > 1; + const int& groups = ctx.Attr("groups"); + const bool& is_test = ctx.Attr("is_test"); + int mask_reorder = + is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, is_test, scale_weights_data, mask_reorder); + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( - !fuse_residual_conn || !force_fp32_output, true, - platform::errors::Unimplemented( - "residual fusion does not support force output with fp32")); - - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); - - PADDLE_ENFORCE_EQ(bias->dims().size(), 1, - platform::errors::InvalidArgument( - "Bias must only have 1 dimension, i.e. X, but " - "got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), - end(dilations_temp)); - - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - bool is_conv3d = strides.size() == 3U; - - PADDLE_ENFORCE_NE(is_conv3d, true, - platform::errors::Unimplemented( - "int8 does not support conv3d currently")); - - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - int groups = ctx.Attr("groups"); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - int g = std::max(groups, 1); - - platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(output->dims()); - - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - - const K* filter_data = filter->data(); - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_out_data = - force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); - float sum_scale = - fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; - - bool is_multi_channel = scale_weights_data.size() > 1; - - int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] - : (weights_tz)[0]) - : 1; - std::vector output_shift_scale(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - if (scale_weights_data[i] == 0.0) - output_shift_scale[i] = - scale_out_data; // weights data will contain 0 - // in some models, then weights - // scale couldn't be calculated - else - output_shift_scale[i] = - static_cast(static_cast(scale_out_data) / - (static_cast(scale_in_data) * - static_cast(scale_weights_data[i]))); - } - - auto user_src_md = - platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - auto chosen_memory_format = MKLDNNMemoryFormat::any; - - std::vector bias_tz; - - auto src_md = - platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - handler.reset( - new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key)); - // create a conv primitive descriptor and save it for usage in backward - auto propagation = is_test ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; - - if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, - MKLDNNMemoryFormat::x); - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, bias_md, dst_md, strides, dilations, paddings, - mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } else { - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, paddle::none, dst_md, strides, dilations, - paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = - is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, true, scale_weights_data, - mask_reorder); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ( - output->dims(), residual_param->dims(), - platform::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, but got output's dimension = %d" - " and residual param's dimension =%d .", - output->dims().size(), residual_param->dims().size())); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - if (residual_param->format() != handler->GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_dt, residual_param->format()); - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } else { - output->ShareDataWith(*residual_param); - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - - // create convolution op primitive - conv_p = handler->AcquireConvolution(); - if (bias) { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel ? 1 << 0 : 1; - int count = - is_multi_channel - ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) - : 1; - std::vector scale_bias_data(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - scale_bias_data[i] = scale_in_data * scale_weights_data[i]; - } - bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( - user_bias_memory_p, pipeline, is_test, true, scale_bias_data, - mask_reorder); - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } - } else { - auto src_memory_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if (src_memory_reorder_p) { - user_src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - src_memory_reorder_p->execute(astream, *user_src_memory_p, - *src_memory_p); - astream.wait(); - } - } else if (src_memory_p) { - src_memory_p->set_data_handle(to_void_cast(input_data)); - } - auto weights_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(weights_key)); + output->dims(), residual_param->dims(), + platform::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), residual_param->dims().size())); dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - output->ShareDataWith(*residual_param); - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } - platform::SetDstMemoryHandler(ctx, output, handler, dst_memory_p); + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (platform::MKLDNNGetDataType() == + mkldnn::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } - auto residual_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(residual_reorder_key)); - if (residual_reorder_p) { - auto user_residual_data_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_residual_key)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - residual_reorder_p->execute(astream, *user_residual_data_p, - *dst_memory_p); - astream.wait(); - } - } + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}; - auto bias_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(bias_key)); + if (bias) { + float mask_reorder; + std::vector scale_bias_data; + std::tie(mask_reorder, scale_bias_data) = + handler.get_int8_bias_scales(ctx); - if (bias_memory_p) { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, is_test, scale_bias_data, mask_reorder); + args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + conv_p->execute(astream, args); astream.wait(); + if (need_s8_to_u8) { output->mutable_data(ctx.GetPlace()); } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } }; template -class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL ConvGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); @@ -1105,18 +892,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); astream.wait(); - filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_layout(framework::DataLayout::kMKLDNN); // in OneDNN groups in convolution are treated as separate dimension // which is not the case in paddlepaddle - auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p); + auto filter_fmt = platform::GetMKLDNNFormat(*diff_weights_memory_p); // For convolution with groups convert from blocked to NCHW // otherwise there will be problems in next operators working on this data if (g > 1) { - memory::data_type in_type = framework::ToMKLDNNDataType(filter->type()); + mkldnn::memory::data_type in_type = + framework::ToMKLDNNDataType(filter->type()); // for 3d conv with groups (six dimensional data reorder to goidhw) // for 2d conv with groups (five dimensional data reorder to goihw) - // auto weights_tz = paddle::framework::vectorize(filter->dims()); + // auto weights_tz = framework::vectorize(filter->dims()); auto weights_tz = diff_weights_memory_p->get_desc().dims(); mkldnn::memory::format_tag out_format = @@ -1168,8 +956,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); - input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + input_grad->set_layout(framework::DataLayout::kMKLDNN); + input_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p)); } } }; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f14f92cb51fdb1012e050b6df19f3764999d8e62..37fa58e423db779eec3f08aaeb1b71fb49c5c8d6 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -531,7 +531,13 @@ inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) { inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) { return op->GetAttrIfExists("mkldnn_data_type") == "float32"; } + enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP }; +template +bool constexpr is_int8() { + return std::is_same::value || std::is_same::value; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 1aa8c0cdb57f977bc8d8f016394091612b0290af..084b47bb3c7a3b3b0622a79a985ea97e3854b4be 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -527,7 +527,8 @@ class MKLDNNHandlerT { const mkldnn::memory::desc& user_md, const mkldnn::memory::desc& target_md, void* ptr, const std::string& suffix, bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { + std::function(const F*)> custom_reorder_func = {}, + const std::vector& scale_data = {1.0f}, int mask = 0) { const auto target_key = key_ + suffix + "_target"; const auto key_reorder_p = key_ + suffix + "reorder_p"; const auto user_key = key_ + suffix + "_user"; @@ -546,8 +547,17 @@ class MKLDNNHandlerT { std::make_shared(user_md, engine_, ptr); if (user_md != target_md) { target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + dnnl::reorder::primitive_desc reorder_pdesc; + if (is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p, + *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); dev_ctx_.SetBlob(key_reorder_p, reorder_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -597,201 +607,6 @@ class MKLDNNHandlerT { std::shared_ptr bwd_w_pd_; }; -// TODO(grygielski) this class will be deleted later. -class MKLDNNHandler { - public: - MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : dev_ctx_(dev_ctx), - engine_(engine), - key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src_mem_p"); - } - - std::shared_ptr AcquireDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr, const std::string& suffix) { - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, const std::string& suffix) { - const auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_); - dev_ctx_.SetBlob(local_key, mem_p); - } - return mem_p; - } - - // This incarnation of AcquireMemory can call user function eg. custom reorder - // or preprocessing routine if needed - std::shared_ptr AcquireMemory( - const mkldnn::memory::desc& md, void* ptr, const std::string& suffix, - user_function custom_func = {}) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - // Call custom reorder/preprocessing func if available - if (custom_func) { - auto reordered_data = custom_func(reinterpret_cast(ptr)); - dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data); - ptr = reinterpret_cast(reordered_data.get()); - } - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::vector& dims, const mkldnn::memory::data_type dtype, - const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto md = mkldnn::memory::desc(dims, dtype, fmt); - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix, - std::vector& pipeline) { // NOLINT - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto stored_reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (stored_reorder_p) { - pipeline.push_back(*stored_reorder_p); - } else { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - return target_memory_p; - } - - std::shared_ptr AcquireMemory( - mkldnn::memory::desc& md, // NOLINT - mkldnn::memory::desc& user_md, // NOLINT - const std::shared_ptr user_memory_p, - const std::string& suffix, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - // create reorder primitive if the input format is not the preferred one - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (target_memory_p == nullptr) { - target_memory_p = user_memory_p; - if (md != user_md) { - target_memory_p = std::make_shared(md, engine_); - std::shared_ptr reorder_pd; - if (is_INT8) { - mkldnn::primitive_attr - attri; // attribute for int8 weights and bias data reorder. - attri.set_output_scales(mask, scale_data); - - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p, attri)); - } else { - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p)); - } - auto reorder_p = - std::shared_ptr(new mkldnn::reorder(*reorder_pd)); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - dev_ctx_.SetBlob(local_key, target_memory_p); - } else if (!is_persistent) { - // Make reorder if needed - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - protected: - const MKLDNNDeviceContext& dev_ctx_; - mkldnn::engine engine_; - std::string key_; -}; - template class BinaryMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { @@ -1143,362 +958,6 @@ class ReorderMKLDNNHandler { mkldnn::engine engine_; }; -template -struct convolutional_algorithm; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct; -}; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = - mkldnn::algorithm::deconvolution_direct; -}; - -template -class ConvMKLDNNTemplateHandler : public MKLDNNHandler { - public: - ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - - // TODO(jczaja): remove after conv int8 is adapted - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) { - conv_pd_ = conv_pd; - } - - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - std::shared_ptr - conv_bwd_data_pd, - std::shared_ptr - conv_bwd_weights_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - conv_pd_(conv_pd), - conv_bwd_weights_pd_(conv_bwd_weights_pd), - conv_bwd_data_pd_(conv_bwd_data_pd) { - // If we are in Grad operatgor then update a key with BWD suffix to - // distinguish from FWD memory primitives - key_ += "-BWD"; - } - - size_t GetDstMemorySize() const { return conv_pd_->dst_desc().get_size(); } - - MKLDNNMemoryFormat GetDstFormat() const { - return paddle::platform::GetMKLDNNFormat(conv_pd_->dst_desc()); - } - - size_t GetDiffWeightsMemorySize() const { - return conv_bwd_weights_pd_->diff_weights_desc().get_size(); - } - - size_t GetDiffSourceMemorySize() const { - return conv_bwd_data_pd_->diff_src_desc().get_size(); - } - - std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_bwd_weights_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, - "@weights-src_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@weights-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@data-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT - auto weights_pd = conv_bwd_data_pd_->weights_desc(); - auto user_pd = user_weights_memory_p->get_desc(); - return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, - "@data-weights_mem_p", pipeline); - } - - std::shared_ptr AcquireResidualDataMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromResidualDataMemory( - const std::shared_ptr& user_residual_memory_p, - void* dst_ptr, - std::vector& pipeline) { // NOLINT - return this->AcquireMemory(user_residual_memory_p, - this->AcquireDstMemoryFromPrimitive(dst_ptr), - "@residual_data_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_bwd_data_pd_->diff_src_desc(), - ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_pd_->dst_desc(), ptr, - "@dst_mem_p"); - } - - std::shared_ptr AcquireSrcMemoryFromPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", - pipeline); - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::desc& md, void* ptr, - user_function custom_func = {}) { - return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); - } - - std::shared_ptr AcquireBiasMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); - } - - std::shared_ptr AcquireWeightsMemoryFromPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - auto user_weights_pd = user_weights_memory_p->get_desc(); - auto weights_pd = conv_pd_->weights_desc(); - return this->AcquireMemory( - weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent, is_INT8, scale_data, mask); - } - - std::shared_ptr AcquireBiasMemoryFromPrimitive( - const std::shared_ptr user_bias_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, - int mask = 0) { // NOLINT - auto user_bias_pd = user_bias_memory_p->get_desc(); - auto bias_pd = conv_pd_->bias_desc(); - return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline, is_persistent, is_INT8, - scale_data, mask); - } - - mkldnn::primitive_attr CreatePostOps( - std::string fuse_activation, float fuse_alpha, float fuse_beta, - bool fuse_residual_conn, const std::vector output_shift_scale = {}, - float sum_scale = 1.0f) const { - mkldnn::primitive_attr conv_attr; - mkldnn::post_ops post_operations; - if (output_shift_scale.size() > 0) { - int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; - conv_attr.set_output_scales(mask, output_shift_scale); - } - // Fusion with Elementwise layer relies on adding a sum post-operation with - // the scale parameter. It is assumed that when fuse_residual_connection is - // true, the output tensor contains the data coming from residual - // connection. The result of this post_op is: - // Output = scale * Output + Conv_Out. - if (fuse_residual_conn) { - post_operations.append_sum(sum_scale); - } - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "relu6") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, - mkldnn::algorithm::eltwise_bounded_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "swish") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish, - fuse_alpha, fuse_beta); - } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - - std::shared_ptr - AcquireConvolutionPrimitiveDescriptor( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, - paddle::optional bias, - const mkldnn::memory::desc& dst, const std::vector& strides, - const std::vector& dilations, - const std::vector& paddings, const mkldnn::engine& engine, - const std::string& fuse_activation, float fuse_alpha, float fuse_beta, - const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind, - const std::vector output_shift_scale = {}, - const float sum_scale = 1.0f) { - // Conv PD has to be passed to Grad op that - // may be exxecuted by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_conv_pd = key_ + "@conv_pd"; - - conv_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_conv_pd)); - - if (conv_pd_ == nullptr) { - mkldnn::memory::dims stride_dims = strides; - mkldnn::memory::dims dilations_dims = dilations; - auto mkldnn_paddings = ToMkldnnPadding(paddings); - - auto conv_desc = - bias ? typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, *bias, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]) - : typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]); - - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, output_shift_scale, sum_scale); - - conv_pd_.reset( - new typename forward_t::primitive_desc(conv_desc, conv_attr, engine)); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx_.SetBlob(key_conv_pd, conv_pd_); - } - - return conv_pd_; - } - - std::shared_ptr AcquireConvolution() { - auto prim_key = key_ + "@conv_p"; - auto conv_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_); - - dev_ctx_.SetBlob(prim_key, conv_p); - } - return conv_p; - } - - std::shared_ptr AcquireConvolutionBackwardWeights() { - auto prim_key = key_ + "@conv_bwd_weights_p"; - auto conv_bwd_weights_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_weights_p == nullptr) { - // create backward conv primitive for weights - conv_bwd_weights_p = - std::make_shared(*conv_bwd_weights_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); - } - return conv_bwd_weights_p; - } - - std::shared_ptr AcquireConvolutionBackwardData() { - auto prim_key = key_ + "@conv_bwd_data_p"; - auto conv_bwd_data_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_data_p == nullptr) { - conv_bwd_data_p = std::make_shared(*conv_bwd_data_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); - } - return conv_bwd_data_p; - } - - private: - std::shared_ptr conv_pd_; - std::shared_ptr - conv_bwd_weights_pd_; - std::shared_ptr conv_bwd_data_pd_; -}; - -using ConvMKLDNNHandler = - ConvMKLDNNTemplateHandler; - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - return dst_memory_p; -} - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const framework::Tensor* residual_param, - const mkldnn::memory::desc& user_residual_md, - const std::shared_ptr& handler, - std::vector* pipeline) { - const T* residual_param_data = residual_param->data(); - PADDLE_ENFORCE_NOT_NULL( - residual_param_data, - platform::errors::PreconditionNotMet("Residual parameter is required for " - "the DNNL conv+elementwise_add " - "fusion, but now it is missing.")); - std::shared_ptr user_residual_memory_p = - handler->AcquireResidualDataMemory(user_residual_md, - to_void_cast(residual_param_data)); - T* output_data = output->mutable_data(ctx.GetPlace()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), *pipeline); - return dst_memory_p; -} - -template -static void SetDstMemoryHandler( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler, - std::shared_ptr dst_memory_p) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); -} - template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, @@ -1524,5 +983,6 @@ static void SetDstMemoryQuantized( dst_memory.reset( new mkldnn::memory(*dst_md, engine, to_void_cast(output_data))); } + } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index db05801c7227b03d7f7a06639abe9d3a779d5faf..8ea4e369d323619db35ebf41d4dd053fc5ffe4d9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -20,7 +20,8 @@ import numpy as np import paddle import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, get_numeric_gradient) from paddle.fluid.tests.unittests.testsuite import create_op from paddle.fluid import Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 027c806fc02e90d792a7316bd9169482ede766fa..89125dc326d15b6538618f044d62642e0c11d609 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -22,7 +22,7 @@ import paddle.nn as nn paddle.enable_static() import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, attrs):