diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 72c50518af08b9c1b2f97e6864e5836e806c77fc..10aa7a59422f4508dda8d0bcd960583056e25938 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; - auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_type.place_)); - auto& cpu_engine = dev_ctx->GetEngine(); - std::vector in_tz = paddle::framework::vectorize2int(in.dims()); std::vector out_tz = in_tz; @@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; - auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); - auto out_format = - platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); - // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - if (in_format != out_format) { + // tempory mem pd fr out , to make reorder + auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(out->dims()), + mkldnn::memory::format::blocked, out_type); + if (in.get_mkldnn_prim_desc() != out_mem_pd) { void* in_data = GetDataFromTensor(in, in_type); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - auto in_memory = - memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); - auto out_memory = - memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); + auto out_memory = memory(out_mem_pd, out_data); platform::Reorder(in_memory, out_memory); } else { out->ShareDataWith(in); } out->set_layout(out_layout); - // reset format since the out tensor will be feed to non-MKLDNN OPkernel - out->set_format(memory::format::format_undef); #endif } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 82872224501709080ff02a13464d58543a0abda8..f0203edf05635452bf347335066dadc24ecc3138 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type, #ifdef PADDLE_WITH_MKLDNN // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur - - auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), - ToMKLDNNFormat(lin)); - out.ShareDataWith(input_tensor); - out.set_layout(DataLayout::kMKLDNN); - out.set_format(out_format); + // TODO(jczaja): Remove that once all mkldnn ops + // are modified to work with mkldnn_blocked + auto mkldnn_fmt = [&](int rank) { + switch (rank) { + case 5: + return mkldnn::memory::format::ncdhw; + case 4: + return mkldnn::memory::format::nchw; + case 3: + return mkldnn::memory::format::ncw; + case 2: + return mkldnn::memory::format::nc; + case 1: + return mkldnn::memory::format::x; + default: + return mkldnn::memory::format::blocked; + } + }; + + auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(out.dims()), + mkldnn_fmt(out.dims().size())); + + out.set_mkldnn_prim_desc(out_mem_pd); #endif } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 40606d9b06baf4dbebf87f3c02580e49ae6e2a70..88f5b757a8111f6a7e269ff71054dab425c0de01 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -27,6 +27,10 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_utils.h" +#endif + namespace paddle { namespace framework { @@ -37,10 +41,34 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - inline mkldnn::memory::format format() const { return format_; } + // TODO(jczaja): This is depracted and will be removed + inline mkldnn::memory::format format() const { + if (layout_ == DataLayout::kMKLDNN) { + return static_cast(mem_pd_.desc().data.format); + } else { + return mkldnn::memory::format::format_undef; + } + } - inline void set_format(const mkldnn::memory::format format) { - format_ = format; + // TODO(jczaja): This is depracted and will be removed + inline void set_format( + const mkldnn::memory::format fmt, + mkldnn::memory::data_type data_type = mkldnn::memory::f32) { + mem_pd_ = paddle::platform::create_prim_desc_from_format( + paddle::framework::vectorize2int(dims()), fmt, data_type); + layout_ = DataLayout::kMKLDNN; + } + + inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const { + return mem_pd_; + } + + inline void set_mkldnn_prim_desc( + const mkldnn::memory::primitive_desc& mem_pd) { + // Internally MKL-DNN is just copying (increasing reference counter) + // to shared_ptr. So asignment should be quite cheap + mem_pd_ = mem_pd; + layout_ = DataLayout::kMKLDNN; } protected: @@ -48,12 +76,9 @@ class Tensor { * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, layout will be set as - * DataLayout::kMKLDNN meanwhile detail memory format will be kept in - * this field. + * nChw16c, etc. For a MKLDNN memory block, we store memory descriptor */ - - mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; + mutable mkldnn::memory::primitive_desc mem_pd_; #endif public: diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 0ce174654e85175f0b949f860a00afafc548ed3e..7ac64e6ba134c034acc58c7310cd51da0f03d16d 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, @@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; - auto src_format = input->format(); - mkldnn::memory::format weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), src_format); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); + // For convolution with groups we need to recreate primitive descriptor + // as Paddle tensor is not having group dims while mkldnn treats + // group as another dimensions + mkldnn::memory::primitive_desc user_weights_mpd = + filter->get_mkldnn_prim_desc(); + if (g > 1) { + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); + user_weights_mpd = + mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine); + } /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = mkldnn::memory::format::any; + mkldnn::memory::format weights_format = mkldnn::memory::format::any; // Check the format for user's special output if (chosen_memory_format != mkldnn::memory::format::any) { if (is_conv3d) { @@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = - handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_src_memory_p = handler.AcquireSrcMemory( + input->get_mkldnn_prim_desc(), to_void_cast(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); + user_weights_mpd, to_void_cast(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = @@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + auto dst_mpd = dst_memory_p->get_primitive_desc(); + output->set_mkldnn_prim_desc(dst_mpd); } void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // push primitive to stream and wait until it's executed pipeline.push_back(*conv_bwd_weights_p); - filter_grad->set_layout(DataLayout::kMKLDNN); - filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); + auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); + filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); } if (input_grad) { diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index 76b00b396c1349eff5db1059268e7cf280a8fc64..d01e8dbf4ce0c92bb81fc76df68d5424f9da0717 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { // The format of output is set as the mkldnn's format // TODO(@mozga-intel) The format of matrix sets inside the another layers. - tensor->set_layout(DataLayout::kMKLDNN); - tensor->set_format(mkldnn::memory::format::oihw); + // TODO(jczaja): Remove this hack after checking performance on block layout + + auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(tensor->dims()), + mkldnn::memory::format::oihw); + tensor->set_mkldnn_prim_desc(tensor_mem_pd); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index e6df7028f540d0928e2bb0763bd4cfef12059665..e41bfb80dfc0452955f7978f74ccfea184886b69 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( - input->format(), platform::to_void_cast(input_data)); + input->get_mkldnn_prim_desc(), platform::to_void_cast(input_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(output, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + // Transpose did change logical dimensions of Tensor, but reorder does not. + // Reorder does change only physical layout eg. format , strides + // so we need to create new primitive descriptor with changed logical layout + // so it match output shape + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(output->dims()), + mkldnn::memory::format::blocked); + output->set_mkldnn_prim_desc(output_mem_pd); } }; @@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, mkldnn_engine, key); - auto transpose_src_memory_p = handler.AcquireSrcMemory( - out_grad->format(), platform::to_void_cast(out_grad_data)); + auto transpose_src_memory_p = + handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), + platform::to_void_cast(out_grad_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(x_grad, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + // Transpose did change logical dimensions of Tensor, but reorder does not. + // Reorder does change only physical layout eg. format , strides + // so we need to create new primitive descriptor with changed logical layout + // so it match output shape + auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(x_grad->dims()), + mkldnn::memory::format::blocked); + x_grad->set_mkldnn_prim_desc(x_grad_mem_pd); } }; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 908499e0d8dc679a714a332c8dfe5f16bfbdcd3d..4a674ca526f455314613d43847faa7e01f4d7802 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -39,6 +39,45 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } + // TODO(jczaja): extract common part and make AcquireMemory + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::primitive_desc& mpd, void* ptr) { + auto local_key = key_ + "@user_src_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mpd, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::primitive_desc& mpd, void* ptr) { + auto local_key = key_ + "@user_weights_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mpd, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + std::shared_ptr AcquireWeightsMemory( const mkldnn::memory::desc& md, void* ptr, user_function custom_func = {}) { @@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis), - logical_axis_(dims.size(), 0) {} - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::format& fmt, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - // Make memory descriptor using input format, unless it - // cannot be trusted (nchw) then make up memory fmt manually - for (size_t i = 0; i < logical_axis_.size(); ++i) { - logical_axis_[i] = i; - } - auto src_md = fmt != mkldnn::memory::format::nchw - ? platform::MKLDNNMemDesc( - dims_, platform::MKLDNNGetDataType(), fmt) - : Axis2MemoryDesc(dims_, logical_axis_); - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{src_md, engine_}, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } + axis_(axis) {} std::shared_ptr AcquireDstMemory(framework::Tensor* output, platform::Place place) { @@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; std::vector axis_; - std::vector logical_axis_; }; template diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..8c511f97d12cfe299ad5629eff1871e8d156c850 --- /dev/null +++ b/paddle/fluid/platform/mkldnn_utils.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace platform { + +inline mkldnn::memory::primitive_desc create_prim_desc_from_dims( + const std::vector& ltz, mkldnn::memory::format fmt, + mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) { + mkldnn_memory_desc_t mem_fmt; + + mem_fmt.primitive_kind = mkldnn_memory; + mem_fmt.ndims = ltz.size(); + for (unsigned int i = 0; i < ltz.size(); ++i) { + mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format, + // regardless physical layout) + } + mem_fmt.data_type = static_cast(data_type); + mem_fmt.format = static_cast(fmt); + + unsigned int total_stride = 1; + for (int i = ltz.size() - 1; i >= 0; --i) { + mem_fmt.layout_desc.blocking.padding_dims[i] = + ltz[i]; // logical dimensions (nchw format, regardless physical + // layout) + mem_fmt.layout_desc.blocking.block_dims[i] = 1; + mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset + mem_fmt.layout_desc.blocking.strides[0][i] = total_stride; + mem_fmt.layout_desc.blocking.strides[1][i] = 1; + total_stride *= ltz[i]; + } + mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset + + auto& pool = platform::DeviceContextPool::Instance(); + auto place = paddle::platform::CPUPlace(); + auto* dev_ctx = dynamic_cast(pool.Get(place)); + auto& cpu_engine = dev_ctx->GetEngine(); + return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine); +} + +inline mkldnn::memory::primitive_desc create_prim_desc_from_format( + const std::vector& ltz, const mkldnn::memory::format format, + const mkldnn::memory::data_type data_type) { + auto md = mkldnn::memory::desc({ltz}, data_type, format); + auto& pool = platform::DeviceContextPool::Instance(); + auto place = paddle::platform::CPUPlace(); + auto dev_ctx = dynamic_cast(pool.Get(place)); + PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device"); + auto& cpu_engine = dev_ctx->GetEngine(); + return mkldnn::memory::primitive_desc(md, cpu_engine); +} + +} // namespace platform +} // namespace paddle