提交 dec9cf53 编写于 作者: J Jacek Czaja 提交者: Tao Luo

[MKL-DNN] MKL-DNN specific Tensor modification (#15429)

* - Implemented draft of primitive desc keeping in Tensor

test=develop

- TransposeMKLDNNHandler::AcquireSrcMemory was reimplemented

- Added nchw and nc formats setting for sake of compatiblity

Fixed unit tests

- Worakaround to problem with 5D data in conv

- Added 3D and 1D MKL-DNN formats for name handles for tensor

test=develop

- Fix to UTs

test=develop

- Conv fp32 op was updated

Cosmetic fixes

test=develop

- tensor mkldnn cosmetics

test=develop

- Moved most of mkl-dnn specific code from Tensor to mkl-dnn utils

* - Lint fixes

test=develop

* - setting prim dec in Tensor , sets also layout to kMKLDNN

test=develop

* - Moved creation of prim desc totally out of Tensor

test=develop

* - Cosmetic fixes adter review

test=develop
上级 e41db6e1
......@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz;
......@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims());
if (in_format != out_format) {
// tempory mem pd fr out , to make reorder
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory =
memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
auto out_memory = memory(out_mem_pd, out_data);
platform::Reorder(in_memory, out_memory);
} else {
out->ShareDataWith(in);
}
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif
}
......
......@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN);
out.set_format(out_format);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
#endif
} else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
......@@ -27,6 +27,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace paddle {
namespace framework {
......@@ -37,10 +41,34 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN
public:
inline mkldnn::memory::format format() const { return format_; }
// TODO(jczaja): This is depracted and will be removed
inline mkldnn::memory::format format() const {
if (layout_ == DataLayout::kMKLDNN) {
return static_cast<mkldnn::memory::format>(mem_pd_.desc().data.format);
} else {
return mkldnn::memory::format::format_undef;
}
}
inline void set_format(const mkldnn::memory::format format) {
format_ = format;
// TODO(jczaja): This is depracted and will be removed
inline void set_format(
const mkldnn::memory::format fmt,
mkldnn::memory::data_type data_type = mkldnn::memory::f32) {
mem_pd_ = paddle::platform::create_prim_desc_from_format(
paddle::framework::vectorize2int(dims()), fmt, data_type);
layout_ = DataLayout::kMKLDNN;
}
inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const {
return mem_pd_;
}
inline void set_mkldnn_prim_desc(
const mkldnn::memory::primitive_desc& mem_pd) {
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_ = mem_pd;
layout_ = DataLayout::kMKLDNN;
}
protected:
......@@ -48,12 +76,9 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
*/
mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
mutable mkldnn::memory::primitive_desc mem_pd_;
#endif
public:
......
......@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN);
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN);
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
......@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<primitive> pipeline;
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
// For convolution with groups we need to recreate primitive descriptor
// as Paddle tensor is not having group dims while mkldnn treats
// group as another dimensions
mkldnn::memory::primitive_desc user_weights_mpd =
filter->get_mkldnn_prim_desc();
if (g > 1) {
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
user_weights_mpd =
mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine);
}
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
......@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
weights_format = mkldnn::memory::format::any;
mkldnn::memory::format weights_format = mkldnn::memory::format::any;
// Check the format for user's special output
if (chosen_memory_format != mkldnn::memory::format::any) {
if (is_conv3d) {
......@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_src_memory_p = handler.AcquireSrcMemory(
input->get_mkldnn_prim_desc(), to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data));
user_weights_mpd, to_void_cast<T>(filter_data));
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
......@@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
auto dst_mpd = dst_memory_p->get_primitive_desc();
output->set_mkldnn_prim_desc(dst_mpd);
}
void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test");
......@@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_bwd_weights_p);
filter_grad->set_layout(DataLayout::kMKLDNN);
filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc();
filter_grad->set_mkldnn_prim_desc(filter_grad_mpd);
}
if (input_grad) {
......
......@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
// TODO(jczaja): Remove this hack after checking performance on block layout
auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(tensor->dims()),
mkldnn::memory::format::oihw);
tensor->set_mkldnn_prim_desc(tensor_mem_pd);
}
};
} // namespace operators
......
......@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine, key);
auto transpose_src_memory_p = handler.AcquireSrcMemory(
input->format(), platform::to_void_cast<T>(input_data));
input->get_mkldnn_prim_desc(), platform::to_void_cast<T>(input_data));
auto transpose_dst_memory_p =
handler.AcquireDstMemory(output, ctx.GetPlace());
auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
......@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*transpose_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto output_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(output->dims()),
mkldnn::memory::format::blocked);
output->set_mkldnn_prim_desc(output_mem_pd);
}
};
......@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx,
mkldnn_engine, key);
auto transpose_src_memory_p = handler.AcquireSrcMemory(
out_grad->format(), platform::to_void_cast<T>(out_grad_data));
auto transpose_src_memory_p =
handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(),
platform::to_void_cast<T>(out_grad_data));
auto transpose_dst_memory_p =
handler.AcquireDstMemory(x_grad, ctx.GetPlace());
auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
......@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(*transpose_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(x_grad->dims()),
mkldnn::memory::format::blocked);
x_grad->set_mkldnn_prim_desc(x_grad_mem_pd);
}
};
......
......@@ -39,6 +39,45 @@ class MKLDNNHandler {
return this->AcquireMemory(md, ptr, "@user_src_mem_p");
}
// TODO(jczaja): extract common part and make AcquireMemory
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::primitive_desc& mpd, void* ptr) {
auto local_key = key_ + "@user_src_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::primitive_desc& mpd, void* ptr) {
auto local_key = key_ + "@user_weights_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mpd, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::desc& md, void* ptr,
user_function custom_func = {}) {
......@@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
dims_(dims),
axis_(axis),
logical_axis_(dims.size(), 0) {}
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::format& fmt, void* ptr) {
auto local_key = key_ + "@user_src_mem_p";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
" find mem primitive in device context");
if (mem_p == nullptr) {
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for (size_t i = 0; i < logical_axis_.size(); ++i) {
logical_axis_[i] = i;
}
auto src_md = fmt != mkldnn::memory::format::nchw
? platform::MKLDNNMemDesc(
dims_, platform::MKLDNNGetDataType<float>(), fmt)
: Axis2MemoryDesc(dims_, logical_axis_);
mem_p = std::make_shared<mkldnn::memory>(
mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
axis_(axis) {}
std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output,
platform::Place place) {
......@@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
private:
std::vector<int> dims_;
std::vector<int> axis_;
std::vector<int> logical_axis_;
};
template <class forward_t, class backward_data_t, class backward_weights_t>
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
namespace paddle {
namespace platform {
inline mkldnn::memory::primitive_desc create_prim_desc_from_dims(
const std::vector<int>& ltz, mkldnn::memory::format fmt,
mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) {
mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory;
mem_fmt.ndims = ltz.size();
for (unsigned int i = 0; i < ltz.size(); ++i) {
mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt.data_type = static_cast<mkldnn_data_type_t>(data_type);
mem_fmt.format = static_cast<mkldnn_memory_format_t>(fmt);
unsigned int total_stride = 1;
for (int i = ltz.size() - 1; i >= 0; --i) {
mem_fmt.layout_desc.blocking.padding_dims[i] =
ltz[i]; // logical dimensions (nchw format, regardless physical
// layout)
mem_fmt.layout_desc.blocking.block_dims[i] = 1;
mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset
mem_fmt.layout_desc.blocking.strides[0][i] = total_stride;
mem_fmt.layout_desc.blocking.strides[1][i] = 1;
total_stride *= ltz[i];
}
mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset
auto& pool = platform::DeviceContextPool::Instance();
auto place = paddle::platform::CPUPlace();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
auto& cpu_engine = dev_ctx->GetEngine();
return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine);
}
inline mkldnn::memory::primitive_desc create_prim_desc_from_format(
const std::vector<int>& ltz, const mkldnn::memory::format format,
const mkldnn::memory::data_type data_type) {
auto md = mkldnn::memory::desc({ltz}, data_type, format);
auto& pool = platform::DeviceContextPool::Instance();
auto place = paddle::platform::CPUPlace();
auto dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device");
auto& cpu_engine = dev_ctx->GetEngine();
return mkldnn::memory::primitive_desc(md, cpu_engine);
}
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册