From ecd9f330c9ae86414ef31e6808f87a48ce99dac3 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 30 Aug 2019 10:00:35 +0200 Subject: [PATCH] [MKL-DNN] Fix to face model on AVX512 platforms (#19282) - Refactor step 1 - Compilation fix - Yet another compilation fix - Even more compilation fix - Lint fixes test=develop - Removed deprectaed PADDLE_ENFORCE occurance test=develop - Candidate fix to BN forward - Lint fixes test=develop - Refactoring in data_layout_transform - compilation fix - Another comppilation fix - Step further into darkness - Yet another compilation fix - Yet another compilation fix - missing header - compilation fix - Added MKLDNN -> Paddle conversion in fetch op test=develop - Compilation fix test=develop - Lint test=develop - Mul fix - Fix to MKLDNN MUL op and Elementwise MUL UT test=develop - Workaround for diffrent weights with groups representation Paddle vs MKL-DNN. test=develop - Candidate fix for 5D convolution with groups - Refactor of fix for conv3d and conv2d in fetch op test=develop - Compilation fix - Still same compilation fix - Compilation fix - Compilation fix - Reverted refactoring of fixes - Adapted test_conv2d_int8_mkldnn so it exects data in NCHW format not NHWC test=develop - minor fix in UT test=develop - Lint fixes test=develop --- .../fluid/framework/data_layout_transform.cc | 12 +++- .../fluid/framework/data_layout_transform.h | 4 ++ .../fluid/operators/controlflow/fetch_op.cc | 12 +++- .../operators/mkldnn/activation_mkldnn_op.cc | 3 +- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 15 ++++- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 3 +- paddle/fluid/platform/mkldnn_helper.h | 7 +++ paddle/fluid/platform/mkldnn_reuse.h | 57 ++++++++++--------- .../mkldnn/test_conv2d_int8_mkldnn_op.py | 17 ++---- .../mkldnn/test_elementwise_mul_mkldnn_op.py | 4 +- 10 files changed, 83 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index bbcd34260e..393b7e9887 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -121,12 +121,19 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const Tensor& in, Tensor* out) { auto in_layout = kernel_type_for_var.data_layout_; auto out_layout = expected_kernel_type.data_layout_; + auto place = expected_kernel_type.place_; PADDLE_ENFORCE( in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " "non-MKLDNN"); + innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place); +} + +void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, + const Tensor& in, Tensor* out, + platform::Place place) { #ifdef PADDLE_WITH_MKLDNN PADDLE_ENFORCE(in.format() != memory::format::format_undef && in.format() != memory::format::any, @@ -137,8 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_type.place_)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto& cpu_engine = dev_ctx->GetEngine(); std::vector in_tz = paddle::framework::vectorize2int(in.dims()); @@ -165,7 +171,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data); auto reorder_dst_memory_p = - handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_); + handler.AcquireDstMemory(out, out_format, place); auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 2c0a34b881..e34e3dc3af 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -69,6 +69,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, Tensor* out); +void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, + const Tensor& in, Tensor* out, + platform::Place place); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index 85d36c5c3a..39fdf07f05 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" @@ -55,7 +56,16 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? if (src_item.IsInitialized() && src_item.numel() > 0) { - TensorCopySync(src_item, platform::CPUPlace(), &dst_item); + // Conversion from MKL-DNN to Paddle + if (src_item.layout() == framework::DataLayout::kMKLDNN) { + framework::Tensor out; + framework::innerTransDataLayoutFromMKLDNN( + src_item.layout(), framework::DataLayout::kNCHW, src_item, &out, + platform::CPUPlace()); + TensorCopySync(out, platform::CPUPlace(), &dst_item); + } else { + TensorCopySync(src_item, platform::CPUPlace(), &dst_item); + } } else { // Not copy, if the src tensor is empty. dst_item.clear(); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 3533418670..52de20258a 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -87,7 +87,6 @@ void eltwise_forward(const framework::ExecutionContext &ctx, auto *y = ctx.Output("Out"); const T *x_data = x->data(); - T *y_data = y->mutable_data(ctx.GetPlace()); const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr("alpha") : 0; const T beta = ctx.op().HasAttr("beta") ? ctx.Attr("beta") : 0; @@ -119,7 +118,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx, auto src_memory_p = handler.AcquireSrcMemory(md, to_void_cast(x_data)); auto dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(y_data)); + handler.AcquireDstMemoryFromPrimitive(y, ctx.GetPlace()); auto activation_p = handler.AcquireActivation(dst_memory_p, src_memory_p); // push primitive to stream and wait until it's executed diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 40f7231c12..ffe908d667 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -58,6 +58,15 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p"); } + template + std::shared_ptr AcquireDstMemoryFromPrimitive( + framework::Tensor *output, platform::Place place) { + T *ptr = output->mutable_data( + place, batch_norm_pd_->dst_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->dst_primitive_desc(), ptr, "@dst_mem_p"); + } + std::shared_ptr AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc, const mkldnn::engine &engine) { @@ -189,7 +198,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const T *x_data = x->data(); const T *mean_data = mean->data(); const T *variance_data = variance->data(); - T *y_data = y->mutable_data(ctx.GetPlace()); T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); T *batch_mean_data = nullptr; @@ -250,8 +258,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data()); // create mkldnn memory for output y tensor - auto dst_memory = handler.AcquireDstMemory( - batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data); + auto dst_memory = + handler.AcquireDstMemoryFromPrimitive(y, ctx.GetPlace()); std::shared_ptr batch_norm_p; if (global_stats) { @@ -334,6 +342,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T *scale_data = scale->data(); const T *shift_data = shift->data(); T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 404f9f74d1..6d870b43ac 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -421,7 +421,8 @@ class MulMKLDNNKernel : public framework::OpKernel { out->Resize(out_dims); } out->set_layout(DataLayout::kMKLDNN); - out->set_format(out->format()); + out->set_format(platform::MKLDNNFormatForSize( + out_dims.size(), mkldnn::memory::format::nchw)); } }; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 8bcb8acee9..4f8c4b7cc8 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -131,7 +131,14 @@ inline mkldnn::memory::format MKLDNNFormatForSize( } else if (data_format == mkldnn::memory::format::nhwc) { return mkldnn::memory::format::nwc; } + } else if (dims_size == 4) { + if (data_format == mkldnn::memory::format::goihw) { + return mkldnn::memory::format::oihw; + } } else if (dims_size == 5) { + if (data_format == mkldnn::memory::format::goidhw) { + return mkldnn::memory::format::oidhw; + } if (data_format == mkldnn::memory::format::nchw) { return mkldnn::memory::format::ncdhw; } else if (data_format == mkldnn::memory::format::nhwc) { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index b3d7ff3190..cec6bb792e 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -337,27 +337,26 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { // may be executed by diffrent thread, hence // for that one we use key that does not contain TID const std::string key_activation_pd = key_common_ + "@activation_pd"; - activation_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_activation_pd)); - if (activation_pd_ == nullptr) { + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_activation_pd)); + if (fwd_pd_ == nullptr) { static std::mutex acquire_barrier; std::lock_guard block_threads_until_finish_this_job( acquire_barrier); - activation_pd_ = + fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_activation_pd)); - if (activation_pd_ == nullptr) { + if (fwd_pd_ == nullptr) { auto activation_desc = mkldnn::eltwise_forward::desc( prop_kind, algorithm, md, alpha, beta); - activation_pd_.reset(new mkldnn::eltwise_forward::primitive_desc( + fwd_pd_.reset(new mkldnn::eltwise_forward::primitive_desc( activation_desc, engine_)); - dev_ctx_.SetBlob(key_activation_pd, activation_pd_); + dev_ctx_.SetBlob(key_activation_pd, fwd_pd_); } } - return activation_pd_; + return fwd_pd_; } std::shared_ptr @@ -366,23 +365,22 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { const mkldnn::memory::desc& src_md, float alpha, float beta) { const std::string key_activation_pd = key_common_ + "@activation_pd"; const std::string key_activation_bwd_pd = key_ + "@activation_bwd_pd"; - activation_bwd_pd_ = + bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_activation_bwd_pd)); - if (activation_bwd_pd_ == nullptr) { - activation_pd_ = + if (bwd_pd_ == nullptr) { + fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_activation_pd)); // PD from FWD op has to exist. - PADDLE_ENFORCE(activation_pd_ != nullptr, - "Eltwise MKL-DNN not found in cache!"); + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, "Eltwise MKL-DNN not found in cache!"); auto backward_desc = mkldnn::eltwise_backward::desc( algorithm, diff_dst_md, src_md, alpha, beta); - activation_bwd_pd_.reset(new mkldnn::eltwise_backward::primitive_desc( - backward_desc, engine_, *activation_pd_)); - dev_ctx_.SetBlob(key_activation_bwd_pd, activation_bwd_pd_); + bwd_pd_.reset(new mkldnn::eltwise_backward::primitive_desc( + backward_desc, engine_, *fwd_pd_)); + dev_ctx_.SetBlob(key_activation_bwd_pd, bwd_pd_); } - return activation_bwd_pd_; + return bwd_pd_; } std::shared_ptr AcquireActivation( @@ -395,22 +393,25 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { dev_ctx_.GetBlob(prim_key)); if (eltwise_p == nullptr) { eltwise_p = std::make_shared( - *activation_pd_, *(src_memory_p), *(dst_memory_p)); + *fwd_pd_, *(src_memory_p), *(dst_memory_p)); dev_ctx_.SetBlob(prim_key, eltwise_p); } return eltwise_p; } - // TODO(jczaja): Merge all AcquireDstMemoryFromPrimitive into one - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive( - activation_pd_->dst_primitive_desc(), ptr, "@dst_mem_p"); + template + std::shared_ptr AcquireDstMemoryFromPrimitive( + framework::Tensor* output, platform::Place place) { + T* ptr = output->mutable_data(place, + fwd_pd_->dst_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); } std::shared_ptr AcquireDiffSrcMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive( - activation_bwd_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(), + ptr, "@diff_src_mem_p"); } std::shared_ptr AcquireActivationBackward( @@ -424,7 +425,7 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { dev_ctx_.GetBlob(prim_key)); if (eltwise_bwd_p == nullptr) { eltwise_bwd_p = std::make_shared( - *activation_bwd_pd_, *(src_memory_p), *(diff_dst_memory_p), + *bwd_pd_, *(src_memory_p), *(diff_dst_memory_p), *(diff_src_memory_p)); dev_ctx_.SetBlob(prim_key, eltwise_bwd_p); } @@ -449,8 +450,8 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { } private: - std::shared_ptr activation_pd_; - std::shared_ptr activation_bwd_pd_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; }; class LRNMKLDNNHandler : public MKLDNNHandler { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py index c2ebec04a1..9413554db9 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -20,14 +20,12 @@ import numpy as np import paddle.fluid.core as core from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp -from mkldnn_op_test import format_reorder def conv2d_forward_refer(input, filter, group, conv_param): out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, conv_param) - size = [in_n, out_c, out_h, out_w] - return format_reorder(out, size) + return out class TestConv2dInt8Op(TestConv2dOp): @@ -79,10 +77,8 @@ class TestConv2dInt8Op(TestConv2dOp): if self.fuse_residual: input_residual = np.random.randint( -5, 5, self.input_residual_size).astype(self.srctype) - output_tmp = np.round(output1 - output2 + format_reorder( - input_residual, self.input_residual_size).astype( - self.srctype) * (self.scale_out / self.scale_in_eltwise - )) + output_tmp = np.round(output1 - output2 + input_residual.astype( + self.srctype) * (self.scale_out / self.scale_in_eltwise)) if self.fuse_activation == "relu": output = np.maximum(output_tmp, 0).astype(self.dsttype) else: @@ -109,10 +105,9 @@ class TestConv2dInt8Op(TestConv2dOp): input_residual = np.random.randint( 0, 10, self.input_residual_size).astype(self.srctype) output_tmp_res = np.round(output1 * (self.scale_out / ( - self.scale_in * self.scale_weights[0])) + format_reorder( - input_residual, self.input_residual_size).astype( - np.int32) * (self.scale_out / self.scale_in_eltwise - )) + self.scale_in * self.scale_weights[ + 0])) + input_residual.astype(np.int32) * ( + self.scale_out / self.scale_in_eltwise)) if self.fuse_activation == "relu": output = np.maximum(output_tmp_res, 0).astype(self.dsttype) else: diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py index 438a2ce5dc..043c544f26 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py @@ -182,7 +182,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp): y = np.random.rand(1, 16, 2, 2).astype(self.dtype) self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) - self.out = self.x * self.y + self.out = x * y def setUp(self): super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp() @@ -213,7 +213,7 @@ class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp): y = np.random.rand(1, 16, 2, 2).astype(self.dtype) self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2) - self.out = self.x * self.y + self.out = x * y def setUp(self): super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp() -- GitLab