diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e4a56d4a45a732cfcf43b09228bc0c44df5924c --- /dev/null +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -0,0 +1,325 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::MKLDNNMemDesc; +using mkldnn::memory; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +namespace { +template +struct bn_type_traits { + using op_type = T; + using op_desc = typename op_type::desc; + using op_prim = typename op_type::primitive_desc; +}; + +template +void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, + Container *c) { + auto it = std::begin(*c); + + std::copy(scale_begin, scale_end, std::inserter(*c, it)); + std::copy( + shift_begin, shift_end, + std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end)))); +} + +template +void run_batch_norm_op(Args &&... args) { + Op batch_norm_op{args...}; + + std::vector pipeline; + pipeline.push_back(batch_norm_op); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +template +inline void *cast_const_to_void(const T *t) { + return static_cast(const_cast(t)); +} +} // namespace + +template +class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto data_layout_str = ctx.Attr("data_layout"); + auto data_layout = framework::StringToDataLayout(data_layout_str); + PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, + "MKLDNN batch normalization handles only NCHW data layout"); + + const float epsilon = ctx.Attr("epsilon"); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + + const auto *x = ctx.Input("X"); + const auto *mean = ctx.Input("Mean"); + const auto *variance = ctx.Input("Variance"); + + auto &dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *batch_mean = ctx.Output("SavedMean"); + auto *batch_variance = ctx.Output("SavedVariance"); + + const auto *scale = ctx.Input("Scale"); + const auto *shift = ctx.Input("Bias"); + + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + + if (!is_test) { + batch_mean->mutable_data(ctx.GetPlace()); + batch_variance->mutable_data(ctx.GetPlace()); + } + + auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + + auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; + auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; + + auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; + auto dst = mkldnn::memory{dst_pd, y->data()}; + + unsigned flags = mkldnn::use_scale_shift; + if (is_test) flags |= mkldnn::use_global_stats; + + using bn_fwd_types = bn_type_traits; + auto batch_norm_fwd_desc = + bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; + auto batch_norm_fwd_pd = + bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + + const unsigned int ic = dims[1]; + + // MKLDNN requires a single piece of memory for scale and shift/bias data + const size_t scaleshift_size = 2 * ic; + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + + auto scaleshift_memory = mkldnn::memory{ + batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + + if (is_test) { + auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), + cast_const_to_void(mean->data())}; + + auto variance_memory = + mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), + cast_const_to_void(variance->data())}; + + run_batch_norm_op( + batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, + dst); + } else { + auto mean_memory = + mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), + cast_const_to_void(batch_mean->data())}; + + auto variance_memory = + mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), + cast_const_to_void(batch_variance->data())}; + + run_batch_norm_op(batch_norm_fwd_pd, src, + scaleshift_memory, dst, + mean_memory, variance_memory); + } + + if (!is_test) { + const unsigned int in = dims[0]; + const unsigned int sample_size = x->numel() / in / ic; + + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e( + batch_mean->mutable_data(ctx.GetPlace()), ic); + EigenVectorArrayMap saved_variance_e( + batch_variance->mutable_data(ctx.GetPlace()), ic); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + const unsigned int x_arr_size = in * ic; + ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); + for (unsigned int nc = 0; nc < x_arr_size; ++nc) { + saved_mean_e(nc % ic) += x_arr.col(nc).sum(); + } + saved_mean_e /= in * sample_size; + for (unsigned int nc = 0; nc < x_arr_size; ++nc) { + saved_variance_e(nc % ic) += + (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); + } + saved_variance_e /= in * sample_size; + + ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; + ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; + + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), ic); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), ic); + + auto one_minus_momentum = 1. - momentum; + running_mean_arr = + mean_arr * momentum + saved_mean_e * one_minus_momentum; + running_var_arr = + variance_arr * momentum + saved_variance_e * one_minus_momentum; + } + } +}; + +template +class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto data_layout_str = ctx.Attr("data_layout"); + auto data_layout = framework::StringToDataLayout(data_layout_str); + PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, + "MKLDNN batch normalization handles only NCHW data layout"); + + auto &dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + + const float epsilon = ctx.Attr("epsilon"); + + const auto *x = ctx.Input("X"); + const auto *scale = ctx.Input("Scale"); + const auto *shift = ctx.Input("Bias"); + const auto *batch_mean = ctx.Input("SavedMean"); + const auto *batch_variance = ctx.Input("SavedVariance"); + + const auto *diff_y = ctx.Input(framework::GradVarName("Y")); + auto *diff_x = ctx.Output(framework::GradVarName("X")); + auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); + auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); + + diff_x->mutable_data(ctx.GetPlace()); + diff_scale->mutable_data(ctx.GetPlace()); + diff_shift->mutable_data(ctx.GetPlace()); + + auto dims = paddle::framework::vectorize2int(x->dims()); + unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + + auto src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto diff_src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto diff_dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + + using bn_bwd_types = bn_type_traits; + using bn_fwd_types = bn_type_traits; + + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; + auto batch_norm_fwd_pd = + bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + + auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ + mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ + batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; + + auto src = mkldnn::memory{{src_md, mkldnn_engine}, + cast_const_to_void(x->data())}; + + auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), + cast_const_to_void(batch_mean->data())}; + + auto variance = + mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), + cast_const_to_void(batch_variance->data())}; + + auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, + cast_const_to_void(diff_y->data())}; + + const unsigned int ic = dims[1]; + + const size_t scaleshift_size = 2 * ic; + + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + + auto scaleshift_memory = mkldnn::memory{ + batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + + std::vector diff_scaleshift_data; + diff_scaleshift_data.reserve(scaleshift_size); + copy_to_weights(diff_scale->data(), diff_scale->data() + ic, + diff_shift->data(), diff_shift->data() + ic, + &diff_scaleshift_data); + + auto diff_scaleshift_memory = + mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()}; + + auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, + static_cast(diff_x->data())}; + + run_batch_norm_op( + batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, + diff_src, diff_scaleshift_memory); + + auto it = std::begin(diff_scaleshift_data); + std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(std::next(it, ic), std::end(diff_scaleshift_data), + diff_shift->data()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, + ops::BatchNormMKLDNNOpKernel); +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, + ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f8b2505ccfb143f9f74cf0b16d92e8e1ca059709..b4bd40d0311bf10ec1fddabab2ee131fe02baf52 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include #include "paddle/fluid/framework/data_layout.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -106,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( ctx.Input("Variance")->type()), "Variance input should be of float type"); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library_); } }; @@ -151,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { "Variance of the current mini batch, " "will apply to output when training") .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Batch Normalization. @@ -349,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout, library_); } }; @@ -474,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("SavedVariance", Output("SavedVariance")); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 7f16bf2a0c430213b2f52dafe8fa948b9e350f96..93e8d0bf296d4846ad5f5924e70a6fbba823d82e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1496,6 +1496,7 @@ def batch_norm(input, bias_attr=None, data_layout='NCHW', in_place=False, + use_mkldnn=False, name=None, moving_mean_name=None, moving_variance_name=None, @@ -1574,9 +1575,12 @@ def batch_norm(input, "SavedMean": saved_mean, "SavedVariance": saved_variance }, - attrs={"momentum": momentum, - "epsilon": epsilon, - "is_test": is_test}) + attrs={ + "momentum": momentum, + "epsilon": epsilon, + "is_test": is_test, + "use_mkldnn": use_mkldnn + }) return helper.append_activation(batch_norm_out) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f6097d4b846e8da1c4ee3cc49b31f9873660056d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from op_test import OpTest +from paddle.fluid.framework import grad_var_name +from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad + + +class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_formats = ["NCHW"] + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + # run forward + y, saved_mean, saved_variance = _reference_training( + x, scale, bias, epsilon, data_layout) + mean_out = saved_mean * (1. - momentum) + momentum * mean + variance_out = saved_variance * (1. - momentum) + momentum * variance + # run backward + x_grad, scale_grad, bias_grad = _reference_grad( + x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout) + + return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad + + +class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference): + def init_kernel_type(self): + self.use_mkldnn = True + + def test_check_output(self): + place = core.CPUPlace() + data_format = "NCHW" + + self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index a0e78a460703778b46191b50c75e92bfbcaec411..4216d83653b27ec7f18034e576fbedbecc3f1cfe 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None): class TestBatchNormOpInference(unittest.TestCase): def setUp(self): self.dtype = np.float32 + self.use_mkldnn = False + self.init_kernel_type() def __assert_close(self, tensor, np_array, msg, atol=1e-4): self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) @@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase): # attrs is_test=True, data_layout=data_layout, + use_mkldnn=self.use_mkldnn, epsilon=epsilon) batch_norm_op.run(scope, place) @@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase): [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3]) + def init_kernel_type(self): + pass + class TestFP16BatchNormOpInference(TestBatchNormOpInference): def setUp(self): self.dtype = np.float16 + self.use_mkldnn = False + self.init_kernel_type() def test_check_output(self): places = [] @@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): class TestBatchNormOpTraining(unittest.TestCase): + def setUp(self): + self.use_mkldnn = False + self.data_formats = ["NCHW", "NHWC"] + self.init_kernel_type() + def __assert_close(self, tensor, np_array, msg, atol=1e-4): np.allclose(np.array(tensor), np_array, atol=atol) + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + # run forward + y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon, + data_layout) + mean_out = saved_mean * (1. - momentum) + momentum * mean + variance_out = var_ref * (1. - momentum) + momentum * variance + saved_variance = 1. / np.sqrt(var_ref + epsilon) + # run backward + x_grad, scale_grad, bias_grad = _reference_grad( + x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) + + return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad + def test_forward_backward(self): def test_with_place(place, data_layout, shape): # attr @@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase): mean = np.zeros(scale_shape).astype(np.float32) variance = np.ones(scale_shape).astype(np.float32) - # run forward - y, saved_mean, var_ref = _reference_training(x, scale, bias, - epsilon, data_layout) - mean_out = saved_mean * (1. - momentum) + momentum * mean - variance_out = var_ref * (1. - momentum) + momentum * variance - saved_variance = 1. / np.sqrt(var_ref + epsilon) - # run backward y_grad = np.random.random_sample(shape).astype(np.float32) - x_grad, scale_grad, bias_grad = _reference_grad( - x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) + + y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( + x, y_grad, scale, bias, mean, variance, epsilon, momentum, + shape, data_layout) var_dict = locals() var_dict['y@GRAD'] = y_grad @@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase): "momentum": momentum, "epsilon": epsilon, "is_test": False, - "data_layout": data_layout + "data_layout": data_layout, + "use_mkldnn": self.use_mkldnn }) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) @@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase): print "op test forward passed: ", str(place), data_layout places = [core.CPUPlace()] + if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): places.append(core.CUDAPlace(0)) for place in places: - for data_format in ["NCHW", "NHWC"]: + for data_format in self.data_formats: test_with_place(place, data_format, [2, 3, 4, 5]) + def init_kernel_type(self): + pass + if __name__ == '__main__': unittest.main()