From 5092f5291c17e46b4c5e176c00b46a69f5e0d466 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sat, 3 Feb 2018 15:05:55 +0800 Subject: [PATCH] Separate GPU and CPU implementation --- paddle/operators/layer_norm_op.cc | 186 +++++++++++++++++- paddle/operators/layer_norm_op.h | 29 ++- .../v2/fluid/tests/test_layer_norm_op.py | 11 +- 3 files changed, 202 insertions(+), 24 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index edc26dfb96f..910b8ec0a4d 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -21,6 +21,13 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using DataLayout = framework::DataLayout; +template +using EigenMatrixMapRowMajor = Eigen::Map< + Eigen::Matrix>; +template +using ConstEigenMatrixMapRowMajor = Eigen::Map< + const Eigen::Matrix>; + class LayerNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -101,7 +108,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Layer Normalization. - Layer Norm has been implemented as discussed in the paper: https://arxiv.org/abs/1607.06450 ... @@ -109,6 +115,75 @@ https://arxiv.org/abs/1607.06450 } }; +template +class LayerNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + auto *output = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + output->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + + auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + + auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); + auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); + + auto squre = [](T ele) { return ele * ele; }; + auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; + + mean_map = input_map.rowwise().mean(); + var_map = (input_map - mean_map.replicate(1, right)) + .unaryExpr(squre) + .rowwise() + .mean() + .unaryExpr(add_epslion); + + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // TODO(zcd): Some thinking about output_map, is it appropriate that + // `output_map` and `input_map` point to the same memory. + auto inv_std = var_map.unaryExpr(inv_std_func); + if (scale && bias) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + + bias_map.replicate(left, 1); + } else if (scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)); + } else if (bias) { + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)) + + bias_map.replicate(left, 1); + } else { + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std.replicate(1, right)); + } + } +}; + class LayerNormGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -161,6 +236,115 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; +template +class LayerNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *mean = ctx.Input("Mean"); + const auto *var = ctx.Input("Variance"); + const auto *scale = ctx.Input("Scale"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + + const auto &x_dims = x->dims(); + + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); + auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); + auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); + d_bias_map = d_y_map.colwise().sum(); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + auto d_scale_map = + EigenMatrixMapRowMajor(d_scale->data(), 1, right); + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // There are two equation to compute d_scale. One uses "Y" and the other + // does not use "Y" + d_scale_map = + ((x_map - mean_map.replicate(1, right)) + .cwiseProduct( + var_map.unaryExpr(inv_std_func).replicate(1, right)) + .cwiseProduct(d_y_map)) + .colwise() + .sum(); + } + + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); + auto triple_product_func = [](T ele) { return ele * ele * ele; }; + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + + auto inv_std_map = var_map.unaryExpr(inv_std_func).eval(); + // TODO(zcd): these code can be refined + if (d_scale) { + auto scale_map = + ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + // dy_dx + auto dx_end = + inv_std_map.replicate(1, right).cwiseProduct(d_y_map).cwiseProduct( + scale_map.replicate(left, 1)); + + // dy_dmean_dx + auto dx_mean = + (T(-1.0) / right) * dx_end.rowwise().sum().replicate(1, right); + + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = inv_std_map.unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } else { + // dy_dx + auto dx_end = inv_std_map.replicate(1, right).cwiseProduct(d_y_map); + + // dy_dmean_dx + auto dx_mean = + (T(-1.0) / right) * dx_end.rowwise().sum().replicate(1, right); + + // dy_var_dx + auto dvar_end_part = (x_map - mean_map.replicate(1, right)) + .cwiseProduct(d_y_map) + .rowwise() + .sum(); + auto dvar_end = inv_std_map.unaryExpr(triple_product_func) + .cwiseProduct(dvar_end_part) + .replicate(1, right); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); + + d_x_map = dx_end + dx_mean + dx_var; + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h index 309f1b87a26..2de58186fbd 100644 --- a/paddle/operators/layer_norm_op.h +++ b/paddle/operators/layer_norm_op.h @@ -78,7 +78,7 @@ class LayerNormKernel : public framework::OpKernel { auto *var = ctx.Output("Variance"); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - const auto &x_dims = x.dims(); + const auto x_dims = x.dims(); y->mutable_data(ctx.GetPlace()); mean->mutable_data(ctx.GetPlace()); @@ -87,11 +87,12 @@ class LayerNormKernel : public framework::OpKernel { auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); x.Resize(matrix_shape); - y->Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); auto &dev_ctx = ctx.template device_context(); math::RowwiseMean row_mean; @@ -101,30 +102,24 @@ class LayerNormKernel : public framework::OpKernel { // functor-> get variance ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), y); - row_mean(dev_ctx, *y, var); + ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); + row_mean(dev_ctx, out, var); // functor-> get norm_out ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), y); + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); ElementwiseComputeEx, DeviceContext, T>( - ctx, y, var, /*axis*/ 0, DivAndSqrtFunctor(static_cast(epsilon)), - y); + ctx, &out, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &out); - framework::DDim scale_shape({right}); if (scale) { - Tensor scale_matrix = *scale; - scale_matrix.Resize(scale_shape); ElementwiseComputeEx, DeviceContext, T>( - ctx, y, &scale_matrix, /*axis*/ 1, MulFunctor(), y); + ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); } if (bias) { - Tensor bias_matrix = *bias; - bias_matrix.Resize(scale_shape); ElementwiseComputeEx, DeviceContext, T>( - ctx, y, &bias_matrix, /*axis*/ 1, AddFunctor(), y); + ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); } - y->Resize(x_dims); } }; @@ -184,6 +179,7 @@ class LayerNormGradKernel : public framework::OpKernel { if (d_x) { framework::DDim vec_shape({left}); d_x->mutable_data(ctx.GetPlace()); + auto dx_dim = d_x->dims(); Tensor temp_vec; temp_vec.mutable_data(vec_shape, ctx.GetPlace()); @@ -227,6 +223,7 @@ class LayerNormGradKernel : public framework::OpKernel { ElementwiseComputeEx, DeviceContext, T>( ctx, d_x, &var, /*axis*/ 0, DivAndSqrtFunctor(static_cast(epsilon)), d_x); + d_x->Resize(dx_dim); } } }; diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 68cf8673cd4..f456b1194c5 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -62,9 +62,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): grad_x = dx_end + d_mean + d_std - grad_y.shape = x_shape - x.shape = x_shape + grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape scale.shape = scale_shape + var.shape, mean.shape = [N, ], [N, ] return grad_x, d_scale, d_bias @@ -112,10 +112,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None): class TestLayerNormdOp(OpTest): def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue( - np.allclose( - np.array(tensor).reshape(np_array.shape), np_array, atol=atol), - msg) + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) def __assert_grad_close(self, tensor, @@ -123,7 +120,7 @@ class TestLayerNormdOp(OpTest): name, place, max_relative_error=0.02): - a = np.array(tensor).reshape(np_array.shape) + a = np.array(tensor) b = np_array abs_a = np.abs(a) abs_a[abs_a < 1e-5] = 1 -- GitLab