From 87b5559cd15a28d515b16f3ad04ca9919c7edd32 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 29 Jan 2018 20:41:08 +0800 Subject: [PATCH] fix scale and bias dim --- paddle/operators/layer_norm_op.cc | 84 +++++++++---------- .../v2/fluid/tests/test_layer_norm_op.py | 16 ++-- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 9e618d10d2..07ca8ac222 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -38,10 +38,6 @@ class LayerNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], 1); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], 1); auto x_dim = ctx->GetInputDim("X"); auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(), @@ -50,6 +46,11 @@ class LayerNormOp : public framework::OperatorWithKernel { auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis); int left = static_cast(matrix_dim[0]); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], left); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], left); + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); ctx->SetOutputDim("Mean", {left}); ctx->SetOutputDim("Variance", {left}); @@ -64,10 +65,10 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input tensor"); AddInput("Scale", - "Scale is a 1-dimensional tensor of size 1 " + "Scale is a 1-dimensional tensor of size H " "that is applied to the output"); AddInput("Bias", - "Bias is a 1-dimensional tensor of size 1 " + "Bias is a 1-dimensional tensor of size H " "that is applied to the output"); AddOutput("Y", "result after normalization"); AddOutput("Mean", "Mean of the current mini batch."); @@ -110,9 +111,6 @@ class LayerNormKernel const auto &x_dims = x->dims(); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto scale_data = scale->data()[0]; - auto bias_data = bias->data()[0]; - auto *output = ctx.Output("Y"); auto *mean = ctx.Output("Mean"); auto *var = ctx.Output("Variance"); @@ -123,7 +121,10 @@ class LayerNormKernel auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); + auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), left, 1); auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); @@ -138,18 +139,15 @@ class LayerNormKernel .mean() .unaryExpr(add_epslion); - auto scale_inv_std = [scale_data](T ele) { - return std::sqrt(1 / ele) * scale_data; - }; - auto sub_bias = [bias_data](T ele) { return bias_data - ele; }; + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; + // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. - output_map = (var_map.unaryExpr(scale_inv_std).replicate(1, right)) - .cwiseProduct(input_map) + - var_map.unaryExpr(scale_inv_std) - .cwiseProduct(mean_map) - .unaryExpr(sub_bias) - .replicate(1, right); + auto inv_std_scale = + var_map.unaryExpr(inv_std_func).cwiseProduct(scale_map); + output_map = + inv_std_scale.replicate(1, right).cwiseProduct(input_map) + + (bias_map - inv_std_scale.cwiseProduct(mean_map)).replicate(1, right); } }; @@ -165,17 +163,17 @@ class LayerNormGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); - const auto x_dims = ctx->GetInputDim("X"); - // check output if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } if (ctx->HasOutput(framework::GradVarName("Scale"))) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {1}); + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); } if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), {1}); + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); } } @@ -210,20 +208,20 @@ class LayerNormGradKernel const auto *var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); - auto scale_data = scale->data()[0]; const auto &x_dims = x->dims(); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]), - right = static_cast(matrix_dim[1]); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); @@ -231,36 +229,38 @@ class LayerNormGradKernel if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - d_bias->data()[0] = d_y_map.sum(); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), left, 1); + d_bias_map = d_y_map.colwise().mean(); } if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - auto inv_std = [](T ele) { return std::sqrt(1 / ele); }; + auto d_scale_map = EigenMatrixMapRowMajor(d_scale->data(), left, 1); + auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // There are two equation to compute d_scale. One uses "Y" and the other // does not use "Y" - d_scale->data()[0] = + d_scale_map = ((x_map - mean_map.replicate(1, right)) - .cwiseProduct(var_map.unaryExpr(inv_std).replicate(1, right)) + .cwiseProduct( + var_map.unaryExpr(inv_std_func).replicate(1, right)) .cwiseProduct(d_y_map)) - .sum(); + .colwise() + .mean(); } if (d_x) { d_x->mutable_data(ctx.GetPlace()); auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); auto triple_product_func = [](T ele) { return ele * ele * ele; }; - auto scale_func = [scale_data](T ele) { return ele * scale_data; }; auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - auto inv_std_scale_func = [scale_data](T ele) { - return std::sqrt(1 / ele) * scale_data; - }; // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_scale_func) + auto dx_end = var_map.unaryExpr(inv_std_func) + .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map); // dy_dmean_dx auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_scale_func) + var_map.unaryExpr(inv_std_func) + .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map) .rowwise() @@ -274,11 +274,11 @@ class LayerNormGradKernel auto dvar_end = var_map.unaryExpr(inv_std_func) .unaryExpr(triple_product_func) .cwiseProduct(dvar_end_part) + .cwiseProduct(scale_map) .replicate(1, right); - auto dx_var = (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)) - .cwiseProduct(dvar_end) - .unaryExpr(scale_func); + auto dx_var = + (T(-1.0) / right) * + (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); d_x_map = dx_end + dx_mean + dx_var; } diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 8ce327436f..9264cf4b79 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -39,8 +39,9 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): x.shape = [N, D] mean = np.mean(x, axis=1) var = np.var(x, axis=1) + epsilon - output = scale * np.divide((x - mean.reshape([N, 1])), - (np.sqrt(var)).reshape([N, 1])) + beta + output = scale.reshape([1, D]) * np.divide( + (x - mean.reshape([N, 1])), + (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D]) output.shape = old_shape x.shape = old_shape return output, mean, var @@ -55,8 +56,10 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): mean.shape = [N, 1] var.shape = [N, 1] - d_scale = np.sum(grad_y).reshape([1, ]) - d_bias = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y).reshape([1, ]) + d_scale = np.sum(grad_y, axis=1).reshape([1, D]) + d_bias = scale.reshape([1, D]) * np.sum(( + (x - mean) * np.sqrt(1 / var)) * grad_y, + axis=1).reshape([1, D]) dx_end = np.sqrt(1.0 / var) * grad_y @@ -69,7 +72,7 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): d_std = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape([N, 1]) * ( 1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) - grad_x = scale * (dx_end + d_mean + d_std) + grad_x = scale.reshape([1, D]) * (dx_end + d_mean + d_std) grad_y.shape = x_shape x.shape = x_shape @@ -146,7 +149,8 @@ class TestLayerNormdOp(OpTest): # attr epsilon = 0.00001 x_shape = shape - scale_shape = [1] + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + scale_shape = [D] np.random.random(123) x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) -- GitLab