From 7e0d21de6d3352c1238d35d2586f40e48b6da27f Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 30 Jan 2018 11:11:04 +0800 Subject: [PATCH] fix scale and bias dim --- paddle/operators/layer_norm_op.cc | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 07ca8ac222..125ac9f53f 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -123,8 +123,8 @@ class LayerNormKernel int right = static_cast(matrix_dim[1]); auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), left, 1); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); + auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); @@ -143,11 +143,11 @@ class LayerNormKernel // TODO(zcd): Some thinking about output_map, is it appropriate that // `output_map` and `input_map` point to the same memory. - auto inv_std_scale = - var_map.unaryExpr(inv_std_func).cwiseProduct(scale_map); - output_map = - inv_std_scale.replicate(1, right).cwiseProduct(input_map) + - (bias_map - inv_std_scale.cwiseProduct(mean_map)).replicate(1, right); + auto inv_std_scale = var_map.unaryExpr(inv_std_func); + output_map = (input_map - mean_map.replicate(1, right)) + .cwiseProduct(inv_std_scale.replicate(1, right)) + .cwiseProduct(scale_map.replicate(left, 1)) - + bias_map.replicate(left, 1); } }; @@ -221,7 +221,7 @@ class LayerNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), left, 1); + auto scale_map = ConstEigenMatrixMapRowMajor(scale->data(), 1, right); auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); @@ -229,12 +229,13 @@ class LayerNormGradKernel if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), left, 1); + auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); d_bias_map = d_y_map.colwise().mean(); } if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - auto d_scale_map = EigenMatrixMapRowMajor(d_scale->data(), left, 1); + auto d_scale_map = + EigenMatrixMapRowMajor(d_scale->data(), 1, right); auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // There are two equation to compute d_scale. One uses "Y" and the other // does not use "Y" @@ -254,15 +255,15 @@ class LayerNormGradKernel auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; // dy_dx auto dx_end = var_map.unaryExpr(inv_std_func) - .cwiseProduct(scale_map) .replicate(1, right) - .cwiseProduct(d_y_map); + .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)); // dy_dmean_dx auto dx_mean = (T(-1.0) / right) * var_map.unaryExpr(inv_std_func) - .cwiseProduct(scale_map) .replicate(1, right) .cwiseProduct(d_y_map) + .cwiseProduct(scale_map.replicate(left, 1)) .rowwise() .sum() .replicate(1, right); @@ -274,8 +275,8 @@ class LayerNormGradKernel auto dvar_end = var_map.unaryExpr(inv_std_func) .unaryExpr(triple_product_func) .cwiseProduct(dvar_end_part) - .cwiseProduct(scale_map) - .replicate(1, right); + .replicate(1, right) + .cwiseProduct(scale_map.replicate(left, 1)); auto dx_var = (T(-1.0) / right) * (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); -- GitLab