diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index b275fd75b3512343825170fc38565dd27f7f1c75..79b8c6f59c7ad3d77aa969f6b4f36f8050cfe823 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -62,7 +62,7 @@ class CompareOpKernel z->mutable_data(context.GetPlace()); int axis = context.Attr("axis"); ElementwiseComputeEx(context, x, y, axis, - z); + Functor(), z); } }; diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index c32288d6984f126f2374a13973541f4f663b25a4..c24f97a85092ff14e8211ca8bc4bb9b155510a2c 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -35,7 +35,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); } }; diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index 07ebade31ff5b3d5c89156e28ff5fa0670a9a842..dc863cc598ec6015067f166b1544a5d20223662a 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -35,7 +35,8 @@ class ElementwiseDivKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + DivFunctor(), z); } }; diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h index 717e45ab31db9b9a6629fb33e17654dbf986d8c5..67efe4e1511e054d54f91b5aa22ce28f222ed20a 100644 --- a/paddle/operators/elementwise_max_op.h +++ b/paddle/operators/elementwise_max_op.h @@ -35,7 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MaxFunctor(), z); } }; diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h index 0de9a91c52b0ab82cd62604de318ce68e56b767d..cf11759404d3342b8a1c0080fa09f6cd57e735db 100644 --- a/paddle/operators/elementwise_min_op.h +++ b/paddle/operators/elementwise_min_op.h @@ -35,7 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MinFunctor(), z); } }; diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index ae7a71e0244dfb8ad3e55683ac081f92bc36bea5..773125f5ca54e7b529df47a2823d56a5ad71e50d 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -34,7 +34,8 @@ class ElementwiseMulKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 213fe1f5a818873e8b666464cb112637261c598c..74abf7c4a58788eb0e53025886f10f5a43021a9e 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -365,10 +365,10 @@ template void ElementwiseComputeEx(const framework::ExecutionContext& ctx, const framework::Tensor* x, - const framework::Tensor* y, int axis, + const framework::Tensor* y, int axis, Functor func, framework::Tensor* z) { TransformFunctor functor( - x, y, z, ctx.template device_context(), Functor()); + x, y, z, ctx.template device_context(), func); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h index 874fd3f09f2afaccfbfca75799cc3448f7393b03..0c5dd031ec46ebecaabb701839c0f69c02678eb0 100644 --- a/paddle/operators/elementwise_pow_op.h +++ b/paddle/operators/elementwise_pow_op.h @@ -36,7 +36,8 @@ class ElementwisePowKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + PowFunctor(), z); } }; diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index c2749a8e6ba689233dab4f3c72de10bf01f39fab..6a88c5f6b4c869f8ab5b4fa3b112ffc264be7145 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -34,7 +34,8 @@ class ElementwiseSubKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, z); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + SubFunctor(), z); } }; diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 1c6d2ae4d05becaeed34d66cad398cc90f9d3ece..76d5d571c31c0cdec207cd171291da1f58d29b61 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -21,13 +21,6 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using DataLayout = framework::DataLayout; -template -using EigenMatrixMapRowMajor = Eigen::Map< - Eigen::Matrix>; -template -using ConstEigenMatrixMapRowMajor = Eigen::Map< - const Eigen::Matrix>; - class LayerNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -108,7 +101,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Layer Normalization. - Layer Norm has been implemented as discussed in the paper: https://arxiv.org/abs/1607.06450 ... @@ -116,75 +108,6 @@ https://arxiv.org/abs/1607.06450 } }; -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - auto *output = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - output->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - - auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); - auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); - - auto squre = [](T ele) { return ele * ele; }; - auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; - - mean_map = input_map.rowwise().mean(); - var_map = (input_map - mean_map.replicate(1, right)) - .unaryExpr(squre) - .rowwise() - .mean() - .unaryExpr(add_epslion); - - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): Some thinking about output_map, is it appropriate that - // `output_map` and `input_map` point to the same memory. - auto inv_std = var_map.unaryExpr(inv_std_func); - if (scale && bias) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) + - bias_map.replicate(left, 1); - } else if (scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)); - } else if (bias) { - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) + - bias_map.replicate(left, 1); - } else { - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)); - } - } -}; - class LayerNormGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -237,125 +160,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *mean = ctx.Input("Mean"); - const auto *var = ctx.Input("Variance"); - const auto *scale = ctx.Input("Scale"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); - auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); - d_bias_map = d_y_map.colwise().sum(); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - auto d_scale_map = - EigenMatrixMapRowMajor(d_scale->data(), 1, right); - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // There are two equation to compute d_scale. One uses "Y" and the other - // does not use "Y" - d_scale_map = - ((x_map - mean_map.replicate(1, right)) - .cwiseProduct( - var_map.unaryExpr(inv_std_func).replicate(1, right)) - .cwiseProduct(d_y_map)) - .colwise() - .sum(); - } - - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); - auto triple_product_func = [](T ele) { return ele * ele * ele; }; - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): these code can be refined - if (d_scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)); - // dy_dmean_dx - auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .cwiseProduct(scale_map.replicate(left, 1)) - .rowwise() - .sum() - .replicate(1, right); - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(inv_std_func) - .unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } else { - // dy_dx - auto dx_end = var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map); - // dy_dmean_dx - auto dx_mean = (T(-1.0) / right) * - var_map.unaryExpr(inv_std_func) - .replicate(1, right) - .cwiseProduct(d_y_map) - .rowwise() - .sum() - .replicate(1, right); - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = var_map.unaryExpr(inv_std_func) - .unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } - } - } -}; - } // namespace operators } // namespace paddle @@ -363,8 +167,9 @@ namespace ops = paddle::operators; REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, layer_norm_grad, ops::LayerNormGradOp); REGISTER_OP_CPU_KERNEL( - layer_norm, - ops::LayerNormKernel); + layer_norm, ops::LayerNormKernel, + ops::LayerNormKernel); REGISTER_OP_CPU_KERNEL( layer_norm_grad, - ops::LayerNormGradKernel); + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.cu b/paddle/operators/layer_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..77d13b216f0e8d6d4434742908437f1eb74818c9 --- /dev/null +++ b/paddle/operators/layer_norm_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/layer_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h index bca35b91e6f52d35dee14aac9d080b52914942e3..3c436b89263758bbc0abcd1bb71cef3e1370d2a5 100644 --- a/paddle/operators/layer_norm_op.h +++ b/paddle/operators/layer_norm_op.h @@ -16,19 +16,222 @@ limitations under the License. */ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/elementwise_op_function.h" +#include "paddle/operators/math/math_function.h" + namespace paddle { namespace operators { +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } +}; + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + template class LayerNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto x = *ctx.Input("X"); + + auto *y = ctx.Output("Y"); + auto *mean = ctx.Output("Mean"); + auto *var = ctx.Output("Variance"); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + const auto x_dims = x.dims(); + + y->mutable_data(ctx.GetPlace()); + mean->mutable_data(ctx.GetPlace()); + var->mutable_data(ctx.GetPlace()); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + x.Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto &dev_ctx = ctx.template device_context(); + math::RowwiseMean row_mean; + + // get mean + row_mean(dev_ctx, x, mean); + + // get variance + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); + row_mean(dev_ctx, out, var); + + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &out); + + if (scale) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); + } + if (bias) { + ElementwiseComputeEx, DeviceContext, T>( + ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); + } + } }; template class LayerNormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + auto x = *ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *mean = ctx.Input("Mean"); + auto *var = ctx.Input("Variance"); + auto *scale = ctx.Input("Scale"); + auto *bias = ctx.Input("Bias"); + auto d_y = *ctx.Input(framework::GradVarName("Y")); + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + const auto &x_dims = x.dims(); + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + framework::DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + auto &dev_ctx = ctx.template device_context(); + math::ColwiseSum colwise_sum; + + Tensor temp; + Tensor temp_norm; + if (d_scale || d_x) { + x.Resize(matrix_shape); + temp.mutable_data(matrix_shape, ctx.GetPlace()); + + if (!(bias && scale)) { + temp_norm.ShareDataWith(*y); + temp_norm.Resize(matrix_shape); + } else { + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); + } + } + + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + d_scale->mutable_data(ctx.GetPlace()); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + framework::DDim vec_shape({left}); + d_x->mutable_data(ctx.GetPlace()); + auto dx_dim = d_x->dims(); + Tensor temp_vec; + temp_vec.mutable_data(vec_shape, ctx.GetPlace()); + + math::RowwiseMean row_mean; + + if (d_scale) { + // dy_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); + framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } else { + // dy_dx + framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); + + // dy_var_dx + ElementwiseComputeEx, DeviceContext, T>( + ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, d_x, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), d_x); + d_x->Resize(dx_dim); + } + } }; } // namespace operators diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8..ce0a5f6cff873166e3308a625978ecefaed2aa29 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -331,6 +331,12 @@ template struct RowwiseAdd; template struct ColwiseSum; template struct ColwiseSum; +template struct RowwiseSum; +template struct RowwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index d47a7f818ded61baf31e46ea3b8ae3101324111f..c0a107470a4629506fc06dabc78a4a4716be6649 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -325,6 +325,31 @@ void ColwiseSum::operator()( vector->data()); } +template struct RowwiseSum; +// template struct RowwiseSum; +// TODO(zcd): Following ColwiseSum format, need to confirm. +// The RowwiseSum failed in debug mode, +// and only failed for this case. So reimplemented it. +template <> +void RowwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]); + framework::Tensor one; + one.mutable_data({size}, context.GetPlace()); + SetConstant set; + set(context, &one, static_cast(1.0)); + gemv( + context, true, static_cast(in_dims[1]), static_cast(in_dims[0]), + 1.0, one.data(), input.data(), 0.0, + vector->data()); +} + +template struct RowwiseMean; +template struct RowwiseMean; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491..cb14d1e57468564710640773fdabd41896c178e0 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -128,6 +128,18 @@ struct ColwiseSum { framework::Tensor* vec); }; +template +struct RowwiseSum { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + +template +struct RowwiseMean { + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index de591626df28e2bc3391b609f909612411398247..af4127788af0aaeb99199f7d6e2138a449b9fe51 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -87,6 +87,88 @@ class ColwiseSum { } }; +template +void RowwiseMean::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.mean(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseMean { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), height); + auto inv_size = 1.0 / size; + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum * inv_size; + } + } +}; + +template +void RowwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* out) { + auto in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(*out); + + vec.device(*context.eigen_device()) = in.sum(Eigen::array({{1}})); +} +// TODO(zcd): Following ColwiseSum format, need to confirm. +// Specialize for CPU, since Eigen implement a general reduce. However, +// rowwise-sum can be easily implemented. General reduce has a huge overhead in +// CPU +template +class RowwiseSum { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + auto height = in_dims[0]; + auto size = in_dims[1]; + PADDLE_ENFORCE_EQ(out->numel(), size); + + T* out_buf = out->mutable_data(out->place()); + const T* in_buf = input.data(); + + for (size_t i = 0; i < static_cast(height); ++i) { + T sum = 0; + for (size_t j = 0; j < static_cast(size); ++j) { + sum += in_buf[i * size + j]; + } + out_buf[i] = sum; + } + } +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index 121b407cae41fa477843b7252ebacc9053d5f7aa..c5f1d51bd718acf32d173b97ee7bb7cdeb443c63 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -300,6 +300,9 @@ class DistributeTranspiler: pass return orig_shape + def _op_input_var(self, op, varname): + pass + def _is_op_on_pserver(self, endpoint, all_ops, idx): """ Recursively check if the op need to run on current server. @@ -309,29 +312,35 @@ class DistributeTranspiler: p.name for p in self.param_grad_ep_mapping[endpoint]["params"] ] op = all_ops[idx] - if op.inputs.has_key("Param"): - if op.inputs["Param"].name in param_names: + input_names = set(op.input_names) + # TODO(typhoonzero): using Param and Grad input name to identify + # that the operator is an optimization operator, need a better way. + if "Param" in input_names: + if op.input("Param")[0] in param_names: return True else: for n in param_names: - if same_or_split_var(n, op.inputs[ - "Param"].name) and n != op.inputs["Param"].name: + if same_or_split_var(n, op.input("Param")[0]) \ + and n != op.input("Param")[0]: return True return False else: j = idx - 1 while j >= 0: prev_op = all_ops[j] - prev_output_names = [o.name for o in prev_op.outputs.values()] - prev_input_names = [o.name for o in prev_op.inputs.values()] + # prev_output_names = [o.name for o in prev_op.outputs.values()] + # prev_input_names = [o.name for o in prev_op.inputs.values()] + # NOTE(typhoonzero): consider list input/output + prev_output_names = prev_op.desc.output_arg_names() + prev_input_names = prev_op.desc.input_arg_names() found1 = False found2 = False - for _, v in op.inputs.iteritems(): - if v.name in prev_output_names: + for varname in op.desc.input_arg_names(): + if varname in prev_output_names: found1 = self._is_op_on_pserver(endpoint, all_ops, j) # later ops may produce output for prev op's next batch use. - for _, v in op.outputs.iteritems(): - if v.name in prev_input_names: + for varname in op.desc.output_arg_names(): + if varname in prev_input_names: found2 = self._is_op_on_pserver(endpoint, all_ops, j) if found1 or found2: return True @@ -342,11 +351,11 @@ class DistributeTranspiler: new_inputs = dict() # update param/grad shape first, then other inputs like # moment can use the updated shape - for key, var in opt_op.inputs.iteritems(): + for key in opt_op.input_names: if key == "Grad": grad_block = None for g in self.param_grad_ep_mapping[endpoint]["grads"]: - if same_or_split_var(g.name, var.name): + if same_or_split_var(g.name, opt_op.input(key)[0]): grad_block = g break if not grad_block: @@ -376,7 +385,7 @@ class DistributeTranspiler: # param is already created on global program param_block = None for p in self.param_grad_ep_mapping[endpoint]["params"]: - if same_or_split_var(p.name, var.name): + if same_or_split_var(p.name, opt_op.input(key)[0]): param_block = p break if not param_block: @@ -389,11 +398,12 @@ class DistributeTranspiler: new_inputs[key] = tmpvar - for key, var in opt_op.inputs.iteritems(): + for key in opt_op.input_names: if key in ["Param", "Grad"]: continue # update accumulator variable shape param_shape = new_inputs["Param"].shape + var = program.global_block().vars[opt_op.input(key)[0]] new_shape = self._get_optimizer_input_shape(opt_op.type, key, var.shape, param_shape) tmpvar = program.global_block().create_var( @@ -412,30 +422,44 @@ class DistributeTranspiler: shape=new_shape) # change output's ParamOut variable - opt_op.outputs["ParamOut"] = new_inputs["Param"] + outputs = self._get_output_map_from_op(program.global_block(), opt_op) + outputs["ParamOut"] = new_inputs["Param"] program.global_block().append_op( type=opt_op.type, inputs=new_inputs, - outputs=opt_op.outputs, + outputs=outputs, attrs=opt_op.attrs) def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op): # Append the ops for parameters that do not need to be optimized/updated - for _, var in opt_op.inputs.iteritems(): - program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) - pserver_program.global_block().create_var( - name=var.name, - persistable=var.persistable, - dtype=var.dtype, - shape=var.shape) + inputs = self._get_input_map_from_op(self.program.global_block().vars, + opt_op) + for var in inputs.itervalues(): + if type(var) == list: + varlist = var + else: + varlist = [var] + for var in varlist: + # TODO(typhoonzero): will remove below line later. + program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + if not pserver_program.global_block().vars.has_key(var.name): + pserver_program.global_block().create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=var.shape) + + outputs = self._get_output_map_from_op(self.program.global_block().vars, + opt_op) + program.global_block().append_op( type=opt_op.type, - inputs=opt_op.inputs, - outputs=opt_op.outputs, + inputs=inputs, + outputs=outputs, attrs=opt_op.attrs) def get_pserver_program(self, endpoint): @@ -472,7 +496,7 @@ class DistributeTranspiler: self.optimize_ops, idx) if not is_op_on_pserver: continue - if opt_op.inputs.has_key("Grad"): + if "Grad" in opt_op.desc.input_arg_names(): self._append_pserver_ops(optimize_sub_program, pserver_program, opt_op, endpoint) else: @@ -499,6 +523,30 @@ class DistributeTranspiler: pserver_program.sync_with_cpp() return pserver_program + def _get_input_map_from_op(self, varmap, op): + iomap = dict() + for key in op.input_names: + vars = [] + for varname in op.input(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + def _get_output_map_from_op(self, varmap, op): + iomap = dict() + for key in op.output_names: + vars = [] + for varname in op.output(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + def get_startup_program(self, endpoint, pserver_program): """ Get startup program for current parameter server. @@ -529,17 +577,21 @@ class DistributeTranspiler: # 2. rename op outputs for op in orig_s_prog.global_block().ops: + new_inputs = dict() new_outputs = dict() # do not append startup op if var is not on this pserver op_on_pserver = False - for key, var in op.outputs.iteritems(): - newname, _ = _get_splited_name_and_shape(var.name) + for key in op.output_names: + newname, _ = _get_splited_name_and_shape(op.output(key)[0]) if newname: op_on_pserver = True new_outputs[key] = created_var_map[newname] - elif var.name in pserver_vars: + elif op.output(key)[0] in pserver_vars: op_on_pserver = True - new_outputs[key] = pserver_vars[var.name] + new_outputs[key] = pserver_vars[op.output(key)[0]] + + # most startup program ops have no inputs + new_inputs = self._get_input_map_from_op(pserver_vars, op) if op_on_pserver: if op.type in [ @@ -548,7 +600,7 @@ class DistributeTranspiler: op.attrs["shape"] = new_outputs["Out"].shape s_prog.global_block().append_op( type=op.type, - inputs=op.inputs, + inputs=new_inputs, outputs=new_outputs, attrs=op.attrs) return s_prog diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index 68cf8673cd46677065588f652482cd0df08b3450..4460ffaf9c46966178497419a35ef4044464ac9f 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core from paddle.v2.fluid.op import Operator from paddle.v2.fluid.framework import grad_var_name +np.random.random(123) + def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): x_shape = x.shape @@ -62,9 +64,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): grad_x = dx_end + d_mean + d_std - grad_y.shape = x_shape - x.shape = x_shape + grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape scale.shape = scale_shape + var.shape, mean.shape = [N, ], [N, ] return grad_x, d_scale, d_bias @@ -112,10 +114,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None): class TestLayerNormdOp(OpTest): def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue( - np.allclose( - np.array(tensor).reshape(np_array.shape), np_array, atol=atol), - msg) + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) def __assert_grad_close(self, tensor, @@ -123,7 +122,7 @@ class TestLayerNormdOp(OpTest): name, place, max_relative_error=0.02): - a = np.array(tensor).reshape(np_array.shape) + a = np.array(tensor) b = np_array abs_a = np.abs(a) abs_a[abs_a < 1e-5] = 1 @@ -151,7 +150,7 @@ class TestLayerNormdOp(OpTest): x_shape = shape D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) scale_shape = [D] - np.random.random(123) + x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32)