diff --git a/paddle/fluid/operators/ngraph/ops/layer_norm_op.h b/paddle/fluid/operators/ngraph/ops/layer_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f56110f969747553ee10e43d91cf4cc5107fadab --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/layer_norm_op.h @@ -0,0 +1,195 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/op_bridge.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +std::shared_ptr reshape_reduction( + std::shared_ptr node, const ngraph::Shape shape, + int begin_norm_axis) { + ngraph::Shape keepdims_shape(shape.begin(), shape.begin() + begin_norm_axis); + return paddle::platform::NgReshaper(node, keepdims_shape); +} + +std::shared_ptr broadcast_reduction( + std::shared_ptr node, const ngraph::Shape shape, + int begin_norm_axis) { + ngraph::AxisSet axis_set; + for (size_t i = begin_norm_axis; i < shape.size(); ++i) axis_set.insert(i); + auto reshape = reshape_reduction(node, shape, begin_norm_axis); + return std::make_shared(reshape, shape, axis_set); +} + +std::shared_ptr reshape_bias_scale( + std::shared_ptr node, const ngraph::Shape shape, + int begin_norm_axis) { + ngraph::Shape keepdims_shape(shape.begin() + begin_norm_axis, shape.end()); + return paddle::platform::NgReshaper(node, keepdims_shape); +} + +std::shared_ptr broadcast_bias_scale( + std::shared_ptr node, const ngraph::Shape shape, + int begin_norm_axis) { + auto reshape = reshape_bias_scale(node, shape, begin_norm_axis); + ngraph::AxisSet axis_set; + for (int i = 0; i < begin_norm_axis; ++i) axis_set.insert(i); + return std::make_shared(reshape, shape, axis_set); +} + +std::shared_ptr flatten(const std::shared_ptr& node, + bool insert_leading_one = false) { + size_t out = 1; + for (auto s : node->get_shape()) out *= s; + if (insert_leading_one) { + return paddle::platform::NgReshaper(node, ngraph::Shape{1, out}); + } else { + return paddle::platform::NgReshaper(node, ngraph::Shape{out}); + } +} + +static void BuildLayerNormNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const auto begin_norm_axis = op_attrs.Get("begin_norm_axis"); + const auto epsilon = op_attrs.Get("epsilon"); + + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto bias = paddle::platform::GetInputNode(op, "Bias", ngb_node_map); + + auto shape = x->get_shape(); + std::vector reduction_axes(shape.size() - begin_norm_axis); + std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis); + + auto mean = ngraph::builder::mean(x, reduction_axes); + auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis); + + auto delta = x - broadcast_mean; + auto variance = ngraph::builder::mean(delta * delta, reduction_axes); + + auto eps = paddle::platform::CreateConstant(variance->get_element_type(), + variance->get_shape(), {epsilon}); + + auto stddev = std::make_shared(variance + eps); + auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis); + + auto norm = delta / broadcast_stddev; + + if (scale) { + auto broadcast_scale = broadcast_bias_scale(scale, shape, begin_norm_axis); + norm = norm * broadcast_scale; + } + if (bias) { + auto broadcast_bias = broadcast_bias_scale(bias, shape, begin_norm_axis); + norm = norm + broadcast_bias; + } + mean = flatten(mean); + variance = flatten(variance); + paddle::platform::SetOutputNode(op, "Y", norm, ngb_node_map); + paddle::platform::SetOutputNode(op, "Mean", mean, ngb_node_map); + paddle::platform::SetOutputNode(op, "Variance", variance, ngb_node_map); +} + +static void BuildLayerNormGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const auto begin_norm_axis = op_attrs.Get("begin_norm_axis"); + const auto epsilon = op_attrs.Get("epsilon"); + + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto mean = paddle::platform::GetInputNode(op, "Mean", ngb_node_map); + auto variance = paddle::platform::GetInputNode(op, "Variance", ngb_node_map); + auto scale = paddle::platform::GetInputNode(op, "Scale", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, framework::GradVarName("Y"), + ngb_node_map); + + auto dx = paddle::platform::GetOutputNode(op, framework::GradVarName("X"), + ngb_node_map); + auto dscale = paddle::platform::GetOutputNode( + op, framework::GradVarName("Scale"), ngb_node_map); + auto dbias = paddle::platform::GetOutputNode( + op, framework::GradVarName("Bias"), ngb_node_map); + + auto shape = x->get_shape(); + + auto broadcast_mean = broadcast_reduction(mean, shape, begin_norm_axis); + + auto delta = x - broadcast_mean; + auto eps = paddle::platform::CreateConstant(variance->get_element_type(), + variance->get_shape(), {epsilon}); + + auto stddev = std::make_shared(variance + eps); + auto broadcast_stddev = broadcast_reduction(stddev, shape, begin_norm_axis); + + auto norm = delta / broadcast_stddev; + + if (dbias) { + std::vector reduction_axes(begin_norm_axis); + std::iota(reduction_axes.begin(), reduction_axes.end(), 0); + auto sum_dy = std::make_shared(dy, reduction_axes); + paddle::platform::SetOutputNode(op, framework::GradVarName("Bias"), + flatten(sum_dy), ngb_node_map); + } + if (dscale) { + std::vector reduction_axes(begin_norm_axis); + std::iota(reduction_axes.begin(), reduction_axes.end(), 0); + auto sum_dy = std::make_shared(dy * norm, reduction_axes); + paddle::platform::SetOutputNode(op, framework::GradVarName("Scale"), + flatten(sum_dy), ngb_node_map); + } + + if (dx) { + std::shared_ptr dx_end = dy / broadcast_stddev; + if (dscale) + dx_end = dx_end * broadcast_bias_scale(scale, shape, begin_norm_axis); + + std::vector reduction_axes(shape.size() - begin_norm_axis); + std::iota(reduction_axes.begin(), reduction_axes.end(), begin_norm_axis); + + auto dx_mean = broadcast_reduction( + ngraph::builder::mean(-dx_end, reduction_axes), shape, begin_norm_axis); + + auto dx_std = + norm * broadcast_reduction( + ngraph::builder::mean(-dx_end * norm, reduction_axes), shape, + begin_norm_axis); + + paddle::platform::SetOutputNode(op, framework::GradVarName("X"), + dx_end + dx_mean + dx_std, ngb_node_map); + } +} + +REGISTER_NG_OP(layer_norm, BuildLayerNormNode); +REGISTER_NG_OP(layer_norm_grad, BuildLayerNormGradNode); + +} // namespace ngraphs +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..a59eaade1bbb8f14765aea5d3c9b00b95b7078b1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py @@ -0,0 +1,30 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import unittest, sys +sys.path.append("../") +from test_layer_norm_op import TestLayerNormdOp + + +class TestLayerNormNGRAPHOp(TestLayerNormdOp): + def setUp(self): + super(TestLayerNormNGRAPHOp, self).setUp() + self.use_cudnn = False + + +del TestLayerNormdOp + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index fb6c43136ff82af55d1fcc2969cf4a07ae081204..fdc5d3679e71036cf1e1d813e654815eb03dd45c 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -72,6 +72,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1): class TestLayerNormdOp(unittest.TestCase): + def setUp(self): + self.use_cudnn = True + def __assert_close(self, tensor, np_array, msg, atol=1e-4): self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) @@ -160,7 +163,8 @@ class TestLayerNormdOp(unittest.TestCase): self.__assert_close(bias_grad, out[5], "bias_grad") places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"): + if core.is_compiled_with_cuda() and core.op_support_gpu( + "layer_norm") and self.use_cudnn: places.append(core.CUDAPlace(0)) for place in places: