diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e6b46421afa7268e026cfb79438f59a2c5457cdf --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" +#include "paddle/fluid/platform/device/npu/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class NPUClipByNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max_norm = context.Attr("max_norm"); + auto in_var = context.InputVar("X"); + + if (!(in_var->IsType())) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid input variable type, only support LodTensor" + "type, but got type is %s.", + framework::ToTypeName(in_var->Type()))); + } + + auto place = context.GetPlace(); + auto& dev_ctx = + context.template device_context(); + auto stream = dev_ctx.stream(); + + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + output->mutable_data(place); + + PADDLE_ENFORCE_NOT_NULL(input, + platform::errors::InvalidArgument( + "Input(X) of ClipByNormOp should not be null. " + "Please check if it is created correctly.")); + + Tensor square_sum(input->type()); + square_sum.mutable_data(framework::DDim({1}), place); + const auto& x_dims = input->dims(); + std::vector axis; + for (int i = 0; i < x_dims.size(); ++i) { + axis.push_back(i); + } + const auto& square_sum_runner = + NpuOpRunner("SquareSumV1", {*input}, {square_sum}, + {{"axis", axis}, {"keep_dims", false}}); + square_sum_runner.Run(stream); + + Tensor x_norm(input->type()); + x_norm.mutable_data(framework::DDim({1}), place); + const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {}); + x_norm_runner.Run(stream); + + Tensor x_norm_t; + framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t); + auto x_norm_v = static_cast(*x_norm_t.data()); + if (x_norm_v <= max_norm) { + framework::TensorCopy(*input, place, dev_ctx, output); + } else { + auto epsilon = x_norm_v <= static_cast(1e-30) + ? static_cast(1e-6) + : static_cast(0); + float scaling = max_norm / (x_norm_v + epsilon); + const auto& muls_runner = + NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}}); + muls_runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + clip_by_norm, + ops::NPUClipByNormKernel, + ops::NPUClipByNormKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..d71fc142ade3a6f0d05996a778daf8793873634a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py @@ -0,0 +1,108 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import sys +sys.path.append("..") +from op_test import OpTest + +paddle.enable_static() + + +class TestClipByNormOp(OpTest): + def setUp(self): + self.set_npu() + self.max_relative_error = 0.006 + self.init_dtype() + self.initTestCase() + input = np.random.random(self.shape).astype(self.dtype) + input[np.abs(input) < self.max_relative_error] = 0.5 + self.op_type = "clip_by_norm" + self.inputs = {'X': input, } + self.attrs = {} + self.attrs['max_norm'] = self.max_norm + norm = np.sqrt(np.sum(np.square(input))) + if norm > self.max_norm: + output = self.max_norm * input / norm + else: + output = input + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def initTestCase(self): + self.shape = (100, ) + self.max_norm = 1.0 + + def init_dtype(self): + self.dtype = np.float32 + + +class TestCase1(TestClipByNormOp): + def initTestCase(self): + self.shape = (100, ) + self.max_norm = 1e20 + + +class TestCase2(TestClipByNormOp): + def initTestCase(self): + self.shape = (16, 16) + self.max_norm = 0.1 + + +class TestCase3(TestClipByNormOp): + def initTestCase(self): + self.shape = (4, 8, 16) + self.max_norm = 1.0 + + +class TestClipByNormOpFp16(TestClipByNormOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestClipByNormOpFp16Case1(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (100, ) + self.max_norm = 1e20 + + +class TestClipByNormOpFp16Case2(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (16, 16) + self.max_norm = 0.1 + + +class TestClipByNormOpFp16Case3(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (4, 8, 16) + self.max_norm = 1.0 + + +if __name__ == '__main__': + unittest.main()