From 7aa4d879ca913d4da2d9a1155aa8fc33d20701e3 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 8 Sep 2021 13:35:12 +0800 Subject: [PATCH] add clip_by_norm fp16 kernel (#35446) * add clip_by_norm fp16 kernel * add ut --- paddle/fluid/operators/clip_by_norm_op.cu | 117 +++++++++++++++++- .../tests/unittests/test_clip_by_norm_op.py | 35 +++++- 2 files changed, 150 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu index 788eab7cb2b..5997e467693 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -13,8 +13,123 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/clip_by_norm_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +struct SquareTransformer { + HOSTDEVICE explicit inline SquareTransformer(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x) * static_cast(x); + } + + HOSTDEVICE inline Ty operator()(const Tx* x) const { + return static_cast(x[0]) * static_cast(x[0]); + } +}; + +template +struct SquareSum { + using Transformer = SquareTransformer; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const { + return b + a; + } +}; + +template <> +class ClipByNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max_norm = context.Attr("max_norm"); + auto in_var = context.InputVar("X"); + auto& dev_ctx = + context.template device_context(); + + Tensor* output = nullptr; + const Tensor* input = nullptr; + if (in_var->IsType()) { + input = context.Input("X"); + + output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); + + // merge ids in selected rows first + math::scatter::MergeAdd + merge_func; + SelectedRows* merged_input = + const_cast(context.scope()) + .Var() + ->GetMutable(); + merge_func(context.template device_context(), + *x, merged_input); + input = &(merged_input->value()); + + SelectedRows* output_selected_rows = context.Output("Out"); + output_selected_rows->set_rows(merged_input->rows()); + output_selected_rows->set_height(merged_input->height()); + output = output_selected_rows->mutable_value(); + output->Resize(merged_input->value().dims()); + output->mutable_data(context.GetPlace()); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid input variable type, only support LodTensor and " + "SelectedRows types, but got type is %s.", + framework::ToTypeName(in_var->Type()))); + } + + PADDLE_ENFORCE_NOT_NULL(input, + platform::errors::InvalidArgument( + "Input(X) of ClipByNormOp should not be null. " + "Please check if it is created correctly.")); + std::vector reduce_dims; + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + Tensor tmp = context.AllocateTmpTensor( + {1}, dev_ctx); + TensorReduceFunctorImpl( + *input, &tmp, reduce_dims, dev_ctx.stream()); + auto tmp_eigen = EigenVector::Flatten(tmp); + auto x_norm = tmp_eigen.sqrt(); + + auto x = EigenVector::Flatten(*input); + auto out = EigenVector::Flatten(*output); + + auto& place = + *context.template device_context() + .eigen_device(); + + auto temp = (x_norm <= max_norm).template cast(); + auto epsilon = + ((x_norm <= static_cast(1e-30)).all().template cast()) * + static_cast(1e-6); + + auto scaling = + (temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon)) + .template cast(); + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + + out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( clip_by_norm, - ops::ClipByNormKernel); + ops::ClipByNormKernel, + ops::ClipByNormKernel); diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index d78cb9b0321..de7aaf94790 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -25,8 +25,9 @@ import paddle.fluid.core as core class TestClipByNormOp(OpTest): def setUp(self): self.max_relative_error = 0.006 + self.init_dtype() self.initTestCase() - input = np.random.random(self.shape).astype("float32") + input = np.random.random(self.shape).astype(self.dtype) input[np.abs(input) < self.max_relative_error] = 0.5 self.op_type = "clip_by_norm" self.inputs = {'X': input, } @@ -46,6 +47,9 @@ class TestClipByNormOp(OpTest): self.shape = (100, ) self.max_norm = 1.0 + def init_dtype(self): + self.dtype = np.float32 + class TestCase1(TestClipByNormOp): def initTestCase(self): @@ -65,6 +69,35 @@ class TestCase3(TestClipByNormOp): self.max_norm = 1.0 +class TestClipByNormOpFp16(TestClipByNormOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=0.001) + + +class TestClipByNormOpFp16Case1(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (100, ) + self.max_norm = 1e20 + + +class TestClipByNormOpFp16Case2(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (16, 16) + self.max_norm = 0.1 + + +class TestClipByNormOpFp16Case3(TestClipByNormOpFp16): + def initTestCase(self): + self.shape = (4, 8, 16) + self.max_norm = 1.0 + + class TestClipByNormOpWithSelectedRows(unittest.TestCase): def check_with_place(self, place): self.config_test_case() -- GitLab