From cfa691337a866b2a77321253bf859a2f2cf2e6bf Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Thu, 12 Aug 2021 10:52:31 +0800 Subject: [PATCH] [NPU] Support npu kernel for smooth_l1_loss op (#34674) --- .../fluid/operators/smooth_l1_loss_op_npu.cc | 203 ++++++++++++++++++ .../npu/test_smooth_l1_loss_op_npu.py | 147 +++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 paddle/fluid/operators/smooth_l1_loss_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc new file mode 100644 index 00000000000..b5a04ce2cab --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class SmoothL1LossNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* inside_weight = context.Input("InsideWeight"); + auto* outside_weight = context.Input("OutsideWeight"); + auto* out_diff = context.Output("Diff"); + auto* out_loss = context.Output("Out"); + out_diff->mutable_data(context.GetPlace()); + out_loss->mutable_data(context.GetPlace()); + + auto sigma = context.Attr("sigma"); + T sigma2 = 1.0 / (sigma * sigma); + bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); + // out_diff = in_x - in_y + auto stream = + context.template device_context() + .stream(); + const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {}); + runner1.Run(stream); + + Tensor no_reduce_loss(in_x->type()); + no_reduce_loss.Resize(in_x->dims()); + no_reduce_loss.mutable_data(context.GetPlace()); + // multiply inside weight before get the loss + if (has_weight) { + Tensor tmp_diff(out_diff->type()); + tmp_diff.Resize(out_diff->dims()); + tmp_diff.mutable_data(context.GetPlace()); + const auto& runner2 = + NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {}); + runner2.Run(stream); + framework::TensorCopy( + tmp_diff, context.GetPlace(), + context.template device_context(), + out_diff); + + Tensor tmp_x(in_x->type()); + tmp_x.Resize(in_x->dims()); + tmp_x.mutable_data(context.GetPlace()); + + Tensor tmp_y(in_y->type()); + tmp_y.Resize(in_y->dims()); + tmp_y.mutable_data(context.GetPlace()); + + // mul input and inside_weight + const auto& runner_x = + NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {}); + runner_x.Run(stream); + const auto& runner_y = + NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {}); + runner_y.Run(stream); + const auto& runner3 = NpuOpRunner("SmoothL1Loss", {tmp_x, tmp_y}, + {no_reduce_loss}, {{"sigma", sigma2}}); + runner3.Run(stream); + } else { + const auto& runner3 = NpuOpRunner("SmoothL1Loss", {*in_x, *in_y}, + {no_reduce_loss}, {{"sigma", sigma2}}); + runner3.Run(stream); + } + + // multiply outside weight and loss + // reduceSum because the output'shape must be [B,1] + if (has_weight) { + Tensor tmp_loss(no_reduce_loss.type()); + tmp_loss.Resize(no_reduce_loss.dims()); + tmp_loss.mutable_data(context.GetPlace()); + const auto& runner4 = + NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {}); + runner4.Run(stream); + const auto& runner5 = + NpuOpRunner("ReduceSumD", {tmp_loss}, {*out_loss}, + {{"axes", std::vector{1}}, {"keep_dims", true}}); + runner5.Run(stream); + } else { + const auto& runner5 = + NpuOpRunner("ReduceSumD", {no_reduce_loss}, {*out_loss}, + {{"axes", std::vector{1}}, {"keep_dims", true}}); + runner5.Run(stream); + } + } +}; + +template +class SmoothL1LossGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* inside_weight = context.Input("InsideWeight"); + auto* outside_weight = context.Input("OutsideWeight"); + auto* diff = context.Input("Diff"); + auto* og = context.Input(framework::GradVarName("Out")); + auto* outx_grad = context.Output(framework::GradVarName("X")); + auto* outy_grad = context.Output(framework::GradVarName("Y")); + auto sigma = context.Attr("sigma"); + T sigma2 = 1.0 / (sigma * sigma); + bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); + + auto stream = + context.template device_context() + .stream(); + + // diff == in_x - in_y == diff - 0 + Tensor tmp_zero(diff->type()); + tmp_zero.Resize(diff->dims()); + tmp_zero.mutable_data(context.GetPlace()); + const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {}); + runner_zero.Run(stream); + + Tensor grad(diff->type()); + grad.Resize(diff->dims()); + grad.mutable_data(context.GetPlace()); + // broadcast og(output_grad) to adapt to the npu interface + const auto& runner_broad = + NpuOpRunner("BroadcastToD", {*og}, {grad}, + {{"shape", framework::vectorize(diff->dims())}}); + runner_broad.Run(stream); + + Tensor gradient(diff->type()); + gradient.Resize(diff->dims()); + gradient.mutable_data(context.GetPlace()); + // diff == diff - 0 == in_x - in_y + const auto& runner_grad = + NpuOpRunner("SmoothL1LossGrad", {*diff, tmp_zero, grad}, {gradient}, + {{"sigma", sigma2}}); + runner_grad.Run(stream); + + // mul weight and gradient + if (has_weight) { + Tensor weight(inside_weight->type()); + weight.Resize(inside_weight->dims()); + weight.mutable_data(context.GetPlace()); + const auto& runner_weight = + NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {}); + runner_weight.Run(stream); + + Tensor tmp_grad(gradient.type()); + tmp_grad.Resize(gradient.dims()); + tmp_grad.mutable_data(context.GetPlace()); + const auto& runner_weight_grad = + NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {}); + runner_weight_grad.Run(stream); + + framework::TensorCopy( + tmp_grad, context.GetPlace(), + context.template device_context(), + &gradient); + } + // outx_grad = gradient + if (outx_grad) { + outx_grad->mutable_data(context.GetPlace()); + framework::TensorCopy( + gradient, context.GetPlace(), + context.template device_context(), + outx_grad); + } + + // outy_grad = - gradient + if (outy_grad) { + outy_grad->mutable_data(context.GetPlace()); + Tensor coeff(framework::proto::VarType::FP32); + coeff.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&coeff, -1); + const auto& runner_y_grad = + NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {}); + runner_y_grad.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossNPUKernel); + +REGISTER_OP_NPU_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py new file mode 100644 index 00000000000..8c20f25061b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py @@ -0,0 +1,147 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +paddle.enable_static() + + +def smooth_l1_loss_forward(val, sigma2): + abs_val = abs(val) + if abs_val < 1.0 / sigma2: + return 0.5 * val * val * sigma2 + else: + return abs_val - 0.5 / sigma2 + + +class TestSmoothL1LossOp1(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "smooth_l1_loss" + dims = (5, 20) + self.inputs = { + 'X': np.random.random(dims).astype("float32"), + 'Y': np.random.random(dims).astype("float32") + } + sigma = 3.0 + self.attrs = {'sigma': sigma} + sigma2 = sigma * sigma + diff = self.inputs['X'] - self.inputs['Y'] + loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2).sum(1) + loss = loss.reshape((dims[0], 1)) + self.outputs = { + 'Diff': diff.astype('float32'), + 'Out': loss.astype('float32') + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.02) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.03, + no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.03, + no_grad_set=set('Y')) + + +class TestSmoothL1LossOp2(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "smooth_l1_loss" + dims = (5, 20) + self.inputs = { + 'X': np.random.random(dims).astype("float32"), + 'Y': np.random.random(dims).astype("float32"), + 'InsideWeight': np.random.random(dims).astype("float32"), + 'OutsideWeight': np.random.random(dims).astype("float32") + } + sigma = 3.0 + self.attrs = {'sigma': sigma} + sigma2 = sigma * sigma + diff = self.inputs['X'] - self.inputs['Y'] + diff = diff * self.inputs['InsideWeight'] + loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2) + loss = loss * self.inputs['OutsideWeight'] + loss = loss.sum(1).reshape((dims[0], 1)) + self.outputs = { + 'Diff': diff.astype('float32'), + 'Out': loss.astype('float32') + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.03) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.03, + no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight'])) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.03, + no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight'])) + + +class TestSmoothL1LossOpError(unittest.TestCase): + def test_errors(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + # The input type of accuracy_op must be Variable. + x1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1) + # The input dtype of accuracy_op must be float32 or float64. + x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32") + y2 = fluid.layers.data(name='x2', shape=[4], dtype="int32") + self.assertRaises(TypeError, fluid.layers.smooth_l1, x2, y2) + + +if __name__ == '__main__': + unittest.main() -- GitLab