From fecbc9584ebbf88b80c097504ca034b688fefd6e Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Fri, 29 Jul 2022 16:46:12 +0800 Subject: [PATCH] add some fp16 op for kunlun resnet50 model (#44672) * add some fp16 op for kunlun resnet50 model *test=kunlun * tmp *test=kunlun --- .../operators/fused/resnet_unit_op_xpu.cc | 121 +++-- .../optimizers/lars_momentum_op_xpu.cc | 29 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 19 +- .../phi/kernels/xpu/elementwise_add_kernel.cc | 15 +- .../kernels/xpu/log_softmax_grad_kernel.cc | 36 +- paddle/phi/kernels/xpu/log_softmax_kernel.cc | 14 +- .../xpu/test_update_loss_scaling_op_xpu.py | 451 +++++++++--------- 7 files changed, 375 insertions(+), 310 deletions(-) diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc index cce506c67a..e9ad179960 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc @@ -23,6 +23,8 @@ using Tensor = framework::Tensor; template class ResNetUnitXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { auto place = ctx.GetPlace(); @@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel { std::string act_type = ctx.Attr("act_type"); auto &dev_ctx = ctx.template device_context(); - std::vector x_list = {input_x->data()}; - std::vector w_list = {filter_x->data()}; - std::vector conv_y_list = {conv_out_x->mutable_data(place)}; + std::vector x_list = { + reinterpret_cast(input_x->data())}; + std::vector w_list = { + reinterpret_cast(filter_x->data())}; + std::vector conv_y_list = { + reinterpret_cast(conv_out_x->mutable_data(place))}; std::vector> x_shape_list = { phi::vectorize(input_x->dims())}; @@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel { Tensor *running_mean_z = ctx.Output("RunningMeanZ"); Tensor *running_var_z = ctx.Output("RunningVarZ"); - x_list.push_back(input_z->data()); - w_list.push_back(filter_z->data()); - conv_y_list.push_back(conv_out_z->mutable_data(place)); + x_list.push_back(reinterpret_cast(input_z->data())); + w_list.push_back(reinterpret_cast(filter_z->data())); + conv_y_list.push_back( + reinterpret_cast(conv_out_z->mutable_data(place))); x_shape_list.push_back(phi::vectorize(input_z->dims())); @@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel { if (fuse_add) { const Tensor *input_z = ctx.Input("Z"); auto input_z_shape = phi::vectorize(input_z->dims()); - x_list.push_back(input_z->data()); + x_list.push_back(reinterpret_cast(input_z->data())); x_shape_list.push_back(input_z_shape); x_maxlist.push_back(nullptr); } } - int r = xpu::resnet_unit_fusion( + int r = xpu::resnet_unit_fusion( dev_ctx.x_context(), x_list, w_list, conv_y_list, - output->mutable_data(place), + reinterpret_cast(output->mutable_data(place)), x_shape_list, filter_x_shape[0], ksize_list, @@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel { template class ResNetUnitGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { auto place = ctx.GetPlace(); @@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); - std::vector x_list = {x->data()}; - std::vector w_list = {filter_x->data()}; - std::vector conv_y_list = {conv_out_x->data()}; - std::vector dx_list = {x_grad->mutable_data(place)}; - std::vector dw_list = {filter_x_grad->mutable_data(place)}; + std::vector x_list = { + reinterpret_cast(x->data())}; + std::vector w_list = { + reinterpret_cast(filter_x->data())}; + std::vector conv_y_list = { + reinterpret_cast(conv_out_x->data())}; + std::vector dx_list = { + reinterpret_cast(x_grad->mutable_data(place))}; + std::vector dw_list = { + reinterpret_cast(filter_x_grad->mutable_data(place))}; std::vector> x_shape_list = { phi::vectorize(x->dims())}; @@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel { Tensor *scale_z_grad = ctx.Output(framework::GradVarName("ScaleZ")); Tensor *bias_z_grad = ctx.Output(framework::GradVarName("BiasZ")); - x_list.push_back(z->data()); - w_list.push_back(filter_z->data()); - conv_y_list.push_back(conv_out_z->data()); - dx_list.push_back(z_grad->mutable_data(place)); - dw_list.push_back(filter_z_grad->mutable_data(place)); + x_list.push_back(reinterpret_cast(z->data())); + w_list.push_back(reinterpret_cast(filter_z->data())); + conv_y_list.push_back( + reinterpret_cast(conv_out_z->data())); + dx_list.push_back( + reinterpret_cast(z_grad->mutable_data(place))); + dw_list.push_back( + reinterpret_cast(filter_z_grad->mutable_data(place))); x_shape_list.push_back(phi::vectorize(z->dims())); auto filter_z_shape = phi::vectorize(filter_z->dims()); @@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel { } else { if (fuse_add) { auto z_grad = ctx.Output(framework::GradVarName("Z")); - dx_list.push_back(z_grad->mutable_data(place)); + dx_list.push_back( + reinterpret_cast(z_grad->mutable_data(place))); } } - int r = - xpu::resnet_unit_grad_fusion(dev_ctx.x_context(), - x_list, - w_list, - y_grad->data(), - output->data(), - conv_y_list, - dx_list, - dw_list, - x_shape_list, - filter_x_shape[0], - ksize_list, - stride_list, - paddings, - dilations, - group, - x_maxlist, - w_maxlist, - scale_list, - batch_mean_list, - batch_invstd_list, - dscale_list, - dbias_list, - xpu::Activation_t::RELU, - eps, - is_nchw, - has_shortcut, - fuse_add); + int r = xpu::resnet_unit_grad_fusion( + dev_ctx.x_context(), + x_list, + w_list, + reinterpret_cast(y_grad->data()), + reinterpret_cast(output->data()), + conv_y_list, + dx_list, + dw_list, + x_shape_list, + filter_x_shape[0], + ksize_list, + stride_list, + paddings, + dilations, + group, + x_maxlist, + w_maxlist, + scale_list, + batch_mean_list, + batch_invstd_list, + dscale_list, + dbias_list, + xpu::Activation_t::RELU, + eps, + is_nchw, + has_shortcut, + fuse_add); PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion"); } }; @@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel); -REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel); +REGISTER_OP_XPU_KERNEL(resnet_unit, + ops::ResNetUnitXPUKernel, + ops::ResNetUnitXPUKernel); +REGISTER_OP_XPU_KERNEL(resnet_unit_grad, + ops::ResNetUnitGradXPUKernel, + ops::ResNetUnitGradXPUKernel); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc index 626e071c20..1f9a9eb251 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc @@ -22,6 +22,8 @@ namespace operators { template class LarsMomentumOpXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { bool multi_precision = ctx.Attr("multi_precision"); @@ -35,14 +37,14 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel { auto master_param = ctx.MultiInput("MasterParam"); auto master_param_out = ctx.MultiOutput("MasterParamOut"); - T mu = static_cast(ctx.Attr("mu")); - T lars_coeff = ctx.Attr("lars_coeff"); - T epsilon = ctx.Attr("epsilon"); - T rescale_grad = ctx.Attr("rescale_grad"); + float mu = static_cast(ctx.Attr("mu")); + float lars_coeff = ctx.Attr("lars_coeff"); + float epsilon = ctx.Attr("epsilon"); + float rescale_grad = ctx.Attr("rescale_grad"); - std::vector param_list; - std::vector grad_list; - std::vector param_out_list; + std::vector param_list; + std::vector grad_list; + std::vector param_out_list; std::vector velocity_list; std::vector velocity_out_list; std::vector lrs; @@ -52,9 +54,12 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel { std::vector master_param_out_list; int op_num = param.size(); for (int i = 0; i < op_num; ++i) { - param_list.push_back(const_cast(param[i]->data())); - grad_list.push_back(const_cast(grad[i]->data())); - param_out_list.push_back(param_out[i]->mutable_data(ctx.GetPlace())); + param_list.push_back( + reinterpret_cast(const_cast((param[i]->data())))); + grad_list.push_back( + reinterpret_cast(const_cast(grad[i]->data()))); + param_out_list.push_back(reinterpret_cast( + param_out[i]->mutable_data(ctx.GetPlace()))); velocity_list.push_back(const_cast(velocity[i]->data())); velocity_out_list.push_back( velocity_out[i]->mutable_data(ctx.GetPlace())); @@ -111,5 +116,7 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel); +REGISTER_OP_XPU_KERNEL(lars_momentum, + ops::LarsMomentumOpXPUKernel, + ops::LarsMomentumOpXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 28ff2bfba5..e3c46ae5b7 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -231,7 +231,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"generate_proposals_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"grad_add", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"greater_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -254,9 +256,8 @@ XPUOpMap& get_kl2_ops() { {"label_smooth", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lars_momentum", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"layer_norm_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"layer_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, @@ -380,9 +381,12 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"resnet_unit", + XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"resnet_unit_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -502,6 +506,9 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"update_loss_scaling", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"unsqueeze2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc index 34d39b0a83..9c5b521849 100644 --- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc +++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc @@ -24,13 +24,15 @@ void GradAddXPUKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; + dev_ctx.template Alloc(out); auto x_shape = phi::vectorize(x.dims()); auto y_shape = phi::vectorize(y.dims()); int r = xpu::broadcast_add(dev_ctx.x_context(), - x.data(), - y.data(), - out->data(), + reinterpret_cast(x.data()), + reinterpret_cast(y.data()), + reinterpret_cast(out->data()), x_shape, y_shape); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); @@ -38,4 +40,9 @@ void GradAddXPUKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {} +PD_REGISTER_KERNEL(grad_add, + XPU, + ALL_LAYOUT, + phi::GradAddXPUKernel, + phi::dtype::float16, + float) {} diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc index c9165f3ef7..26f532f17b 100644 --- a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc @@ -26,6 +26,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, int axis, DenseTensor* x_grad) { + using XPUType = typename XPUTypeTrait::Type; const int rank = out.dims().size(); axis = funcs::CanonicalAxis(axis, rank); @@ -40,24 +41,29 @@ void LogSoftmaxGradKernel(const Context& dev_ctx, PADDLE_ENFORCE_NE( tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu")); - int r = - xpu::exp(dev_ctx.x_context(), out.data(), tmp_ptr, out_grad.numel()); + int r = xpu::exp(dev_ctx.x_context(), + reinterpret_cast(out.data()), + reinterpret_cast(tmp_ptr), + out_grad.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp"); - r = xpu::reciprocal( - dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel()); + r = xpu::reciprocal(dev_ctx.x_context(), + reinterpret_cast(tmp_ptr), + reinterpret_cast(tmp2_ptr), + out_grad.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal"); - r = xpu::mul(dev_ctx.x_context(), - tmp2_ptr, - out_grad.data(), - tmp2_ptr, - out_grad.numel()); + r = xpu::mul(dev_ctx.x_context(), + reinterpret_cast(tmp2_ptr), + reinterpret_cast(out_grad.data()), + reinterpret_cast(tmp2_ptr), + out_grad.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul"); - r = xpu::softmax_grad(dev_ctx.x_context(), - tmp_ptr, - tmp2_ptr, - x_grad->data(), - out_shape, - axis); + r = xpu::softmax_grad( + dev_ctx.x_context(), + reinterpret_cast(tmp_ptr), + reinterpret_cast(tmp2_ptr), + reinterpret_cast(x_grad->data()), + out_shape, + axis); PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad"); } } diff --git a/paddle/phi/kernels/xpu/log_softmax_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_kernel.cc index 1f084d0e6c..0250b08e50 100644 --- a/paddle/phi/kernels/xpu/log_softmax_kernel.cc +++ b/paddle/phi/kernels/xpu/log_softmax_kernel.cc @@ -25,6 +25,7 @@ void LogSoftmaxKernel(const Context& dev_ctx, const DenseTensor& x, int axis, DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; const int rank = x.dims().size(); axis = funcs::CanonicalAxis(axis, rank); @@ -32,11 +33,16 @@ void LogSoftmaxKernel(const Context& dev_ctx, auto x_shape = phi::vectorize(x.dims()); dev_ctx.template Alloc(out); if (axis < 0) axis += rank; - int r = xpu::softmax( - dev_ctx.x_context(), x.data(), out->data(), x_shape, axis); + int r = xpu::softmax(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + x_shape, + axis); PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax"); - r = xpu::log( - dev_ctx.x_context(), out->data(), out->data(), out->numel()); + r = xpu::log(dev_ctx.x_context(), + reinterpret_cast(out->data()), + reinterpret_cast(out->data()), + out->numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "log"); } } diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py index 5ed10d159a..41e277d7a3 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py @@ -23,231 +23,242 @@ import paddle import paddle.fluid as fluid import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + paddle.enable_static() -class TestUpdateLossScalingOp(XPUOpTest): - - def setUp(self): - self.op_type = "update_loss_scaling" - self.init() - found_inf = np.array([False], dtype=np.bool_) - x = np.random.random((1024, 1024)).astype(self.dtype) - - self.inputs = { - 'X': [('x0', x)], - 'FoundInfinite': found_inf, - 'PrevLossScaling': self.prev_loss_scaling, - 'InGoodSteps': self.num_good_steps, - 'InBadSteps': self.num_bad_steps - } - - self.outputs = { - 'Out': [('out0', x)], - 'LossScaling': self.prev_loss_scaling * self.incr_ratio, - 'OutGoodSteps': self.zero_steps, - 'OutBadSteps': self.zero_steps - } - - def init(self): - self.incr_ratio = 2.0 - self.decr_ratio = 0.8 - self.dtype = np.float32 - self.prev_loss_scaling = np.array([2048]).astype(self.dtype) - self.num_good_steps = np.array([999], dtype=np.int32) - self.num_bad_steps = np.array([1], dtype=np.int32) - self.zero_steps = np.array([0], dtype=np.int32) - self.attrs = { - 'incr_every_n_steps': 1000, - 'decr_every_n_nan_or_inf': 2, - 'incr_ratio': self.incr_ratio, - 'decr_ratio': self.decr_ratio, - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, no_check_set=['Out']) - - -class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): - - def setUp(self): - self.op_type = "update_loss_scaling" - self.init() - found_inf = np.array([True], dtype=np.bool_) - x = np.random.random((1024, 1024)).astype(self.dtype) - i = np.random.randint(0, 1024, 1) - j = np.random.randint(0, 1024, 1) - x[i[0]][j[0]] = np.inf - - self.inputs = { - 'X': [('x0', x)], - 'FoundInfinite': found_inf, - 'PrevLossScaling': self.prev_loss_scaling, - 'InGoodSteps': self.num_good_steps, - 'InBadSteps': self.num_bad_steps - } - - self.outputs = { - 'Out': [('out0', np.zeros_like(x))], - 'LossScaling': self.prev_loss_scaling * self.decr_ratio, - 'OutGoodSteps': self.zero_steps, - 'OutBadSteps': self.zero_steps - } - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - #self.check_output() - - -class TestUpdateLossScalingLayer(unittest.TestCase): - - def loss_scaling_check(self, scope=fluid.Scope()): - a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') - b = fluid.data(name="b", shape=[512, 128], dtype='float32') - x = [a, b] - found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') - prev_loss_scaling = fluid.data(name="prev_loss_scaling", +class XPUTestUpdateLossScalingOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = "update_loss_scaling" + self.use_dynamic_create_class = False + + class TestUpdateLossScalingOp(XPUOpTest): + + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([False], dtype=np.bool_) + x = np.random.random((1024, 1024)).astype(self.dtype) + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', x)], + 'LossScaling': self.prev_loss_scaling * self.incr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.float32 + self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, no_check_set=['Out']) + + class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): + + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([True], dtype=np.bool_) + x = np.random.random((1024, 1024)).astype(self.dtype) + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + x[i[0]][j[0]] = np.inf + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': self.prev_loss_scaling * self.decr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + #self.check_output() + + class TestUpdateLossScalingLayer(unittest.TestCase): + + def loss_scaling_check(self, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data(name="prev_loss_scaling", + shape=[1], + dtype='float32') + num_good_steps = fluid.data(name="num_good_steps", + shape=[1], + dtype='int32') + num_bad_steps = fluid.data(name="num_bad_steps", shape=[1], - dtype='float32') - num_good_steps = fluid.data(name="num_good_steps", - shape=[1], - dtype='int32') - num_bad_steps = fluid.data(name="num_bad_steps", - shape=[1], - dtype='int32') - - a_v = np.random.random([1024, 1024]).astype('float32') - b_v = np.random.random([512, 128]).astype('float32') - found_inf_v = np.array([False]).astype('bool') - prev_loss_scaling_v = np.array([2048]).astype('float32') - num_good_steps_v = np.array([999], dtype=np.int32) - num_bad_steps_v = np.array([1], dtype=np.int32) - - incr_every_n_steps = 1000 - decr_every_n_nan_or_inf = 2 - incr_ratio = 2 - decr_ratio = 0.8 - - result = amp_nn.update_loss_scaling(x, - found_inf, - prev_loss_scaling, - num_good_steps, - num_bad_steps, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - name="update_loss_scaling") - - place = fluid.XPUPlace(0) - exe = fluid.Executor(place) - with fluid.scope_guard(scope): - exe.run(fluid.default_startup_program()) - result_v = exe.run(feed={ - 'a': a_v, - 'b': b_v, - 'found_inf': found_inf_v, - 'prev_loss_scaling': prev_loss_scaling_v, - 'num_good_steps': num_good_steps_v, - 'num_bad_steps': num_bad_steps_v - }, - fetch_list=[ - result, x, found_inf, prev_loss_scaling, - num_good_steps, num_bad_steps - ]) - assert np.array_equal(result_v[0], a_v) - assert np.array_equal(result_v[1], b_v) - assert np.array_equal(result_v[0], result_v[2]) - assert np.array_equal(result_v[1], result_v[3]) - assert np.array_equal(result_v[4], found_inf_v) - assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio) - assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) - assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) - - def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()): - a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') - b = fluid.data(name="b", shape=[512, 128], dtype='float32') - x = [a, b] - found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') - prev_loss_scaling = fluid.data(name="prev_loss_scaling", + dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + found_inf_v = np.array([False]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling(x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.XPUPlace(0) + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], a_v) + assert np.array_equal(result_v[1], b_v) + assert np.array_equal(result_v[0], result_v[2]) + assert np.array_equal(result_v[1], result_v[3]) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data(name="prev_loss_scaling", + shape=[1], + dtype='float32') + num_good_steps = fluid.data(name="num_good_steps", + shape=[1], + dtype='int32') + num_bad_steps = fluid.data(name="num_bad_steps", shape=[1], - dtype='float32') - num_good_steps = fluid.data(name="num_good_steps", - shape=[1], - dtype='int32') - num_bad_steps = fluid.data(name="num_bad_steps", - shape=[1], - dtype='int32') - - a_v = np.random.random([1024, 1024]).astype('float32') - b_v = np.random.random([512, 128]).astype('float32') - i = np.random.randint(0, 1024, 1) - j = np.random.randint(0, 1024, 1) - a_v[i[0]][j[0]] = np.inf - found_inf_v = np.array([True]).astype('bool') - prev_loss_scaling_v = np.array([2048]).astype('float32') - num_good_steps_v = np.array([999], dtype=np.int32) - num_bad_steps_v = np.array([1], dtype=np.int32) - - incr_every_n_steps = 1000 - decr_every_n_nan_or_inf = 2 - incr_ratio = 2 - decr_ratio = 0.8 - - result = amp_nn.update_loss_scaling(x, - found_inf, - prev_loss_scaling, - num_good_steps, - num_bad_steps, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - name="update_loss_scaling") - - place = fluid.XPUPlace(0) - exe = fluid.Executor(place) - with fluid.scope_guard(scope): - exe.run(fluid.default_startup_program()) - result_v = exe.run(feed={ - 'a': a_v, - 'b': b_v, - 'found_inf': found_inf_v, - 'prev_loss_scaling': prev_loss_scaling_v, - 'num_good_steps': num_good_steps_v, - 'num_bad_steps': num_bad_steps_v - }, - fetch_list=[ - result, x, found_inf, prev_loss_scaling, - num_good_steps, num_bad_steps - ]) - assert np.array_equal(result_v[0], np.zeros_like(a_v)) - assert np.array_equal(result_v[1], np.zeros_like(b_v)) - assert np.array_equal(result_v[2], np.zeros_like(a_v)) - assert np.array_equal(result_v[3], np.zeros_like(b_v)) - assert np.array_equal(result_v[4], found_inf_v) - assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio) - assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) - assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) - - def test_loss_scaling(self): - main = fluid.Program() - startup = fluid.Program() - with fluid.unique_name.guard(): - with fluid.program_guard(main, startup): - self.loss_scaling_check() - - def test_loss_scaling_inf(self): - main = fluid.Program() - startup = fluid.Program() - with fluid.unique_name.guard(): - with fluid.program_guard(main, startup): - self.loss_scaling_check_inf() - + dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + a_v[i[0]][j[0]] = np.inf + found_inf_v = np.array([True]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling(x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.XPUPlace(0) + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], np.zeros_like(a_v)) + assert np.array_equal(result_v[1], np.zeros_like(b_v)) + assert np.array_equal(result_v[2], np.zeros_like(a_v)) + assert np.array_equal(result_v[3], np.zeros_like(b_v)) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def test_loss_scaling(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check() + + def test_loss_scaling_inf(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf() + + +support_types = get_xpu_op_support_types('update_loss_scaling') +for stype in support_types: + create_test_class(globals(), XPUTestUpdateLossScalingOp, stype) if __name__ == '__main__': unittest.main() -- GitLab