From f17ba93bde4365cd90291353639eb710dac3781c Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 28 Jul 2021 16:04:23 +0800 Subject: [PATCH] [NPU] Support ScaleTensor for scale npu kernel (#34418) * support ScaleTensor for scale npu kernel * add more tests for adam npu * fix compile * fix unittest * refine adam optimizer --- paddle/fluid/operators/scale_op.h | 3 +- paddle/fluid/operators/scale_op_npu.cc | 14 +- python/paddle/fluid/optimizer.py | 44 ++-- .../tests/unittests/npu/test_adam_op_npu.py | 209 ++++++++++++------ 4 files changed, 179 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 544f0a91668..e7a07810c62 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -25,7 +25,8 @@ template static inline T GetAttrFromTensor(const framework::Tensor* tensor) { const auto* tensor_data = tensor->data(); framework::Tensor cpu_tensor; - if (platform::is_gpu_place(tensor->place())) { + if (platform::is_gpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); tensor_data = cpu_tensor.data(); } diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 6fb0e6d3727..3892fcb7584 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -27,20 +27,24 @@ class ScaleNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); - auto scale = static_cast(ctx.Attr("scale")); - auto bias = static_cast(ctx.Attr("bias")); + auto scale = ctx.Attr("scale"); + auto bias = ctx.Attr("bias"); auto bias_after_scale = ctx.Attr("bias_after_scale"); auto stream = ctx.template device_context() .stream(); - float _power = 1.0; + float power = 1.0; VLOG(4) << "scale:" << scale << ", bias:" << bias << " ,bias_after_scale:" << bias_after_scale; + if (ctx.HasInput("ScaleTensor")) { + auto* scale_tensor = ctx.Input("ScaleTensor"); + scale = static_cast(GetAttrFromTensor(scale_tensor)); + } if (bias_after_scale) { out->mutable_data(ctx.GetPlace()); const auto& runner = NpuOpRunner("Power", {*x}, {*out}, - {{"power", _power}, {"scale", scale}, {"shift", bias}}); + {{"power", power}, {"scale", scale}, {"shift", bias}}); runner.Run(stream); } else { @@ -55,7 +59,7 @@ class ScaleNPUKernel : public framework::OpKernel { float _bias = 0.0; const auto& runner = NpuOpRunner("Power", {tmp_x}, {*out}, - {{"power", _power}, {"scale", scale}, {"shift", _bias}}); + {{"power", power}, {"scale", scale}, {"shift", _bias}}); runner.Run(stream); } } diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 5193339a2f8..ef168d2d921 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2535,30 +2535,46 @@ class AdamOptimizer(Optimizer): with block.program._optimized_guard([]): inputs = {"X": beta1_pow_acc} + outputs = {"Out": beta1_pow_acc} attrs = {} if isinstance(self._beta1, Variable): - inputs['ScaleTensor'] = self._beta1 + inputs["Y"] = self._beta1 + # use elementwise_mul for better performance + block.append_op( + type="elementwise_mul", + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) else: attrs['scale'] = self._beta1 - block.append_op( - type="scale", - inputs=inputs, - outputs={"Out": beta1_pow_acc}, - attrs=attrs, - stop_gradient=True) + block.append_op( + type="scale", + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) inputs = {"X": beta2_pow_acc} + outputs = {"Out": beta2_pow_acc} attrs = {} if isinstance(self._beta2, Variable): - inputs['ScaleTensor'] = self._beta2 + inputs["Y"] = self._beta2 + # use elementwise_mul for better performance + block.append_op( + type="elementwise_mul", + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) else: attrs['scale'] = self._beta2 - block.append_op( - type="scale", - inputs=inputs, - outputs={"Out": beta2_pow_acc}, - attrs=attrs, - stop_gradient=True) + block.append_op( + type="scale", + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) class AdamaxOptimizer(Optimizer): diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py index 02d4002f72c..4899938766f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py @@ -19,6 +19,7 @@ sys.path.append("..") from op_test import OpTest import paddle import paddle.fluid as fluid +import paddle.fluid.core as core from test_adam_op import adam_step paddle.enable_static() @@ -300,85 +301,151 @@ class TestNet(unittest.TestCase): class TestNetWithEpsilonTensor(unittest.TestCase): - def _test(self, run_npu=True): + def _test(self, + place, + use_tensor=True, + use_fluid_api=True, + use_global_beta_pow=False, + flatten_param_grads=False): + paddle.enable_static() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + SEED = 2021 + paddle.seed(SEED) np.random.seed(SEED) - a_np = np.random.random(size=(32, 32)).astype('float32') - b_np = np.random.random(size=(32, 32)).astype('float32') - label_np = np.random.randint(2, size=(32, 1)).astype('int64') + a_np = np.random.random(size=(2, 2)).astype('float32') + b_np = np.random.random(size=(2, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + weight_attr1 = paddle.ParamAttr( + name="weight1", + initializer=fluid.initializer.Constant(value=1.0), + trainable=True) + weight_attr2 = paddle.ParamAttr( + name="weight2", + initializer=fluid.initializer.Constant(value=2.0), + trainable=True) + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') - b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') - label = paddle.static.data( - name="label", shape=[32, 1], dtype='int64') - - sum = paddle.add(a, b) - z = paddle.pow(sum, 2.0) - - fc_1 = fluid.layers.fc(input=z, size=128) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - beta1_init = 0.9 - beta2_init = 0.999 - epsilon_init = 1e-8 - beta1 = fluid.layers.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - persistable=True, - name="beta1") - beta2 = fluid.layers.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - persistable=True, - name="beta2") - epsilon = fluid.layers.create_global_var( - shape=[1], - value=float(epsilon_init), - dtype='float32', - persistable=True, - name="epsilon") - adam = fluid.optimizer.Adam( - learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon) - adam.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run( - main_prog, - feed={"a": a_np, - "b": b_np, - "label": label_np}, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: + with paddle.utils.unique_name.guard(): + a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1) + prediction = fluid.layers.fc(input=fc_1, + size=2, + param_attr=weight_attr2, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + beta1_init = 0.9 + beta2_init = 0.999 + epsilon_init = 1e-8 + if use_tensor: + beta1 = fluid.layers.create_global_var( + shape=[1], + value=float(beta1_init), + dtype='float32', + persistable=True, + name="beta1") + beta2 = fluid.layers.create_global_var( + shape=[1], + value=float(beta2_init), + dtype='float32', + persistable=True, + name="beta2") + epsilon = fluid.layers.create_global_var( + shape=[1], + value=float(epsilon_init), + dtype='float32', + persistable=True, + name="epsilon") + if use_fluid_api: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + use_global_beta_pow=use_global_beta_pow, + flatten_param_grads=flatten_param_grads, + align_size=256, + grad_clip=clip) + else: + adam = paddle.optimizer.Adam( + learning_rate=0.01, + beta1=beta1, + beta2=beta2, + epsilon=epsilon, + grad_clip=clip) + else: + if use_fluid_api: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1_init, + beta2=beta2_init, + epsilon=epsilon_init, + use_global_beta_pow=use_global_beta_pow, + flatten_param_grads=flatten_param_grads, + align_size=256, + grad_clip=clip) + else: + adam = fluid.optimizer.Adam( + learning_rate=0.01, + beta1=beta1_init, + beta2=beta2_init, + epsilon=epsilon_init, + grad_clip=clip) + + adam.minimize(loss) + + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(10): + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) print("Epoch {} | Prediction[0]: {}, Loss: {}".format( epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + paddle.disable_static() + return pred_res, loss_res + + def _test_with_place(self, place): + preds = [] + losses = [] + + for use_tensor in [True, False]: + for use_fluid_api in [True, False]: + for use_global_beta_pow in [True, False]: + for flatten_param_grads in [True, False]: + pred, loss = self._test( + place, use_tensor, use_fluid_api, + use_global_beta_pow, flatten_param_grads) + preds.append(pred) + losses.append(loss) + for pred in preds: + self.assertTrue(np.allclose(pred, preds[0])) + for loss in losses: + self.assertTrue(np.allclose(loss, losses[0])) + + def test_adam_api(self): + # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly. + self._test_with_place(paddle.CPUPlace()) + if core.is_compiled_with_npu(): + self._test_with_place(paddle.NPUPlace(0)) if __name__ == '__main__': -- GitLab