未验证 提交 f17ba93b 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] Support ScaleTensor for scale npu kernel (#34418)

* support ScaleTensor for scale npu kernel

* add more tests for adam npu

* fix compile

* fix unittest

* refine adam optimizer
上级 9f604928
...@@ -25,7 +25,8 @@ template <typename T> ...@@ -25,7 +25,8 @@ template <typename T>
static inline T GetAttrFromTensor(const framework::Tensor* tensor) { static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
const auto* tensor_data = tensor->data<T>(); const auto* tensor_data = tensor->data<T>();
framework::Tensor cpu_tensor; framework::Tensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place()) ||
platform::is_npu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
tensor_data = cpu_tensor.data<T>(); tensor_data = cpu_tensor.data<T>();
} }
......
...@@ -27,20 +27,24 @@ class ScaleNPUKernel : public framework::OpKernel<T> { ...@@ -27,20 +27,24 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::Tensor>("X"); auto* x = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
auto scale = static_cast<float>(ctx.Attr<float>("scale")); auto scale = ctx.Attr<float>("scale");
auto bias = static_cast<float>(ctx.Attr<float>("bias")); auto bias = ctx.Attr<float>("bias");
auto bias_after_scale = ctx.Attr<bool>("bias_after_scale"); auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
float _power = 1.0; float power = 1.0;
VLOG(4) << "scale:" << scale << ", bias:" << bias VLOG(4) << "scale:" << scale << ", bias:" << bias
<< " ,bias_after_scale:" << bias_after_scale; << " ,bias_after_scale:" << bias_after_scale;
if (ctx.HasInput("ScaleTensor")) {
auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
}
if (bias_after_scale) { if (bias_after_scale) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
const auto& runner = const auto& runner =
NpuOpRunner("Power", {*x}, {*out}, NpuOpRunner("Power", {*x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", bias}}); {{"power", power}, {"scale", scale}, {"shift", bias}});
runner.Run(stream); runner.Run(stream);
} else { } else {
...@@ -55,7 +59,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> { ...@@ -55,7 +59,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
float _bias = 0.0; float _bias = 0.0;
const auto& runner = const auto& runner =
NpuOpRunner("Power", {tmp_x}, {*out}, NpuOpRunner("Power", {tmp_x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", _bias}}); {{"power", power}, {"scale", scale}, {"shift", _bias}});
runner.Run(stream); runner.Run(stream);
} }
} }
......
...@@ -2535,30 +2535,46 @@ class AdamOptimizer(Optimizer): ...@@ -2535,30 +2535,46 @@ class AdamOptimizer(Optimizer):
with block.program._optimized_guard([]): with block.program._optimized_guard([]):
inputs = {"X": beta1_pow_acc} inputs = {"X": beta1_pow_acc}
outputs = {"Out": beta1_pow_acc}
attrs = {} attrs = {}
if isinstance(self._beta1, Variable): if isinstance(self._beta1, Variable):
inputs['ScaleTensor'] = self._beta1 inputs["Y"] = self._beta1
# use elementwise_mul for better performance
block.append_op(
type="elementwise_mul",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
else: else:
attrs['scale'] = self._beta1 attrs['scale'] = self._beta1
block.append_op( block.append_op(
type="scale", type="scale",
inputs=inputs, inputs=inputs,
outputs={"Out": beta1_pow_acc}, outputs=outputs,
attrs=attrs, attrs=attrs,
stop_gradient=True) stop_gradient=True)
inputs = {"X": beta2_pow_acc} inputs = {"X": beta2_pow_acc}
outputs = {"Out": beta2_pow_acc}
attrs = {} attrs = {}
if isinstance(self._beta2, Variable): if isinstance(self._beta2, Variable):
inputs['ScaleTensor'] = self._beta2 inputs["Y"] = self._beta2
# use elementwise_mul for better performance
block.append_op(
type="elementwise_mul",
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
else: else:
attrs['scale'] = self._beta2 attrs['scale'] = self._beta2
block.append_op( block.append_op(
type="scale", type="scale",
inputs=inputs, inputs=inputs,
outputs={"Out": beta2_pow_acc}, outputs=outputs,
attrs=attrs, attrs=attrs,
stop_gradient=True) stop_gradient=True)
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
......
...@@ -19,6 +19,7 @@ sys.path.append("..") ...@@ -19,6 +19,7 @@ sys.path.append("..")
from op_test import OpTest from op_test import OpTest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
from test_adam_op import adam_step from test_adam_op import adam_step
paddle.enable_static() paddle.enable_static()
...@@ -300,85 +301,151 @@ class TestNet(unittest.TestCase): ...@@ -300,85 +301,151 @@ class TestNet(unittest.TestCase):
class TestNetWithEpsilonTensor(unittest.TestCase): class TestNetWithEpsilonTensor(unittest.TestCase):
def _test(self, run_npu=True): def _test(self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False,
flatten_param_grads=False):
paddle.enable_static()
main_prog = paddle.static.Program() main_prog = paddle.static.Program()
startup_prog = paddle.static.Program() startup_prog = paddle.static.Program()
main_prog.random_seed = SEED SEED = 2021
startup_prog.random_seed = SEED paddle.seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32') a_np = np.random.random(size=(2, 2)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32') b_np = np.random.random(size=(2, 2)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64') label_np = np.random.randint(2, size=(2, 1)).astype('int64')
weight_attr1 = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
trainable=True)
weight_attr2 = paddle.ParamAttr(
name="weight2",
initializer=fluid.initializer.Constant(value=2.0),
trainable=True)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog): with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') with paddle.utils.unique_name.guard():
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
label = paddle.static.data( b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
name="label", shape=[32, 1], dtype='int64') label = paddle.static.data(
name="label", shape=[2, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0) sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1,
cost = fluid.layers.cross_entropy(input=prediction, label=label) size=2,
loss = fluid.layers.reduce_mean(cost) param_attr=weight_attr2,
beta1_init = 0.9 act='softmax')
beta2_init = 0.999
epsilon_init = 1e-8 cost = fluid.layers.cross_entropy(input=prediction, label=label)
beta1 = fluid.layers.create_global_var( loss = fluid.layers.reduce_mean(cost)
shape=[1], beta1_init = 0.9
value=float(beta1_init), beta2_init = 0.999
dtype='float32', epsilon_init = 1e-8
persistable=True, if use_tensor:
name="beta1") beta1 = fluid.layers.create_global_var(
beta2 = fluid.layers.create_global_var( shape=[1],
shape=[1], value=float(beta1_init),
value=float(beta2_init), dtype='float32',
dtype='float32', persistable=True,
persistable=True, name="beta1")
name="beta2") beta2 = fluid.layers.create_global_var(
epsilon = fluid.layers.create_global_var( shape=[1],
shape=[1], value=float(beta2_init),
value=float(epsilon_init), dtype='float32',
dtype='float32', persistable=True,
persistable=True, name="beta2")
name="epsilon") epsilon = fluid.layers.create_global_var(
adam = fluid.optimizer.Adam( shape=[1],
learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon) value=float(epsilon_init),
adam.minimize(loss) dtype='float32',
persistable=True,
if run_npu: name="epsilon")
place = paddle.NPUPlace(0) if use_fluid_api:
else: adam = fluid.optimizer.Adam(
place = paddle.CPUPlace() learning_rate=0.01,
beta1=beta1,
exe = paddle.static.Executor(place) beta2=beta2,
exe.run(startup_prog) epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow,
print("Start run on {}".format(place)) flatten_param_grads=flatten_param_grads,
for epoch in range(100): align_size=256,
grad_clip=clip)
pred_res, loss_res = exe.run( else:
main_prog, adam = paddle.optimizer.Adam(
feed={"a": a_np, learning_rate=0.01,
"b": b_np, beta1=beta1,
"label": label_np}, beta2=beta2,
fetch_list=[prediction, loss]) epsilon=epsilon,
if epoch % 10 == 0: grad_clip=clip)
else:
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
else:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
grad_clip=clip)
adam.minimize(loss)
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(10):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format( print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res)) epoch, pred_res[0], loss_res))
paddle.disable_static()
return pred_res, loss_res return pred_res, loss_res
def test_npu(self): def _test_with_place(self, place):
cpu_pred, cpu_loss = self._test(False) preds = []
npu_pred, npu_loss = self._test(True) losses = []
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) for use_tensor in [True, False]:
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]:
for flatten_param_grads in [True, False]:
pred, loss = self._test(
place, use_tensor, use_fluid_api,
use_global_beta_pow, flatten_param_grads)
preds.append(pred)
losses.append(loss)
for pred in preds:
self.assertTrue(np.allclose(pred, preds[0]))
for loss in losses:
self.assertTrue(np.allclose(loss, losses[0]))
def test_adam_api(self):
# NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
self._test_with_place(paddle.CPUPlace())
if core.is_compiled_with_npu():
self._test_with_place(paddle.NPUPlace(0))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册