From 234c21ac8b0ddef8cc22b441b35660fffc01a7a7 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 11 Aug 2021 19:02:11 +0800 Subject: [PATCH] [NPU] add while, read_from_array and write_to_array npu op (#34755) * add while read_from_array write_to_array npu op * optimize unittest --- paddle/fluid/operators/array_operator.h | 3 +- .../operators/controlflow/while_op_helper.cc | 11 +- paddle/fluid/operators/sum_op_npu.cc | 97 +++++++++---- .../tests/unittests/npu/test_while_op_npu.py | 130 ++++++++++++++++++ 4 files changed, 211 insertions(+), 30 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index 44063f233c..af44a77c81 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -47,7 +47,8 @@ class ArrayOp : public framework::OperatorBase { size_t offset; if (platform::is_gpu_place(i_tensor.place()) || - platform::is_xpu_place(i_tensor.place())) { + platform::is_xpu_place(i_tensor.place()) || + platform::is_npu_place(i_tensor.place())) { // FIXME: Avoid copy from GPU to CPU framework::Tensor t; framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t); diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 5c94c08271..63b273fdbb 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -212,14 +212,17 @@ bool GetCondData(const framework::LoDTensor &cond) { if (platform::is_cpu_place(cond.place())) { return cond.data()[0]; } - // when platform::is_gpu_place(cond.place()) is true + // when platform::is_gpu_place(cond.place()) or + // platform::is_npu_place(cond.place()) is true std::unique_ptr cpu_cond{new framework::LoDTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "This version of PaddlePaddle does NOT support GPU but got GPU tensor " - "Cond in WhileOp. Please compile WITH_GPU option.")); + "This version of PaddlePaddle does NOT support GPU/NPU but got GPU/NPU " + "tensor " + "Cond in WhileOp. Please compile WITH_GPU or WITH_ASCEND_CL option.")); #endif return cpu_cond->data()[0]; } diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index a6032236c0..a4f75e369e 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -27,36 +27,83 @@ using Tensor = framework::Tensor; template class SumNPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); + void Compute(const framework::ExecutionContext &ctx) const override { + auto out_var = ctx.OutputVar("Out"); + if (out_var->IsType()) { + auto *out = out_var->GetMutable(); + auto x = ctx.MultiInput("X"); + out->mutable_data(ctx.GetPlace()); - auto place = ctx.GetPlace(); + auto place = ctx.GetPlace(); - int n = static_cast(x.size()); - if (n == 1) { - TensorCopy(*x[0], place, out); - return; - } + int n = static_cast(x.size()); + if (n == 1) { + TensorCopy(*x[0], place, out); + return; + } - std::vector inputs; - std::vector names; - for (int i = 0; i < n; ++i) { - if (x[i] && x[i]->numel() > 0) { - inputs.push_back(*x[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; + std::vector inputs; + std::vector names; + for (int i = 0; i < n; ++i) { + if (x[i] && x[i]->numel() > 0) { + inputs.push_back(*x[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } } - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; - runner.AddInputNames(names); - runner.Run(stream); + auto stream = + ctx.template device_context() + .stream(); + NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; + runner.AddInputNames(names); + runner.Run(stream); + } else if (out_var->IsType()) { + auto in_vars = ctx.MultiInputVar("X"); + bool in_place = out_var == in_vars[0]; + auto &out_array = *out_var->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { + PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), true, + platform::errors::InvalidArgument( + "Only support all inputs are TensorArray, " + "but inputs[%d] is not TensorArray.", + i)); + auto &in_array = in_vars[i]->Get(); + + for (size_t i = 0; i < in_array.size(); ++i) { + if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) { + framework::TensorCopy(in_array[i], in_array[i].place(), + ctx.device_context(), &out_array[i]); + out_array[i].set_lod(in_array[i].lod()); + } else { + PADDLE_ENFORCE_EQ( + out_array[i].lod(), in_array[i].lod(), + platform::errors::InvalidArgument( + "The lod message between inputs[%d] and" + " outputs[%d] must be same, but now is not same.", + i, i)); + auto stream = ctx.template device_context< + paddle::platform::NPUDeviceContext>() + .stream(); + NpuOpRunner runner{ + "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}}; + runner.Run(stream); + } + } + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Expected type of Output(out) must be Tensor or " + "LoDTensorArray. But got " + "unsupport type: %s.", + framework::ToTypeName(out_var->Type()))); + } } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py new file mode 100644 index 0000000000..a388761d5e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py @@ -0,0 +1,130 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.backward import append_backward +import numpy +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() + + +class TestWhileOp(unittest.TestCase): + def simple_net(self): + d0 = layers.data( + "d0", shape=[10], append_batch_size=False, dtype='float32') + d1 = layers.data( + "d1", shape=[10], append_batch_size=False, dtype='float32') + d2 = layers.data( + "d2", shape=[10], append_batch_size=False, dtype='float32') + # fill_constant npu op doesn't support int64 + i = layers.zeros(shape=[1], dtype='int32') + i = layers.cast(i, 'int64') + i.stop_gradient = True + init = layers.zeros(shape=[10], dtype='float32') + mem_array = layers.array_write(x=init, i=i) + data_array = layers.array_write(x=d0, i=i) + i = layers.increment(i) + layers.array_write(d1, i, array=data_array) + i = layers.increment(i) + layers.array_write(d2, i, array=data_array) + i = layers.zeros(shape=[1], dtype='int32') + i = layers.cast(i, 'int64') + i.stop_gradient = True + array_len = layers.fill_constant(shape=[1], dtype='int32', value=5) + array_len = layers.cast(array_len, 'int64') + array_len.stop_gradient = True + cond = layers.ones(shape=[1], dtype='int32') + cond = layers.cast(cond, 'bool') + j = layers.fill_constant(shape=[1], dtype='int32', value=1) + j = layers.cast(j, 'int64') + j.stop_gradient = True + array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3) + array_len2 = layers.cast(array_len2, 'int64') + array_len2.stop_gradient = True + cond2 = layers.logical_or(x=j, y=array_len2) + cond2 = layers.ones(shape=[1], dtype='int32') + cond2 = layers.cast(cond2, 'bool') + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + prev = layers.array_read(array=mem_array, i=i) + result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=mem_array) + layers.less_than(x=i, y=array_len, cond=cond) + + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + sum_result = layers.array_read(array=mem_array, i=j) + loss = layers.mean(sum_result) + return loss, sum_result + + def test_simple_net(self): + paddle.enable_static() + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + loss, sum_result = self.simple_net() + + append_backward(loss) + + npu_place = paddle.NPUPlace(0) + exe = Executor(npu_place) + d = [] + + for i in range(3): + d.append(numpy.random.random(size=[10]).astype('float32')) + + outs = exe.run(feed={'d0': d[0], + 'd1': d[1], + 'd2': d[2]}, + fetch_list=[sum_result]) + self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01) + + def test_simple_net_forward(self): + paddle.enable_static() + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + self.simple_net() + + npu_place = paddle.NPUPlace(0) + exe = Executor(npu_place) + d = [] + + for i in range(3): + d.append(numpy.random.random(size=[10]).astype('float32')) + + for _ in range(2): + exe.run(main_program, feed={'d0': d[0], 'd1': d[1], 'd2': d[2]}) + + +if __name__ == '__main__': + unittest.main() -- GitLab