未验证 提交 234c21ac 编写于 作者: P pangyoki 提交者: GitHub

[NPU] add while, read_from_array and write_to_array npu op (#34755)

* add while read_from_array write_to_array npu op

* optimize unittest
上级 d45d3112
...@@ -47,7 +47,8 @@ class ArrayOp : public framework::OperatorBase { ...@@ -47,7 +47,8 @@ class ArrayOp : public framework::OperatorBase {
size_t offset; size_t offset;
if (platform::is_gpu_place(i_tensor.place()) || if (platform::is_gpu_place(i_tensor.place()) ||
platform::is_xpu_place(i_tensor.place())) { platform::is_xpu_place(i_tensor.place()) ||
platform::is_npu_place(i_tensor.place())) {
// FIXME: Avoid copy from GPU to CPU // FIXME: Avoid copy from GPU to CPU
framework::Tensor t; framework::Tensor t;
framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t); framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
......
...@@ -212,14 +212,17 @@ bool GetCondData(const framework::LoDTensor &cond) { ...@@ -212,14 +212,17 @@ bool GetCondData(const framework::LoDTensor &cond) {
if (platform::is_cpu_place(cond.place())) { if (platform::is_cpu_place(cond.place())) {
return cond.data<bool>()[0]; return cond.data<bool>()[0];
} }
// when platform::is_gpu_place(cond.place()) is true // when platform::is_gpu_place(cond.place()) or
// platform::is_npu_place(cond.place()) is true
std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL)
framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"This version of PaddlePaddle does NOT support GPU but got GPU tensor " "This version of PaddlePaddle does NOT support GPU/NPU but got GPU/NPU "
"Cond in WhileOp. Please compile WITH_GPU option.")); "tensor "
"Cond in WhileOp. Please compile WITH_GPU or WITH_ASCEND_CL option."));
#endif #endif
return cpu_cond->data<bool>()[0]; return cpu_cond->data<bool>()[0];
} }
......
...@@ -27,9 +27,11 @@ using Tensor = framework::Tensor; ...@@ -27,9 +27,11 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SumNPUKernel : public framework::OpKernel<T> { class SumNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto out_var = ctx.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
auto *out = out_var->GetMutable<framework::LoDTensor>();
auto x = ctx.MultiInput<Tensor>("X"); auto x = ctx.MultiInput<Tensor>("X");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -57,6 +59,51 @@ class SumNPUKernel : public framework::OpKernel<T> { ...@@ -57,6 +59,51 @@ class SumNPUKernel : public framework::OpKernel<T> {
NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}};
runner.AddInputNames(names); runner.AddInputNames(names);
runner.Run(stream); runner.Run(stream);
} else if (out_var->IsType<framework::LoDTensorArray>()) {
auto in_vars = ctx.MultiInputVar("X");
bool in_place = out_var == in_vars[0];
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(), true,
platform::errors::InvalidArgument(
"Only support all inputs are TensorArray, "
"but inputs[%d] is not TensorArray.",
i));
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
framework::TensorCopy(in_array[i], in_array[i].place(),
ctx.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE_EQ(
out_array[i].lod(), in_array[i].lod(),
platform::errors::InvalidArgument(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same.",
i, i));
auto stream = ctx.template device_context<
paddle::platform::NPUDeviceContext>()
.stream();
NpuOpRunner runner{
"Add", {out_array[i], in_array[i]}, {out_array[i]}, {}};
runner.Run(stream);
}
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) must be Tensor or "
"LoDTensorArray. But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
} }
}; };
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.fluid.layers as layers
from paddle.fluid.executor import Executor
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid.backward import append_backward
import numpy
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
class TestWhileOp(unittest.TestCase):
def simple_net(self):
d0 = layers.data(
"d0", shape=[10], append_batch_size=False, dtype='float32')
d1 = layers.data(
"d1", shape=[10], append_batch_size=False, dtype='float32')
d2 = layers.data(
"d2", shape=[10], append_batch_size=False, dtype='float32')
# fill_constant npu op doesn't support int64
i = layers.zeros(shape=[1], dtype='int32')
i = layers.cast(i, 'int64')
i.stop_gradient = True
init = layers.zeros(shape=[10], dtype='float32')
mem_array = layers.array_write(x=init, i=i)
data_array = layers.array_write(x=d0, i=i)
i = layers.increment(i)
layers.array_write(d1, i, array=data_array)
i = layers.increment(i)
layers.array_write(d2, i, array=data_array)
i = layers.zeros(shape=[1], dtype='int32')
i = layers.cast(i, 'int64')
i.stop_gradient = True
array_len = layers.fill_constant(shape=[1], dtype='int32', value=5)
array_len = layers.cast(array_len, 'int64')
array_len.stop_gradient = True
cond = layers.ones(shape=[1], dtype='int32')
cond = layers.cast(cond, 'bool')
j = layers.fill_constant(shape=[1], dtype='int32', value=1)
j = layers.cast(j, 'int64')
j.stop_gradient = True
array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3)
array_len2 = layers.cast(array_len2, 'int64')
array_len2.stop_gradient = True
cond2 = layers.logical_or(x=j, y=array_len2)
cond2 = layers.ones(shape=[1], dtype='int32')
cond2 = layers.cast(cond2, 'bool')
while_op = layers.While(cond=cond)
while_op2 = layers.While(cond=cond2)
with while_op.block():
d = layers.array_read(array=data_array, i=i)
prev = layers.array_read(array=mem_array, i=i)
result = layers.sums(input=[d, prev])
i = layers.increment(x=i, in_place=True)
layers.array_write(result, i=i, array=mem_array)
layers.less_than(x=i, y=array_len, cond=cond)
with while_op2.block():
d2 = layers.array_read(array=data_array, i=j)
prev2 = layers.array_read(array=mem_array, i=j)
result2 = layers.sums(input=[d2, prev2])
j = layers.increment(x=j, in_place=True)
layers.array_write(result2, i=j, array=mem_array)
layers.less_than(x=j, y=array_len2, cond=cond2)
sum_result = layers.array_read(array=mem_array, i=j)
loss = layers.mean(sum_result)
return loss, sum_result
def test_simple_net(self):
paddle.enable_static()
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
loss, sum_result = self.simple_net()
append_backward(loss)
npu_place = paddle.NPUPlace(0)
exe = Executor(npu_place)
d = []
for i in range(3):
d.append(numpy.random.random(size=[10]).astype('float32'))
outs = exe.run(feed={'d0': d[0],
'd1': d[1],
'd2': d[2]},
fetch_list=[sum_result])
self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
def test_simple_net_forward(self):
paddle.enable_static()
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
self.simple_net()
npu_place = paddle.NPUPlace(0)
exe = Executor(npu_place)
d = []
for i in range(3):
d.append(numpy.random.random(size=[10]).astype('float32'))
for _ in range(2):
exe.run(main_program, feed={'d0': d[0], 'd1': d[1], 'd2': d[2]})
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册