From 3c66b8721a1e80c2c1713bcca1e37f27512bff20 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 25 Mar 2021 19:55:01 +0800 Subject: [PATCH] [NPU] add npu kernel for truncated_gaussian_random op (#31654) * init * add todo * add npu kernel for truncated_gaussian_random * add sync * fix concat_grad * fix typo --- paddle/fluid/operators/concat_op_npu.cc | 7 +- .../truncated_gaussian_random_op_npu.cc | 113 ++++++++++++++++++ .../test_truncated_gaussian_random_op_npu.py | 71 +++++++++++ 3 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/truncated_gaussian_random_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index 9b979dede0..87bb3397ca 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -80,7 +80,6 @@ class ConcatGradNPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); - std::vector sizes; int offset = 0; auto stream = ctx.template device_context() @@ -91,7 +90,6 @@ class ConcatGradNPUKernel : public framework::OpKernel { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - sizes.push_back(outs[j]->dims()[axis]); std::vector offsets; std::vector sizes; for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { @@ -103,9 +101,8 @@ class ConcatGradNPUKernel : public framework::OpKernel { sizes.push_back(ins[j]->dims()[dim]); } } - auto runner = - NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, - {{"offsets", offset}, {"size", ins[j]->dims()[axis]}}); + auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); runner.Run(stream); } if (ins[j]->numel() != 0UL) { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc new file mode 100644 index 0000000000..4253187fdd --- /dev/null +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal + std::vector shape = ctx.Attr>("shape"); + Tensor shape_tensor(framework::proto::VarType::INT32); + shape_tensor.mutable_data({static_cast(shape.size())}, + ctx.GetPlace()); + TensorFromVector(shape, ctx.device_context(), &shape_tensor); + float mean = ctx.Attr("mean"); + Tensor mean_tensor(framework::proto::VarType::FP32); + mean_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{mean}, ctx.device_context(), + &mean_tensor); + + float std = ctx.Attr("std"); + Tensor std_tensor(framework::proto::VarType::FP32); + std_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{std}, ctx.device_context(), + &std_tensor); + + int32_t seed_var = ctx.Attr("seed"); + + Tensor min_tensor(framework::proto::VarType::FP32); + min_tensor.mutable_data({1}, ctx.GetPlace()); + float min_value = mean - std * 2.0; + TensorFromVector(std::vector{min_value}, ctx.device_context(), + &min_tensor); + + Tensor max_tensor(framework::proto::VarType::FP32); + max_tensor.mutable_data({1}, ctx.GetPlace()); + float max_value = mean + std * 2.0; + TensorFromVector(std::vector{max_value}, ctx.device_context(), + &max_tensor); + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ParameterizedTruncatedNormal", + {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out}, + {{"seed", seed_var}}); + runner.Run(stream); + } +}; + +// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the +// above +// npu version work in the future. +template +class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + tensor->mutable_data(context.GetPlace()); + + Tensor cpu_tensor(tensor->type()); + cpu_tensor.Resize(tensor->dims()); + T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); + TruncatedNormal truncated_normal(mean, std); + int64_t size = tensor->numel(); + + unsigned int seed = static_cast(context.Attr("seed")); + auto engine = framework::GetCPURandomEngine(seed); + for (int64_t i = 0; i < size; ++i) { + cpu_data[i] = truncated_normal(dist(*engine)); + } + framework::TensorCopy( + cpu_tensor, context.GetPlace(), + context.template device_context(), tensor); + context.template device_context() + .Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(truncated_gaussian_random, + ops::NPUTruncatedGaussianRandomKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py new file mode 100644 index 0000000000..ff89508d19 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.executor import Executor + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTruncatedNormal(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + scope = paddle.fluid.core.Scope() + + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + paddle.seed(SEED) + + with fluid.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + weight_attr = paddle.framework.ParamAttr( + name="linear_weight", + initializer=paddle.nn.initializer.TruncatedNormal( + mean=0.0, std=2.0)) + linear = paddle.nn.Linear( + 2, 2, weight_attr=weight_attr, bias_attr=False) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + w = exe.run(startup_prog, fetch_list=['linear_weight']) + return w + + def test_npu(self): + cpu_w = self._test(False) + npu_w = self._test(True) + + self.assertTrue(np.allclose(npu_w, cpu_w)) + + +if __name__ == '__main__': + unittest.main() -- GitLab