未验证 提交 a6343afc 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] support npu for memcpy op (#31808)

* support npu for memcpy op

* add ut

* fix ut

* fix typo
上级 3ab39705
...@@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
* we will use and tell the compiler to * we will use and tell the compiler to
......
...@@ -103,16 +103,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,16 +103,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"is the same as input X."); "is the same as input X.");
AddAttr<int>("dst_place_type", AddAttr<int>("dst_place_type",
"Determine the dst place of tensor copy. " "Determine the dst place of tensor copy. "
"By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other " "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
"place type is Unimplemented and will cause ERROR." "NPUPlace <-> CPUPlace. "
"Other place type is Unimplemented and will cause ERROR."
"0: dst is on CPUPlace. " "0: dst is on CPUPlace. "
"1: dst is on CUDAPlace. " "1: dst is on CUDAPlace. "
"2: dst is on CUDAPinnedPlace. " "2: dst is on CUDAPinnedPlace. "
"3: dst is on XPUPlace. "); "3: dst is on XPUPlace. "
"4: dst is on NPUPlace. ");
AddComment(R"DOC( AddComment(R"DOC(
Memcpy Operator. Memcpy Operator.
By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace, By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
and used as an internal op by Recompute-Offload. NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
You would have to update it if you want other more capacities. You would have to update it if you want other more capacities.
Out = X, when type in [LoDTensor] Out = X, when type in [LoDTensor]
...@@ -144,3 +146,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ...@@ -144,3 +146,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, plat::float16, ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel); ops::MemcpyKernel);
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, int, ops::MemcpyKernel,
int64_t, ops::MemcpyKernel, bool,
ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel);
#endif
...@@ -44,7 +44,17 @@ class MemcpyFunctor { ...@@ -44,7 +44,17 @@ class MemcpyFunctor {
} else if (dst_place_type_ == 2) { } else if (dst_place_type_ == 2) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor); &out_tensor);
} else { }
#ifdef PADDLE_WITH_ASCEND_CL
else if (dst_place_type_ == 0) { // NOLINT
framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == 4) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"memcpy dst_place_type: %d is not supported yet.", dst_place_type_)); "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
} }
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
SEED = 2021
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestMemcpy_FillConstant(unittest.TestCase):
def get_prog(self):
paddle.enable_static()
main_program = Program()
with program_guard(main_program):
cpu_var_name = "tensor@Cpu"
npu_var_name = "tensor@Npu"
cpu_var = main_program.global_block().create_var(
name=cpu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
npu_var = main_program.global_block().create_var(
name=npu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": npu_var_name},
attrs={
"shape": [10, 10],
"dtype": npu_var.dtype,
"value": 1.0,
"place_type": 1
})
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": cpu_var_name},
attrs={
"shape": [10, 10],
"dtype": cpu_var.dtype,
"value": 0.0,
"place_type": 2
})
return main_program, npu_var, cpu_var
def test_npu_cpoy_to_cpu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': npu_var},
outputs={'Out': cpu_var},
attrs={'dst_place_type': 0})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
def test_cpu_cpoy_npu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': cpu_var},
outputs={'Out': npu_var},
attrs={'dst_place_type': 4})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册