[NPU] support npu for memcpy op (#31808)

* support npu for memcpy op * add ut * fix ut * fix typo

[NPU] support npu for memcpy op (#31808)
* support npu for memcpy op * add ut * fix ut * fix typo
a6343afc · Leo Chen · GitHub · 3ab39705 · a6343afc · a6343afc
4 changed file
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)

+#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
 * Macro to mark what Operator and Kernel
 * we will use and tell the compiler to

--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -103,16 +103,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "is the same as input X.");
    AddAttr<int>("dst_place_type",
                 "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
-                 "place type is Unimplemented and will cause ERROR."
+                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
+                 "NPUPlace <-> CPUPlace. "
+                 "Other place type is Unimplemented and will cause ERROR."
                 "0: dst is on CPUPlace. "
                 "1: dst is on CUDAPlace. "
                 "2: dst is on CUDAPinnedPlace. "
-                 "3: dst is on XPUPlace. ");
+                 "3: dst is on XPUPlace. "
+                 "4: dst is on NPUPlace. ");
    AddComment(R"DOC(
    Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
-    and used as an internal op by Recompute-Offload.
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
    You would have to update it if you want other more capacities.

 Out = X,  when type in [LoDTensor]
@@ -144,3 +146,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                ops::MemcpyKernel, plat::float16,
                                ops::MemcpyKernel);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
+                               ops::MemcpyKernel, int, ops::MemcpyKernel,
+                               int64_t, ops::MemcpyKernel, bool,
+                               ops::MemcpyKernel, plat::float16,
+                               ops::MemcpyKernel);
+#endif
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -44,7 +44,17 @@ class MemcpyFunctor {
    } else if (dst_place_type_ == 2) {
      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                            &out_tensor);
-    } else {
+    }
+#ifdef PADDLE_WITH_ASCEND_CL
+    else if (dst_place_type_ == 0) {  // NOLINT
+      framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
+                            &out_tensor);
+    } else if (dst_place_type_ == 4) {
+      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
+                            &out_tensor);
+    }
+#endif
+    else {  // NOLINT
      PADDLE_THROW(platform::errors::Unimplemented(
          "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
    }

--- a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMemcpy_FillConstant(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = Program()
+        with program_guard(main_program):
+            cpu_var_name = "tensor@Cpu"
+            npu_var_name = "tensor@Npu"
+            cpu_var = main_program.global_block().create_var(
+                name=cpu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            npu_var = main_program.global_block().create_var(
+                name=npu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": npu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": npu_var.dtype,
+                    "value": 1.0,
+                    "place_type": 1
+                })
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": cpu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": cpu_var.dtype,
+                    "value": 0.0,
+                    "place_type": 2
+                })
+        return main_program, npu_var, cpu_var
+
+    def test_npu_cpoy_to_cpu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': npu_var},
+            outputs={'Out': cpu_var},
+            attrs={'dst_place_type': 0})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
+
+    def test_cpu_cpoy_npu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': cpu_var},
+            outputs={'Out': npu_var},
+            attrs={'dst_place_type': 4})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
+
+
+if __name__ == '__main__':
+    unittest.main()