diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index b6e55051a45efc4bc292f44e52355938d606bf73..7f5bcff4281955222d608ad504ca8a34a7e73dfb 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -476,6 +476,56 @@ void ApplyDeviceGuard(const OperatorBase* op_base, } } +platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op, + const platform::Place& place) { + auto& pool = platform::DeviceContextPool::Instance(); + auto* default_dev_ctx = pool.Get(place); + + // Replace the default_dev_ctx according to dst_place_type for memcpy op if + // needed. + + // NOTE(liudongxue01): + // Please apply the following logic in other Executor/Interpreter modules + // likewise. + // + + // NOTE(liudongxue01): + // The following code aims to fixup the memcpy kernel which does not handle + // some rare case. The case is: + // 1. The default place in the current execution context is not CUDAPlace, + // such as CPUPlace, + // 2. The dst_place_type is 1 which means CUDAPlace, + // 3. The expected result place is CUDAPlace but the actual result is + // CPUPlace. + // When the default place is CPUPlace, we call the tensor.cuda() would + // simply hit such case. + // + // Q: Why we do not add such logic in the memcpy kernel? + // A: (1) To fixup the memcpy kernel, we need to construct a CUDAPlace() and + // corresponding DeviceContext instance which used by the phi::Copy(...) + // api to perform the real memcpy action. (2) We should not access the + // singleton of the DeviceContextPool object in the PHI framework which + // is designed as a standalone module and all context data should passed + // into the kernel API through arguments. (3) So we have no way to + // construct a CUDAPlace() in the memcpy kernel and then pass it + // to the phi::Copy(...) api. + if (!platform::is_gpu_place(place)) { + const auto& op_type = op->Type(); + if (op_type == "memcpy") { + int dst_place_type = op->Attr("dst_place_type"); + if (dst_place_type == 1) { // 1 : CUDAPlace + auto dev_ctx = pool.Get(paddle::DefaultGPUPlace()); + VLOG(4) << "Change the device context for memcpy OP: (" + << default_dev_ctx->type_info().name() << ") -> (" + << dev_ctx->type_info().name() << ")"; + return dev_ctx; + } + } + } + + return default_dev_ctx; +} + void HandleOperatorBase(const platform::Place& place, std::shared_ptr op, OpFuncNode* op_func_node, @@ -537,6 +587,8 @@ void BuildOpFuncList(const platform::Place& place, } auto unused_var_map = GetUnusedVars(block, ops); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + bool flag_log_is_printed = false; for (size_t i = 0; i < ops.size(); ++i) { auto op = ops[i].get(); @@ -655,8 +707,9 @@ void BuildOpFuncList(const platform::Place& place, runtime_scope = local_scope; } - auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + // construct the device context + auto* dev_ctx = ConstructDeviceContext(op, place); + SetDeviceCommContext(op, dev_ctx); auto exec_ctx = ExecutionContext( *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index 684404d79422a7e0745215764cfb6ea33b88c64c..e95fbacf2f66a180c1eba22dd0e3e2978991b86e 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -146,6 +146,7 @@ def monkey_patch_variable(): In Static Graph Mode: .. code-block:: python + import paddle paddle.enable_static() @@ -162,7 +163,7 @@ def monkey_patch_variable(): persistable=False, stop_gradient=True, ) - # 0 means cpu place, see paddle/fluid/operators/memcpy_op.h + # 0 means cpu place, see paddle/phi/kernels/memcpy_kernel.cc attrs = {'dst_place_type': 0} block.append_op( type='memcpy', @@ -173,13 +174,57 @@ def monkey_patch_variable(): return output @static_only - def cuda(self): + def cuda(self, device_id=None, blocking=True): """ - Variable should not have cpu() and cuda() interface. - But this interface can greatly facilitate dy2static. - We do nothing here. + In dy2static, Variable also needs cpu() and cuda() interface. + But, the underneath operator has only forward op but not backward one. + + Args: + self(Variable): The variable itself. + device_id(int, optional): The destination GPU device id. Default: None, means current device. + We add this argument for dy2static translation, please do not use it. + blocking(bool, optional): Whether blocking or not, Default: True. + We add this argument for dy2static translation, please do not use it. + + Returns: + The tensor which has copied to cuda place. + + Examples: + In Static Graph Mode: + + .. code-block:: python + + import paddle + paddle.enable_static() + + x = paddle.static.data(name="x", shape=[2,2], dtype='float32') + y = x.cpu() + z = y.cuda() """ - return self + if device_id is not None: + warnings.warn("device_id is not supported, and it will be ignored.") + if blocking is not True: + warnings.warn("blocking is not supported, and it will be ignored.") + + block = current_block(self) + tmp_name = unique_tmp_name() + output = block.create_var( + name=tmp_name, + dtype=self.dtype, + shape=self.shape, + type=self.type, + persistable=False, + stop_gradient=True, + ) + # 1 means cuda place, see paddle/phi/kernels/memcpy_kernel.cc + attrs = {'dst_place_type': 1} + block.append_op( + type='memcpy', + inputs={'X': [self]}, + outputs={'Out': [output]}, + attrs=attrs, + ) + return output @static_only def place(self): diff --git a/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py b/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py index a9ea152f7a85ba2ad93f488d3a355c3e3f818051..45f3a0a5550180f6392c2750e57146aae0833362 100644 --- a/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py +++ b/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py @@ -26,6 +26,20 @@ def tensor_copy_to_cpu(x): return y +@paddle.jit.to_static +def tensor_copy_to_cuda(x): + x = paddle.to_tensor(x) + y = x.cuda() + return y + + +@paddle.jit.to_static +def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True): + x = paddle.to_tensor(x) + y = x.cuda(device_id, blocking) + return y + + class TestTensorCopyToCpuOnDefaultCPU(unittest.TestCase): def _run(self, to_static): paddle.jit.enable_to_static(to_static) @@ -46,5 +60,67 @@ class TestTensorCopyToCpuOnDefaultCPU(unittest.TestCase): self.assertTrue(static_place.is_cpu_place()) +class TestTensorCopyToCUDAOnDefaultCPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda(x1) + return x1.place, x2.place, x2.numpy() + + def test_tensor_cuda_on_default_cpu(self): + if not paddle.fluid.is_compiled_with_cuda(): + return + + """ + Note(liudongxue01): If the following asserts fail to run, + please check the workaround logic for memcpy OP + whether is still taking effect or not. + See ConstructDeviceContext() in interpreter_util.cc. + """ + paddle.fluid.framework._set_expected_place(paddle.CPUPlace()) + dygraph_x1_place, dygraph_place, dygraph_res = self._run( + to_static=False + ) + static_x1_place, static_place, static_res = self._run(to_static=True) + np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05) + self.assertTrue(dygraph_x1_place.is_cpu_place()) + self.assertTrue(static_x1_place.is_cpu_place()) + self.assertTrue(dygraph_place.is_gpu_place()) + self.assertTrue(static_place.is_gpu_place()) + + +class TestTensorCopyToCUDAWithWarningOnCPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False) + return x1.place, x2.place, x2.numpy() + + def test_with_warning_on_cpu(self): + if not paddle.fluid.is_compiled_with_cuda(): + return + + paddle.fluid.framework._set_expected_place(paddle.CPUPlace()) + + x1 = paddle.ones([1, 2, 3]) + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=1, blocking=True + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=None, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=2, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + if __name__ == '__main__': unittest.main() diff --git a/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py b/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py index 4f7479d680bb433e4046549fb3eb42b645955020..de642dd4087e8130802317227db0c10f2d626237 100644 --- a/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py +++ b/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py @@ -27,6 +27,20 @@ def tensor_copy_to_cpu(x): return y +@paddle.jit.to_static +def tensor_copy_to_cuda(x): + x = paddle.to_tensor(x) + y = x.cuda() + return y + + +@paddle.jit.to_static +def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True): + x = paddle.to_tensor(x) + y = x.cuda(device_id, blocking) + return y + + class TestTensorCopyToCpuOnDefaultGPU(unittest.TestCase): def _run(self, to_static): paddle.jit.enable_to_static(to_static) @@ -53,5 +67,67 @@ class TestTensorCopyToCpuOnDefaultGPU(unittest.TestCase): self.assertTrue(static_place.is_cpu_place()) +class TestTensorCopyToCUDAOnDefaultGPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda(x1) + return x1.place, x2.place, x2.numpy() + + def test_tensor_cuda_on_default_gpu(self): + if paddle.fluid.is_compiled_with_cuda(): + place = paddle.CUDAPlace( + int(os.environ.get('FLAGS_selected_gpus', 0)) + ) + else: + return + paddle.fluid.framework._set_expected_place(place) + dygraph_x1_place, dygraph_place, dygraph_res = self._run( + to_static=False + ) + static_x1_place, static_place, static_res = self._run(to_static=True) + np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05) + self.assertTrue(dygraph_x1_place.is_gpu_place()) + self.assertTrue(static_x1_place.is_gpu_place()) + self.assertTrue(dygraph_place.is_gpu_place()) + self.assertTrue(static_place.is_gpu_place()) + + +class TestTensorCopyToCUDAWithWarningOnGPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False) + return x1.place, x2.place, x2.numpy() + + def test_with_warning_on_gpu(self): + if paddle.fluid.is_compiled_with_cuda(): + place = paddle.CUDAPlace( + int(os.environ.get('FLAGS_selected_gpus', 0)) + ) + else: + return + paddle.fluid.framework._set_expected_place(place) + + x1 = paddle.ones([1, 2, 3]) + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=1, blocking=True + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=None, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=2, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + if __name__ == '__main__': unittest.main()