diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 407da71c8c66337ea7613588a2e9a23f1243f1db..4703bbb3939c465badcdb07121acc238269076d8 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -320,6 +320,11 @@ OpFuncType AnalyseOpFuncType(const OpFuncNode& op_func_node, return OpFuncType::kGpuSync; } + // for memcpy explicitly called by user + if (platform::is_gpu_place(place) && op->Type() == interpreter::kMemcpyD2H) { + return OpFuncType::kGpuSync; + } + if (op->Type() == "shape") { return OpFuncType::kGpuSync; } diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 0e834343e2800bacd2ba043c1aa4eab3d3b5fb0c..293ca8da772c7110966c92bedbdb3ae34543263e 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -663,8 +663,6 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): # TODO to support CUDAPinned/NPU/XPU Places if direction == "D2H": dst_place_type = 0 - elif direction == "D2H": - dst_place_type = 1 else: raise NotImplementedError( "direction [{}] is not supported yet.".format(direction) @@ -673,7 +671,7 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): attrs = {'dst_place_type': dst_place_type} new_op = block._insert_op_without_sync( index=idx, - type='memcpy', + type='memcpy_d2h', inputs={'X': [src_var]}, outputs={'Out': [output_var]}, attrs=attrs,