From 2259ced150cc4cd96654e14dfa81ead28890b922 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Fri, 23 Dec 2022 19:18:43 +0800 Subject: [PATCH] [AutoParallel-Performance] AMP Flag Memcpy support newexe Overlap (#49219) * memcpy overlap * memcpy newexe --- .../framework/new_executor/interpreter/interpreter_util.cc | 5 +++++ python/paddle/distributed/passes/auto_parallel_fp16.py | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 407da71c8c..4703bbb393 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -320,6 +320,11 @@ OpFuncType AnalyseOpFuncType(const OpFuncNode& op_func_node, return OpFuncType::kGpuSync; } + // for memcpy explicitly called by user + if (platform::is_gpu_place(place) && op->Type() == interpreter::kMemcpyD2H) { + return OpFuncType::kGpuSync; + } + if (op->Type() == "shape") { return OpFuncType::kGpuSync; } diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 0e834343e2..293ca8da77 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -663,8 +663,6 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): # TODO to support CUDAPinned/NPU/XPU Places if direction == "D2H": dst_place_type = 0 - elif direction == "D2H": - dst_place_type = 1 else: raise NotImplementedError( "direction [{}] is not supported yet.".format(direction) @@ -673,7 +671,7 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): attrs = {'dst_place_type': dst_place_type} new_op = block._insert_op_without_sync( index=idx, - type='memcpy', + type='memcpy_d2h', inputs={'X': [src_var]}, outputs={'Out': [output_var]}, attrs=attrs, -- GitLab