Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
5866a7a5
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5866a7a5
编写于
9月 11, 2019
作者:
C
chengduo
提交者:
GitHub
9月 11, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Enable fused_all_reduce_op_handle support GPU and CPU Gradients (#19418)
* Enable fused_all_reduce_op_handle support GPU and CPU Gradients
上级
3e5fb636
变更
17
显示空白变更内容
内联
并排
Showing
17 changed file
with
377 addition
and
240 deletion
+377
-240
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+115
-81
paddle/fluid/framework/details/all_reduce_op_handle.h
paddle/fluid/framework/details/all_reduce_op_handle.h
+9
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+14
-7
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+56
-73
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+9
-14
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+5
-0
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
...le/fluid/framework/details/sparse_all_reduce_op_handle.cc
+1
-1
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+34
-22
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
...work/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+30
-20
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+20
-12
paddle/fluid/framework/ir/graph_test.cc
paddle/fluid/framework/ir/graph_test.cc
+46
-0
paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
...id/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+12
-2
paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
...uid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+3
-0
paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
...framework/ir/memory_optimize_pass/reference_count_pass.cc
+10
-1
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+10
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
...k/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+3
-3
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
...rk/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+0
-1
未找到文件。
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
5866a7a5
...
...
@@ -40,43 +40,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
)
{}
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
AllReduceOpHandle
::
RunAllReduceFuncs
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
)
{
this
->
RunAndRecordEvent
([
&
]
{
if
(
all_reduce_calls
.
size
()
==
1UL
)
{
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls
[
0
]();
}
else
{
platform
::
NCCLGroupGuard
guard
;
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
}
});
if
(
FLAGS_sync_nccl_allreduce
)
{
for
(
auto
&
p
:
places_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
*
nccl_ctxs
=
nccl_ctxs_
->
GetRunEnvNCCLCtx
(
run_order_
,
use_hierarchical_allreduce_
);
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
cudaError_t
e_sync
=
cudaStreamSynchronize
(
stream
);
if
(
e_sync
!=
0
)
{
LOG
(
FATAL
)
<<
"cudaStreamSynchronize "
<<
cudaGetErrorString
(
e_sync
);
}
cudaError_t
e_get
=
cudaGetLastError
();
if
(
e_get
!=
0
)
{
LOG
(
FATAL
)
<<
"cudaGetLastError "
<<
cudaGetErrorString
(
e_get
)
<<
" errno:"
<<
e_get
;
}
}
}
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
#endif
...
...
@@ -84,81 +49,150 @@ void AllReduceOpHandle::RunImpl() {
platform
::
RecordEvent
record_event
(
Name
());
WaitInputVarGenerated
();
std
::
vector
<
VarHandleBase
*>
inputs
=
this
->
Inputs
();
std
::
vector
<
VarHandleBase
*>
outputs
=
this
->
Outputs
();
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
inputs
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs
);
AllReduceImpl
(
in_var_handles
,
out_var_handles
);
}
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
());
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
());
void
AllReduceOpHandle
::
AllReduceImpl
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
)
{
size_t
num_places
=
places_
.
size
();
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
places_
.
size
()
,
in_var_handles
.
size
(),
num_places
,
"The NoDummyInputSize should be equal to the number of places."
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
out_var_handles
.
size
(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."
);
std
::
vector
<
const
LoDTensor
*>
lod_tensors
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
local_exec_scopes_
.
size
(),
num_places
);
std
::
vector
<
const
void
*>
lod_tensor_data
;
std
::
vector
<
platform
::
Place
>
places
;
lod_tensor_data
.
reserve
(
num_places
);
places
.
reserve
(
num_places
);
int64_t
numel
=
-
1
;
bool
is_gpu_place
=
false
;
auto
dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
0
);
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
&
local_scope
=
local_exec_scopes_
[
i
];
auto
&
lod_tensor
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
())
->
Get
<
LoDTensor
>
();
lod_tensors
.
emplace_back
(
&
lod_tensor
);
auto
var
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
());
PADDLE_ENFORCE_NOT_NULL
(
var
,
"%s is not found int scope."
,
in_var_handles
[
i
]
->
name
());
auto
&
lod_tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
i
==
0
)
{
numel
=
static_cast
<
int64_t
>
(
lod_tensor
.
numel
());
dtype
=
lod_tensor
.
type
();
is_gpu_place
=
platform
::
is_gpu_place
(
lod_tensor
.
place
());
}
PADDLE_ENFORCE_EQ
(
numel
,
static_cast
<
int64_t
>
(
lod_tensor
.
numel
()));
PADDLE_ENFORCE_EQ
(
dtype
,
lod_tensor
.
type
());
PADDLE_ENFORCE_EQ
(
is_gpu_place
,
platform
::
is_gpu_place
(
lod_tensor
.
place
()));
lod_tensor_data
.
emplace_back
(
lod_tensor
.
data
<
void
>
());
places
.
emplace_back
(
lod_tensor
.
place
());
VLOG
(
10
)
<<
"place:"
<<
i
<<
", input_name:"
<<
in_var_handles
[
i
]
->
name
()
<<
", out_name:"
<<
out_var_handles
[
i
]
->
name
();
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
(),
out_var_handles
[
i
]
->
name
(),
"The name of input and output should be equal."
);
}
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
int
dtype
=
-
1
;
size_t
numel
=
0
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
auto
&
lod_tensor
=
*
lod_tensors
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
if
(
dtype
==
-
1
)
{
dtype
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
std
::
vector
<
std
::
string
>
grad_var_names
;
grad_var_names
.
reserve
(
num_places
);
for
(
auto
&
out_var
:
out_var_handles
)
{
grad_var_names
.
emplace_back
(
out_var
->
Name
());
}
if
(
numel
==
0
)
{
numel
=
static_cast
<
size_t
>
(
lod_tensor
.
numel
());
}
AllReduceFunc
(
lod_tensor_data
,
dtype
,
numel
,
places
,
grad_var_names
);
}
void
AllReduceOpHandle
::
AllReduceFunc
(
std
::
vector
<
const
void
*>
lod_tensor_data
,
const
framework
::
proto
::
VarType
::
Type
&
dtype
,
int64_t
numel
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
out_var_names
)
{
if
(
is_gpu_place
(
places
[
0
]))
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE_NOT_NULL
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
ncclDataType_t
nccl_dtype
=
platform
::
ToNCCLDataType
(
dtype
);
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor_data
.
at
(
i
));
all_reduce_calls
.
emplace_back
([
=
]
{
NCCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
);
NCCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
nccl_dtype
,
ncclSum
);
});
}
VLOG
(
10
)
<<
"allreduce size:"
<<
numel
*
SizeOfType
(
lod_tensors
[
0
]
->
type
());
RunAllReduceFuncs
(
all_reduce_calls
);
NCCLAllReduceFunc
(
all_reduce_calls
);
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
PADDLE_THROW
(
"Not compiled with CUDA
.
"
);
#endif
}
else
{
// Special handle CPU only Operator's gradient. Like CRF
auto
&
trg
=
*
this
->
local_exec_scopes_
[
0
]
->
FindVar
(
out_var_
handles
[
0
]
->
name
()
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
&
trg
=
*
local_exec_scopes_
[
0
]
->
FindVar
(
out_var_
names
[
0
]
)
->
GetMutable
<
LoDTensor
>
();
// Reduce All Tensor to trg in CPU
Reduce
LoDTensor
func
(
lod_tensors
,
&
trg
);
VisitDataType
(
lod_tensors
[
0
]
->
type
(),
func
);
Reduce
BufferData
func
(
lod_tensor_data
,
trg
.
data
<
void
>
(),
numel
);
VisitDataType
(
trg
.
type
(),
func
);
for
(
size_t
i
=
1
;
i
<
local_scopes_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
local_
exec_
scopes_
.
size
();
++
i
)
{
auto
&
scope
=
local_exec_scopes_
[
i
];
auto
&
p
=
places
_
[
i
];
auto
*
var
=
scope
->
FindVar
(
out_var_
handles
[
i
]
->
name
()
);
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
);
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
auto
&
tensor_gpu
=
*
var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
&
tensor_cpu
=
trg
;
TensorCopy
(
tensor_cpu
,
p
,
*
dev_ctx
,
&
tensor_gpu
);
auto
&
p
=
places
[
i
];
auto
*
var
=
scope
->
FindVar
(
out_var_
names
[
i
]
);
size_t
size
=
numel
*
SizeOfType
(
trg
.
type
());
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
p
,
size
]
{
auto
dst_ptr
=
var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
data
<
void
>
();
platform
::
CPUPlace
cpu_place
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
trg
.
data
<
void
>
(),
size
);
});
}
}
VLOG
(
10
)
<<
Name
()
<<
" size:"
<<
numel
*
SizeOfType
(
dtype
);
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
AllReduceOpHandle
::
NCCLAllReduceFunc
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
)
{
this
->
RunAndRecordEvent
([
&
]
{
if
(
all_reduce_calls
.
size
()
==
1UL
)
{
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls
[
0
]();
}
else
{
platform
::
NCCLGroupGuard
guard
;
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
}
});
if
(
FLAGS_sync_nccl_allreduce
)
{
for
(
auto
&
p
:
places_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
*
nccl_ctxs
=
nccl_ctxs_
->
GetRunEnvNCCLCtx
(
run_order_
,
use_hierarchical_allreduce_
);
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
cudaError_t
e_sync
=
cudaStreamSynchronize
(
stream
);
if
(
e_sync
!=
0
)
{
LOG
(
FATAL
)
<<
"cudaStreamSynchronize "
<<
cudaGetErrorString
(
e_sync
);
}
cudaError_t
e_get
=
cudaGetLastError
();
if
(
e_get
!=
0
)
{
LOG
(
FATAL
)
<<
"cudaGetLastError "
<<
cudaGetErrorString
(
e_get
)
<<
" errno:"
<<
e_get
;
}
}
}
}
#endif
std
::
string
AllReduceOpHandle
::
Name
()
const
{
return
"all_reduce"
;
}
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/all_reduce_op_handle.h
浏览文件 @
5866a7a5
...
...
@@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase {
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
RunAllReduceFuncs
(
void
NCCLAllReduceFunc
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
);
#endif
void
AllReduceImpl
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
);
void
AllReduceFunc
(
std
::
vector
<
const
void
*>
lod_tensor_data
,
const
framework
::
proto
::
VarType
::
Type
&
dtype
,
int64_t
numel
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
out_var_handles
);
};
}
// namespace details
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
5866a7a5
...
...
@@ -83,12 +83,20 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
<<
"Currently, fuse_all_optimizer_ops doesn't work under "
"parallel_graph."
;
strategy_
.
fuse_all_optimizer_ops_
=
false
;
VLOG_IF
(
3
,
strategy_
.
fuse_all_reduce_ops_
)
<<
"fuse_all_reduce_ops doesn't work under "
"parallel_graph."
;
strategy_
.
fuse_all_reduce_ops_
=
false
;
}
if
(
strategy_
.
is_distribution_
)
{
VLOG_IF
(
3
,
strategy_
.
fuse_all_optimizer_ops_
)
<<
"Currently, fuse_all_optimizer_ops only works under "
"Non-distributed mode."
;
strategy_
.
fuse_all_optimizer_ops_
=
false
;
VLOG_IF
(
3
,
strategy_
.
fuse_all_reduce_ops_
)
<<
"Currently, fuse_all_reduce_ops_ only works under "
"Non-distributed mode."
;
strategy_
.
fuse_all_reduce_ops_
=
false
;
}
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
VLOG_IF
(
3
,
strategy_
.
fuse_all_optimizer_ops_
)
...
...
@@ -284,8 +292,8 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
Erase
(
kLocalScopes
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
,
&
local_scopes
);
pass
->
Erase
(
ir
::
kNRanks
);
pass
->
Set
<
size_t
>
(
ir
::
kNRanks
,
new
size_t
(
nranks
));
pass
->
Erase
(
kNRanks
);
pass
->
Set
<
size_t
>
(
kNRanks
,
new
size_t
(
nranks
));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
NCCLCommunicator
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
...
...
@@ -293,6 +301,8 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
SetNotOwned
<
platform
::
NCCLCommunicator
>
(
kNCCLCtxs
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"fuse_all_reduce_op_pass"
)
{
pass
->
Erase
(
kNRanks
);
pass
->
Set
<
size_t
>
(
kNRanks
,
new
size_t
(
nranks
));
pass
->
Erase
(
kPlaces
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
,
&
places
);
pass
->
Erase
(
kLocalScopes
);
...
...
@@ -307,11 +317,8 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
new
bool
(
use_hierarchical_allreduce_
));
#endif
}
else
if
(
pass
->
Type
()
==
"coalesce_grad_tensor_pass"
)
{
pass
->
Erase
(
kPlaces
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
,
&
places
);
pass
->
Erase
(
kLocalScopes
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
,
&
local_scopes
);
pass
->
Erase
(
kNRanks
);
pass
->
Set
<
size_t
>
(
kNRanks
,
new
size_t
(
nranks
));
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
5866a7a5
...
...
@@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
NCCLCommunicator
*
ctxs
)
:
NCCLOpHandleBase
(
node
,
places
,
ctxs
),
local_scopes_
(
local_scopes
),
num_of_all_reduce_
(
num_of_all_reduce
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
:
AllReduceOpHandle
(
node
,
local_scopes
,
places
,
ctxs
),
num_of_all_reduce_
(
num_of_all_reduce
)
{}
#else
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
),
num_of_all_reduce_
(
num_of_all_reduce
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
}
:
AllReduceOpHandle
(
node
,
local_scopes
,
places
),
num_of_all_reduce_
(
num_of_all_reduce
)
{}
#endif
void
FusedAllReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
());
VLOG
(
4
)
<<
this
->
DebugString
();
WaitInputVarGenerated
();
...
...
@@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() {
in_var_handles
.
size
(),
out_var_handles
.
size
(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."
);
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
if
(
InputIsInDifferentPlace
(
in_var_handles
))
{
for
(
size_t
j
=
0
;
j
<
num_of_all_reduce_
;
++
j
)
{
std
::
vector
<
VarHandle
*>
dev_inputs
;
std
::
vector
<
VarHandle
*>
dev_outputs
;
dev_inputs
.
reserve
(
place_num
);
dev_outputs
.
reserve
(
place_num
);
for
(
size_t
idx
=
0
;
idx
<
place_num
;
++
idx
)
{
dev_inputs
.
emplace_back
(
in_var_handles
.
at
(
j
*
place_num
+
idx
));
dev_outputs
.
emplace_back
(
out_var_handles
.
at
(
j
*
place_num
+
idx
));
}
AllReduceImpl
(
dev_inputs
,
dev_outputs
);
}
}
else
{
FusedAllReduceFunc
(
in_var_handles
,
out_var_handles
);
}
}
void
FusedAllReduceOpHandle
::
FusedAllReduceFunc
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
)
{
size_t
place_num
=
places_
.
size
();
GradientAndLoDTensor
grads_tensor
;
grads_tensor
.
resize
(
place_num
);
...
...
@@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() {
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
0
);
GetDTypeAndNumel
(
g_tensor
,
&
ele_dtype
,
&
element_num
);
if
(
numel
==
-
1
)
{
if
(
scope_idx
==
0
)
{
numel
=
element_num
;
}
if
(
dtype
==
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
0
))
{
dtype
=
ele_dtype
;
PADDLE_ENFORCE_NE
(
ele_dtype
,
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
0
));
}
PADDLE_ENFORCE_EQ
(
ele_dtype
,
dtype
);
// Check whether the address space is contiguous.
...
...
@@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() {
}
std
::
vector
<
const
void
*>
lod_tensor_data
;
lod_tensor_data
.
reserve
(
place_num
);
for
(
size_t
scope_idx
=
0
;
scope_idx
<
place_num
;
++
scope_idx
)
{
auto
data
=
grads_tensor
.
at
(
scope_idx
).
at
(
0
).
second
->
data
<
void
>
();
lod_tensor_data
.
emplace_back
(
data
);
}
if
(
platform
::
is_gpu_place
(
places_
[
0
]))
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
int
nccl_dtype
=
platform
::
ToNCCLDataType
(
dtype
);
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor_data
.
at
(
i
));
all_reduce_calls
.
emplace_back
([
=
]
{
NCCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
nccl_dtype
),
ncclSum
);
});
std
::
vector
<
std
::
string
>
grad_var_names
;
grad_var_names
.
reserve
(
place_num
);
for
(
auto
&
grad_t
:
grads_tensor
)
{
grad_var_names
.
emplace_back
(
grad_t
.
at
(
0
).
first
);
}
VLOG
(
10
)
<<
"fusedallreduce size:"
<<
numel
*
SizeOfType
(
dtype
);
AllReduceFunc
(
lod_tensor_data
,
dtype
,
numel
,
this
->
places_
,
grad_var_names
);
}
this
->
RunAndRecordEvent
([
&
]
{
if
(
all_reduce_calls
.
size
()
==
1UL
)
{
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls
[
0
]();
}
else
{
platform
::
NCCLGroupGuard
guard
;
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
bool
FusedAllReduceOpHandle
::
InputIsInDifferentPlace
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
)
const
{
for
(
size_t
scope_idx
=
0
;
scope_idx
<
local_scopes_
.
size
();
++
scope_idx
)
{
auto
*
local_scope
=
local_exec_scopes_
[
scope_idx
];
size_t
place_num
=
places_
.
size
();
for
(
size_t
j
=
0
;
j
<
in_var_handles
.
size
();
j
+=
place_num
)
{
auto
var_name
=
in_var_handles
[
j
]
->
name
();
auto
var
=
local_scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"%s is not found in local scope."
,
var_name
);
auto
&
lod_tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
!
is_same_place
(
lod_tensor
.
place
(),
places_
.
at
(
scope_idx
)))
{
return
true
;
}
});
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
else
{
// Special handle CPU only Operator's gradient. Like CRF
auto
grad_name
=
grads_tensor
.
at
(
0
).
at
(
0
).
first
;
auto
&
trg
=
*
this
->
local_exec_scopes_
[
0
]
->
FindVar
(
grad_name
)
->
GetMutable
<
framework
::
LoDTensor
>
();
// Reduce All data to trg in CPU
ReduceBufferData
func
(
lod_tensor_data
,
trg
.
data
<
void
>
(),
numel
);
VisitDataType
(
trg
.
type
(),
func
);
for
(
size_t
i
=
1
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
&
scope
=
*
local_exec_scopes_
[
i
];
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
grad_name
);
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
);
size_t
size
=
numel
*
SizeOfType
(
trg
.
type
());
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
,
size
]
{
auto
dst_ptr
=
var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
data
<
void
>
();
platform
::
CPUPlace
cpu_place
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
trg
.
data
<
void
>
(),
size
);
});
}
}
return
false
;
}
void
FusedAllReduceOpHandle
::
GetGradLoDTensor
(
...
...
@@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>
*
grad_tensor
)
const
{
auto
*
local_scope
=
local_exec_scopes_
[
scope_idx
];
size_t
place_num
=
places_
.
size
();
for
(
size_t
j
=
0
;
j
<
in_var_handles
.
size
();
j
+=
place_num
)
{
auto
var_name
=
in_var_handles
[
j
]
->
name
();
PADDLE_ENFORCE_EQ
(
var_name
,
out_var_handles
[
j
]
->
name
());
auto
&
lod_tensor
=
local_scope
->
FindVar
(
var_name
)
->
Get
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
lod_tensor
.
place
(),
places_
.
at
(
scope_idx
));
auto
var
=
local_scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"%s is not found in local scope."
,
var_name
);
auto
&
lod_tensor
=
var
->
Get
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
lod_tensor
.
place
(),
places_
.
at
(
scope_idx
),
"%s(%d) is not in the right place."
,
var_name
,
scope_idx
);
grad_tensor
->
emplace_back
(
std
::
make_pair
(
var_name
,
&
lod_tensor
));
}
}
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
浏览文件 @
5866a7a5
...
...
@@ -17,6 +17,7 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -30,14 +31,14 @@ namespace framework {
namespace
details
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
struct
FusedAllReduceOpHandle
:
public
NCCLOpHandleBas
e
{
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandl
e
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
NCCLCommunicator
*
ctxs
);
#else
struct
FusedAllReduceOpHandle
:
public
OpHandleBas
e
{
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandl
e
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
@@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
#endif
std
::
string
Name
()
const
override
;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool
IsMultiDeviceTransfer
()
override
{
return
true
;
};
protected:
void
RunImpl
()
override
;
std
::
vector
<
Scope
*>
GetLocalScopes
()
override
{
return
local_scopes_
;
}
private:
std
::
vector
<
Scope
*>
local_scopes_
;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
// NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std
::
vector
<
platform
::
Place
>
places_
;
#endif
size_t
num_of_all_reduce_
;
// Check the dtype of the input
...
...
@@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
std
::
vector
<
std
::
pair
<
std
::
string
,
const
LoDTensor
*>>
*
grad_tensor
)
const
;
bool
InputIsInDifferentPlace
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
)
const
;
void
FusedAllReduceFunc
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
);
};
}
// namespace details
...
...
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
5866a7a5
...
...
@@ -42,6 +42,8 @@ typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars
;
constexpr
char
kGraphVars
[]
=
"vars"
;
constexpr
char
kNRanks
[]
=
"nranks"
;
constexpr
char
kPlaces
[]
=
"places"
;
constexpr
char
kLocalScopes
[]
=
"local_scopes"
;
constexpr
char
kNCCLCtxs
[]
=
"nccl_ctxs"
;
...
...
@@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
typedef
std
::
vector
<
ProgramDesc
>
ProgramDescs
;
constexpr
char
kProgramDescs
[]
=
"program_descs"
;
typedef
std
::
unordered_set
<
std
::
string
>
PinnedVars
;
constexpr
char
kPinnedVars
[]
=
"pinned_vars"
;
typedef
std
::
vector
<
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>>
GroupParamsAndGrads
;
constexpr
char
kGroupParamsAndDenseGrads
[]
=
"group_params_dense_grads"
;
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
浏览文件 @
5866a7a5
...
...
@@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
});
}
RunAllReduceFuncs
(
all_reduce_calls
);
NCCLAllReduceFunc
(
all_reduce_calls
);
}
int
SparseAllReduceOpHandle
::
GetKValue
(
const
std
::
string
&
grad_name
)
{
...
...
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -65,28 +65,33 @@ double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; }
class
CoalesceGradTensorPass
:
public
ir
::
Pass
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
if
(
Get
<
size_t
>
(
details
::
kNRanks
)
<=
1
)
{
VLOG
(
6
)
<<
"The number of place is"
<<
Get
<
size_t
>
(
details
::
kNRanks
)
<<
", there doesn't need apply FuseAllReduceOpPass."
;
return
;
}
ir
::
Graph
&
result
=
*
graph
;
details
::
ParamsAndGrads
params_grads
;
RecordParamsAndGrads
(
result
,
&
params_grads
);
VLOG
(
10
)
<<
"The number of params and grads is:"
<<
params_grads
.
size
();
if
(
params_grads
.
size
()
==
0
)
{
return
;
}
auto
vars_info
=
GetVarInfo
(
result
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndDenseGrads
,
&
result
);
ResetAttribute
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndSparseGrads
,
&
result
);
ResetAttribute
<
details
::
GroupParamsAndGrads
>
(
details
::
kGroupParamsAndDenseGrads
,
&
result
);
VLOG
(
10
)
<<
"The number of params and grads is:"
<<
params_grads
.
size
();
if
(
params_grads
.
size
()
==
0
)
{
return
;
}
auto
&
p_g_dense_grad
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndDenseGrads
);
auto
&
p_g_sparse_grad
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndSparseGrads
);
auto
vars_info
=
GetVarInfo
(
result
);
for
(
auto
&
param_grad
:
params_grads
)
{
if
(
IsLoDTensorType
(
GetTypeOfVar
(
vars_info
,
param_grad
.
second
)))
{
p_g_dense_grad
.
emplace_back
(
param_grad
);
...
...
@@ -118,33 +123,37 @@ class CoalesceGradTensorPass : public ir::Pass {
p_g_dense_grad
.
size
(),
num_of_p_g_dense_grad
,
"The number of p_g_dense_grad is not consistent with before."
);
auto
&
pinned_var_set
=
graph
->
GetOrInit
<
details
::
PinnedVars
>
(
details
::
kPinnedVars
);
if
(
IsUnifiedDtype
(
p_g_dense_grad
,
vars_info
))
{
SetGradientPersistable
(
p_g_dense_grad
,
vars_info
);
RecordGradients
(
p_g_dense_grad
,
vars_info
,
&
pinned_var_set
);
CoalesceTensors
(
vars_info
,
p_g_dense_grad
,
&
result
);
}
else
{
for
(
auto
&
sub_param_grad
:
group_params_grads
)
{
SetGradientPersistable
(
p_g_dense_grad
,
vars_info
);
PADDLE_ENFORCE
(
IsUnifiedDtype
(
sub_param_grad
,
vars_info
)
,
RecordGradients
(
p_g_dense_grad
,
vars_info
,
&
pinned_var_set
);
PADDLE_ENFORCE
_EQ
(
IsUnifiedDtype
(
sub_param_grad
,
vars_info
),
true
,
"The data type of the same group is not consistent."
);
CoalesceTensors
(
vars_info
,
sub_param_grad
,
&
result
);
}
}
}
void
SetGradientPersistable
(
void
RecordGradients
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
sub_param_grad
,
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
&
vars_info
)
const
{
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
&
vars_info
,
std
::
unordered_set
<
std
::
string
>
*
pinned_var_set
)
const
{
// The Gradients should not be reused during memory optimization.
for
(
auto
&
p_g
:
sub_param_grad
)
{
auto
iter
=
vars_info
.
find
(
p_g
.
second
);
PADDLE_ENFORCE
(
iter
!=
vars_info
.
end
(),
"%s is not found."
,
p_g
.
second
);
PADDLE_ENFORCE
(
!
iter
->
second
.
empty
()
);
// Set persistable
PADDLE_ENFORCE
_EQ
(
iter
!=
vars_info
.
end
(),
true
,
"%s is not found."
,
p_g
.
second
);
PADDLE_ENFORCE_EQ
(
!
iter
->
second
.
empty
(),
true
);
for
(
auto
it
:
iter
->
second
)
{
PADDLE_ENFORCE_NOT_NULL
(
it
->
Var
());
it
->
Var
()
->
SetPersistable
(
true
);
pinned_var_set
->
insert
(
it
->
Var
()
->
Name
()
);
}
PADDLE_ENFORCE
(
IsLoDTensorType
(
GetTypeOfVar
(
vars_info
,
p_g
.
second
)));
PADDLE_ENFORCE_EQ
(
IsLoDTensorType
(
GetTypeOfVar
(
vars_info
,
p_g
.
second
)),
true
);
}
}
...
...
@@ -411,8 +420,10 @@ class CoalesceGradTensorPass : public ir::Pass {
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
&
vars_info
,
const
std
::
string
&
var_name
)
const
{
auto
grad_iter
=
vars_info
.
find
(
var_name
);
PADDLE_ENFORCE
(
grad_iter
!=
vars_info
.
end
(),
"%s is not found."
,
var_name
);
PADDLE_ENFORCE
(
!
grad_iter
->
second
.
empty
());
PADDLE_ENFORCE_EQ
(
grad_iter
!=
vars_info
.
end
(),
true
,
"%s is not found."
,
var_name
);
PADDLE_ENFORCE_EQ
(
!
grad_iter
->
second
.
empty
(),
true
,
"%s is not found."
,
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
.
front
()
->
Var
());
return
grad_iter
->
second
.
front
()
->
Var
();
}
...
...
@@ -483,4 +494,5 @@ class CoalesceGradTensorPass : public ir::Pass {
}
// namespace paddle
REGISTER_PASS
(
coalesce_grad_tensor_pass
,
paddle
::
framework
::
ir
::
CoalesceGradTensorPass
);
paddle
::
framework
::
ir
::
CoalesceGradTensorPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kNRanks
);
paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -106,6 +106,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
PADDLE_ENFORCE_LE
(
params_and_dense_grads
.
size
(),
aux_var_set
.
at
(
kGrad
).
size
(),
"The number of dense gradients should be little than optimizer ops."
);
std
::
unordered_set
<
std
::
string
>
opt_grad_set
(
aux_var_set
.
at
(
kGrad
).
size
());
for
(
auto
&
p_g
:
params_and_dense_grads
)
{
opt_grad_set
.
insert
(
p_g
.
second
);
...
...
@@ -138,7 +139,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto
&
fused_vars
=
result
.
Get
<
details
::
FusedVars
>
(
details
::
kFusedVars
);
auto
iter
=
std
::
find
(
fused_vars
.
begin
(),
fused_vars
.
end
(),
fused_grad
.
front
());
PADDLE_ENFORCE
(
iter
!=
fused_vars
.
end
(),
"Not find the fused_grad."
);
PADDLE_ENFORCE_EQ
(
iter
!=
fused_vars
.
end
(),
true
,
"Not find the fused_grad."
);
fused_vars_name
[
kGrad
]
=
fused_grad
.
front
();
// Sort the parameters and auxiliary variables according
...
...
@@ -246,18 +248,24 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const
std
::
vector
<
std
::
string
>
&
params
,
const
std
::
vector
<
std
::
string
>
&
grads
,
const
std
::
string
&
fused_grad_name
,
ir
::
Graph
*
result
)
const
{
auto
&
pinned_var_set
=
result
->
GetOrInit
<
details
::
PinnedVars
>
(
details
::
kPinnedVars
);
auto
vars_info
=
GetVarInfo
(
*
result
);
//
Set Gradients as Persistable to prevent this var becoming reusable
.
//
The Gradients should not be reused during memory optimization
.
for
(
auto
&
grad_var_name
:
grads
)
{
auto
iter
=
vars_info
.
find
(
grad_var_name
);
PADDLE_ENFORCE
(
iter
!=
vars_info
.
end
());
PADDLE_ENFORCE
(
!
iter
->
second
.
empty
());
PADDLE_ENFORCE_EQ
(
iter
!=
vars_info
.
end
(),
true
,
"%s is not found."
,
grad_var_name
);
PADDLE_ENFORCE_EQ
(
!
iter
->
second
.
empty
(),
true
,
"%s is not found."
,
grad_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
iter
->
second
.
front
()
->
Var
());
PADDLE_ENFORCE
(
IsLoDTensorType
(
iter
->
second
.
front
()
->
Var
()
->
GetType
()),
PADDLE_ENFORCE_EQ
(
IsLoDTensorType
(
iter
->
second
.
front
()
->
Var
()
->
GetType
()),
true
,
"Currently the gradient type only should be LoDTensor when "
"fusing optimizer ops."
);
for
(
auto
var
:
iter
->
second
)
{
var
->
Var
()
->
SetPersistable
(
true
);
pinned_var_set
.
insert
(
var
->
Var
()
->
Name
()
);
}
}
...
...
@@ -293,8 +301,9 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
const
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Node
*>>
&
var_nodes
,
const
std
::
string
&
name
)
const
{
auto
grad_iter
=
var_nodes
.
find
(
name
);
PADDLE_ENFORCE
(
grad_iter
!=
var_nodes
.
end
());
PADDLE_ENFORCE
(
grad_iter
->
second
.
size
()
>
0
);
PADDLE_ENFORCE_EQ
(
grad_iter
!=
var_nodes
.
end
(),
true
,
"%s is not found."
,
name
);
PADDLE_ENFORCE_GT
(
grad_iter
->
second
.
size
(),
0
);
PADDLE_ENFORCE_NOT_NULL
(
grad_iter
->
second
.
front
()
->
Var
());
return
grad_iter
->
second
.
front
()
->
Var
()
->
GetType
();
}
...
...
@@ -321,24 +330,25 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
&
params_grads
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
*
aux_vars_set
,
std
::
vector
<
ir
::
Node
*>
*
ops
)
const
{
PADDLE_ENFORCE_NE
(
aux_vars_set
->
count
(
k
Param
),
static_cast
<
size_t
>
(
0
));
auto
&
param_vec
=
aux_vars_set
->
at
(
kParam
);
PADDLE_ENFORCE_NE
(
aux_vars_set
->
count
(
k
Grad
),
static_cast
<
size_t
>
(
0
));
auto
&
grad_vec
=
aux_vars_set
->
at
(
kGrad
);
std
::
vector
<
size_t
>
param
_sort_idx
;
param_sort_idx
.
reserve
(
param
_vec
.
size
());
std
::
vector
<
size_t
>
grad
_sort_idx
;
grad_sort_idx
.
reserve
(
grad
_vec
.
size
());
for
(
auto
&
p_g
:
params_grads
)
{
auto
iter
=
std
::
find
(
param_vec
.
begin
(),
param_vec
.
end
(),
p_g
.
first
);
PADDLE_ENFORCE
(
iter
!=
param_vec
.
end
());
auto
idx
=
std
::
distance
(
param_vec
.
begin
(),
iter
);
param_sort_idx
.
emplace_back
(
idx
);
auto
iter
=
std
::
find
(
grad_vec
.
begin
(),
grad_vec
.
end
(),
p_g
.
second
);
PADDLE_ENFORCE_EQ
(
iter
!=
grad_vec
.
end
(),
true
,
"%s is not found in grad_vec"
,
p_g
.
second
);
auto
idx
=
std
::
distance
(
grad_vec
.
begin
(),
iter
);
grad_sort_idx
.
emplace_back
(
idx
);
}
for
(
auto
&
aux_vars
:
*
aux_vars_set
)
{
std
::
vector
<
std
::
string
>
sorted_vars
;
sorted_vars
.
reserve
(
aux_vars
.
second
.
size
());
for
(
size_t
i
=
0
;
i
<
aux_vars
.
second
.
size
();
++
i
)
{
sorted_vars
.
emplace_back
(
aux_vars
.
second
.
at
(
param
_sort_idx
[
i
]));
sorted_vars
.
emplace_back
(
aux_vars
.
second
.
at
(
grad
_sort_idx
[
i
]));
}
std
::
swap
(
aux_vars
.
second
,
sorted_vars
);
...
...
@@ -354,7 +364,7 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
std
::
vector
<
ir
::
Node
*>
sorted_ops
;
sorted_ops
.
reserve
(
ops
->
size
());
for
(
size_t
i
=
0
;
i
<
ops
->
size
();
++
i
)
{
sorted_ops
.
emplace_back
(
ops
->
at
(
param
_sort_idx
[
i
]));
sorted_ops
.
emplace_back
(
ops
->
at
(
grad
_sort_idx
[
i
]));
}
std
::
swap
(
*
ops
,
sorted_ops
);
}
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
5866a7a5
...
...
@@ -85,9 +85,17 @@ class Graph {
return
attrs_
.
count
(
attr_name
)
>
0
;
}
template
<
typename
AttrType
>
AttrType
&
GetOrInit
(
const
std
::
string
&
attr_name
)
{
if
(
!
Has
(
attr_name
))
{
Set
(
attr_name
,
new
AttrType
);
}
return
Get
<
AttrType
>
(
attr_name
);
}
template
<
typename
AttrType
>
AttrType
&
Get
(
const
std
::
string
&
attr_name
)
const
{
PADDLE_ENFORCE
(
Has
(
attr_name
)
,
"%s attr not registered for graph."
,
PADDLE_ENFORCE
_EQ
(
Has
(
attr_name
),
true
,
"%s attr not registered for graph."
,
attr_name
);
try
{
return
*
boost
::
any_cast
<
AttrType
*>
(
attrs_
.
at
(
attr_name
));
...
...
@@ -101,7 +109,7 @@ class Graph {
template
<
typename
AttrType
>
void
Set
(
const
std
::
string
&
attr_name
,
AttrType
*
attr
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
==
0
,
"%s already set in the graph"
,
PADDLE_ENFORCE
_EQ
(
attrs_
.
count
(
attr_name
),
0
,
"%s already set in the graph"
,
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
...
...
@@ -112,14 +120,14 @@ class Graph {
template
<
typename
AttrType
>
void
SetNotOwned
(
const
std
::
string
&
attr_name
,
AttrType
*
attr
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
==
0
,
"%s already set in the graph"
,
PADDLE_ENFORCE
_EQ
(
attrs_
.
count
(
attr_name
),
0
,
"%s already set in the graph"
,
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[]()
{};
}
void
Erase
(
const
std
::
string
&
attr_name
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
!=
0
,
"%s not set in the graph"
,
PADDLE_ENFORCE
_NE
(
attrs_
.
count
(
attr_name
),
0
,
"%s not set in the graph"
,
attr_name
);
attr_dels_
[
attr_name
]();
attrs_
.
erase
(
attr_name
);
...
...
@@ -130,7 +138,7 @@ class Graph {
// Create a normal variable with non-null VarDesc.
ir
::
Node
*
CreateVarNode
(
VarDesc
*
var_desc
)
{
PADDLE_ENFORCE
(
var_desc
);
PADDLE_ENFORCE
_NOT_NULL
(
var_desc
);
auto
*
x
=
AddNode
(
new
ir
::
Node
(
var_desc
));
x
->
SetId
(
num_node_created_
++
);
return
x
;
...
...
@@ -138,7 +146,7 @@ class Graph {
// Create a normal runnable operator with OpDesc.
ir
::
Node
*
CreateOpNode
(
OpDesc
*
op_desc
)
{
PADDLE_ENFORCE
(
op_desc
);
PADDLE_ENFORCE
_NOT_NULL
(
op_desc
);
auto
*
x
=
AddNode
(
new
ir
::
Node
(
op_desc
));
x
->
SetId
(
num_node_created_
++
);
return
x
;
...
...
@@ -178,7 +186,7 @@ class Graph {
}
std
::
unique_ptr
<
ir
::
Node
>
RemoveNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
!=
node_set_
.
end
()
);
PADDLE_ENFORCE
_EQ
(
node_set_
.
find
(
node
)
!=
node_set_
.
end
(),
true
);
std
::
unique_ptr
<
ir
::
Node
>
ret
;
ret
.
reset
(
nodes_
.
at
(
node
).
release
());
nodes_
.
erase
(
node
);
...
...
@@ -204,7 +212,7 @@ class Graph {
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
()
);
PADDLE_ENFORCE
_EQ
(
node_set_
.
find
(
node
)
==
node_set_
.
end
(),
true
);
nodes_
[
node
].
reset
(
node
);
node_set_
.
insert
(
node
);
return
node
;
...
...
paddle/fluid/framework/ir/graph_test.cc
浏览文件 @
5866a7a5
...
...
@@ -206,5 +206,51 @@ TEST(GraphTest, WriteAfterWrite) {
ASSERT_NE
(
control_dep2
,
nullptr
);
ASSERT_EQ
(
control_dep1
,
control_dep2
);
}
TEST
(
GraphTest
,
TestException
)
{
ProgramDesc
prog
;
std
::
unique_ptr
<
ir
::
Graph
>
g
(
new
ir
::
Graph
(
prog
));
bool
not_met_exception
=
false
;
try
{
g
->
Erase
(
"no_attr"
);
}
catch
(
const
platform
::
EnforceNotMet
&
e
)
{
not_met_exception
=
true
;
}
ASSERT_TRUE
(
not_met_exception
);
not_met_exception
=
false
;
try
{
g
->
CreateVarNode
(
nullptr
);
}
catch
(
const
platform
::
EnforceNotMet
&
e
)
{
not_met_exception
=
true
;
}
ASSERT_TRUE
(
not_met_exception
);
not_met_exception
=
false
;
try
{
g
->
CreateOpNode
(
nullptr
);
}
catch
(
const
platform
::
EnforceNotMet
&
e
)
{
not_met_exception
=
true
;
}
ASSERT_TRUE
(
not_met_exception
);
not_met_exception
=
false
;
try
{
g
->
RemoveNode
(
nullptr
);
}
catch
(
const
platform
::
EnforceNotMet
&
e
)
{
not_met_exception
=
true
;
}
ASSERT_TRUE
(
not_met_exception
);
not_met_exception
=
false
;
try
{
g
->
AddNode
(
nullptr
);
g
->
AddNode
(
nullptr
);
}
catch
(
const
platform
::
EnforceNotMet
&
e
)
{
not_met_exception
=
true
;
}
ASSERT_TRUE
(
not_met_exception
);
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -36,6 +36,11 @@ void MemoryReusePass::ApplyImpl(Graph *graph) const {
reused_out_var_names_
.
resize
(
all_vars_
->
size
());
var_descs_
.
resize
(
all_vars_
->
size
());
pinned_var_set_
=
nullptr
;
if
(
graph
->
Has
(
details
::
kPinnedVars
))
{
pinned_var_set_
=
&
graph
->
Get
<
details
::
PinnedVars
>
(
details
::
kPinnedVars
);
}
// Collect the existing ShareTensorBufferOpHandles.
// This is because (1) we want to reuse the existing
// ShareTensorBufferOpHandles to avoid inserting too many ops;
...
...
@@ -195,7 +200,7 @@ bool MemoryReusePass::IsInVarReusable(const details::VarHandle &in_var) const {
const
VarDesc
*
in_var_desc
=
GetVarDesc
(
in_var
);
if
(
in_var_desc
->
Persistable
(
))
{
if
(
IsPinnedVar
(
*
in_var_desc
))
{
return
false
;
}
...
...
@@ -244,7 +249,7 @@ bool MemoryReusePass::IsOutVarReusable(
}
const
VarDesc
*
out_var_desc
=
GetVarDesc
(
out_var
);
if
(
out_var_desc
->
Persistable
(
))
{
if
(
IsPinnedVar
(
*
out_var_desc
))
{
return
false
;
}
...
...
@@ -261,6 +266,11 @@ bool MemoryReusePass::IsOutVarReusable(
return
true
;
}
bool
MemoryReusePass
::
IsPinnedVar
(
const
VarDesc
&
var_desc
)
const
{
return
var_desc
.
Persistable
()
||
(
pinned_var_set_
&&
pinned_var_set_
->
count
(
var_desc
.
Name
()));
}
/**
* Input-Output pair can be reused only when:
* - they are not the same var.
...
...
paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
浏览文件 @
5866a7a5
...
...
@@ -133,6 +133,9 @@ class MemoryReusePass : public Pass {
mutable
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
reused_out_var_names_
;
mutable
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
VarDesc
*>>
var_descs_
;
mutable
details
::
PinnedVars
*
pinned_var_set_
;
bool
IsPinnedVar
(
const
VarDesc
&
out_var_desc
)
const
;
};
}
// namespace ir
...
...
paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -312,13 +312,22 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
ShrinkDepsOpFunctor
shrink_func
(
ir
::
FilterByNodeWrapper
<
details
::
OpHandleBase
>
(
*
graph
));
details
::
PinnedVars
*
pinned_var_set
=
nullptr
;
if
(
graph
->
Has
(
details
::
kPinnedVars
))
{
pinned_var_set
=
&
graph
->
Get
<
details
::
PinnedVars
>
(
details
::
kPinnedVars
);
}
auto
is_pinned_var
=
[
&
pinned_var_set
](
const
VarDesc
&
var_desc
)
{
return
var_desc
.
Persistable
()
||
(
pinned_var_set
&&
pinned_var_set
->
count
(
var_desc
.
Name
()));
};
VLOG
(
1
)
<<
"Place number: "
<<
vars
.
size
();
for
(
size_t
i
=
0
;
i
<
vars
.
size
();
++
i
)
{
for
(
auto
&
name_var_pair
:
vars
[
i
])
{
// Whether this variable can be reused or deleted? If not, we do not
// compute reference counts and dependencies.
VarDesc
*
var_desc
=
TryGetLatestVarDesc
(
name_var_pair
.
second
);
if
(
var_desc
==
nullptr
||
var_desc
->
Persistable
(
))
{
if
(
var_desc
==
nullptr
||
is_pinned_var
(
*
var_desc
))
{
continue
;
}
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -29,14 +29,21 @@ namespace ir {
class
FuseAllReduceOpPass
:
public
ir
::
Pass
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
{
ir
::
Graph
&
result
=
*
graph
;
if
(
Get
<
size_t
>
(
details
::
kNRanks
)
<=
1
)
{
VLOG
(
6
)
<<
"The number of place is"
<<
Get
<
size_t
>
(
details
::
kNRanks
)
<<
", there doesn't need apply FuseAllReduceOpPass."
;
return
;
}
auto
&
places
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
);
auto
&
local_scopes
=
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
multi_nccl_ctxs
=
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
#endif
ir
::
Graph
&
result
=
*
graph
;
auto
&
params_grads
=
result
.
Get
<
details
::
ParamsAndGrads
>
(
details
::
kParamsAndDenseGrads
);
size_t
num_of_all_reduce
=
params_grads
.
size
();
...
...
@@ -203,4 +210,5 @@ class FuseAllReduceOpPass : public ir::Pass {
}
// namespace paddle
REGISTER_PASS
(
fuse_all_reduce_op_pass
,
paddle
::
framework
::
ir
::
FuseAllReduceOpPass
);
paddle
::
framework
::
ir
::
FuseAllReduceOpPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kNRanks
);
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
浏览文件 @
5866a7a5
...
...
@@ -205,7 +205,7 @@ void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
}
// Insert collective ops if nranks > 1
if
(
!
is_forwarding
&&
Get
<
size_t
>
(
kNRanks
)
>
1
)
{
if
(
!
is_forwarding
&&
Get
<
size_t
>
(
details
::
kNRanks
)
>
1
)
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
...
...
@@ -273,7 +273,7 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
loss_scale
=
1
;
break
;
case
details
::
BuildStrategy
::
GradientScaleStrategy
::
kCoeffNumDevice
:
loss_scale
=
Get
<
size_t
>
(
kNRanks
);
loss_scale
=
Get
<
size_t
>
(
details
::
kNRanks
);
break
;
case
details
::
BuildStrategy
::
GradientScaleStrategy
::
kCustomized
:
loss_scale
=
0
;
...
...
@@ -1106,7 +1106,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
.RequirePassAttr(paddle::framework::details::kPlaces) \
.RequirePassAttr(paddle::framework::details::kLocalScopes) \
.RequirePassAttr(paddle::framework::ir::kStrategy) \
.RequirePassAttr(paddle::framework::
ir
::kNRanks)
.RequirePassAttr(paddle::framework::
details
::kNRanks)
REGISTER_MULTI_DEVICES_PASS
(
reduce_mode_multi_devices_pass
,
paddle
::
framework
::
ir
::
ReduceSSAGraphBuilder
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
浏览文件 @
5866a7a5
...
...
@@ -35,7 +35,6 @@ namespace ir {
constexpr
char
kLossVarName
[]
=
"loss_var_name"
;
constexpr
char
kStrategy
[]
=
"strategy"
;
constexpr
char
kNRanks
[]
=
"nranks"
;
class
MultiDevSSAGraphBuilderBase
:
public
ir
::
Pass
{
protected:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录