Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
93c7f058
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
93c7f058
编写于
11月 26, 2019
作者:
W
WangXi
提交者:
gongweibao
11月 26, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick 1.6] Fix dgc buffer illegal & reuse velocity & fix fuse (#21281)
上级
3423f0b6
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
168 addition
and
114 deletion
+168
-114
paddle/fluid/framework/details/dgc_const_values.h
paddle/fluid/framework/details/dgc_const_values.h
+2
-2
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
...le/fluid/framework/details/sparse_all_reduce_op_handle.cc
+24
-24
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+0
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+6
-0
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
...k/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+3
-3
paddle/fluid/operators/dgc_op.cc
paddle/fluid/operators/dgc_op.cc
+15
-20
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+16
-1
paddle/fluid/pybind/const_value.cc
paddle/fluid/pybind/const_value.cc
+2
-2
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+77
-49
python/paddle/fluid/tests/unittests/test_dgc_op.py
python/paddle/fluid/tests/unittests/test_dgc_op.py
+17
-5
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+6
-6
未找到文件。
paddle/fluid/framework/details/dgc_const_values.h
浏览文件 @
93c7f058
...
...
@@ -22,10 +22,10 @@ namespace details {
constexpr
char
g_dgc_counter_name
[]
=
"__g_dgc_counter__"
;
constexpr
char
g_dgc_rampup_begin_step
[]
=
"__g_rampup_begin_step__"
;
constexpr
char
g_dgc_u
[]
=
"__dgc_u__"
;
constexpr
char
g_dgc_v
[]
=
"__dgc_v__"
;
constexpr
char
g_dgc_nranks
[]
=
"__g_nranks__"
;
constexpr
char
g_dgc_k
[]
=
"__dgc_k__"
;
constexpr
char
g_dgc_encoded
[]
=
"__dgc_encoded__"
;
constexpr
char
g_dgc_gather
[]
=
"__dgc_gather__"
;
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
浏览文件 @
93c7f058
...
...
@@ -38,30 +38,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_
(
is_encoded
),
nranks_
(
nranks
)
{
// TODO(gongwb) :polish them!
if
(
is_encoded
)
{
VLOG
(
1
)
<<
"Use dgc allreduce mode"
;
}
}
PADDLE_ENFORCE_EQ
(
is_encoded
,
true
);
VLOG
(
1
)
<<
"Use dgc allreduce mode"
<<
", nranks:"
<<
nranks_
;
void
SparseAllReduceOpHandle
::
WaitInputVarGenerated
()
{
#ifdef PADDLE_WITH_CUDA
for
(
auto
&
p
:
dev_ctxes_
)
{
if
(
platform
::
is_gpu_place
(
p
.
first
))
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
auto
*
compute_dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
platform
::
CUDAPlace
(
dev_id
));
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
);
if
(
compute_dev_ctx
->
stream
()
!=
dev_ctx
->
stream
())
{
auto
&
event
=
events_
.
at
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
compute_dev_ctx
->
stream
()));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
dev_ctx
->
stream
(),
event
,
0
));
}
PADDLE_ENFORCE_GT
(
local_scopes_
.
size
(),
0
);
auto
nranks_name
=
g_dgc_nranks
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
local_scope
=
local_scopes_
[
i
];
auto
nranks_var
=
local_scope
->
FindVar
(
nranks_name
);
if
(
nranks_var
==
nullptr
)
{
PADDLE_THROW
(
"not find nranks_var:%s"
,
nranks_name
);
}
float
*
dgc_nranks
=
nranks_var
->
GetMutable
<
LoDTensor
>
()
->
data
<
float
>
();
*
dgc_nranks
=
nranks
;
VLOG
(
10
)
<<
"dgc_nranks="
<<
*
dgc_nranks
;
}
#endif
}
void
SparseAllReduceOpHandle
::
RunImplEncoded
()
{
...
...
@@ -77,18 +70,27 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
"The NoDummyInputSize and NoDummyOutputSize should be equal."
);
std
::
vector
<
const
LoDTensor
*>
ins
;
std
::
vector
<
LoDTensor
*>
gathers
;
std
::
vector
<
LoDTensor
*>
outs
;
int
k
=
-
1
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
local_scope
=
local_exec_scopes_
[
i
];
auto
original_name
=
paddle
::
framework
::
GradOriginalVarName
(
in_var_handles
[
i
]
->
name
());
auto
encode_var_name
=
original_name
+
g_dgc_encoded
;
auto
*
in_var
=
local_scope
->
FindVar
(
encode_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
,
"%s should not be null"
,
encode_var_name
);
auto
&
in
=
in_var
->
Get
<
LoDTensor
>
();
ins
.
emplace_back
(
&
in
);
auto
gather_var_name
=
original_name
+
g_dgc_gather
;
auto
*
gather_var
=
local_scope
->
FindVar
(
gather_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
gather_var
,
"%s should not be null"
,
gather_var_name
);
auto
*
gather
=
gather_var
->
GetMutable
<
LoDTensor
>
();
gathers
.
emplace_back
(
gather
);
auto
*
out
=
local_scope
->
FindVar
(
out_var_handles
[
i
]
->
name
())
->
GetMutable
<
LoDTensor
>
();
outs
.
emplace_back
(
out
);
...
...
@@ -135,9 +137,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
// dgc use ncclAllGather to get all the encoded data
// so the buffer need nranks.
int
buf_size
=
nranks_
*
encode_size
;
auto
tmp_ious_data
=
memory
::
Alloc
(
place
,
buf_size
);
void
*
gather_buff
=
reinterpret_cast
<
void
*>
(
tmp_ious_data
->
ptr
());
allocations
.
emplace_back
(
std
::
move
(
tmp_ious_data
));
void
*
gather_buff
=
gathers
[
i
]
->
data
<
void
>
();
VLOG
(
10
)
<<
"in_numel:"
<<
in_numel
<<
", out_numel:"
<<
out_numel
<<
", nranks:"
<<
nranks_
<<
", gather_buf size:"
<<
buf_size
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
浏览文件 @
93c7f058
...
...
@@ -36,8 +36,6 @@ class SparseAllReduceOpHandle : public AllReduceOpHandle {
bool
is_encoded
=
false
,
int
nranks
=
-
1
);
std
::
string
Name
()
const
override
;
void
WaitInputVarGenerated
()
override
;
protected:
void
RunImpl
()
override
;
int
GetKValue
(
const
std
::
string
&
grad_name
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
93c7f058
...
...
@@ -105,6 +105,12 @@ class FuseAllReduceOpPass : public ir::Pass {
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
#if defined(PADDLE_WITH_DGC)
PADDLE_ENFORCE_NE
(
all_reduce_op_handle
->
Name
(),
"sparse_all_reduce"
,
"DGC doesn't support fuse for now, if you want to use DGC "
"you need set strategy.fuse_all_reduce_ops = False."
);
#endif
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
浏览文件 @
93c7f058
...
...
@@ -1011,10 +1011,10 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
#if defined(PADDLE_WITH_DGC)
bool
AllReduceSSAGraphBuilder
::
IsEncoded
(
const
std
::
string
&
p_name
)
const
{
auto
u_name
=
p_name
+
details
::
g_dgc_u
;
auto
it
=
all_vars_
.
find
(
u
_name
);
auto
k_name
=
p_name
+
details
::
g_dgc_k
;
auto
it
=
all_vars_
.
find
(
k
_name
);
if
(
it
==
all_vars_
.
end
())
{
VLOG
(
10
)
<<
"can't find
u_name, so it's not encoded:"
<<
u
_name
;
VLOG
(
10
)
<<
"can't find
k_name, so it's not encoded:"
<<
k
_name
;
return
false
;
}
...
...
paddle/fluid/operators/dgc_op.cc
浏览文件 @
93c7f058
...
...
@@ -31,6 +31,8 @@ class DGCOp : public framework::OperatorWithKernel {
"Input(Grad) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
"Input(current_step) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
"Input(nranks) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"U_out"
),
"Output(U_out) of DGCop should not be null."
);
...
...
@@ -40,14 +42,15 @@ class DGCOp : public framework::OperatorWithKernel {
"Output(k) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"EncodeGrad"
),
"Output(EncodeGrad) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"GatherBuff"
),
true
,
"Output(EncodeGrad) of DGCop should not be null."
);
}
protected:
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"current_step"
||
var_name
==
"rampup_step"
||
var_name
==
"k"
)
{
if
(
var_name
==
"current_step"
||
var_name
==
"k"
||
var_name
==
"nranks"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
}
...
...
@@ -60,26 +63,18 @@ class DGCOp : public framework::OperatorWithKernel {
class
DGCOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"U"
,
"(Tensor)
Middle
tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor)
Middle
tensor of DGC"
);
AddInput
(
"U"
,
"(Tensor)
U velocity
tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor)
V velocity
tensor of DGC"
);
AddInput
(
"Grad"
,
"(Tensor) Input gradient"
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddOutput
(
"U_out"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"V_out"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"EncodeGrad"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"Grad_out"
,
"(Tensor) "
"Output grad gradient"
);
AddOutput
(
"k"
,
"(Tensor) "
"Output top-k value"
);
AddInput
(
"nranks"
,
"(Tensor) nranks."
);
AddOutput
(
"U_out"
,
"(Tensor) Output U velocity of DGC"
);
AddOutput
(
"V_out"
,
"(Tensor) Output V velocity of DGC"
);
AddOutput
(
"EncodeGrad"
,
"(Tensor) Output encoded gradient"
);
AddOutput
(
"Grad_out"
,
"(Tensor) Output grad gradient"
);
AddOutput
(
"k"
,
"(Tensor) Output top-k value"
);
AddOutput
(
"GatherBuff"
,
"(Tensor) Gather buffer"
);
AddAttr
<
float
>
(
"m"
,
"(float, 0.9) "
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
93c7f058
...
...
@@ -50,6 +50,11 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
rampup_begin_step
=
ctx
.
Attr
<
float
>
(
"rampup_begin_step"
);
auto
rampup_step
=
ctx
.
Attr
<
float
>
(
"rampup_step"
);
// nranks
auto
nranks_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
// current step
auto
current_step_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
const
float
*
current_step
=
current_step_tensor
->
data
<
float
>
();
...
...
@@ -72,7 +77,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
<<
", rampup_begin_step:"
<<
rampup_begin_step
<<
", rampup_step:"
<<
rampup_step
<<
", current_step:"
<<
*
current_step
<<
", ratio:"
<<
ratio
<<
", k:"
<<
k
;
<<
", k:"
<<
k
<<
", nranks:"
<<
nranks
;
auto
k_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"k"
);
T
*
k_out_data
=
k_out
->
data
<
T
>
();
...
...
@@ -81,6 +86,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
u_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"U_out"
);
auto
v_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"V_out"
);
auto
encode_grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"EncodeGrad"
);
auto
gather_buff
=
ctx
.
Output
<
framework
::
Tensor
>
(
"GatherBuff"
);
// FIXME(gongwb): use cublas.
auto
u_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u_out
);
...
...
@@ -88,6 +94,13 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
if
(
static_cast
<
int
>
(
*
current_step
)
==
static_cast
<
int
>
(
rampup_begin_step
))
{
// calc local momentum from global momentum
u_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
u_e
;
}
if
(
use_nesterov
)
{
// u = m * (u + g)
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g_e
);
...
...
@@ -111,6 +124,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
T
*
u_out_data
=
u_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
encode_grad_out_data
=
encode_grad_out
->
mutable_data
<
T
>
(
framework
::
DDim
{
2
*
k
},
ctx
.
GetPlace
());
gather_buff
->
mutable_data
<
T
>
(
framework
::
DDim
{
2
*
k
*
nranks
},
ctx
.
GetPlace
());
int
buf_size
=
paddle
::
communication
::
dgc
::
get_buffer_size
(
k
);
auto
tmp_ious_data
=
memory
::
Alloc
(
dev_ctx
,
buf_size
);
...
...
paddle/fluid/pybind/const_value.cc
浏览文件 @
93c7f058
...
...
@@ -59,14 +59,14 @@ void BindConstValue(pybind11::module* m) {
framework
::
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
);
#if defined(PADDLE_WITH_DGC)
auto
dgc
=
m
->
def_submodule
(
"dgc"
);
dgc
.
def
(
"kDGCUName"
,
[]
{
return
framework
::
details
::
g_dgc_u
;
});
dgc
.
def
(
"kDGCVName"
,
[]
{
return
framework
::
details
::
g_dgc_v
;
});
dgc
.
def
(
"kDGCKName"
,
[]
{
return
framework
::
details
::
g_dgc_k
;
});
dgc
.
def
(
"kDGCEncodedName"
,
[]
{
return
framework
::
details
::
g_dgc_encoded
;
});
dgc
.
def
(
"kDGCGatherName"
,
[]
{
return
framework
::
details
::
g_dgc_gather
;
});
dgc
.
def
(
"kDGCCounterName"
,
[]
{
return
framework
::
details
::
g_dgc_counter_name
;
});
dgc
.
def
(
"kDGCRampUpBeginStepName"
,
[]
{
return
framework
::
details
::
g_dgc_rampup_begin_step
;
});
dgc
.
def
(
"kDGCNRanksName"
,
[]
{
return
framework
::
details
::
g_dgc_nranks
;
});
#endif
}
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
93c7f058
...
...
@@ -867,7 +867,7 @@ class MomentumOptimizer(Optimizer):
return
momentum_op
class
DGCMomentumOptimizer
(
Momentum
Optimizer
):
class
DGCMomentumOptimizer
(
Optimizer
):
"""
DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
...
...
@@ -923,6 +923,8 @@ class DGCMomentumOptimizer(MomentumOptimizer):
sparsity=[0.999, 0.999])
"""
_u_velocity_acc_str
=
"_dgc_u_"
_v_velocity_acc_str
=
"_dgc_v_"
def
__init__
(
self
,
learning_rate
,
...
...
@@ -935,17 +937,25 @@ class DGCMomentumOptimizer(MomentumOptimizer):
num_trainers
=
None
,
regularization
=
None
,
name
=
None
):
self
.
_sparsity
=
sparsity
self
.
_rampup_step
=
rampup_step
self
.
_rampup_step_var
=
None
assert
learning_rate
is
not
None
assert
momentum
is
not
None
super
(
DGCMomentumOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
regularization
=
regularization
,
name
=
name
)
self
.
type
=
"dgc_momentum"
self
.
_momentum
=
momentum
self
.
_use_nesterov
=
bool
(
use_nesterov
)
self
.
_rampup_begin_step
=
rampup_begin_step
self
.
_rampup_begin_step_var
=
None
self
.
_rampup_step
=
rampup_step
self
.
_sparsity
=
sparsity
self
.
_rampup_begin_step_var
=
None
self
.
_global_step_var
=
None
self
.
_local_grad_clip_norm
=
None
self
.
_clip_norm
=
None
if
local_grad_clip_norm
is
not
None
:
assert
isinstance
(
num_trainers
,
int
)
assert
isinstance
(
local_grad_clip_norm
,
float
)
...
...
@@ -956,9 +966,6 @@ class DGCMomentumOptimizer(MomentumOptimizer):
self
.
_clip_norm
=
local_grad_clip_norm
/
(
num_trainers
*
num_trainers
)
super
(
DGCMomentumOptimizer
,
self
).
__init__
(
learning_rate
,
momentum
,
use_nesterov
,
regularization
,
name
)
def
_is_use_dgc
(
self
,
param_var
,
grad_var
):
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
))
if
var_numel
<
16384
or
\
...
...
@@ -970,34 +977,36 @@ class DGCMomentumOptimizer(MomentumOptimizer):
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
velocity_acc
=
self
.
_get_accumulator
(
self
.
_u_velocity_acc_str
,
param_and_grad
[
0
])
assert
velocity_acc
is
not
None
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
,
}
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
}
if
not
self
.
_is_use_dgc
(
param_and_grad
[
0
],
param_and_grad
[
1
]):
return
super
(
DGCMomentumOptimizer
,
self
).
_append_optimize_op
(
block
,
param_and_grad
)
type
=
"momentum"
else
:
type
=
"dgc_momentum"
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
})
attrs
.
update
({
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)})
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
# create the dgc momentum optimize op
dgc_momentum_op
=
block
.
append_op
(
type
=
"dgc_momentum"
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
"current_step"
:
self
.
_global_step_var
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
},
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)
},
type
=
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
return
dgc_momentum_op
def
_add_auto_increment_var
(
self
,
counter_name
,
begin
,
step
=
1
):
...
...
@@ -1019,8 +1028,20 @@ class DGCMomentumOptimizer(MomentumOptimizer):
return
counter
def
_add_nranks_var
(
self
,
name
,
value
=-
1
):
helper
=
LayerHelper
(
'global_step_counter'
)
counter
,
is_new_var
=
helper
.
create_or_get_global_variable
(
name
=
name
,
dtype
=
'float32'
,
shape
=
[
1
],
persistable
=
True
)
if
is_new_var
:
helper
.
set_variable_initializer
(
counter
,
initializer
=
Constant
(
value
=
float
(
value
),
force_cpu
=
True
))
counter
.
stop_gradient
=
True
return
counter
def
_append_dgc_ops
(
self
,
param_and_grads
):
start_program
=
default_startup_program
()
main_program
=
default_main_program
()
main_program
.
_enable_dgc
=
True
...
...
@@ -1028,6 +1049,9 @@ class DGCMomentumOptimizer(MomentumOptimizer):
self
.
_global_step_var
=
self
.
_add_auto_increment_var
(
counter_name
=
core
.
dgc
.
kDGCCounterName
(),
begin
=
0
)
self
.
_nranks_var
=
self
.
_add_nranks_var
(
name
=
core
.
dgc
.
kDGCNRanksName
(),
value
=-
1
)
# rampup begin step var for all_reduce_op_handle
self
.
_rampup_begin_step_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
...
...
@@ -1037,22 +1061,16 @@ class DGCMomentumOptimizer(MomentumOptimizer):
value
=
self
.
_rampup_begin_step
*
1.0
,
force_cpu
=
True
)
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
for
param_var
,
grad_var
in
param_and_grads
:
# reuse velocity in dgc_op and dgc_momentum_op
u_var
=
self
.
_add_accumulator
(
self
.
_u_velocity_acc_str
,
param_var
)
if
not
self
.
_is_use_dgc
(
param_var
,
grad_var
):
continue
u_var
=
tensor
.
create_global_var
(
shape
=
param_var
.
shape
,
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCUName
(),
value
=
0.0
)
v_var
=
tensor
.
create_global_var
(
shape
=
param_var
.
shape
,
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCVName
(),
value
=
0.0
)
v_var
=
self
.
_add_accumulator
(
self
.
_v_velocity_acc_str
,
param_var
)
k_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
...
...
@@ -1070,6 +1088,14 @@ class DGCMomentumOptimizer(MomentumOptimizer):
value
=
0.0
,
force_cpu
=
False
)
gather_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCGatherName
(),
value
=
0.0
,
force_cpu
=
False
)
# del back oprolevarname
op_maker
=
core
.
op_proto_and_checker_maker
backward
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
...
...
@@ -1092,7 +1118,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
if
self
.
_local_grad_clip_norm
is
not
None
:
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_clip_norm
)
self
.
_dgc_op
(
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
)
encoded_var
,
gather_var
)
def
_is_the_backward_op
(
self
,
op
):
op_maker
=
core
.
op_proto_and_checker_maker
...
...
@@ -1131,7 +1157,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
x
=
grad_var
,
max_norm
=
clip_norm
,
name
=
grad_var
.
name
)
def
_dgc_op
(
self
,
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
):
encoded_var
,
gather_var
):
block
=
framework
.
default_main_program
().
global_block
()
op_maker
=
core
.
op_proto_and_checker_maker
dgc_op
=
block
.
append_op
(
...
...
@@ -1140,21 +1166,23 @@ class DGCMomentumOptimizer(MomentumOptimizer):
"U"
:
u_var
,
"V"
:
v_var
,
"Grad"
:
clip_var
,
"current_step"
:
self
.
_global_step_var
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
,
},
outputs
=
{
"U_out"
:
u_var
,
"V_out"
:
v_var
,
"EncodeGrad"
:
encoded_var
,
"k"
:
k_var
,
"Grad_out"
:
grad_var
"Grad_out"
:
grad_var
,
"GatherBuff"
:
gather_var
,
},
attrs
=
{
"m"
:
self
.
_momentum
,
"sparsity"
:
self
.
_sparsity
,
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
),
"rampup_step"
:
float
(
self
.
_rampup_step
)
"rampup_step"
:
float
(
self
.
_rampup_step
)
,
},
stop_gradient
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_dgc_op.py
浏览文件 @
93c7f058
...
...
@@ -34,7 +34,7 @@ class TestDGCOp(unittest.TestCase):
print
(
"place:"
,
place
)
# numpy data
# inputs: U, V, Grad, current_step
# inputs: U, V, Grad, current_step
, nranks
self
.
u_name
=
"U"
self
.
u
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
...
...
@@ -47,10 +47,14 @@ class TestDGCOp(unittest.TestCase):
self
.
current_step_name
=
"current_step"
self
.
current_step
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
# output: U_out, V_out, EncodeGrad, GradLocal_out
self
.
nranks_name
=
"nranks"
self
.
nranks
=
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
# output: U_out, V_out, EncodeGrad, GradLocal_out, k, GatherBuff
self
.
encode_grad_name
=
"EncodeGrad"
self
.
k_name
=
"k"
self
.
k
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
self
.
gather_buff_name
=
"GatherBuff"
# scope data
self
.
u_tensor
=
self
.
scope
.
var
(
self
.
u_name
).
get_tensor
()
...
...
@@ -62,16 +66,22 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
self
.
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
encode_grad_tensor
=
self
.
scope
.
var
(
self
.
encode_grad_name
).
get_tensor
()
self
.
current_step_tensor
=
self
.
scope
.
var
(
self
.
current_step_name
).
get_tensor
()
self
.
current_step_tensor
.
set
(
self
.
current_step
,
core
.
CPUPlace
())
self
.
nranks_tensor
=
self
.
scope
.
var
(
self
.
nranks_name
).
get_tensor
()
self
.
nranks_tensor
.
set
(
self
.
nranks
,
core
.
CPUPlace
())
self
.
encode_grad_tensor
=
self
.
scope
.
var
(
self
.
encode_grad_name
).
get_tensor
()
self
.
k_tensor
=
self
.
scope
.
var
(
self
.
k_name
).
get_tensor
()
self
.
k_tensor
.
set
(
self
.
k
,
core
.
CPUPlace
())
self
.
gather_buff_tensor
=
self
.
scope
.
var
(
self
.
gather_buff_name
).
get_tensor
()
def
check
(
self
,
actual_t
,
expect_t
,
place
,
out_name
,
atol
=
1e-5
):
self
.
assertTrue
(
np
.
allclose
(
...
...
@@ -87,6 +97,7 @@ class TestDGCOp(unittest.TestCase):
'V'
:
self
.
v_name
,
'Grad'
:
self
.
grad_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
# outputs
'U_out'
:
self
.
u_name
,
...
...
@@ -94,6 +105,7 @@ class TestDGCOp(unittest.TestCase):
'EncodeGrad'
:
self
.
encode_grad_name
,
'Grad_out'
:
self
.
grad_name
,
'k'
:
self
.
k_name
,
'GatherBuff'
:
self
.
gather_buff_name
,
# attrs
'm'
:
0.9
,
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
93c7f058
...
...
@@ -29,7 +29,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
return
self
.
_accumulators
def
get_velocity_str
(
self
):
return
self
.
_velocity_acc_str
return
self
.
_
u_
velocity_acc_str
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
):
init_program
=
framework
.
Program
()
...
...
@@ -66,8 +66,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
# params_grads = append_backward(mean_out)
params_grads
=
dgc_momentum_optimizer
.
backward
(
mean_out
)
accumulator_count
=
1
if
name
==
"momentum"
else
2
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
dgc_momentum_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
dgc_momentum_optimizer
.
get_accumulators
()),
accumulator_count
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
dgc_momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
2
)
...
...
@@ -77,7 +79,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
# Check accumulators
accumulators
=
dgc_momentum_optimizer
.
get_accumulators
()
self
.
assertEqual
(
len
(
accumulators
),
1
)
self
.
assertEqual
(
len
(
accumulators
),
accumulator_count
)
self
.
assertTrue
(
dgc_momentum_optimizer
.
get_velocity_str
()
in
accumulators
)
velocity_acc
=
accumulators
[
dgc_momentum_optimizer
.
get_velocity_str
()]
...
...
@@ -86,11 +88,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
# Check init_program
init_ops
=
init_program
.
global_block
().
ops
self
.
assertEqual
(
len
(
init_ops
),
2
)
self
.
assertEqual
(
len
(
init_ops
),
1
)
self
.
assertEqual
(
init_ops
[
0
].
type
,
"fill_constant"
)
self
.
assertAlmostEqual
(
init_ops
[
0
].
attr
(
'value'
),
learning_rate
)
self
.
assertEqual
(
init_ops
[
1
].
type
,
"fill_constant"
)
self
.
assertAlmostEqual
(
init_ops
[
1
].
attr
(
'value'
),
0.0
)
with
open
(
"test_dgc_optimizer_"
+
name
+
".log"
,
"w"
)
as
f
:
program_to_code
(
program
,
fout
=
f
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录