Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
93c7f058
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
93c7f058
编写于
11月 26, 2019
作者:
W
WangXi
提交者:
gongweibao
11月 26, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick 1.6] Fix dgc buffer illegal & reuse velocity & fix fuse (#21281)
上级
3423f0b6
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
168 addition
and
114 deletion
+168
-114
paddle/fluid/framework/details/dgc_const_values.h
paddle/fluid/framework/details/dgc_const_values.h
+2
-2
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
...le/fluid/framework/details/sparse_all_reduce_op_handle.cc
+24
-24
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+0
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+6
-0
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
...k/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+3
-3
paddle/fluid/operators/dgc_op.cc
paddle/fluid/operators/dgc_op.cc
+15
-20
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+16
-1
paddle/fluid/pybind/const_value.cc
paddle/fluid/pybind/const_value.cc
+2
-2
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+77
-49
python/paddle/fluid/tests/unittests/test_dgc_op.py
python/paddle/fluid/tests/unittests/test_dgc_op.py
+17
-5
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+6
-6
未找到文件。
paddle/fluid/framework/details/dgc_const_values.h
浏览文件 @
93c7f058
...
...
@@ -22,10 +22,10 @@ namespace details {
constexpr
char
g_dgc_counter_name
[]
=
"__g_dgc_counter__"
;
constexpr
char
g_dgc_rampup_begin_step
[]
=
"__g_rampup_begin_step__"
;
constexpr
char
g_dgc_u
[]
=
"__dgc_u__"
;
constexpr
char
g_dgc_v
[]
=
"__dgc_v__"
;
constexpr
char
g_dgc_nranks
[]
=
"__g_nranks__"
;
constexpr
char
g_dgc_k
[]
=
"__dgc_k__"
;
constexpr
char
g_dgc_encoded
[]
=
"__dgc_encoded__"
;
constexpr
char
g_dgc_gather
[]
=
"__dgc_gather__"
;
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
浏览文件 @
93c7f058
...
...
@@ -38,30 +38,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_
(
is_encoded
),
nranks_
(
nranks
)
{
// TODO(gongwb) :polish them!
if
(
is_encoded
)
{
VLOG
(
1
)
<<
"Use dgc allreduce mode"
;
}
}
PADDLE_ENFORCE_EQ
(
is_encoded
,
true
);
VLOG
(
1
)
<<
"Use dgc allreduce mode"
<<
", nranks:"
<<
nranks_
;
void
SparseAllReduceOpHandle
::
WaitInputVarGenerated
()
{
#ifdef PADDLE_WITH_CUDA
for
(
auto
&
p
:
dev_ctxes_
)
{
if
(
platform
::
is_gpu_place
(
p
.
first
))
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
auto
*
compute_dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
platform
::
CUDAPlace
(
dev_id
));
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
p
.
second
);
if
(
compute_dev_ctx
->
stream
()
!=
dev_ctx
->
stream
())
{
auto
&
event
=
events_
.
at
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
compute_dev_ctx
->
stream
()));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
dev_ctx
->
stream
(),
event
,
0
));
}
PADDLE_ENFORCE_GT
(
local_scopes_
.
size
(),
0
);
auto
nranks_name
=
g_dgc_nranks
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
local_scope
=
local_scopes_
[
i
];
auto
nranks_var
=
local_scope
->
FindVar
(
nranks_name
);
if
(
nranks_var
==
nullptr
)
{
PADDLE_THROW
(
"not find nranks_var:%s"
,
nranks_name
);
}
float
*
dgc_nranks
=
nranks_var
->
GetMutable
<
LoDTensor
>
()
->
data
<
float
>
();
*
dgc_nranks
=
nranks
;
VLOG
(
10
)
<<
"dgc_nranks="
<<
*
dgc_nranks
;
}
#endif
}
void
SparseAllReduceOpHandle
::
RunImplEncoded
()
{
...
...
@@ -77,18 +70,27 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
"The NoDummyInputSize and NoDummyOutputSize should be equal."
);
std
::
vector
<
const
LoDTensor
*>
ins
;
std
::
vector
<
LoDTensor
*>
gathers
;
std
::
vector
<
LoDTensor
*>
outs
;
int
k
=
-
1
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
local_scope
=
local_exec_scopes_
[
i
];
auto
original_name
=
paddle
::
framework
::
GradOriginalVarName
(
in_var_handles
[
i
]
->
name
());
auto
encode_var_name
=
original_name
+
g_dgc_encoded
;
auto
*
in_var
=
local_scope
->
FindVar
(
encode_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
,
"%s should not be null"
,
encode_var_name
);
auto
&
in
=
in_var
->
Get
<
LoDTensor
>
();
ins
.
emplace_back
(
&
in
);
auto
gather_var_name
=
original_name
+
g_dgc_gather
;
auto
*
gather_var
=
local_scope
->
FindVar
(
gather_var_name
);
PADDLE_ENFORCE_NOT_NULL
(
gather_var
,
"%s should not be null"
,
gather_var_name
);
auto
*
gather
=
gather_var
->
GetMutable
<
LoDTensor
>
();
gathers
.
emplace_back
(
gather
);
auto
*
out
=
local_scope
->
FindVar
(
out_var_handles
[
i
]
->
name
())
->
GetMutable
<
LoDTensor
>
();
outs
.
emplace_back
(
out
);
...
...
@@ -135,9 +137,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
// dgc use ncclAllGather to get all the encoded data
// so the buffer need nranks.
int
buf_size
=
nranks_
*
encode_size
;
auto
tmp_ious_data
=
memory
::
Alloc
(
place
,
buf_size
);
void
*
gather_buff
=
reinterpret_cast
<
void
*>
(
tmp_ious_data
->
ptr
());
allocations
.
emplace_back
(
std
::
move
(
tmp_ious_data
));
void
*
gather_buff
=
gathers
[
i
]
->
data
<
void
>
();
VLOG
(
10
)
<<
"in_numel:"
<<
in_numel
<<
", out_numel:"
<<
out_numel
<<
", nranks:"
<<
nranks_
<<
", gather_buf size:"
<<
buf_size
...
...
paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
浏览文件 @
93c7f058
...
...
@@ -36,8 +36,6 @@ class SparseAllReduceOpHandle : public AllReduceOpHandle {
bool
is_encoded
=
false
,
int
nranks
=
-
1
);
std
::
string
Name
()
const
override
;
void
WaitInputVarGenerated
()
override
;
protected:
void
RunImpl
()
override
;
int
GetKValue
(
const
std
::
string
&
grad_name
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
93c7f058
...
...
@@ -105,6 +105,12 @@ class FuseAllReduceOpPass : public ir::Pass {
auto
*
all_reduce_op_handle
=
dynamic_cast
<
details
::
AllReduceOpHandle
*>
(
&
node
->
Wrapper
<
details
::
OpHandleBase
>
());
if
(
all_reduce_op_handle
)
{
#if defined(PADDLE_WITH_DGC)
PADDLE_ENFORCE_NE
(
all_reduce_op_handle
->
Name
(),
"sparse_all_reduce"
,
"DGC doesn't support fuse for now, if you want to use DGC "
"you need set strategy.fuse_all_reduce_ops = False."
);
#endif
auto
inputs
=
details
::
DynamicCast
<
details
::
VarHandle
>
(
all_reduce_op_handle
->
Inputs
());
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
num_place
);
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
浏览文件 @
93c7f058
...
...
@@ -1011,10 +1011,10 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
#if defined(PADDLE_WITH_DGC)
bool
AllReduceSSAGraphBuilder
::
IsEncoded
(
const
std
::
string
&
p_name
)
const
{
auto
u_name
=
p_name
+
details
::
g_dgc_u
;
auto
it
=
all_vars_
.
find
(
u
_name
);
auto
k_name
=
p_name
+
details
::
g_dgc_k
;
auto
it
=
all_vars_
.
find
(
k
_name
);
if
(
it
==
all_vars_
.
end
())
{
VLOG
(
10
)
<<
"can't find
u_name, so it's not encoded:"
<<
u
_name
;
VLOG
(
10
)
<<
"can't find
k_name, so it's not encoded:"
<<
k
_name
;
return
false
;
}
...
...
paddle/fluid/operators/dgc_op.cc
浏览文件 @
93c7f058
...
...
@@ -31,6 +31,8 @@ class DGCOp : public framework::OperatorWithKernel {
"Input(Grad) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
"Input(current_step) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
"Input(nranks) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"U_out"
),
"Output(U_out) of DGCop should not be null."
);
...
...
@@ -40,14 +42,15 @@ class DGCOp : public framework::OperatorWithKernel {
"Output(k) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"EncodeGrad"
),
"Output(EncodeGrad) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"GatherBuff"
),
true
,
"Output(EncodeGrad) of DGCop should not be null."
);
}
protected:
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
if
(
var_name
==
"current_step"
||
var_name
==
"rampup_step"
||
var_name
==
"k"
)
{
if
(
var_name
==
"current_step"
||
var_name
==
"k"
||
var_name
==
"nranks"
)
{
VLOG
(
10
)
<<
"var_name:"
<<
var_name
<<
" need not to transform"
;
return
expected_kernel_type
;
}
...
...
@@ -60,26 +63,18 @@ class DGCOp : public framework::OperatorWithKernel {
class
DGCOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"U"
,
"(Tensor)
Middle
tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor)
Middle
tensor of DGC"
);
AddInput
(
"U"
,
"(Tensor)
U velocity
tensor of DGC"
);
AddInput
(
"V"
,
"(Tensor)
V velocity
tensor of DGC"
);
AddInput
(
"Grad"
,
"(Tensor) Input gradient"
);
AddInput
(
"current_step"
,
"(Tensor) Current step."
);
AddOutput
(
"U_out"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"V_out"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"EncodeGrad"
,
"(Tensor) "
"Output encoded gradient"
);
AddOutput
(
"Grad_out"
,
"(Tensor) "
"Output grad gradient"
);
AddOutput
(
"k"
,
"(Tensor) "
"Output top-k value"
);
AddInput
(
"nranks"
,
"(Tensor) nranks."
);
AddOutput
(
"U_out"
,
"(Tensor) Output U velocity of DGC"
);
AddOutput
(
"V_out"
,
"(Tensor) Output V velocity of DGC"
);
AddOutput
(
"EncodeGrad"
,
"(Tensor) Output encoded gradient"
);
AddOutput
(
"Grad_out"
,
"(Tensor) Output grad gradient"
);
AddOutput
(
"k"
,
"(Tensor) Output top-k value"
);
AddOutput
(
"GatherBuff"
,
"(Tensor) Gather buffer"
);
AddAttr
<
float
>
(
"m"
,
"(float, 0.9) "
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
93c7f058
...
...
@@ -50,6 +50,11 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
rampup_begin_step
=
ctx
.
Attr
<
float
>
(
"rampup_begin_step"
);
auto
rampup_step
=
ctx
.
Attr
<
float
>
(
"rampup_step"
);
// nranks
auto
nranks_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
// current step
auto
current_step_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"current_step"
);
const
float
*
current_step
=
current_step_tensor
->
data
<
float
>
();
...
...
@@ -72,7 +77,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
<<
", rampup_begin_step:"
<<
rampup_begin_step
<<
", rampup_step:"
<<
rampup_step
<<
", current_step:"
<<
*
current_step
<<
", ratio:"
<<
ratio
<<
", k:"
<<
k
;
<<
", k:"
<<
k
<<
", nranks:"
<<
nranks
;
auto
k_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"k"
);
T
*
k_out_data
=
k_out
->
data
<
T
>
();
...
...
@@ -81,6 +86,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
u_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"U_out"
);
auto
v_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"V_out"
);
auto
encode_grad_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"EncodeGrad"
);
auto
gather_buff
=
ctx
.
Output
<
framework
::
Tensor
>
(
"GatherBuff"
);
// FIXME(gongwb): use cublas.
auto
u_out_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
u_out
);
...
...
@@ -88,6 +94,13 @@ class DGCOpKernel : public framework::OpKernel<T> {
auto
g_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
g
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
eigen_ctx
=
*
dev_ctx
.
eigen_device
();
if
(
static_cast
<
int
>
(
*
current_step
)
==
static_cast
<
int
>
(
rampup_begin_step
))
{
// calc local momentum from global momentum
u_out_e
.
device
(
eigen_ctx
)
=
(
1.0
/
nranks
)
*
u_e
;
}
if
(
use_nesterov
)
{
// u = m * (u + g)
u_out_e
.
device
(
eigen_ctx
)
=
m
*
(
u_e
+
g_e
);
...
...
@@ -111,6 +124,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
T
*
u_out_data
=
u_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
encode_grad_out_data
=
encode_grad_out
->
mutable_data
<
T
>
(
framework
::
DDim
{
2
*
k
},
ctx
.
GetPlace
());
gather_buff
->
mutable_data
<
T
>
(
framework
::
DDim
{
2
*
k
*
nranks
},
ctx
.
GetPlace
());
int
buf_size
=
paddle
::
communication
::
dgc
::
get_buffer_size
(
k
);
auto
tmp_ious_data
=
memory
::
Alloc
(
dev_ctx
,
buf_size
);
...
...
paddle/fluid/pybind/const_value.cc
浏览文件 @
93c7f058
...
...
@@ -59,14 +59,14 @@ void BindConstValue(pybind11::module* m) {
framework
::
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
);
#if defined(PADDLE_WITH_DGC)
auto
dgc
=
m
->
def_submodule
(
"dgc"
);
dgc
.
def
(
"kDGCUName"
,
[]
{
return
framework
::
details
::
g_dgc_u
;
});
dgc
.
def
(
"kDGCVName"
,
[]
{
return
framework
::
details
::
g_dgc_v
;
});
dgc
.
def
(
"kDGCKName"
,
[]
{
return
framework
::
details
::
g_dgc_k
;
});
dgc
.
def
(
"kDGCEncodedName"
,
[]
{
return
framework
::
details
::
g_dgc_encoded
;
});
dgc
.
def
(
"kDGCGatherName"
,
[]
{
return
framework
::
details
::
g_dgc_gather
;
});
dgc
.
def
(
"kDGCCounterName"
,
[]
{
return
framework
::
details
::
g_dgc_counter_name
;
});
dgc
.
def
(
"kDGCRampUpBeginStepName"
,
[]
{
return
framework
::
details
::
g_dgc_rampup_begin_step
;
});
dgc
.
def
(
"kDGCNRanksName"
,
[]
{
return
framework
::
details
::
g_dgc_nranks
;
});
#endif
}
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
93c7f058
...
...
@@ -867,7 +867,7 @@ class MomentumOptimizer(Optimizer):
return
momentum_op
class
DGCMomentumOptimizer
(
Momentum
Optimizer
):
class
DGCMomentumOptimizer
(
Optimizer
):
"""
DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
...
...
@@ -923,6 +923,8 @@ class DGCMomentumOptimizer(MomentumOptimizer):
sparsity=[0.999, 0.999])
"""
_u_velocity_acc_str
=
"_dgc_u_"
_v_velocity_acc_str
=
"_dgc_v_"
def
__init__
(
self
,
learning_rate
,
...
...
@@ -935,17 +937,25 @@ class DGCMomentumOptimizer(MomentumOptimizer):
num_trainers
=
None
,
regularization
=
None
,
name
=
None
):
self
.
_sparsity
=
sparsity
self
.
_rampup_step
=
rampup_step
self
.
_rampup_step_var
=
None
assert
learning_rate
is
not
None
assert
momentum
is
not
None
super
(
DGCMomentumOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
regularization
=
regularization
,
name
=
name
)
self
.
type
=
"dgc_momentum"
self
.
_momentum
=
momentum
self
.
_use_nesterov
=
bool
(
use_nesterov
)
self
.
_rampup_begin_step
=
rampup_begin_step
self
.
_rampup_begin_step_var
=
None
self
.
_rampup_step
=
rampup_step
self
.
_sparsity
=
sparsity
self
.
_rampup_begin_step_var
=
None
self
.
_global_step_var
=
None
self
.
_local_grad_clip_norm
=
None
self
.
_clip_norm
=
None
if
local_grad_clip_norm
is
not
None
:
assert
isinstance
(
num_trainers
,
int
)
assert
isinstance
(
local_grad_clip_norm
,
float
)
...
...
@@ -956,9 +966,6 @@ class DGCMomentumOptimizer(MomentumOptimizer):
self
.
_clip_norm
=
local_grad_clip_norm
/
(
num_trainers
*
num_trainers
)
super
(
DGCMomentumOptimizer
,
self
).
__init__
(
learning_rate
,
momentum
,
use_nesterov
,
regularization
,
name
)
def
_is_use_dgc
(
self
,
param_var
,
grad_var
):
var_numel
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
param_var
.
shape
))
if
var_numel
<
16384
or
\
...
...
@@ -970,34 +977,36 @@ class DGCMomentumOptimizer(MomentumOptimizer):
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
velocity_acc
=
self
.
_get_accumulator
(
self
.
_u_velocity_acc_str
,
param_and_grad
[
0
])
assert
velocity_acc
is
not
None
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
,
}
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
}
if
not
self
.
_is_use_dgc
(
param_and_grad
[
0
],
param_and_grad
[
1
]):
return
super
(
DGCMomentumOptimizer
,
self
).
_append_optimize_op
(
block
,
param_and_grad
)
type
=
"momentum"
else
:
type
=
"dgc_momentum"
inputs
.
update
({
"current_step"
:
self
.
_global_step_var
})
attrs
.
update
({
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)})
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
# create the dgc momentum optimize op
dgc_momentum_op
=
block
.
append_op
(
type
=
"dgc_momentum"
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
"current_step"
:
self
.
_global_step_var
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
},
attrs
=
{
"mu"
:
self
.
_momentum
,
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
)
},
type
=
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
attrs
,
stop_gradient
=
True
)
return
dgc_momentum_op
def
_add_auto_increment_var
(
self
,
counter_name
,
begin
,
step
=
1
):
...
...
@@ -1019,8 +1028,20 @@ class DGCMomentumOptimizer(MomentumOptimizer):
return
counter
def
_add_nranks_var
(
self
,
name
,
value
=-
1
):
helper
=
LayerHelper
(
'global_step_counter'
)
counter
,
is_new_var
=
helper
.
create_or_get_global_variable
(
name
=
name
,
dtype
=
'float32'
,
shape
=
[
1
],
persistable
=
True
)
if
is_new_var
:
helper
.
set_variable_initializer
(
counter
,
initializer
=
Constant
(
value
=
float
(
value
),
force_cpu
=
True
))
counter
.
stop_gradient
=
True
return
counter
def
_append_dgc_ops
(
self
,
param_and_grads
):
start_program
=
default_startup_program
()
main_program
=
default_main_program
()
main_program
.
_enable_dgc
=
True
...
...
@@ -1028,6 +1049,9 @@ class DGCMomentumOptimizer(MomentumOptimizer):
self
.
_global_step_var
=
self
.
_add_auto_increment_var
(
counter_name
=
core
.
dgc
.
kDGCCounterName
(),
begin
=
0
)
self
.
_nranks_var
=
self
.
_add_nranks_var
(
name
=
core
.
dgc
.
kDGCNRanksName
(),
value
=-
1
)
# rampup begin step var for all_reduce_op_handle
self
.
_rampup_begin_step_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
...
...
@@ -1037,22 +1061,16 @@ class DGCMomentumOptimizer(MomentumOptimizer):
value
=
self
.
_rampup_begin_step
*
1.0
,
force_cpu
=
True
)
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
for
param_var
,
grad_var
in
param_and_grads
:
# reuse velocity in dgc_op and dgc_momentum_op
u_var
=
self
.
_add_accumulator
(
self
.
_u_velocity_acc_str
,
param_var
)
if
not
self
.
_is_use_dgc
(
param_var
,
grad_var
):
continue
u_var
=
tensor
.
create_global_var
(
shape
=
param_var
.
shape
,
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCUName
(),
value
=
0.0
)
v_var
=
tensor
.
create_global_var
(
shape
=
param_var
.
shape
,
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCVName
(),
value
=
0.0
)
v_var
=
self
.
_add_accumulator
(
self
.
_v_velocity_acc_str
,
param_var
)
k_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
...
...
@@ -1070,6 +1088,14 @@ class DGCMomentumOptimizer(MomentumOptimizer):
value
=
0.0
,
force_cpu
=
False
)
gather_var
=
tensor
.
create_global_var
(
shape
=
[
1
],
dtype
=
param_var
.
dtype
,
persistable
=
True
,
name
=
param_var
.
name
+
core
.
dgc
.
kDGCGatherName
(),
value
=
0.0
,
force_cpu
=
False
)
# del back oprolevarname
op_maker
=
core
.
op_proto_and_checker_maker
backward
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
...
...
@@ -1092,7 +1118,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
if
self
.
_local_grad_clip_norm
is
not
None
:
clip_var
=
self
.
_append_clip_norm
(
grad_var
,
self
.
_clip_norm
)
self
.
_dgc_op
(
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
)
encoded_var
,
gather_var
)
def
_is_the_backward_op
(
self
,
op
):
op_maker
=
core
.
op_proto_and_checker_maker
...
...
@@ -1131,7 +1157,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
x
=
grad_var
,
max_norm
=
clip_norm
,
name
=
grad_var
.
name
)
def
_dgc_op
(
self
,
param_var
,
clip_var
,
grad_var
,
u_var
,
v_var
,
k_var
,
encoded_var
):
encoded_var
,
gather_var
):
block
=
framework
.
default_main_program
().
global_block
()
op_maker
=
core
.
op_proto_and_checker_maker
dgc_op
=
block
.
append_op
(
...
...
@@ -1140,21 +1166,23 @@ class DGCMomentumOptimizer(MomentumOptimizer):
"U"
:
u_var
,
"V"
:
v_var
,
"Grad"
:
clip_var
,
"current_step"
:
self
.
_global_step_var
"current_step"
:
self
.
_global_step_var
,
"nranks"
:
self
.
_nranks_var
,
},
outputs
=
{
"U_out"
:
u_var
,
"V_out"
:
v_var
,
"EncodeGrad"
:
encoded_var
,
"k"
:
k_var
,
"Grad_out"
:
grad_var
"Grad_out"
:
grad_var
,
"GatherBuff"
:
gather_var
,
},
attrs
=
{
"m"
:
self
.
_momentum
,
"sparsity"
:
self
.
_sparsity
,
"use_nesterov"
:
self
.
_use_nesterov
,
"rampup_begin_step"
:
float
(
self
.
_rampup_begin_step
),
"rampup_step"
:
float
(
self
.
_rampup_step
)
"rampup_step"
:
float
(
self
.
_rampup_step
)
,
},
stop_gradient
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_dgc_op.py
浏览文件 @
93c7f058
...
...
@@ -34,7 +34,7 @@ class TestDGCOp(unittest.TestCase):
print
(
"place:"
,
place
)
# numpy data
# inputs: U, V, Grad, current_step
# inputs: U, V, Grad, current_step
, nranks
self
.
u_name
=
"U"
self
.
u
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
...
...
@@ -47,10 +47,14 @@ class TestDGCOp(unittest.TestCase):
self
.
current_step_name
=
"current_step"
self
.
current_step
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
# output: U_out, V_out, EncodeGrad, GradLocal_out
self
.
nranks_name
=
"nranks"
self
.
nranks
=
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
# output: U_out, V_out, EncodeGrad, GradLocal_out, k, GatherBuff
self
.
encode_grad_name
=
"EncodeGrad"
self
.
k_name
=
"k"
self
.
k
=
np
.
full
((
1
),
0.0
).
astype
(
"float32"
)
self
.
gather_buff_name
=
"GatherBuff"
# scope data
self
.
u_tensor
=
self
.
scope
.
var
(
self
.
u_name
).
get_tensor
()
...
...
@@ -62,16 +66,22 @@ class TestDGCOp(unittest.TestCase):
self
.
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
self
.
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
encode_grad_tensor
=
self
.
scope
.
var
(
self
.
encode_grad_name
).
get_tensor
()
self
.
current_step_tensor
=
self
.
scope
.
var
(
self
.
current_step_name
).
get_tensor
()
self
.
current_step_tensor
.
set
(
self
.
current_step
,
core
.
CPUPlace
())
self
.
nranks_tensor
=
self
.
scope
.
var
(
self
.
nranks_name
).
get_tensor
()
self
.
nranks_tensor
.
set
(
self
.
nranks
,
core
.
CPUPlace
())
self
.
encode_grad_tensor
=
self
.
scope
.
var
(
self
.
encode_grad_name
).
get_tensor
()
self
.
k_tensor
=
self
.
scope
.
var
(
self
.
k_name
).
get_tensor
()
self
.
k_tensor
.
set
(
self
.
k
,
core
.
CPUPlace
())
self
.
gather_buff_tensor
=
self
.
scope
.
var
(
self
.
gather_buff_name
).
get_tensor
()
def
check
(
self
,
actual_t
,
expect_t
,
place
,
out_name
,
atol
=
1e-5
):
self
.
assertTrue
(
np
.
allclose
(
...
...
@@ -87,6 +97,7 @@ class TestDGCOp(unittest.TestCase):
'V'
:
self
.
v_name
,
'Grad'
:
self
.
grad_name
,
'current_step'
:
self
.
current_step_name
,
'nranks'
:
self
.
nranks_name
,
# outputs
'U_out'
:
self
.
u_name
,
...
...
@@ -94,6 +105,7 @@ class TestDGCOp(unittest.TestCase):
'EncodeGrad'
:
self
.
encode_grad_name
,
'Grad_out'
:
self
.
grad_name
,
'k'
:
self
.
k_name
,
'GatherBuff'
:
self
.
gather_buff_name
,
# attrs
'm'
:
0.9
,
...
...
python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
浏览文件 @
93c7f058
...
...
@@ -29,7 +29,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
return
self
.
_accumulators
def
get_velocity_str
(
self
):
return
self
.
_velocity_acc_str
return
self
.
_
u_
velocity_acc_str
def
check_dgc_momentum_optimizer
(
self
,
dims
=
[
5
,
10
,
8
],
name
=
"momentum"
):
init_program
=
framework
.
Program
()
...
...
@@ -66,8 +66,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
# params_grads = append_backward(mean_out)
params_grads
=
dgc_momentum_optimizer
.
backward
(
mean_out
)
accumulator_count
=
1
if
name
==
"momentum"
else
2
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
dgc_momentum_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
dgc_momentum_optimizer
.
get_accumulators
()),
accumulator_count
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
dgc_momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
2
)
...
...
@@ -77,7 +79,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
# Check accumulators
accumulators
=
dgc_momentum_optimizer
.
get_accumulators
()
self
.
assertEqual
(
len
(
accumulators
),
1
)
self
.
assertEqual
(
len
(
accumulators
),
accumulator_count
)
self
.
assertTrue
(
dgc_momentum_optimizer
.
get_velocity_str
()
in
accumulators
)
velocity_acc
=
accumulators
[
dgc_momentum_optimizer
.
get_velocity_str
()]
...
...
@@ -86,11 +88,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
# Check init_program
init_ops
=
init_program
.
global_block
().
ops
self
.
assertEqual
(
len
(
init_ops
),
2
)
self
.
assertEqual
(
len
(
init_ops
),
1
)
self
.
assertEqual
(
init_ops
[
0
].
type
,
"fill_constant"
)
self
.
assertAlmostEqual
(
init_ops
[
0
].
attr
(
'value'
),
learning_rate
)
self
.
assertEqual
(
init_ops
[
1
].
type
,
"fill_constant"
)
self
.
assertAlmostEqual
(
init_ops
[
1
].
attr
(
'value'
),
0.0
)
with
open
(
"test_dgc_optimizer_"
+
name
+
".log"
,
"w"
)
as
f
:
program_to_code
(
program
,
fout
=
f
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录