Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
427c5529
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
427c5529
编写于
8月 01, 2020
作者:
Y
Yi Liu
提交者:
GitHub
8月 01, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add localsgd meta optimizer (#25758)
* add localsgd meta optimizer
上级
2d24f56a
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
387 addition
and
5 deletion
+387
-5
python/paddle/fleet/base/meta_optimizer_factory.py
python/paddle/fleet/base/meta_optimizer_factory.py
+2
-0
python/paddle/fleet/meta_optimizers/__init__.py
python/paddle/fleet/meta_optimizers/__init__.py
+2
-0
python/paddle/fleet/meta_optimizers/common.py
python/paddle/fleet/meta_optimizers/common.py
+126
-0
python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
+193
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+9
-5
python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
...uid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+55
-0
未找到文件。
python/paddle/fleet/base/meta_optimizer_factory.py
浏览文件 @
427c5529
...
@@ -16,6 +16,7 @@ from ..meta_optimizers import RecomputeOptimizer
...
@@ -16,6 +16,7 @@ from ..meta_optimizers import RecomputeOptimizer
from
..meta_optimizers
import
GradientMergeOptimizer
from
..meta_optimizers
import
GradientMergeOptimizer
from
..meta_optimizers
import
GraphExecutionOptimizer
from
..meta_optimizers
import
GraphExecutionOptimizer
from
..meta_optimizers
import
PipelineOptimizer
from
..meta_optimizers
import
PipelineOptimizer
from
..meta_optimizers
import
LocalSGDOptimizer
__all__
=
[
"MetaOptimizerFactory"
]
__all__
=
[
"MetaOptimizerFactory"
]
...
@@ -24,6 +25,7 @@ meta_optimizer_names = [
...
@@ -24,6 +25,7 @@ meta_optimizer_names = [
"GradientMergeOptimizer"
,
"GradientMergeOptimizer"
,
"GraphExecutionOptimizer"
,
"GraphExecutionOptimizer"
,
"PipelineOptimizer"
,
"PipelineOptimizer"
,
"LocalSGDOptimizer"
,
]
]
...
...
python/paddle/fleet/meta_optimizers/__init__.py
浏览文件 @
427c5529
...
@@ -15,9 +15,11 @@ from .recompute_optimizer import RecomputeOptimizer
...
@@ -15,9 +15,11 @@ from .recompute_optimizer import RecomputeOptimizer
from
.gradient_merge_optimizer
import
GradientMergeOptimizer
from
.gradient_merge_optimizer
import
GradientMergeOptimizer
from
.graph_execution_optimizer
import
GraphExecutionOptimizer
from
.graph_execution_optimizer
import
GraphExecutionOptimizer
from
.pipeline_optimizer
import
PipelineOptimizer
from
.pipeline_optimizer
import
PipelineOptimizer
from
.localsgd_optimizer
import
LocalSGDOptimizer
__all__
=
[
__all__
=
[
'RecomputeOptimizer'
,
'RecomputeOptimizer'
,
'GradientMergeOptimizer'
,
'GradientMergeOptimizer'
,
'PipelineOptimizer'
,
'PipelineOptimizer'
,
'LocalSGDOptimizer'
,
]
]
python/paddle/fleet/meta_optimizers/common.py
0 → 100644
浏览文件 @
427c5529
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
,
unique_name
from
..base.private_helper_function
import
wait_server_ready
OpRole
=
core
.
op_proto_and_checker_maker
.
OpRole
OP_ROLE_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
OP_ROLE_VAR_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
def
is_update_op
(
op
):
return
'Param'
in
op
.
input_names
and
'Grad'
in
op
.
input_names
and
\
"LearningRate"
in
op
.
input_names
def
is_loss_grad_op
(
op
):
if
OP_ROLE_KEY
not
in
op
.
attr_names
:
return
False
op_role
=
int
(
op
.
all_attrs
()[
OP_ROLE_KEY
])
return
op_role
&
int
(
OpRole
.
Backward
)
and
op_role
&
int
(
OpRole
.
Loss
)
def
is_backward_op
(
op
):
return
OP_ROLE_KEY
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
OP_ROLE_KEY
])
&
int
(
OpRole
.
Backward
)
def
is_optimizer_op
(
op
):
return
OP_ROLE_KEY
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
OP_ROLE_KEY
])
&
int
(
OpRole
.
Optimize
)
class
CollectiveHelper
(
object
):
def
__init__
(
self
,
role_maker
,
nrings
=
1
,
wait_port
=
'6174'
):
self
.
nrings
=
nrings
self
.
wait_port
=
wait_port
self
.
role_maker
=
role_maker
def
update_startup_program
(
self
,
startup_program
=
None
):
self
.
startup_program
=
startup_program
if
startup_program
is
None
:
self
.
startup_program
=
fluid
.
default_startup_program
()
endpoints
=
self
.
role_maker
.
get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
worker_index
()]
for
ring_id
in
range
(
self
.
nrings
):
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
endpoints
,
self
.
role_maker
.
worker_index
(),
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
()
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
def
_broadcast_params
(
self
):
block
=
self
.
startup_program
.
global_block
()
ring_id
=
-
1
for
param
in
block
.
iter_parameters
():
if
param
.
is_distributed
:
continue
ring_id
=
(
ring_id
+
1
)
%
self
.
nrings
block
.
append_op
(
type
=
'c_broadcast'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
'root'
:
0
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
for
ring_id
in
range
(
self
.
nrings
):
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
python/paddle/fleet/meta_optimizers/localsgd_optimizer.py
0 → 100644
浏览文件 @
427c5529
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
paddle.fluid
import
program_guard
,
layers
from
paddle.fluid.optimizer
import
Momentum
,
SGD
from
.meta_optimizer_base
import
MetaOptimizerBase
from
.common
import
OpRole
,
OP_ROLE_KEY
,
CollectiveHelper
,
is_update_op
class
LocalSGDOptimizer
(
MetaOptimizerBase
):
def
__init__
(
self
,
optimizer
):
super
(
LocalSGDOptimizer
,
self
).
__init__
(
optimizer
)
self
.
inner_opt
=
optimizer
self
.
meta_optimizers_white_list
=
[]
self
.
snapshot_key
=
'@SNAPSHOT'
def
_can_apply
(
self
):
if
not
self
.
user_defined_strategy
.
localsgd
:
return
False
if
self
.
role_maker
.
worker_num
()
<=
1
:
return
False
return
isinstance
(
self
.
inner_opt
,
Momentum
)
\
or
isinstance
(
self
.
inner_opt
,
SGD
)
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
localsgd
=
False
dist_strategy
.
localsgd_configs
=
{
'k_steps'
:
1
}
def
snapshot_name
(
self
,
param_name
):
return
param_name
+
self
.
snapshot_key
def
minimize_impl
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
minimized
=
self
.
inner_opt
.
minimize
(
loss
,
startup_program
=
startup_program
)
init_k_steps
=
self
.
user_defined_strategy
.
localsgd_configs
[
'k_steps'
]
auto_steps
=
self
.
user_defined_strategy
.
auto
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
main_block
=
loss
.
block
self
.
nrings
=
2
collective_helper
=
CollectiveHelper
(
self
.
role_maker
,
self
.
nrings
)
collective_helper
.
update_startup_program
(
startup_program
)
with
program_guard
(
main_block
.
program
):
step
=
layers
.
autoincreased_step_counter
(
begin
=
0
)
k_steps
=
layers
.
create_global_var
(
name
=
"k_steps"
,
shape
=
[
1
],
value
=
init_k_steps
,
dtype
=
'int64'
,
persistable
=
True
)
last_step
=
layers
.
create_global_var
(
name
=
"last_step"
,
shape
=
[
1
],
value
=
int
(
0
),
dtype
=
'int64'
,
persistable
=
True
)
if
auto_steps
:
lr_0
=
layers
.
create_global_var
(
name
=
"lr_0"
,
shape
=
[
1
],
value
=
float
(
0
),
dtype
=
'float32'
,
persistable
=
True
)
loss_0
=
layers
.
create_global_var
(
name
=
"loss_0"
,
shape
=
[
1
],
value
=
float
(
0
),
dtype
=
'float32'
,
persistable
=
True
)
global_lr
=
self
.
inner_opt
.
_global_learning_rate
()
def
initialize
():
layers
.
assign
(
loss
,
loss_0
)
layers
.
assign
(
global_lr
,
lr_0
)
layers
.
cond
(
step
==
0
,
initialize
)
def
communicate
():
ordered_param_snapshot
=
[]
ring_id
=
-
1
for
idx
,
op
in
reversed
(
list
(
enumerate
(
main_block
.
ops
))):
if
is_update_op
(
op
):
param
=
main_block
.
vars
[
op
.
input
(
'Param'
)[
0
]]
if
param
.
is_distributed
:
continue
snapshot
=
main_block
.
create_var
(
name
=
self
.
snapshot_name
(
param
.
name
),
shape
=
param
.
shape
,
persistable
=
True
,
stop_gradient
=
True
,
dtype
=
param
.
dtype
)
main_block
.
_insert_op
(
idx
+
1
,
type
=
'elementwise_sub'
,
inputs
=
{
'X'
:
[
snapshot
],
'Y'
:
[
param
]},
outputs
=
{
'Out'
:
[
param
]},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
main_block
.
_insert_op
(
idx
+
2
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
ring_id
=
(
ring_id
+
1
)
%
self
.
nrings
main_block
.
_insert_op
(
idx
+
3
,
type
=
'c_allreduce_sum'
,
inputs
=
{
'X'
:
[
param
]},
outputs
=
{
'Out'
:
[
param
]},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Optimize
})
ordered_param_snapshot
.
append
((
param
,
snapshot
))
for
ring_id
in
range
(
self
.
nrings
):
main_block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Optimize
})
for
param_snapshot
in
reversed
(
ordered_param_snapshot
):
param
=
param_snapshot
[
0
]
snapshot
=
param_snapshot
[
1
]
main_block
.
append_op
(
type
=
'scale'
,
inputs
=
{
'X'
:
[
param
]},
outputs
=
{
'Out'
:
[
param
]},
attrs
=
{
'scale'
:
1.0
/
self
.
role_maker
.
worker_num
(),
OP_ROLE_KEY
:
OpRole
.
Optimize
})
main_block
.
append_op
(
type
=
'elementwise_sub'
,
inputs
=
{
'X'
:
[
snapshot
],
'Y'
:
[
param
]},
outputs
=
{
'Out'
:
[
param
]},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
main_block
.
append_op
(
type
=
'assign'
,
inputs
=
{
'X'
:
[
param
]},
outputs
=
{
'Out'
:
[
snapshot
]},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
if
auto_steps
:
next_local_steps
=
layers
.
cast
(
layers
.
ceil
(
layers
.
sqrt
(
lr_0
*
loss
/
(
global_lr
*
loss_0
)
*
float
(
init_k_steps
))),
dtype
=
'int64'
)
max_local_steps
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'int64'
,
value
=
16
)
next_local_steps
=
layers
.
elementwise_min
(
next_local_steps
,
max_local_steps
)
layers
.
assign
(
next_local_steps
,
k_steps
)
layers
.
assign
(
step
,
last_step
)
layers
.
cond
(
step
-
last_step
==
k_steps
,
communicate
)
return
minimized
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
427c5529
...
@@ -34,6 +34,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
...
@@ -34,6 +34,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
foreach
(
TEST_OP
${
MIXED_DIST_TEST_OPS
}
)
foreach
(
TEST_OP
${
MIXED_DIST_TEST_OPS
}
)
list
(
REMOVE_ITEM TEST_OPS
${
TEST_OP
}
)
list
(
REMOVE_ITEM TEST_OPS
${
TEST_OP
}
)
...
@@ -363,11 +364,14 @@ if(WITH_DISTRIBUTE)
...
@@ -363,11 +364,14 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_communicator_sync MODULES test_communicator_sync ENVS
${
dist_ENVS
}
FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1
)
py_test_modules
(
test_communicator_sync MODULES test_communicator_sync ENVS
${
dist_ENVS
}
FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1
)
py_test_modules
(
test_collective_optimizer MODULES test_collective_optimizer
)
py_test_modules
(
test_collective_optimizer MODULES test_collective_optimizer
)
if
(
NOT APPLE
)
if
(
NOT APPLE
)
py_test_modules
(
test_fleet_base MODULES test_fleet_base ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_base MODULES test_fleet_base ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer MODULES test_fleet_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer MODULES test_fleet_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
if
(
NOT WIN32
)
py_test_modules
(
test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS
${
dist_ENVS
}
)
endif
(
NOT WIN32
)
endif
(
NOT APPLE
)
endif
(
NOT APPLE
)
if
(
WITH_DGC
)
if
(
WITH_DGC
)
# if with dgc, test all dgc tests.
# if with dgc, test all dgc tests.
...
...
python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
0 → 100644
浏览文件 @
427c5529
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
import
os
import
paddle.fleet
as
fleet
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
class
TestFleetLocalSGDMetaOptimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"1"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001,127.0.0.1:36002"
def
test_localsgd_optimizer
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
fleet
.
DistributedStrategy
()
strategy
.
localsgd
=
True
strategy
.
auto
=
True
config
=
strategy
.
localsgd_configs
config
[
'k_steps'
]
=
1
strategy
.
localsgd_configs
=
config
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录