Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
32ad4f90
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
32ad4f90
编写于
9月 24, 2020
作者:
1
123malin
提交者:
GitHub
9月 24, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【paddle.fleet】 Usages Change: from fleet.util() to fleet.util (#27468)
* test=develop, bug fix
上级
df7fabee
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
120 addition
and
144 deletion
+120
-144
python/paddle/distributed/fleet/__init__.py
python/paddle/distributed/fleet/__init__.py
+3
-3
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+5
-27
python/paddle/distributed/fleet/base/util_factory.py
python/paddle/distributed/fleet/base/util_factory.py
+33
-30
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+4
-6
python/paddle/distributed/fleet/utils/__init__.py
python/paddle/distributed/fleet/utils/__init__.py
+2
-0
python/paddle/distributed/fleet/utils/fs.py
python/paddle/distributed/fleet/utils/fs.py
+26
-26
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+3
-4
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+3
-4
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+1
-2
python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+2
-3
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+2
-5
python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
...addle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+1
-4
python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
...le/fluid/tests/unittests/test_dist_fleet_heter_program.py
+0
-1
python/paddle/fluid/tests/unittests/test_fleet_base.py
python/paddle/fluid/tests/unittests/test_fleet_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
.../paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+6
-6
python/paddle/fluid/tests/unittests/test_fleet_util.py
python/paddle/fluid/tests/unittests/test_fleet_util.py
+28
-22
未找到文件。
python/paddle/distributed/fleet/__init__.py
浏览文件 @
32ad4f90
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
# TODO: define distributed api under this directory,
from
.base.role_maker
import
UserDefinedRoleMaker
,
PaddleCloudRoleMaker
from
.base.role_maker
import
Role
,
UserDefinedRoleMaker
,
PaddleCloudRoleMaker
from
.base.distributed_strategy
import
DistributedStrategy
from
.base.fleet_base
import
Fleet
from
.base.util_factory
import
UtilBase
...
...
@@ -26,6 +26,7 @@ __all__ = [
"UserDefinedRoleMaker"
,
"PaddleCloudRoleMaker"
,
"Fleet"
,
"Role"
,
]
fleet
=
Fleet
()
...
...
@@ -39,8 +40,7 @@ server_num = fleet.server_num
server_index
=
fleet
.
server_index
server_endpoints
=
fleet
.
server_endpoints
is_server
=
fleet
.
is_server
set_util
=
fleet
.
set_util
util
=
fleet
.
util
util
=
UtilBase
()
barrier_worker
=
fleet
.
barrier_worker
init_worker
=
fleet
.
init_worker
init_server
=
fleet
.
init_server
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
32ad4f90
...
...
@@ -23,7 +23,6 @@ from .strategy_compiler import StrategyCompiler
from
.distributed_strategy
import
DistributedStrategy
from
.meta_optimizer_factory
import
MetaOptimizerFactory
from
.runtime_factory
import
RuntimeFactory
from
.util_factory
import
UtilFactory
from
paddle.fluid.wrapped_decorator
import
wrap_decorator
from
paddle.fluid.dygraph
import
parallel_helper
...
...
@@ -120,7 +119,6 @@ class Fleet(object):
self
.
strategy_compiler
=
None
self
.
_is_collective
=
False
self
.
_runtime_handle
=
None
self
.
_util
=
None
def
init
(
self
,
role_maker
=
None
,
is_collective
=
False
):
"""
...
...
@@ -182,6 +180,9 @@ class Fleet(object):
format
(
type
(
role_maker
)))
self
.
_role_maker
.
_generate_role
()
import
paddle.distributed.fleet
as
fleet
fleet
.
util
.
_set_role_maker
(
self
.
_role_maker
)
self
.
strategy_compiler
=
StrategyCompiler
()
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
():
if
parallel_helper
.
_is_parallel_ctx_initialized
():
...
...
@@ -353,29 +354,6 @@ class Fleet(object):
return
self
.
_role_maker
.
_is_server
(
)
or
self
.
_role_maker
.
_is_heter_worker
()
def
set_util
(
self
,
util
):
self
.
_util
=
util
def
util
(
self
):
"""
Utility functions that can be used under certain runtime
return util
Returns:
UtilBase: instance of UtilBase, can use distributed ops/tools easily.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
fleet.init()
util = fleet.util
files = ["1.log", "2.log", "3.log", "4.log"]
files = util.get_file_shard()
"""
return
self
.
_util
def
barrier_worker
(
self
):
"""
barrier all workers
...
...
@@ -1102,7 +1080,7 @@ class Fleet(object):
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
i
f
self
.
_util
is
None
:
self
.
_util
=
UtilFactory
().
_create_util
(
context
)
i
mport
paddle.distributed.fleet
as
fleet
fleet
.
util
.
_set_strategy
(
context
[
"valid_strategy"
]
)
return
optimize_ops
,
params_grads
python/paddle/distributed/fleet/base/util_factory.py
浏览文件 @
32ad4f90
...
...
@@ -73,11 +73,13 @@ class UtilBase(object):
.. code-block:: python
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
from paddle.distributed.fleet.base.util_factory import fleet_util
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet import PaddleCloudRoleMaker
import sys
import numpy as np
import os
os.environ["PADDLE_WITH_GLOO"] = "2"
def train():
role = PaddleCloudRoleMaker(
...
...
@@ -85,19 +87,18 @@ class UtilBase(object):
init_gloo=True,
path="./tmp_gloo")
fleet.init(role)
fleet_util._set_role_maker(role)
if fleet.is_server():
input = [1, 2]
output = fleet
_
util.all_reduce(input, "sum", "server")
output = fleet
.
util.all_reduce(input, "sum", "server")
print(output)
# [2, 4]
elif fleet.is_worker():
input = np.array([3, 4])
output = fleet
_
util.all_reduce(input, "sum", "worker")
output = fleet
.
util.all_reduce(input, "sum", "worker")
print(output)
# [6, 8]
output = fleet
_
util.all_reduce(input, "sum", "all")
output = fleet
.
util.all_reduce(input, "sum", "all")
print(output)
# [8, 12]
if __name__ == "__main__":
...
...
@@ -117,10 +118,12 @@ class UtilBase(object):
.. code-block:: python
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
from paddle.distributed.fleet.base.util_factory import fleet_util
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet import PaddleCloudRoleMaker
import sys
import os
os.environ["PADDLE_WITH_GLOO"] = "2"
def train():
role = PaddleCloudRoleMaker(
...
...
@@ -128,15 +131,14 @@ class UtilBase(object):
init_gloo=True,
path="./tmp_gloo")
fleet.init(role)
fleet_util._set_role_maker(role)
if fleet.is_server():
fleet
_
util.barrier("server")
fleet
.
util.barrier("server")
print("all server arrive here")
elif fleet.is_worker():
fleet
_
util.barrier("worker")
fleet
.
util.barrier("worker")
print("all server arrive here")
fleet
_
util.barrier("all")
fleet
.
util.barrier("all")
print("all servers and workers arrive here")
if __name__ == "__main__":
...
...
@@ -160,10 +162,12 @@ class UtilBase(object):
.. code-block:: python
# Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
from paddle.distributed.fleet.base.util_factory import fleet_util
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet import PaddleCloudRoleMaker
import sys
import os
os.environ["PADDLE_WITH_GLOO"] = "2"
def train():
role = PaddleCloudRoleMaker(
...
...
@@ -171,19 +175,18 @@ class UtilBase(object):
init_gloo=True,
path="./tmp_gloo")
fleet.init(role)
fleet_util._set_role_maker(role)
if fleet.is_server():
input = fleet.server_index()
output = fleet
_
util.all_gather(input, "server")
output = fleet
.
util.all_gather(input, "server")
print(output)
# output = [0, 1]
elif fleet.is_worker():
input = fleet.worker_index()
output = fleet
_
util.all_gather(input, "worker")
output = fleet
.
util.all_gather(input, "worker")
# output = [0, 1]
print(output)
output = fleet
_
util.all_gather(input, "all")
output = fleet
.
util.all_gather(input, "all")
print(output)
# output = [0, 1, 0, 1]
...
...
@@ -220,18 +223,20 @@ class UtilBase(object):
.. code-block:: python
from paddle.distributed.fleet.base.util_factory import fleet_util
import paddle.distributed.fleet.base.role_maker as role_m
aker
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet import UserDefinedRoleM
aker
role =
role_maker.
UserDefinedRoleMaker(
role = UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=
role_maker
.Role.WORKER,
role=
fleet
.Role.WORKER,
worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
fleet_util._set_role_maker(role)
files = fleet_util.get_file_shard(["file1", "file2", "file3"])
fleet.init(role)
files = fleet.util.get_file_shard(["file1", "file2", "file3"])
print(files)
# files = ["file1", "file2"]
"""
if
not
isinstance
(
files
,
list
):
...
...
@@ -267,18 +272,19 @@ class UtilBase(object):
.. code-block:: python
from paddle.distributed.fleet.base.util_factory import fleet_util
import paddle.distributed.fleet.base.role_maker as role_m
aker
import paddle.distributed.fleet as fleet
from paddle.distributed.fleet import UserDefinedRoleM
aker
role =
role_maker.
UserDefinedRoleMaker(
role = UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=
role_maker
.Role.WORKER,
role=
fleet
.Role.WORKER,
worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
fleet_util._set_role_maker(role)
fleet_util.print_on_rank("I'm worker 0", 0)
fleet.init(role)
fleet.util.print_on_rank("I'm worker 0", 0)
"""
if
self
.
role_maker
.
_worker_index
()
!=
rank_id
:
return
...
...
@@ -577,6 +583,3 @@ class UtilBase(object):
print
(
"fetch_targets name: %s"
%
v
.
name
)
print
(
"fetch_targets: {}"
.
format
(
results
[
i
]))
return
results
fleet_util
=
UtilFactory
().
_create_util
(
None
)
python/paddle/distributed/fleet/launch.py
浏览文件 @
32ad4f90
...
...
@@ -181,8 +181,8 @@ def get_gpus(gpus):
cuda_visible_devices_list
=
cuda_visible_devices
.
split
(
','
)
for
x
in
gpus
.
split
(
','
):
assert
x
in
cuda_visible_devices_list
,
"Can't find "
\
"your gpus %s in CUDA_VISIBLE_DEVICES[%s]."
\
%
(
x
,
cuda_visible_devices
)
"your gpus %s in CUDA_VISIBLE_DEVICES[%s]."
\
%
(
x
,
cuda_visible_devices
)
res_gpus
=
[
cuda_visible_devices_list
.
index
(
x
.
strip
())
for
x
in
gpus
.
split
(
','
)
...
...
@@ -348,8 +348,7 @@ def launch_ps(args):
"PADDLE_PORT"
:
cur_server
.
endpoint
.
split
(
":"
)[
1
],
"TRAINING_ROLE"
:
"PSERVER"
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"POD_IP"
:
cur_server
.
endpoint
.
split
(
":"
)[
0
],
"PADDLE_WITH_GLOO"
:
"1"
"POD_IP"
:
cur_server
.
endpoint
.
split
(
":"
)[
0
]
}
current_env
.
update
(
proc_env
)
...
...
@@ -388,8 +387,7 @@ def launch_ps(args):
"PADDLE_TRAINER_ENDPOINTS"
:
worker_endpoints
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"TRAINING_ROLE"
:
"TRAINER"
,
"PADDLE_TRAINER_ID"
:
str
(
cur_worker
.
rank
),
"PADDLE_WITH_GLOO"
:
"1"
"PADDLE_TRAINER_ID"
:
str
(
cur_worker
.
rank
)
}
current_env
.
update
(
proc_env
)
...
...
python/paddle/distributed/fleet/utils/__init__.py
浏览文件 @
32ad4f90
...
...
@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.fs
import
LocalFS
,
HDFSClient
python/paddle/distributed/fleet/utils/fs.py
浏览文件 @
32ad4f90
...
...
@@ -120,7 +120,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
subdirs, files = client.ls_dir("./")
...
...
@@ -140,7 +140,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
subdirs, files = client.ls_dir("./")
...
...
@@ -160,7 +160,7 @@ class LocalFS(FS):
def
mkdirs
(
self
,
fs_path
):
"""
Create a
remote HDFS
directory.
Create a
local
directory.
Args:
fs_path(str): The local directory path.
...
...
@@ -168,7 +168,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.mkdirs("test_mkdirs")
...
...
@@ -189,7 +189,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.touch("test_rename_src")
...
...
@@ -217,7 +217,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.mkdirs("test_localFS_mkdirs")
...
...
@@ -247,7 +247,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.touch("test_is_file")
...
...
@@ -269,7 +269,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.mkdirs("test_is_dir")
...
...
@@ -292,7 +292,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
ret = local_fs.is_exist("test_is_exist")
...
...
@@ -311,7 +311,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.touch("test_touch")
...
...
@@ -332,13 +332,11 @@ class LocalFS(FS):
src_path(str): Name of the file or directory, that's needed to be moved.
dst_path(str): Name of the file or directory to which to move to.
overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
test_exists(bool): Check the existence of `src_path` and `dst_path` .
When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption.
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
client.touch("test_mv_src")
...
...
@@ -369,7 +367,7 @@ class LocalFS(FS):
Examples:
.. code-block:: python
from paddle.distributed.fleet.utils
.fs
import LocalFS
from paddle.distributed.fleet.utils import LocalFS
client = LocalFS()
subdirs = client.list_dirs("./")
...
...
@@ -432,7 +430,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -493,7 +491,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -526,7 +524,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -587,7 +585,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -629,7 +627,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -661,7 +659,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -695,7 +693,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -740,7 +738,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -784,7 +782,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -830,7 +828,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -893,7 +891,7 @@ class HDFSClient(FS):
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
@@ -919,12 +917,14 @@ class HDFSClient(FS):
Args:
fs_path(str): The HDFS file path.
exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
program will throw an Exception. Default is true.
Examples:
.. code-block:: text
from paddle.distributed.fleet.utils
.fs
import HDFSClient
from paddle.distributed.fleet.utils import HDFSClient
hadoop_home = "/home/client/hadoop-client/hadoop/"
configs = {
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
浏览文件 @
32ad4f90
...
...
@@ -28,7 +28,6 @@ import numpy as np
import
ctr_dataset_reader
from
test_dist_fleet_base
import
runtime_main
,
FleetDistRunnerBase
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
paddle
.
enable_static
()
...
...
@@ -180,13 +179,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
fetch_list
=
[
self
.
avg_cost
.
name
])
loss_val
=
np
.
mean
(
loss_val
)
# TODO(randomly fail)
# reduce_output = fleet
_
util.all_reduce(
# reduce_output = fleet
.
util.all_reduce(
# np.array(loss_val), mode="sum")
# loss_all_trainer = fleet
_
util.all_gather(float(loss_val))
# loss_all_trainer = fleet
.
util.all_gather(float(loss_val))
# loss_val = float(reduce_output) / len(loss_all_trainer)
message
=
"TRAIN ---> pass: {} loss: {}
\n
"
.
format
(
epoch_id
,
loss_val
)
fleet
_
util
.
print_on_rank
(
message
,
0
)
fleet
.
util
.
print_on_rank
(
message
,
0
)
pass_time
=
time
.
time
()
-
pass_start
except
fluid
.
core
.
EOFException
:
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
浏览文件 @
32ad4f90
...
...
@@ -29,7 +29,6 @@ import numpy as np
import
ctr_dataset_reader
from
test_dist_fleet_base
import
runtime_main
,
FleetDistRunnerBase
from
dist_fleet_ctr
import
TestDistCTR2x2
,
fake_ctr_reader
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
...
...
@@ -76,13 +75,13 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
loss_val
=
exe
.
run
(
program
=
fleet
.
main_program
,
fetch_list
=
[
self
.
avg_cost
.
name
])
loss_val
=
np
.
mean
(
loss_val
)
reduce_output
=
fleet
_
util
.
all_reduce
(
reduce_output
=
fleet
.
util
.
all_reduce
(
np
.
array
(
loss_val
),
mode
=
"sum"
)
loss_all_trainer
=
fleet
_
util
.
all_gather
(
float
(
loss_val
))
loss_all_trainer
=
fleet
.
util
.
all_gather
(
float
(
loss_val
))
loss_val
=
float
(
reduce_output
)
/
len
(
loss_all_trainer
)
message
=
"TRAIN ---> pass: {} loss: {}
\n
"
.
format
(
epoch_id
,
loss_val
)
fleet
_
util
.
print_on_rank
(
message
,
0
)
fleet
.
util
.
print_on_rank
(
message
,
0
)
pass_time
=
time
.
time
()
-
pass_start
except
fluid
.
core
.
EOFException
:
...
...
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
浏览文件 @
32ad4f90
...
...
@@ -29,7 +29,6 @@ import numpy as np
import
ctr_dataset_reader
from
test_dist_fleet_heter_base
import
runtime_main
,
FleetDistHeterRunnerBase
from
dist_fleet_ctr
import
TestDistCTR2x2
,
fake_ctr_reader
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
paddle
.
enable_static
()
...
...
@@ -182,7 +181,7 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
thread_num
=
int
(
os
.
getenv
(
"CPU_NUM"
,
2
))
batch_size
=
128
filelist
=
fleet
_
util
.
get_file_shard
(
train_file_list
)
filelist
=
fleet
.
util
.
get_file_shard
(
train_file_list
)
print
(
"filelist: {}"
.
format
(
filelist
))
# config dataset
...
...
python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
浏览文件 @
32ad4f90
...
...
@@ -32,7 +32,6 @@ import os
import
signal
from
functools
import
reduce
from
test_dist_fleet_base
import
runtime_main
,
FleetDistRunnerBase
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
paddle
.
enable_static
()
...
...
@@ -198,7 +197,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
def
net
(
self
,
args
,
batch_size
=
4
,
lr
=
0.01
):
avg_cost
,
_
,
predict
,
self
.
reader
=
\
train_network
(
batch_size
=
batch_size
,
is_distributed
=
False
,
is_sparse
=
True
,
is_self_contained_lr
=
False
,
is_pyreader
=
(
args
.
reader
==
"pyreader"
))
is_sparse
=
True
,
is_self_contained_lr
=
False
,
is_pyreader
=
(
args
.
reader
==
"pyreader"
))
self
.
avg_cost
=
avg_cost
self
.
predict
=
predict
...
...
@@ -238,7 +237,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
loss_val
=
np
.
mean
(
loss_val
)
message
=
"TRAIN ---> pass: {} loss: {}
\n
"
.
format
(
epoch_id
,
loss_val
)
fleet
_
util
.
print_on_rank
(
message
,
0
)
fleet
.
util
.
print_on_rank
(
message
,
0
)
pass_time
=
time
.
time
()
-
pass_start
except
fluid
.
core
.
EOFException
:
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
浏览文件 @
32ad4f90
...
...
@@ -34,8 +34,7 @@ import unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet.base.role_maker
as
role_maker
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
from
paddle.distributed.fleet
import
fleet
import
paddle.distributed.fleet
as
fleet
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy
import
StrategyFactory
__all__
=
[
'FleetDistRunnerBase'
,
'TestFleetBase'
,
'runtime_main'
]
...
...
@@ -97,7 +96,7 @@ class FleetDistRunnerBase(object):
self
.
dump_fields_path
=
os
.
getenv
(
"dump_fields_path"
,
""
)
debug
=
int
(
os
.
getenv
(
"Debug"
,
"0"
))
# TODO(update strategy to support dump params)
if
False
:
#debug:
if
False
:
#
debug:
self
.
strategy
.
set_debug_opt
({
"dump_param"
:
self
.
dump_param
,
"dump_fields"
:
self
.
dump_fields
,
...
...
@@ -372,8 +371,6 @@ def runtime_main(test_class):
strategy
=
model
.
build_strategy
(
args
)
avg_cost
=
model
.
net
(
args
)
model
.
build_optimizer
(
avg_cost
,
strategy
)
fleet_util
.
_set_strategy
(
strategy
)
fleet_util
.
_set_role_maker
(
role
)
if
args
.
role
==
"pserver"
:
model
.
run_pserver
(
args
)
else
:
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
浏览文件 @
32ad4f90
...
...
@@ -34,8 +34,7 @@ import unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet.base.role_maker
as
role_maker
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
from
paddle.distributed.fleet
import
fleet
import
paddle.distributed.fleet
as
fleet
__all__
=
[
'FleetDistHeterRunnerBase'
,
'TestFleetHeterBase'
,
'runtime_main'
]
...
...
@@ -376,8 +375,6 @@ def runtime_main(test_class):
strategy
=
model
.
build_strategy
(
args
)
avg_cost
=
model
.
net
(
args
)
model
.
build_optimizer
(
avg_cost
,
strategy
)
fleet_util
.
_set_strategy
(
strategy
)
fleet_util
.
_set_role_maker
(
role
)
if
args
.
role
==
"pserver"
or
args
.
role
==
"heter_trainer"
:
model
.
run_pserver
(
args
)
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
浏览文件 @
32ad4f90
...
...
@@ -19,7 +19,6 @@ import os
import
math
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet.base.role_maker
as
role_maker
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
from
paddle.distributed.fleet
import
fleet
import
paddle
...
...
python/paddle/fluid/tests/unittests/test_fleet_base.py
浏览文件 @
32ad4f90
...
...
@@ -107,7 +107,7 @@ class TestFleetBase(unittest.TestCase):
def
test_util
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
self
.
assert
Equal
(
fleet
.
util
()
,
None
)
self
.
assert
NotEqual
(
fleet
.
util
,
None
)
def
test_barrier_worker
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
浏览文件 @
32ad4f90
...
...
@@ -436,12 +436,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
optimizer
.
minimize
(
avg_cost
)
comm_world
=
"server"
fleet
.
util
()
.
barrier
(
comm_world
)
fleet
.
util
.
barrier
(
comm_world
)
gather
=
fleet
.
util
()
.
all_gather
(
1
,
comm_world
)
gather
=
fleet
.
util
.
all_gather
(
1
,
comm_world
)
self
.
assertEqual
(
gather
[
0
],
1
)
all_reduce
=
fleet
.
util
()
.
all_reduce
(
1
,
"sum"
,
comm_world
)
all_reduce
=
fleet
.
util
.
all_reduce
(
1
,
"sum"
,
comm_world
)
self
.
assertEqual
(
1
,
all_reduce
)
self
.
clean
(
tmp
)
...
...
@@ -752,12 +752,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
optimizer
.
minimize
(
avg_cost
)
comm_world
=
"server"
fleet
.
util
()
.
barrier
(
comm_world
)
fleet
.
util
.
barrier
(
comm_world
)
gather
=
fleet
.
util
()
.
all_gather
(
1
,
comm_world
)
gather
=
fleet
.
util
.
all_gather
(
1
,
comm_world
)
self
.
assertEqual
(
gather
[
0
],
1
)
all_reduce
=
fleet
.
util
()
.
all_reduce
(
1
,
"sum"
,
comm_world
)
all_reduce
=
fleet
.
util
.
all_reduce
(
1
,
"sum"
,
comm_world
)
self
.
assertEqual
(
1
,
all_reduce
)
self
.
clean
(
tmp
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_util.py
浏览文件 @
32ad4f90
...
...
@@ -22,7 +22,6 @@ import tempfile
import
os
import
sys
from
paddle.dataset.common
import
download
,
DATA_HOME
from
paddle.distributed.fleet.base.util_factory
import
fleet_util
import
paddle.distributed.fleet.base.role_maker
as
role_maker
...
...
@@ -59,8 +58,7 @@ class TestFleetUtil(unittest.TestCase):
import
paddle.distributed.fleet.base.role_maker
as
role_maker
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
default_util
=
fleet
.
util
()
self
.
assertEqual
(
default_util
,
None
)
self
.
assertNotEqual
(
fleet
.
util
,
None
)
def
test_set_user_defined_util
(
self
):
import
paddle.distributed.fleet
as
fleet
...
...
@@ -76,17 +74,19 @@ class TestFleetUtil(unittest.TestCase):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
my_util
=
UserDefinedUtil
()
fleet
.
set_util
(
my_util
)
user_id
=
fleet
.
util
()
.
get_user_id
()
fleet
.
util
=
my_util
user_id
=
fleet
.
util
.
get_user_id
()
self
.
assertEqual
(
user_id
,
10
)
def
test_fs
(
self
):
from
paddle.distributed.fleet.utils.fs
import
LocalFS
import
paddle.distributed.fleet
as
fleet
from
paddle.distributed.fleet.utils
import
LocalFS
fs
=
LocalFS
()
dirs
,
files
=
fs
.
ls_dir
(
"test_tmp"
)
dirs
,
files
=
fs
.
ls_dir
(
"./"
)
self
.
assertFalse
(
fs
.
need_upload_download
())
fleet
_
util
.
_set_file_system
(
fs
)
fleet
.
util
.
_set_file_system
(
fs
)
def
download_files
(
self
):
path
=
download
(
self
.
proto_data_url
,
self
.
module_name
,
...
...
@@ -98,7 +98,8 @@ class TestFleetUtil(unittest.TestCase):
return
unzip_folder
def
test_get_file_shard
(
self
):
self
.
assertRaises
(
Exception
,
fleet_util
.
get_file_shard
,
"files"
)
import
paddle.distributed.fleet
as
fleet
self
.
assertRaises
(
Exception
,
fleet
.
util
.
get_file_shard
,
"files"
)
try
:
import
netifaces
except
:
...
...
@@ -112,18 +113,20 @@ class TestFleetUtil(unittest.TestCase):
role
=
role_maker
.
Role
.
WORKER
,
worker_endpoints
=
[
"127.0.0.1:6003"
,
"127.0.0.1:6004"
],
server_endpoints
=
[
"127.0.0.1:6001"
,
"127.0.0.1:6002"
])
fleet_util
.
_set_role_maker
(
role
)
files
=
fleet_util
.
get_file_shard
([
"1"
,
"2"
,
"3"
])
fleet
.
init
(
role
)
files
=
fleet
.
util
.
get_file_shard
([
"1"
,
"2"
,
"3"
])
self
.
assertTrue
(
len
(
files
)
==
2
and
"1"
in
files
and
"2"
in
files
)
def
test_program_type_trans
(
self
):
import
paddle.distributed.fleet
as
fleet
data_dir
=
self
.
download_files
()
program_dir
=
os
.
path
.
join
(
data_dir
,
self
.
pruned_dir
)
text_program
=
"pruned_main_program.pbtxt"
binary_program
=
"pruned_main_program.bin"
text_to_binary
=
fleet
_
util
.
_program_type_trans
(
program_dir
,
text_to_binary
=
fleet
.
util
.
_program_type_trans
(
program_dir
,
text_program
,
True
)
binary_to_text
=
fleet
_
util
.
_program_type_trans
(
program_dir
,
binary_to_text
=
fleet
.
util
.
_program_type_trans
(
program_dir
,
binary_program
,
False
)
self
.
assertTrue
(
os
.
path
.
exists
(
os
.
path
.
join
(
program_dir
,
text_to_binary
)))
...
...
@@ -131,6 +134,7 @@ class TestFleetUtil(unittest.TestCase):
os
.
path
.
exists
(
os
.
path
.
join
(
program_dir
,
binary_to_text
)))
def
test_prams_check
(
self
):
import
paddle.distributed.fleet
as
fleet
data_dir
=
self
.
download_files
()
class
config
:
...
...
@@ -160,11 +164,11 @@ class TestFleetUtil(unittest.TestCase):
# test saved var's shape
conf
.
dump_program_filename
=
"pruned_main_program.save_var_shape_not_match"
self
.
assertRaises
(
Exception
,
fleet
_
util
.
_params_check
)
self
.
assertRaises
(
Exception
,
fleet
.
util
.
_params_check
)
# test program.proto without feed_op and fetch_op
conf
.
dump_program_filename
=
"pruned_main_program.no_feed_fetch"
results
=
fleet
_
util
.
_params_check
(
conf
)
results
=
fleet
.
util
.
_params_check
(
conf
)
self
.
assertTrue
(
len
(
results
)
==
1
)
np
.
testing
.
assert_array_almost_equal
(
results
[
0
],
np
.
array
(
...
...
@@ -172,11 +176,11 @@ class TestFleetUtil(unittest.TestCase):
# test feed_var's shape
conf
.
dump_program_filename
=
"pruned_main_program.feed_var_shape_not_match"
self
.
assertRaises
(
Exception
,
fleet
_
util
.
_params_check
)
self
.
assertRaises
(
Exception
,
fleet
.
util
.
_params_check
)
# test correct case with feed_vars_filelist
conf
.
dump_program_filename
=
"pruned_main_program.pbtxt"
results
=
fleet
_
util
.
_params_check
(
conf
)
results
=
fleet
.
util
.
_params_check
(
conf
)
self
.
assertTrue
(
len
(
results
)
==
1
)
np
.
testing
.
assert_array_almost_equal
(
results
[
0
],
np
.
array
(
...
...
@@ -186,13 +190,14 @@ class TestFleetUtil(unittest.TestCase):
conf
.
feed_config
.
feeded_vars_filelist
=
None
# test feed var with lod_level >= 2
conf
.
dump_program_filename
=
"pruned_main_program.feed_lod2"
self
.
assertRaises
(
Exception
,
fleet
_
util
.
_params_check
)
self
.
assertRaises
(
Exception
,
fleet
.
util
.
_params_check
)
conf
.
dump_program_filename
=
"pruned_main_program.pbtxt"
results
=
fleet
_
util
.
_params_check
(
conf
)
results
=
fleet
.
util
.
_params_check
(
conf
)
self
.
assertTrue
(
len
(
results
)
==
1
)
def
test_proto_check
(
self
):
import
paddle.distributed.fleet
as
fleet
data_dir
=
self
.
download_files
()
class
config
:
...
...
@@ -210,7 +215,7 @@ class TestFleetUtil(unittest.TestCase):
"pruned_main_program.save_var_shape_not_match"
))
conf
.
is_text_pruned_program
=
True
conf
.
draw
=
False
res
=
fleet
_
util
.
_proto_check
(
conf
)
res
=
fleet
.
util
.
_proto_check
(
conf
)
self
.
assertFalse
(
res
)
# test match
...
...
@@ -222,10 +227,11 @@ class TestFleetUtil(unittest.TestCase):
else
:
conf
.
draw
=
True
conf
.
draw_out_name
=
"pruned_check"
res
=
fleet
_
util
.
_proto_check
(
conf
)
res
=
fleet
.
util
.
_proto_check
(
conf
)
self
.
assertTrue
(
res
)
def
test_visualize
(
self
):
import
paddle.distributed.fleet
as
fleet
if
sys
.
platform
==
'win32'
or
sys
.
platform
==
'sys.platform'
:
pass
else
:
...
...
@@ -234,10 +240,10 @@ class TestFleetUtil(unittest.TestCase):
data_dir
,
os
.
path
.
join
(
self
.
train_dir
,
"join_main_program.pbtxt"
))
is_text
=
True
program
=
fleet
_
util
.
_load_program
(
program_path
,
is_text
)
program
=
fleet
.
util
.
_load_program
(
program_path
,
is_text
)
output_dir
=
os
.
path
.
join
(
data_dir
,
self
.
train_dir
)
output_filename
=
"draw_prog"
fleet
_
util
.
_visualize_graphviz
(
program
,
output_dir
,
output_filename
)
fleet
.
util
.
_visualize_graphviz
(
program
,
output_dir
,
output_filename
)
self
.
assertTrue
(
os
.
path
.
exists
(
os
.
path
.
join
(
output_dir
,
output_filename
+
".dot"
)))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录