未验证 提交 ab18644c 编写于 作者: W wangguanqun 提交者: GitHub

remove fluid (#47959)

* remove fluid

* update public

* core

* public

* public1

* ci
上级 fd689106
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright(c) 2019 PaddlePaddle Authors.All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0(the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http: // www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Communicator is used for async distribute training in distribute_transpiler mode.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
"""
import paddle
from paddle.framework import core
from paddle.distributed.ps.utils.public import DistributedMode
__all__ = ['Communicator', 'FLCommunicator', 'LargeScaleKV']
class Communicator:
def __init__(self, mode, kwargs=None, envs=None):
"""
Communicator is used for async distribute training in distribute_transpiler mode.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
Args:
program(Program): the trainers program after transpile of distribute_transpiler.
It's used by communicator to extract the information to do communication.
Returns:
None
Examples:
.. code-block:: python
import paddle
prog = paddle.static.Program()
comm = paddle.distributed.communicator.Communicator(prog)
comm.start()
comm.stop()
"""
# set all recv op to not_run mode
if kwargs is None:
if envs is None:
envs = {}
else:
if mode == DistributedMode.SYNC:
envs["pserver_endpoints"] = ','.join(
kwargs["pserver_endpoints"]
)
envs["trainers"] = str(kwargs["trainers"])
envs["trainer_id"] = str(kwargs["trainer_id"])
envs["need_global_step"] = str(kwargs["need_global_step"])
envs["barrier_table_id"] = str(kwargs["barrier_table_id"])
mode_str = None
if mode == DistributedMode.SYNC:
mode_str = "SYNC"
elif mode == DistributedMode.ASYNC:
mode_str = "ASYNC"
elif mode == DistributedMode.HALF_ASYNC:
mode_str = "HALF_ASYNC"
elif mode == DistributedMode.GEO:
mode_str = "GEO"
self.mode = mode_str
self.envs = envs
self.communicator_ = None
self.send_ctx_ = None
self.recv_ctx_ = None
def init_with_ctx(
self, send_ctx, recv_ctx, proto_txt, unit64_hosts, scope=None
):
if scope is None:
scope = paddle.static.global_scope()
self.communicator_ = core.DistCommunicator(
self.mode,
proto_txt,
unit64_hosts,
send_ctx,
recv_ctx,
scope,
self.envs,
)
self.send_ctx_ = send_ctx
self.recv_ctx_ = recv_ctx
def create_client_to_client_connection(
self,
pserver_timeout_ms=500000,
pserver_connect_timeout_ms=10000,
max_retry=3,
):
self.communicator_.create_client_to_client_connection(
pserver_timeout_ms, pserver_connect_timeout_ms, max_retry
)
def get_client_info(self):
return self.communicator_.get_client_info()
def set_clients(self, host_list):
self.communicator_.set_clients(host_list)
def start(self):
"""
Start communicator. Should call before training process.
Returns:
None
Examples:
.. code-block:: python
import paddle
prog = paddle.static.Program()
comm = paddle.distributed.communicator.Communicator(prog)
comm.start()
comm.stop()
"""
if self.communicator_ is None:
print('you must call init_with_ctx first to init comm before start')
return
self.communicator_.start()
def stop(self):
"""
Stop communicator. Should call after training process.
Returns:
None
Examples:
.. code-block:: python
import paddle
prog = paddle.static.Program()
comm = paddle.distributed.communicator.Communicator(prog)
comm.start()
comm.stop()
"""
if self.communicator_ is None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.stop()
def is_running(self):
"""
Get communicator is running or stop.
Returns:
bool
Examples:
.. code-block:: python
import paddle
prog = paddle.static.Program()
comm = paddle.distributed.communicator.Communicator(prog)
comm.is_running()
"""
if self.communicator_ is None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.is_running()
def recv(self):
self.communicator_.recv()
def init_params(self, context):
self.communicator_.init_params(context)
def pull_dense(self, context):
self.communicator_.pull_dense(context)
def push_sparse_param(self, var_name, table_id=-1, scope=None):
if scope is None:
scope = paddle.static.global_scope()
if not self.is_running():
raise ValueError(
"Communicator should init first. Using fleet.init_worker() before push_sparse_param()"
)
assert isinstance(var_name, str)
assert isinstance(table_id, int)
if table_id == -1:
table_id = self.send_ctx_[var_name].table_id()
self.communicator_.push_sparse_param(var_name, table_id, scope)
class FLCommunicator(Communicator): # only for coordinator
def __init__(self, ps_hosts, kwargs=None):
mode = None
super().__init__(mode, kwargs)
send_ctx = {}
dense_map = {}
prototxt = ""
self.mode = "WITH_COORDINATOR"
self.init_with_ctx(send_ctx, dense_map, prototxt, ps_hosts)
def start_coordinator(self, self_endpoint, trainer_endpoints):
if self.communicator_ is not None:
self.communicator_.start_coordinator(
self_endpoint, trainer_endpoints
)
return
def save_fl_strategy(self, mp):
if self.communicator_ is not None:
self.communicator_.save_fl_strategy(mp)
else:
raise ValueError("self.communicator_ is null")
return
def query_fl_clients_info(self):
info_mp = {}
if self.communicator_ is not None:
info_mp = self.communicator_.query_fl_clients_info()
return info_mp
class LargeScaleKV:
def __init__(self):
self.scale_kv = core.LargeScaleKV()
def save(self, varname, dirname):
self.scale_kv.save(varname, dirname)
def load(self, varname, dirname):
self.scale_kv.load(varname, dirname)
def size(self, varname):
return self.scale_kv.size(varname)
class HeterClient:
def __init__(self, endpoint, previous_endpoint, trainer_id):
self.heter_client_ = core.HeterClient(
endpoint, previous_endpoint, trainer_id
)
def stop(self):
self.heter_client_.stop()
......@@ -14,7 +14,7 @@
import logging
import paddle.fluid as fluid
import paddle
from ..ps.utils.public import (
get_optimize_ops,
get_ps_endpoint,
......@@ -76,12 +76,14 @@ class AddLrDecayTablePass(PassBase):
'ExponentialDecay',
]
decay_main_program = fluid.framework.Program()
decay_startup_program = fluid.framework.Program()
decay_main_program = paddle.static.Program()
decay_startup_program = paddle.static.Program()
lr_name = ""
if isinstance(lr_sheduler, ExponentialDecay):
with fluid.program_guard(decay_main_program, decay_startup_program):
with paddle.static.program_guard(
decay_main_program, decay_startup_program
):
lr = exponential_decay(
1.0, lr_decay_steps, lr_sheduler.gamma, True
)
......@@ -94,7 +96,9 @@ class AddLrDecayTablePass(PassBase):
% lr_decay_steps
)
elif isinstance(lr_sheduler, NoamDecay):
with fluid.program_guard(decay_main_program, decay_startup_program):
with paddle.static.program_guard(
decay_main_program, decay_startup_program
):
lr = noam_decay(
lr_sheduler.d_model, lr_sheduler.warmup_steps, 1.0
)
......@@ -104,7 +108,9 @@ class AddLrDecayTablePass(PassBase):
% lr_sheduler.warmup_steps
)
elif isinstance(lr_sheduler, NaturalExpDecay):
with fluid.program_guard(decay_main_program, decay_startup_program):
with paddle.static.program_guard(
decay_main_program, decay_startup_program
):
lr = natural_exp_decay(
1.0, lr_decay_steps, lr_sheduler.gamma, True
)
......@@ -117,7 +123,9 @@ class AddLrDecayTablePass(PassBase):
% lr_decay_steps
)
elif isinstance(lr_sheduler, InverseTimeDecay):
with fluid.program_guard(decay_main_program, decay_startup_program):
with paddle.static.program_guard(
decay_main_program, decay_startup_program
):
lr = inverse_time_decay(
1.0, lr_decay_steps, lr_sheduler.gamma, True
)
......
......@@ -17,10 +17,10 @@ import paddle
from ..ps.utils.public import * # noqa: F403
from paddle.framework import core
from paddle.distributed.passes.pass_base import PassBase, register_pass
from paddle.fluid.transpiler.details.program_utils import delete_ops
from paddle.fluid.transpiler.collective import SingleProcessMultiThread
from ..ps.utils.collective_transpiler import SingleProcessMultiThread
from _collections import defaultdict
from paddle.fluid.framework import Program, Parameter
from paddle.static import Program
from paddle.fluid.framework import Parameter
@register_pass("append_send_ops_pass")
......
......@@ -13,7 +13,7 @@
# limitations under the License.
import paddle
from paddle.fluid.communicator import FLCommunicator
from paddle.distributed.communicator import FLCommunicator
from paddle.distributed.fleet.proto import the_one_ps_pb2
from google.protobuf import text_format
from paddle.distributed.ps.utils.public import is_distributed_env
......
......@@ -15,20 +15,17 @@
import warnings
import os
import paddle.fluid as fluid
import paddle
from paddle.distributed import fleet
from paddle.fluid import core
from paddle.framework import core
from paddle.distributed.ps.utils.public import * # noqa: F403
from paddle.fluid.framework import Program
from paddle.fluid.compiler import CompiledProgram
from paddle.fluid.executor import Executor
from paddle.fluid.parallel_executor import ParallelExecutor
from paddle.static import Program, CompiledProgram, Executor, ParallelExecutor
from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase
from paddle.distributed.fleet.base.private_helper_function import (
wait_server_ready,
)
from paddle.distributed.fleet.proto import the_one_ps_pb2
from paddle.fluid.communicator import Communicator, HeterClient
from paddle.distributed.communicator import Communicator, HeterClient
from google.protobuf import text_format
from paddle.distributed.ps.coordinator import Coordinator
......@@ -1035,7 +1032,7 @@ class TheOnePSRuntime(RuntimeBase):
super().__init__()
self._communicator = None
self._server = None
self._worker = fluid.core.DistFleetWrapper()
self._worker = core.DistFleetWrapper()
self._coordinator = None
self._server_sub_program = []
self._heter_client = None
......@@ -1092,7 +1089,7 @@ class TheOnePSRuntime(RuntimeBase):
self.string_hosts = []
for idx, ep in enumerate(self.endpoints):
host, port = ep.split(":")
pshost = fluid.core.PSHost(host, int(port), idx)
pshost = core.PSHost(host, int(port), idx)
self.string_hosts.append(pshost.serialize_to_string())
self.with_coordinator = self.role_maker._with_coordinator
......@@ -1102,7 +1099,7 @@ class TheOnePSRuntime(RuntimeBase):
coordinator_endpoints = self.role_maker._get_coordinator_endpoints()
for idx, ep in enumerate(coordinator_endpoints):
ip, port = ep.split(":")
pshost = fluid.core.PSHost(ip, int(port), idx)
pshost = core.PSHost(ip, int(port), idx)
self.coordinator_hosts.append(pshost.serialize_to_string())
self.ps_desc_builder = PsDescBuilder(self.context)
......@@ -1173,7 +1170,7 @@ class TheOnePSRuntime(RuntimeBase):
gpus_env = os.getenv("FLAGS_selected_gpus")
gpus_env = [int(s) for s in gpus_env.split(",")]
main_program._fleet_opt["worker_places"] = gpus_env
PSGPU = fluid.core.PSGPU()
PSGPU = core.PSGPU()
PSGPU.init_gpu_ps(gpus_env)
def sync_strategy_envs():
......@@ -1241,7 +1238,7 @@ class TheOnePSRuntime(RuntimeBase):
dense_map,
worker_desc,
self.string_hosts,
fluid.global_scope(),
paddle.static.global_scope(),
)
fleet.util.barrier()
......@@ -1273,7 +1270,7 @@ class TheOnePSRuntime(RuntimeBase):
raise ValueError(
"You must set the scope list when you have Multiple programs"
)
scopes = [fluid.global_scope()]
scopes = [paddle.static.global_scope()]
if len(self.origin_main_programs) != len(scopes):
raise VauleError("len(programs) != len(scopes)")
......@@ -1350,7 +1347,7 @@ class TheOnePSRuntime(RuntimeBase):
if self.debug:
print("server_desc: \n{}".format(server_desc))
self._server = fluid.core.DistFleetWrapper()
self._server = core.DistFleetWrapper()
self._server.init_server(
server_desc,
self.string_hosts,
......
......@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
from .public import * # noqa: F403
from paddle.distributed.fleet.base.private_helper_function import (
wait_server_ready,
......@@ -77,8 +79,8 @@ class PsProgramBuilder:
self._build_trainer_programs()
fluid.framework.switch_startup_program(self.cloned_startup)
print(
"fluid.default_startup_program: {}".format(
fluid.default_startup_program
"paddle.static.default_startup_program: {}".format(
paddle.static.default_startup_program
)
)
# print("ps_program_build before =", id(self.loss.block.program))
......@@ -471,8 +473,8 @@ class FlPsProgramBuilder(HeterAsyncPsProgramBuilder):
fluid.framework.switch_startup_program(self.cloned_startup)
fluid.framework.switch_main_program(self.cloned_main)
print(
"fluid.default_startup_program: {}".format(
fluid.default_startup_program()._heter_pipeline_opt
"paddle.static.default_startup_program: {}".format(
paddle.static.default_startup_program()._heter_pipeline_opt
)
)
else:
......
......@@ -19,7 +19,7 @@ import os
import warnings
import logging
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.framework import core
import paddle.fluid.framework as framework
# logging.basicConfig(
......@@ -896,7 +896,7 @@ def find_heter_ops(program, default_device="cpu"):
if len(heter_ops) == 0:
warnings.warn(
"No heterogeneous OP was found in your program , "
" please using fluid.device_guard() to run OPs on different device."
" please using static.device_guard() to run OPs on different device."
)
total_heter_ops = 0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册