remove fluid (#47959)

* remove fluid * update public * core * public * public1 * ci

remove fluid (#47959)
* remove fluid * update public * core * public * public1 * ci
ab18644c · wangguanqun · GitHub · fd689106 · ab18644c · ab18644c
8 changed file
--- a/python/paddle/distributed/communicator.py
+++ b/python/paddle/distributed/communicator.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright(c) 2019 PaddlePaddle Authors.All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0(the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http:  // www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Communicator is used for async distribute training in distribute_transpiler mode.
+It's a wrapper of a cpp class Communicator and should be used inside fleet API.
+"""
+import paddle
+from paddle.framework import core
+from paddle.distributed.ps.utils.public import DistributedMode
+
+__all__ = ['Communicator', 'FLCommunicator', 'LargeScaleKV']
+
+
+class Communicator:
+    def __init__(self, mode, kwargs=None, envs=None):
+        """
+        Communicator is used for async distribute training in distribute_transpiler mode.
+        It's a wrapper of a cpp class Communicator and should be used inside fleet API.
+
+        Args:
+            program(Program): the trainers program after transpile of distribute_transpiler.
+            It's used by communicator to extract the information to do communication.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                prog = paddle.static.Program()
+                comm = paddle.distributed.communicator.Communicator(prog)
+                comm.start()
+                comm.stop()
+        """
+        # set all recv op to not_run mode
+
+        if kwargs is None:
+            if envs is None:
+                envs = {}
+        else:
+            if mode == DistributedMode.SYNC:
+                envs["pserver_endpoints"] = ','.join(
+                    kwargs["pserver_endpoints"]
+                )
+
+            envs["trainers"] = str(kwargs["trainers"])
+            envs["trainer_id"] = str(kwargs["trainer_id"])
+            envs["need_global_step"] = str(kwargs["need_global_step"])
+            envs["barrier_table_id"] = str(kwargs["barrier_table_id"])
+
+        mode_str = None
+
+        if mode == DistributedMode.SYNC:
+            mode_str = "SYNC"
+        elif mode == DistributedMode.ASYNC:
+            mode_str = "ASYNC"
+        elif mode == DistributedMode.HALF_ASYNC:
+            mode_str = "HALF_ASYNC"
+        elif mode == DistributedMode.GEO:
+            mode_str = "GEO"
+
+        self.mode = mode_str
+        self.envs = envs
+        self.communicator_ = None
+        self.send_ctx_ = None
+        self.recv_ctx_ = None
+
+    def init_with_ctx(
+        self, send_ctx, recv_ctx, proto_txt, unit64_hosts, scope=None
+    ):
+        if scope is None:
+            scope = paddle.static.global_scope()
+        self.communicator_ = core.DistCommunicator(
+            self.mode,
+            proto_txt,
+            unit64_hosts,
+            send_ctx,
+            recv_ctx,
+            scope,
+            self.envs,
+        )
+        self.send_ctx_ = send_ctx
+        self.recv_ctx_ = recv_ctx
+
+    def create_client_to_client_connection(
+        self,
+        pserver_timeout_ms=500000,
+        pserver_connect_timeout_ms=10000,
+        max_retry=3,
+    ):
+        self.communicator_.create_client_to_client_connection(
+            pserver_timeout_ms, pserver_connect_timeout_ms, max_retry
+        )
+
+    def get_client_info(self):
+        return self.communicator_.get_client_info()
+
+    def set_clients(self, host_list):
+        self.communicator_.set_clients(host_list)
+
+    def start(self):
+        """
+        Start communicator. Should call before training process.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                prog = paddle.static.Program()
+                comm = paddle.distributed.communicator.Communicator(prog)
+                comm.start()
+                comm.stop()
+        """
+        if self.communicator_ is None:
+            print('you must call init_with_ctx first to init comm before start')
+            return
+        self.communicator_.start()
+
+    def stop(self):
+        """
+        Stop communicator. Should call after training process.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                prog = paddle.static.Program()
+                comm = paddle.distributed.communicator.Communicator(prog)
+                comm.start()
+                comm.stop()
+        """
+        if self.communicator_ is None:
+            print('you must call init_with_ctx first to init comm before stop')
+            return
+        self.communicator_.stop()
+
+    def is_running(self):
+        """
+        Get communicator is running or stop.
+
+        Returns:
+            bool
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                prog = paddle.static.Program()
+                comm = paddle.distributed.communicator.Communicator(prog)
+                comm.is_running()
+        """
+        if self.communicator_ is None:
+            print('you must call init_with_ctx first to init comm before stop')
+            return
+        self.communicator_.is_running()
+
+    def recv(self):
+        self.communicator_.recv()
+
+    def init_params(self, context):
+        self.communicator_.init_params(context)
+
+    def pull_dense(self, context):
+        self.communicator_.pull_dense(context)
+
+    def push_sparse_param(self, var_name, table_id=-1, scope=None):
+        if scope is None:
+            scope = paddle.static.global_scope()
+        if not self.is_running():
+            raise ValueError(
+                "Communicator should init first. Using fleet.init_worker() before push_sparse_param()"
+            )
+        assert isinstance(var_name, str)
+        assert isinstance(table_id, int)
+        if table_id == -1:
+            table_id = self.send_ctx_[var_name].table_id()
+        self.communicator_.push_sparse_param(var_name, table_id, scope)
+
+
+class FLCommunicator(Communicator):  # only for coordinator
+    def __init__(self, ps_hosts, kwargs=None):
+        mode = None
+        super().__init__(mode, kwargs)
+        send_ctx = {}
+        dense_map = {}
+        prototxt = ""
+        self.mode = "WITH_COORDINATOR"
+        self.init_with_ctx(send_ctx, dense_map, prototxt, ps_hosts)
+
+    def start_coordinator(self, self_endpoint, trainer_endpoints):
+        if self.communicator_ is not None:
+            self.communicator_.start_coordinator(
+                self_endpoint, trainer_endpoints
+            )
+        return
+
+    def save_fl_strategy(self, mp):
+        if self.communicator_ is not None:
+            self.communicator_.save_fl_strategy(mp)
+        else:
+            raise ValueError("self.communicator_ is null")
+        return
+
+    def query_fl_clients_info(self):
+        info_mp = {}
+        if self.communicator_ is not None:
+            info_mp = self.communicator_.query_fl_clients_info()
+        return info_mp
+
+
+class LargeScaleKV:
+    def __init__(self):
+        self.scale_kv = core.LargeScaleKV()
+
+    def save(self, varname, dirname):
+        self.scale_kv.save(varname, dirname)
+
+    def load(self, varname, dirname):
+        self.scale_kv.load(varname, dirname)
+
+    def size(self, varname):
+        return self.scale_kv.size(varname)
+
+
+class HeterClient:
+    def __init__(self, endpoint, previous_endpoint, trainer_id):
+        self.heter_client_ = core.HeterClient(
+            endpoint, previous_endpoint, trainer_id
+        )
+
+    def stop(self):
+        self.heter_client_.stop()
--- a/python/paddle/distributed/passes/ps_server_pass.py
+++ b/python/paddle/distributed/passes/ps_server_pass.py
@@ -14,7 +14,7 @@

 import logging

-import paddle.fluid as fluid
+import paddle
 from ..ps.utils.public import (
    get_optimize_ops,
    get_ps_endpoint,
@@ -76,12 +76,14 @@ class AddLrDecayTablePass(PassBase):
            'ExponentialDecay',
        ]

-        decay_main_program = fluid.framework.Program()
-        decay_startup_program = fluid.framework.Program()
+        decay_main_program = paddle.static.Program()
+        decay_startup_program = paddle.static.Program()
        lr_name = ""

        if isinstance(lr_sheduler, ExponentialDecay):
-            with fluid.program_guard(decay_main_program, decay_startup_program):
+            with paddle.static.program_guard(
+                decay_main_program, decay_startup_program
+            ):
                lr = exponential_decay(
                    1.0, lr_decay_steps, lr_sheduler.gamma, True
                )
@@ -94,7 +96,9 @@ class AddLrDecayTablePass(PassBase):
                    % lr_decay_steps
                )
        elif isinstance(lr_sheduler, NoamDecay):
-            with fluid.program_guard(decay_main_program, decay_startup_program):
+            with paddle.static.program_guard(
+                decay_main_program, decay_startup_program
+            ):
                lr = noam_decay(
                    lr_sheduler.d_model, lr_sheduler.warmup_steps, 1.0
                )
@@ -104,7 +108,9 @@ class AddLrDecayTablePass(PassBase):
                    % lr_sheduler.warmup_steps
                )
        elif isinstance(lr_sheduler, NaturalExpDecay):
-            with fluid.program_guard(decay_main_program, decay_startup_program):
+            with paddle.static.program_guard(
+                decay_main_program, decay_startup_program
+            ):
                lr = natural_exp_decay(
                    1.0, lr_decay_steps, lr_sheduler.gamma, True
                )
@@ -117,7 +123,9 @@ class AddLrDecayTablePass(PassBase):
                    % lr_decay_steps
                )
        elif isinstance(lr_sheduler, InverseTimeDecay):
-            with fluid.program_guard(decay_main_program, decay_startup_program):
+            with paddle.static.program_guard(
+                decay_main_program, decay_startup_program
+            ):
                lr = inverse_time_decay(
                    1.0, lr_decay_steps, lr_sheduler.gamma, True
                )

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -17,10 +17,10 @@ import paddle
 from ..ps.utils.public import *  # noqa: F403
 from paddle.framework import core
 from paddle.distributed.passes.pass_base import PassBase, register_pass
-from paddle.fluid.transpiler.details.program_utils import delete_ops
-from paddle.fluid.transpiler.collective import SingleProcessMultiThread
+from ..ps.utils.collective_transpiler import SingleProcessMultiThread
 from _collections import defaultdict
-from paddle.fluid.framework import Program, Parameter
+from paddle.static import Program
+from paddle.fluid.framework import Parameter


 @register_pass("append_send_ops_pass")

--- a/python/paddle/distributed/ps/coordinator.py
+++ b/python/paddle/distributed/ps/coordinator.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 import paddle
-from paddle.fluid.communicator import FLCommunicator
+from paddle.distributed.communicator import FLCommunicator
 from paddle.distributed.fleet.proto import the_one_ps_pb2
 from google.protobuf import text_format
 from paddle.distributed.ps.utils.public import is_distributed_env

--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -15,20 +15,17 @@
 import warnings

 import os
-import paddle.fluid as fluid
+import paddle
 from paddle.distributed import fleet
-from paddle.fluid import core
+from paddle.framework import core
 from paddle.distributed.ps.utils.public import *  # noqa: F403
-from paddle.fluid.framework import Program
-from paddle.fluid.compiler import CompiledProgram
-from paddle.fluid.executor import Executor
-from paddle.fluid.parallel_executor import ParallelExecutor
+from paddle.static import Program, CompiledProgram, Executor, ParallelExecutor
 from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase
 from paddle.distributed.fleet.base.private_helper_function import (
    wait_server_ready,
 )
 from paddle.distributed.fleet.proto import the_one_ps_pb2
-from paddle.fluid.communicator import Communicator, HeterClient
+from paddle.distributed.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 from paddle.distributed.ps.coordinator import Coordinator

@@ -1035,7 +1032,7 @@ class TheOnePSRuntime(RuntimeBase):
        super().__init__()
        self._communicator = None
        self._server = None
-        self._worker = fluid.core.DistFleetWrapper()
+        self._worker = core.DistFleetWrapper()
        self._coordinator = None
        self._server_sub_program = []
        self._heter_client = None
@@ -1092,7 +1089,7 @@ class TheOnePSRuntime(RuntimeBase):
        self.string_hosts = []
        for idx, ep in enumerate(self.endpoints):
            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
+            pshost = core.PSHost(host, int(port), idx)
            self.string_hosts.append(pshost.serialize_to_string())

        self.with_coordinator = self.role_maker._with_coordinator
@@ -1102,7 +1099,7 @@ class TheOnePSRuntime(RuntimeBase):
            coordinator_endpoints = self.role_maker._get_coordinator_endpoints()
            for idx, ep in enumerate(coordinator_endpoints):
                ip, port = ep.split(":")
-                pshost = fluid.core.PSHost(ip, int(port), idx)
+                pshost = core.PSHost(ip, int(port), idx)
                self.coordinator_hosts.append(pshost.serialize_to_string())

        self.ps_desc_builder = PsDescBuilder(self.context)
@@ -1173,7 +1170,7 @@ class TheOnePSRuntime(RuntimeBase):
            gpus_env = os.getenv("FLAGS_selected_gpus")
            gpus_env = [int(s) for s in gpus_env.split(",")]
            main_program._fleet_opt["worker_places"] = gpus_env
-            PSGPU = fluid.core.PSGPU()
+            PSGPU = core.PSGPU()
            PSGPU.init_gpu_ps(gpus_env)

        def sync_strategy_envs():
@@ -1241,7 +1238,7 @@ class TheOnePSRuntime(RuntimeBase):
                dense_map,
                worker_desc,
                self.string_hosts,
-                fluid.global_scope(),
+                paddle.static.global_scope(),
            )
        fleet.util.barrier()

@@ -1273,7 +1270,7 @@ class TheOnePSRuntime(RuntimeBase):
                raise ValueError(
                    "You must set the scope list when you have Multiple programs"
                )
-            scopes = [fluid.global_scope()]
+            scopes = [paddle.static.global_scope()]
        if len(self.origin_main_programs) != len(scopes):
            raise VauleError("len(programs) != len(scopes)")

@@ -1350,7 +1347,7 @@ class TheOnePSRuntime(RuntimeBase):
        if self.debug:
            print("server_desc: \n{}".format(server_desc))

-        self._server = fluid.core.DistFleetWrapper()
+        self._server = core.DistFleetWrapper()
        self._server.init_server(
            server_desc,
            self.string_hosts,

--- a/python/paddle/distributed/ps/utils/collective_transpiler.py
+++ b/python/paddle/distributed/ps/utils/collective_transpiler.py
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import paddle
+import paddle.fluid as fluid
 from .public import *  # noqa: F403
 from paddle.distributed.fleet.base.private_helper_function import (
    wait_server_ready,
@@ -77,8 +79,8 @@ class PsProgramBuilder:
            self._build_trainer_programs()
            fluid.framework.switch_startup_program(self.cloned_startup)
            print(
-                "fluid.default_startup_program: {}".format(
-                    fluid.default_startup_program
+                "paddle.static.default_startup_program: {}".format(
+                    paddle.static.default_startup_program
                )
            )
            # print("ps_program_build before =", id(self.loss.block.program))
@@ -471,8 +473,8 @@ class FlPsProgramBuilder(HeterAsyncPsProgramBuilder):
            fluid.framework.switch_startup_program(self.cloned_startup)
            fluid.framework.switch_main_program(self.cloned_main)
            print(
-                "fluid.default_startup_program: {}".format(
-                    fluid.default_startup_program()._heter_pipeline_opt
+                "paddle.static.default_startup_program: {}".format(
+                    paddle.static.default_startup_program()._heter_pipeline_opt
                )
            )
        else:

--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -19,7 +19,7 @@ import os
 import warnings
 import logging
 import paddle.fluid as fluid
-from paddle.fluid import core
+from paddle.framework import core
 import paddle.fluid.framework as framework

 # logging.basicConfig(
@@ -896,7 +896,7 @@ def find_heter_ops(program, default_device="cpu"):
    if len(heter_ops) == 0:
        warnings.warn(
            "No heterogeneous OP was found in your program , "
-            " please using fluid.device_guard() to run OPs on different device."
+            " please using static.device_guard() to run OPs on different device."
        )

    total_heter_ops = 0