From 0f9d40816eafff20317f6f4db514dc8f48ed9392 Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Tue, 3 Mar 2020 16:51:29 +0800
Subject: [PATCH] test=develop, optimize distributedstrategy (#22677)

* test=develop, optimize distributedstrategy
---
 .../distributed_strategy.py                   | 239 +++++++++++++-----
 .../unittests/test_distributed_strategy.py    |  27 ++
 2 files changed, 200 insertions(+), 66 deletions(-)

diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index 0d868f6109e..92d07c97da4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -19,12 +19,32 @@ __all__ = [
 
 import os
 import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig, DistributedMode
 
 
 class TrainerRuntimeConfig(object):
     def __init__(self):
+        self.mode = None
+        num_threads = os.getenv("CPU_NUM", "1")
+
         self.runtime_configs = {}
+        self.runtime_configs['communicator_max_merge_var_num'] = os.getenv(
+            "FLAGS_communicator_max_merge_var_num", num_threads)
+        self.runtime_configs['communicator_send_queue_size'] = os.getenv(
+            "FLAGS_communicator_send_queue_size", num_threads)
+        self.runtime_configs[
+            'communicator_independent_recv_thread'] = os.getenv(
+                "FLAGS_communicator_independent_recv_thread", "1")
+        self.runtime_configs[
+            'communicator_min_send_grad_num_before_recv'] = os.getenv(
+                "FLAGS_communicator_min_send_grad_num_before_recv", num_threads)
+        self.runtime_configs['communicator_thread_pool_size'] = os.getenv(
+            "FLAGS_communicator_thread_pool_size", "5")
+        self.runtime_configs['communicator_send_wait_times'] = os.getenv(
+            "FLAGS_communicator_send_wait_times", "5")
+        self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
+            "FLAGS_communicator_is_sgd_optimizer", "1")
+
         # not used 
         self.runtime_configs['rpc_deadline'] = os.getenv("FLAGS_rpc_deadline",
                                                          "180000")
@@ -32,9 +52,54 @@ class TrainerRuntimeConfig(object):
             "FLAGS_rpc_retry_times", "3")
 
     def get_communicator_flags(self):
-        return self.runtime_configs
-
-    def __repr__(self):
+        need_keys = []
+        num_threads = os.getenv("CPU_NUM", "1")
+        mode_str = ""
+        if self.mode is None or self.mode == DistributedMode.ASYNC:
+            need_keys = self.runtime_configs.keys()
+            mode_str = "async"
+        elif self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            mode_str = "sync or half_async"
+            need_keys = [
+                'communicator_max_merge_var_num',
+                'communicator_send_wait_times', 'communicator_thread_pool_size',
+                'communicator_send_queue_size'
+            ]
+        elif self.mode == DistributedMode.GEO:
+            mode_str = "GEO"
+            need_keys = [
+                'communicator_thread_pool_size', 'communicator_send_wait_times'
+            ]
+        else:
+            raise ValueError("Unsupported Mode")
+
+        if self.mode == DistributedMode.SYNC or self.mode == DistributedMode.HALF_ASYNC:
+            max_merge_var_num = self.runtime_configs[
+                'communicator_max_merge_var_num']
+            send_queue_size = self.runtime_configs[
+                'communicator_send_queue_size']
+            if max_merge_var_num != num_threads:
+                print('WARNING: In {} mode, communicator_max_merge_var_num '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_max_merge_var_num = {}, CPU_NUM = '
+                      '{}. communicator_max_merge_var_num will be fored to {}.'
+                      .format(mode_str, max_merge_var_num, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_max_merge_var_num'] = num_threads
+            if send_queue_size != num_threads:
+                print('WARNING: In {} mode, communicator_send_queue_size '
+                      'must be equal to CPU_NUM. But received, '
+                      'communicator_send_queue_size = {}, CPU_NUM = '
+                      '{}. communicator_send_queue_size will be fored to {}.'
+                      .format(mode_str, send_queue_size, num_threads,
+                              num_threads))
+                self.runtime_configs[
+                    'communicator_send_queue_size'] = num_threads
+
+        return dict((key, str(self.runtime_configs[key])) for key in need_keys)
+
+    def display(self, configs):
         raw0, raw1, length = 45, 5, 50
         h_format = "{:^45s}{:<5s}\n"
         l_format = "{:<45s}{:<5s}\n"
@@ -47,7 +112,7 @@ class TrainerRuntimeConfig(object):
         draws += h_format.format("TrainerRuntimeConfig Overview", "Value")
         draws += line + "\n"
 
-        for k, v in self.get_communicator_flags().items():
+        for k, v in configs.items():
             draws += l_format.format(k, v)
 
         draws += border
@@ -55,6 +120,9 @@ class TrainerRuntimeConfig(object):
         _str = "\n{}\n".format(draws)
         return _str
 
+    def __repr__(self):
+        return self.display(self.get_communicator_flags())
+
 
 class DistributedStrategy(object):
     def __init__(self):
@@ -105,6 +173,12 @@ class DistributedStrategy(object):
             raise TypeError(
                 "program_config only accept input type: dict or DistributeTranspilerConfig"
             )
+        self.check_program_config()
+
+    def check_program_config(self):
+        raise NotImplementedError(
+            "check_program_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
+        )
 
     def get_trainer_runtime_config(self):
         return self._trainer_runtime_config
@@ -123,6 +197,12 @@ class DistributedStrategy(object):
             raise TypeError(
                 "trainer_runtime_config only accept input type: dict or TrainerRuntimeConfig"
             )
+        self.check_trainer_runtime_config()
+
+    def check_trainer_runtime_config(self):
+        raise NotImplementedError(
+            "check_trainer_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
+        )
 
     def get_server_runtime_config(self):
         return self._server_runtime_config
@@ -141,6 +221,12 @@ class DistributedStrategy(object):
             raise TypeError(
                 "server_runtime_config only accept input type: dict or ServerRuntimeConfig"
             )
+        self.check_server_runtime_config()
+
+    def check_server_runtime_config(self):
+        raise NotImplementedError(
+            "check_server_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
+        )
 
     def get_execute_strategy(self):
         return self._execute_strategy
@@ -159,6 +245,12 @@ class DistributedStrategy(object):
             raise TypeError(
                 "execute_strategy only accept input type: dict or ExecutionStrategy"
             )
+        self.check_execute_strategy()
+
+    def check_execute_strategy(self):
+        raise NotImplementedError(
+            "check_execute_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
+        )
 
     def get_build_strategy(self):
         return self._build_strategy
@@ -176,106 +268,121 @@ class DistributedStrategy(object):
         else:
             raise TypeError(
                 "build_strategy only accept input type: dict or BuildStrategy")
+        self.check_build_strategy()
+
+    def check_build_strategy(self):
+        raise NotImplementedError(
+            "check_build_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
+        )
 
 
 class SyncStrategy(DistributedStrategy):
     def __init__(self):
         super(SyncStrategy, self).__init__()
+        self.check_program_config()
+        self.check_trainer_runtime_config()
+        self.check_server_runtime_config()
+        self.check_build_strategy()
+        self.check_execute_strategy()
+
+    def check_trainer_runtime_config(self):
+        self._trainer_runtime_config.mode = DistributedMode.SYNC
+
+    def check_program_config(self):
         self._program_config.sync_mode = False
         self._program_config.runtime_split_send_recv = True
-        self._build_strategy.async_mode = True
         self._program_config.half_async = True
         self._program_config.completely_not_async = True
-        self._execute_strategy.use_thread_barrier = True
 
-        num_threads = os.getenv("CPU_NUM", "1")
+    def check_server_runtime_config(self):
+        pass
 
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_max_merge_var_num'] = os.getenv(
-                "FLAGS_communicator_max_merge_var_num", num_threads)
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_wait_times'] = os.getenv(
-                "FLAGS_communicator_send_wait_times", "5")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_thread_pool_size'] = os.getenv(
-                "FLAGS_communicator_thread_pool_size", "10")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_queue_size'] = os.getenv(
-                "FLAGS_communicator_send_queue_size", num_threads)
+    def check_execute_strategy(self):
+        self._execute_strategy.use_thread_barrier = True
+
+    def check_build_strategy(self):
+        self._build_strategy.async_mode = True
 
 
 class AsyncStrategy(DistributedStrategy):
     def __init__(self):
         super(AsyncStrategy, self).__init__()
+        self.check_program_config()
+        self.check_trainer_runtime_config()
+        self.check_server_runtime_config()
+        self.check_build_strategy()
+        self.check_execute_strategy()
+
+    def check_trainer_runtime_config(self):
+        self._trainer_runtime_config.mode = DistributedMode.ASYNC
+
+    def check_program_config(self):
         self._program_config.sync_mode = False
         self._program_config.runtime_split_send_recv = True
-        self._build_strategy.async_mode = True
 
-        num_threads = os.getenv("CPU_NUM", "1")
+    def check_server_runtime_config(self):
+        pass
 
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_max_merge_var_num'] = os.getenv(
-                "FLAGS_communicator_max_merge_var_num", num_threads)
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_independent_recv_thread'] = os.getenv(
-                "FLAGS_communicator_independent_recv_thread", "0")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_min_send_grad_num_before_recv'] = os.getenv(
-                "FLAGS_communicator_min_send_grad_num_before_recv", num_threads)
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_thread_pool_size'] = os.getenv(
-                "FLAGS_communicator_thread_pool_size", "10")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_wait_times'] = os.getenv(
-                "FLAGS_communicator_send_wait_times", "5")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_is_sgd_optimizer'] = os.getenv(
-                "FLAGS_communicator_is_sgd_optimizer", "1")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_queue_size'] = os.getenv(
-                "FLAGS_communicator_send_queue_size", num_threads)
+    def check_execute_strategy(self):
+        pass
+
+    def check_build_strategy(self):
+        self._build_strategy.async_mode = True
 
 
 class HalfAsyncStrategy(DistributedStrategy):
     def __init__(self):
         super(HalfAsyncStrategy, self).__init__()
+        self.check_program_config()
+        self.check_trainer_runtime_config()
+        self.check_server_runtime_config()
+        self.check_build_strategy()
+        self.check_execute_strategy()
+
+    def check_trainer_runtime_config(self):
+        self._trainer_runtime_config.mode = DistributedMode.HALF_ASYNC
+
+    def check_program_config(self):
         self._program_config.sync_mode = False
         self._program_config.runtime_split_send_recv = True
         self._program_config.half_async = True
-        self._build_strategy.async_mode = True
-        self._execute_strategy.use_thread_barrier = True
 
-        num_threads = os.getenv("CPU_NUM", "1")
+    def check_server_runtime_config(self):
+        pass
+
+    def check_execute_strategy(self):
+        self._execute_strategy.use_thread_barrier = True
 
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_max_merge_var_num'] = os.getenv(
-                "FLAGS_communicator_max_merge_var_num", num_threads)
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_wait_times'] = os.getenv(
-                "FLAGS_communicator_send_wait_times", "5")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_thread_pool_size'] = os.getenv(
-                "FLAGS_communicator_thread_pool_size", "10")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_queue_size'] = os.getenv(
-                "FLAGS_communicator_send_queue_size", num_threads)
+    def check_build_strategy(self):
+        self._build_strategy.async_mode = True
 
 
 class GeoStrategy(DistributedStrategy):
     def __init__(self, update_frequency=100):
         super(GeoStrategy, self).__init__()
+        self._program_config.geo_sgd_need_push_nums = update_frequency
+        self.check_program_config()
+        self.check_trainer_runtime_config()
+        self.check_server_runtime_config()
+        self.check_build_strategy()
+        self.check_execute_strategy()
+
+    def check_program_config(self):
         self._program_config.sync_mode = False
         self._program_config.runtime_split_send_recv = True
         self._program_config.geo_sgd_mode = True
-        self._program_config.geo_sgd_need_push_nums = update_frequency
-        self._build_strategy.async_mode = True
 
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_thread_pool_size'] = os.getenv(
-                "FLAGS_communicator_thread_pool_size", "10")
-        self._trainer_runtime_config.runtime_configs[
-            'communicator_send_wait_times'] = os.getenv(
-                "FLAGS_communicator_send_wait_times", "5")
+    def check_trainer_runtime_config(self):
+        self._trainer_runtime_config.mode = DistributedMode.GEO
+
+    def check_server_runtime_config(self):
+        pass
+
+    def check_execute_strategy(self):
+        pass
+
+    def check_build_strategy(self):
+        self._build_strategy.async_mode = True
 
 
 class StrategyFactory(object):
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
index 25940022b8b..8dbe2f398f2 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
@@ -52,6 +52,15 @@ class TestStrategyFactor(unittest.TestCase):
         self.assertRaises(Exception, strategy.set_program_config,
                           program_config_illegal)
 
+        trainer_runtime_config = strategy.get_trainer_runtime_config()
+        trainer_runtime_config.runtime_configs[
+            'communicator_send_queue_size'] = '50'
+        runtime_configs = trainer_runtime_config.get_communicator_flags()
+        self.assertIn('communicator_send_queue_size', runtime_configs)
+        self.assertNotIn('communicator_independent_recv_thread',
+                         runtime_configs)
+        self.assertEqual(runtime_configs['communicator_send_queue_size'], '2')
+
     def test_geo_strategy(self):
         strategy = StrategyFactory.create_geo_strategy(5)
         self.assertEqual(strategy._program_config.sync_mode, False)
@@ -82,6 +91,14 @@ class TestStrategyFactor(unittest.TestCase):
         self.assertRaises(Exception, strategy.set_build_strategy,
                           build_strategy_illegal)
 
+        os.environ["CPU_NUM"] = '100'
+        trainer_runtime_config = strategy.get_trainer_runtime_config()
+        runtime_configs = trainer_runtime_config.get_communicator_flags()
+        self.assertIn('communicator_thread_pool_size', runtime_configs)
+        self.assertIn('communicator_send_wait_times', runtime_configs)
+        self.assertNotIn('communicator_independent_recv_thread',
+                         runtime_configs)
+
     def test_async_strategy(self):
         os.environ["CPU_NUM"] = '100'
 
@@ -164,6 +181,16 @@ class TestStrategyFactor(unittest.TestCase):
         self.assertRaises(Exception, strategy.set_server_runtime_config,
                           server_runtime_config_illegal)
 
+        os.environ["CPU_NUM"] = '100'
+        trainer_runtime_config = strategy.get_trainer_runtime_config()
+        trainer_runtime_config.runtime_configs[
+            'communicator_send_queue_size'] = '50'
+        runtime_configs = trainer_runtime_config.get_communicator_flags()
+        self.assertIn('communicator_send_queue_size', runtime_configs)
+        self.assertNotIn('communicator_independent_recv_thread',
+                         runtime_configs)
+        self.assertEqual(runtime_configs['communicator_send_queue_size'], '100')
+
 
 class TestCreateDefaultStrategy(unittest.TestCase):
     def test_default_strategy(self):
-- 
GitLab