diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c803864e61220ceb954bd1c23b2f8f367e2510c1..3b81d59ad965b7532ca729682e7aeb8eb96194a8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -825,7 +825,7 @@ All parameter, weight, gradient are variables in Paddle. If :math:`num\_threads=1`, all the operators will execute one by one, but the order maybe difference between iterations. If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count`, for CPU, + device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. if it is not set, ParallelExecutor will get the cpu count by calling `multiprocessing.cpu_count()`. Default 0.)DOC") diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 999065f8aa37708e26ef68a26c1568bed9264a54..a8643bc542c809d7ffa86dc96d1e0a120653cd1e 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -135,7 +135,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', + 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 970996128686a70ae7204ec3eba4583fc8c02cd3..c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -117,7 +117,7 @@ class ParallelExecutor(object): if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) + exec_strategy.num_threads = len(self._places) * 4 else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count()))