diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c803864e61220ceb954bd1c23b2f8f367e2510c1..3b81d59ad965b7532ca729682e7aeb8eb96194a8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -825,7 +825,7 @@ All parameter, weight, gradient are variables in Paddle.
             If :math:`num\_threads=1`, all the operators will execute one by one,
             but the order maybe difference between iterations.
             If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count`, for CPU,
+            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
             :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
             if it is not set, ParallelExecutor will get the cpu count by calling
             `multiprocessing.cpu_count()`. Default 0.)DOC")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 999065f8aa37708e26ef68a26c1568bed9264a54..a8643bc542c809d7ffa86dc96d1e0a120653cd1e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -135,7 +135,8 @@ def __bootstrap__():
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
+        'enable_parallel_graph'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 970996128686a70ae7204ec3eba4583fc8c02cd3..c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -117,7 +117,7 @@ class ParallelExecutor(object):
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places)
+                exec_strategy.num_threads = len(self._places) * 4
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))