diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f5ab5d6ee5dc800632febc38184850b1fbb52284..ce37aa4855c8b370d00ab1897c41d84a20fd0bcc 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -328,6 +328,12 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                    "the number of places must be greater than 1.");
   }
 
+  LOG(WARNING) << string::Sprintf(
+      "The number of %s, which is used in ParallelExecutor, is %lu. And "
+      "the Program will be copied %lu copies",
+      (member_->use_cuda_ ? "CUDAPlace" : "CPUPlace"), places.size(),
+      places.size());
+
   // Step 1. Bcast the bcast_vars to devs.
   // Create local scopes
   if (local_scopes.empty()) {
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index e048639ae1e9e627ab7ae92f2492eed8f5c0b5da..969ad3c922f9c15b2e39f71ae4359cd3d2fcdcce 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -138,8 +138,7 @@ def reader_creator(data_file,
                 break
 
     if use_xmap:
-        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
-        return xmap_readers(mapper, reader, cpu_num, buffered_size)
+        return xmap_readers(mapper, reader, min(4, cpu_count()), buffered_size)
     else:
         return map_readers(mapper, reader)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
index 0ab8052d7ab16743bb6589dbb44203e70fa907d0..69080cf50ecaf8a290984f2792ec697a1edf3234 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -19,6 +19,8 @@ import six
 import numpy as np
 from paddle.fluid.contrib.slim.graph import GraphWrapper
 from paddle.fluid import core
+import os
+os.environ['CPU_NUM'] = str(4)
 
 
 def residual_block(num):
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 32b2c8014ca562d368842208ebf7737cd900341e..1090c781422045a2005ae1fb536a17d15005a8ad 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -22,7 +22,7 @@ from six.moves import zip, range, xrange
 import multiprocessing
 
 from .framework import Variable, default_main_program, _current_expected_place
-
+from .framework import _cpu_num, _cuda_ids
 __all__ = ['DataFeeder']
 
 
@@ -359,11 +359,9 @@ class DataFeeder(object):
         if num_places is not None:
             return int(num_places)
         elif isinstance(self.place, core.CUDAPlace):
-            return core.get_cuda_device_count()
+            return len(_cuda_ids())
         else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            return cpu_num
+            return _cpu_num()
 
     def decorate_reader(self,
                         reader,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 012d15f45a4a04d0262b113ffed77d3e4d810274..f7d487b640e52f04eb81b22d4ddd3dd1d16a8b91 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -82,7 +82,24 @@ def _current_expected_place():
 
 
 def _cpu_num():
-    return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    cpu_num = os.environ.get('CPU_NUM', None)
+    if cpu_num is None:
+        sys.stderr.write(
+            'The CPU_NUM is not specified, you should set CPU_NUM in '
+            'the environment variable list, i.e export CPU_NUM=1. CPU_NUM '
+            'indicates that how many CPUPlace are used in the current task.\n'
+            '!!! The default number of CPUPlaces is 1.')
+        os.environ['CPU_NUM'] = str(1)
+    return int(cpu_num)
+
+
+def _cuda_ids():
+    gpus_env = os.getenv("FLAGS_selected_gpus")
+    if gpus_env:
+        device_ids = [int(s) for s in gpus_env.split(",")]
+    else:
+        device_ids = six.moves.range(core.get_cuda_device_count())
+    return device_ids
 
 
 def cuda_places(device_ids=None):
@@ -116,11 +133,7 @@ def cuda_places(device_ids=None):
     assert core.is_compiled_with_cuda(), \
         "Not compiled with CUDA"
     if device_ids is None:
-        gpus_env = os.getenv("FLAGS_selected_gpus")
-        if gpus_env:
-            device_ids = [int(s) for s in gpus_env.split(",")]
-        else:
-            device_ids = six.moves.range(core.get_cuda_device_count())
+        device_ids = _cuda_ids()
     elif not isinstance(device_ids, (list, tuple)):
         device_ids = [device_ids]
     return [core.CUDAPlace(dev_id) for dev_id in device_ids]
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index d0eca7d6dfbdf03828125508c798a9bd31f8bbd6..328b3a4813eec261d39985ef80c47d0c827380ca 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -17,6 +17,8 @@ from paddle.fluid import compiler
 import unittest
 import logging
 import six
+import os
+os.environ['CPU_NUM'] = str(4)
 
 
 class TestBase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index a3701f0808b98b80b62866ffe1250d065361025c..e4fb9b1970a8da4bfec5d48f1182e9552aa77ca8 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -22,6 +22,7 @@ import numpy as np
 import threading
 import multiprocessing
 import os
+os.environ['CPU_NUM'] = str(4)
 
 
 def as_tensor(np_array_or_tensor, place=None):