diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index f082e33be3357fbe405ab1a1ef5e0e601108a363..0d1eaeed0971e514fc4368e2f58ba844bd8118ae 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -119,7 +119,7 @@ def reader_creator(data_file,
                 yield sample, int(label) - 1
 
     if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        return xmap_readers(mapper, reader, min(4, cpu_count()), buffered_size)
     else:
         return map_readers(mapper, reader)
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 7940dabcfb03cc9eb46f678365685a6e99bcceec..0fd696510e5dcf57b95e92e430feb27a8aedd3f8 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -150,7 +150,7 @@ class DataFeeder(object):
         elif isinstance(self.place, core.CUDAPlace):
             return core.get_cuda_device_count()
         else:
-            return multiprocessing.cpu_count()
+            return min(4, multiprocessing.cpu_count())
 
     def decorate_reader(self,
                         reader,
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 3117dfe00c7a3df1035c439dc31b81e67781d0cc..70437399d72c25043d1ed929c1dad1190d55cb83 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -101,7 +101,7 @@ class ParallelExecutor(object):
                 p.set_place(self._act_places[-1])
                 self._places.append(p)
         else:
-            for i in xrange(multiprocessing.cpu_count()):
+            for i in xrange(min(4, multiprocessing.cpu_count())):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -110,10 +110,7 @@ class ParallelExecutor(object):
 
         if exec_strategy is None:
             exec_strategy = ExecutionStrategy()
-            if use_cuda:
-                exec_strategy.use_event = True
-            else:
-                exec_strategy.use_event = False
+        exec_strategy.use_event = use_cuda
 
         if exec_strategy.num_threads == 0:
             if use_cuda:
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index c9c3c648717814c28c39a401487925824e885946..566b676777cc329dce02f1875abf0d72176c1c00 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -23,6 +23,7 @@ __all__ = ['TestParallelExecutorBase']
 class TestParallelExecutorBase(unittest.TestCase):
     def check_network_convergence(self,
                                   method,
+                                  use_cuda=True,
                                   memory_opt=True,
                                   iter=50,
                                   batch_size=None,
@@ -53,7 +54,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
-            place = fluid.CUDAPlace(0)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
@@ -64,7 +65,7 @@ class TestParallelExecutorBase(unittest.TestCase):
 
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
-                    True,
+                    use_cuda,
                     loss_name=loss.name,
                     exec_strategy=exec_strategy,
                     build_strategy=build_strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 015703c3e25f4e11e64ab6a7de99da12bee608f6..52dfb9620f87306c18eb5149d49ab534e4db410c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -99,7 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
+    def check_simple_fc_convergence(self,
+                                    balance_parameter_opt_between_cards,
+                                    use_cuda=True):
         self.check_network_convergence(simple_fc_net)
         self.check_network_convergence(simple_fc_net, allow_op_delay=True)
 
@@ -109,17 +111,19 @@ class TestMNIST(TestParallelExecutorBase):
             simple_fc_net,
             feed_dict={"image": img,
                        "label": label},
+            use_cuda=use_cuda,
             balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
         )
 
     def test_simple_fc(self):
-        self.check_simple_fc_convergence(False)
+        self.check_simple_fc_convergence(False, use_cuda=True)
 
     def test_simple_fc_with_new_strategy(self):
-        self.check_simple_fc_convergence(True)
+        self.check_simple_fc_convergence(True, use_cuda=True)
 
     def check_simple_fc_parallel_accuracy(self,
-                                          balance_parameter_opt_between_cards):
+                                          balance_parameter_opt_between_cards,
+                                          use_cuda=True):
         img = np.zeros(shape=[32, 784], dtype='float32')
         label = np.ones(shape=[32, 1], dtype='int64')
         single_first_loss, single_last_loss = self.check_network_convergence(
@@ -127,6 +131,7 @@ class TestMNIST(TestParallelExecutorBase):
             seed=1000,
             feed_dict={"image": img,
                        "label": label},
+            use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
@@ -143,13 +148,15 @@ class TestMNIST(TestParallelExecutorBase):
             self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(False)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
 
     def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        self.check_simple_fc_parallel_accuracy(True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
 
-    def check_batchnorm_fc_convergence(self,
-                                       balance_parameter_opt_between_cards):
+    def check_batchnorm_fc_convergence(
+            self, balance_parameter_opt_between_cards, use_cuda):
         self.check_network_convergence(fc_with_batchnorm)
         img = np.zeros(shape=[32, 784], dtype='float32')
         label = np.ones(shape=[32, 1], dtype='int64')
@@ -157,14 +164,17 @@ class TestMNIST(TestParallelExecutorBase):
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
+            use_cuda=use_cuda,
             balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
         )
 
     def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(False)
+        self.check_batchnorm_fc_convergence(False, use_cuda=True)
+        self.check_batchnorm_fc_convergence(False, use_cuda=False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence(True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 7bdddeaabec733ef26b3f766c6437f5c53d65044..6c2c0d5cc50dad04123bf864488457fc91036ffc 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -119,7 +119,7 @@ def reader_creator(data_file,
                 yield sample, int(label) - 1
 
     if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        return xmap_readers(mapper, reader, min(4, cpu_count()), buffered_size)
     else:
         return map_readers(mapper, reader)