diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index cdd67fdc929851979fe0a38afe1af74ec7321b8a..67d355d10d3c9e11b59c9ce9d208826523095459 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -19,10 +19,14 @@ function (inference_analysis_test TARGET)
         set(multiValueArgs SRCS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+        set(mem_opt "")
+        if(WITH_GPU)
+            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+        endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
                 DEPS analysis
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 216c4666c0a311f93f29692b2ca1d17bf9dafab8..3e13e7b1ffebf92301df69084b058ca55783e578 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -248,15 +248,11 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("rows", [](SelectedRows &self) {
-#ifndef PADDLE_WITH_CUDA
-        return self.rows();
-#else
-         auto rows = self.rows();
-         std::vector<int64_t> new_rows;
-         new_rows.reserve(rows.size());
-         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-         return new_rows;
-#endif
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
       });
 
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 330d104e0a774d905e463566f85bd2e64a080190..f83b026d4d50772b969c4316964b70a68b27442b 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -30,7 +30,9 @@ class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
                  size_t max_num_record)
-      : stream_(filename), writer_(&stream_, compressor, max_num_record) {}
+      : closed_(false),
+        stream_(filename),
+        writer_(&stream_, compressor, max_num_record) {}
 
   void AppendTensor(const framework::LoDTensor& tensor) {
     tensors_.push_back(tensor);
@@ -47,9 +49,17 @@ class RecordIOWriter {
     PADDLE_ENFORCE(tensors_.empty());
     writer_.Flush();
     stream_.close();
+    closed_ = true;
+  }
+
+  ~RecordIOWriter() {
+    if (!closed_) {
+      Close();
+    }
   }
 
  private:
+  bool closed_;
   std::vector<framework::LoDTensor> tensors_;
   std::ofstream stream_;
   recordio::Writer writer_;
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 9d05aeeb95c4f936cb773ece20407ecb32cbbf21..6259cc35b4f7bb781886bb5da9d16924831d7246 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                 for i in xrange(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 9a09db25dc0e2c71772aa06e6d0cf993321612e4..fd278f45f1c1b71a1653c3b28ace8bca8e4b1545 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
 import sys
 import numpy
@@ -134,4 +135,4 @@ def main(use_cuda):
 
 if __name__ == '__main__':
     # for use_cuda in (False, True):
-    main(use_cuda=True)
+    main(use_cuda=core.is_compiled_with_cuda())
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 49f549fa184037a64aa846f0d1d0e1b57db1f2ef..c471863920999a28cbede93a7965f07ee784f96d 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
 
+import paddle.fluid.core as core
 import math
 import os
 import sys
@@ -257,6 +258,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine):
 
 def inject_all_tests():
     for use_cuda in (False, True):
+        if use_cuda and not core.is_compiled_with_cuda():
+            continue
         for parallel in (False, True):
             for nn_type in ('mlp', 'conv'):
                 inject_test_method(use_cuda, parallel, nn_type, True)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 80e0692bc640efc280c43bd5b929847ad29207c4..3b957508ca1f11fea3bbc182dca7eaa938594cb6 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -245,7 +245,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
                     is_sparse=is_sparse,
                     is_parallel=is_parallel)
 
-    if use_cuda and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f6c8dcabcbc592024188f4742e6c532a704d2289..8c564abf986d351b31e993b62bfe1f17c52a4e10 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
+    list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -47,9 +52,11 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+if(WITH_DISTRIBUTE)
+    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index db5771f7b0ad74c73b81d502209c17dce3ce8457..4a3ac2a31e072eb1a15af31f558cf9f626a7ac53 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -100,6 +100,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             np.array_equal(np.array(sentence_scores), expected_data))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
     def setUp(self):
         self.scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 07545e7feb46c85a4b80f9b846be27d36cbfb59a..af6cd99b0d7e6b0a2dfd4fc1d33e8390017a5906 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -191,12 +191,16 @@ class TestWithDilation(TestConv2dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv2dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -212,6 +216,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -227,6 +233,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index c9f26d10df8ff39d6bd77b1597336600f676d362..300fa5e8bde001e0f66c5f924a81c30add99aead 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -197,12 +197,16 @@ class TestWithDilation(TestConv3dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv3dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -218,6 +222,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -233,6 +239,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 1f5d2f16773efb7537de85abec88344f8e0daa9f..60d63364d5f403f04519363db5ad3ad136f8a975 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -15,6 +15,7 @@
 import paddle.dataset.flowers as flowers
 import math
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 import paddle
@@ -92,7 +93,8 @@ class TestFetchOp(unittest.TestCase):
             train_inputs.append(tst_reader_iter.next())
 
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(train_inputs, seed=1, use_cuda=True)
         self.parallel_exe(train_inputs, seed=1, use_cuda=False)
 
 
@@ -137,7 +139,8 @@ class TestFeedParallel(unittest.TestCase):
 
     def test_feed_op(self):
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(use_cuda=True, seed=1)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(use_cuda=True, seed=1)
         self.parallel_exe(use_cuda=False, seed=1)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index f098dc7a3fb670e23471c2aa897011a3cf882c33..4580ab85e235e9f73d138058ac6a9aa3beb49a54 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -14,6 +14,7 @@
 
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import paddle
 import paddle.dataset.mnist as mnist
@@ -102,6 +103,8 @@ class TestMNIST(TestParallelExecutorBase):
                 MNIST_RECORDIO_FILE, reader, feeder)
 
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
         self.check_network_convergence(
             simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
@@ -126,6 +129,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_convergence(False, True)
 
     def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         img = np.zeros(shape=[32, 784], dtype='float32')
         label = np.ones(shape=[32, 1], dtype='int64')
         single_first_loss, single_last_loss = self.check_network_convergence(
@@ -159,6 +164,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_simple_fc_parallel_accuracy(False, True)
 
     def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
         img = np.zeros(shape=[32, 784], dtype='float32')
         label = np.ones(shape=[32, 1], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 4d39505b66abf44249e0ea160b82aaf7be0638cb..834e920845f29b153909a971eb5afc4f8a33346e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -16,6 +16,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.initializer import init_on_cpu
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
@@ -140,6 +141,9 @@ class TestResnet(TestParallelExecutorBase):
                                                           use_reduce=False,
                                                           iter=20):
 
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
         os.environ['CPU_NUM'] = str(4)
 
         def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 9a2733927d38f1a2b1af92fcc12f036158b4d06f..7688b8495d7f7c60e80f62dae2edd72be9f839d4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import unittest
 import os
@@ -92,16 +93,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
     def test_parallel_testing(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index e01af42a58b86042fd0282928d1a78d9c3239fe3..b461c5c9401d74ef8dcf4afc84dc0ea6920a2419 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -56,6 +56,8 @@ class TestPrintOpCPU(unittest.TestCase):
                        return_numpy=False)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestPrintOpGPU(TestPrintOpCPU):
     def setUp(self):
         self.place = core.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a..9f8d33f9bbfc78b6f1a0c089b34b2f41d561c640 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -79,12 +79,18 @@ class TestProfiler(unittest.TestCase):
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_all_profiler(self):
         self.net_profiler('All', '/tmp/profile_out')
         with open('/tmp/profile_out', 'r') as f:
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index e91a69a0f8039651225039beb2a42e8dffeb62d3..c4fc8b74cf80c3596b0af9f7f0434864591195bd 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -61,6 +61,8 @@ class TestSequenceSoftmaxOp(OpTest):
 
 
 # ----------------cudnn Sequencesoftmax----------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
     def init_op_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 279f3073f73d1c36f54bb901d92441a7403ac23f..0ab581cfb0ea0ff2205450b8e62edb8bf3c51707 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -63,11 +63,15 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
     def init_kernel_type(self):
         self.dtype = np.float16
@@ -79,6 +83,8 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06..026cf501cfb35ab3fe35d24f52d3c271565482ef 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                 for i in xrange(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader