diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index ee0df039acd446bc7952186d0870688f9b2dfb46..c18ba049c8f107f17afd7a8e08af6b3657cfd56d 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -30,7 +30,7 @@ Executor::Executor(const std::vector<platform::Place>& places) {
       device_contexts_[i] = new platform::CPUDeviceContext(
           boost::get<platform::CPUPlace>(places[i]));
     } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
       device_contexts_[i] = new platform::CUDADeviceContext(
           boost::get<platform::GPUPlace>(places[i]));
 #else
diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc
index 5e327cc893e3a393b758a06f6a6f33173d0e86db..55e209628b394c4fd43ce6c185918b8492e2f024 100644
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -293,7 +293,7 @@ TEST_F(ExecutorTesterFeed, CPU) {
   delete executor;
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST_F(ExecutorTesterRandom, GPU) {
   std::vector<Place> places;
   GPUPlace gpu_place(0);
@@ -315,10 +315,20 @@ TEST_F(ExecutorTesterFeed, GPU) {
 
   Executor* executor = new Executor(places);
 
-  // need to set feed variable before Executor::Run
-  set_feed_variable<float>(inputs_);
-  executor->Run(pdesc_, GetScope());
-
+  // 3 mini-batch
+  for (int i = 0; i < 3; i++) {
+    // need to set feed variable before Executor::Run
+    std::cout << "start mini-batch " << i << std::endl;
+    set_feed_variable<float>(inputs_);
+    executor->Run(pdesc_, GetScope());
+    std::vector<std::vector<float>> result = get_fetch_variable<float>();
+    for (auto& vec : result) {
+      for (auto& num : vec) {
+        std::cout << num << " ";
+      }
+      std::cout << std::endl;
+    }
+  }
   delete executor;
 }
 #endif
diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu
index 2e24d3a8adcdce7e3443f23367a8f06bf6ec86b2..ca39d24c791ded71149777acc53e3b5cc240329f 100644
--- a/paddle/operators/fetch_op.cu
+++ b/paddle/operators/fetch_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/feed_op.h"
+#include "paddle/operators/fetch_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel<float>);
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 486dcd623a66910fff06214937e0ae65f7b6be6e..aa76bb209d55a456c58e59591b60db1ad9436629 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -43,7 +43,8 @@ int GetCurrentDeviceId() {
 }
 
 void SetDeviceId(int id) {
-  PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count");
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE(id < GetCUDADeviceCount(), "id must less than GPU count");
   PADDLE_ENFORCE(cudaSetDevice(id),
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }