diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 86f7046058c7001fcaa588727b1cdc0f3f20c35f..9a139ab27ec53395a8d1ab1347dbce93ea68fd8e 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", string::Sprintf(
                       "(LoDTensor) the right hand operand of %s operator",
                       comment.type));
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out", string::Sprintf(
                          "(LoDTensor) n-dim bool tensor. Each element is %s",
                          comment.equation));
@@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    bool force_cpu = ctx.Attr<bool>("force_cpu");
+    kt.place_ = force_cpu ? platform::CPUPlace()
+                          : ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 8c1a2549e0390dd6e0603dd1bed429ff25ad3220..8b62b242cf8745378eb216db10605388b294ca75 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -54,6 +54,8 @@ class WhileOp : public framework::OperatorBase {
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
 
+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index af55ef49beaeab612f17369c766385d661aa4ab7..fbfc383d118acca42e16d27a8dc962b0ba0a539b 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
 
 __all__ = [
@@ -949,7 +950,7 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, **ignored):
+def less_than(x, y, force_cpu=True, cond=None, **ignored):
     """
     **Less than**
 
@@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored):
     Args:
         x(Variable): First operand of *less_than*
         y(Variable): Second operand of *less_than*
+        force_cpu(Bool|True): The output data will be on CPU if set true.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
@@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored):
         cond.stop_gradient = True
 
     helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
+        type='less_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
     return cond
 
 
@@ -1395,7 +1400,8 @@ class DynamicRNN(object):
                 type='less_than',
                 inputs={'X': self.step_idx,
                         'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
+                outputs={'Out': self.cond},
+                attrs={'force_cpu': True})
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
@@ -1443,7 +1449,11 @@ class DynamicRNN(object):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                force_cpu=True,
+                cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array: