diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index c527bc74eee93fe1a69ae82d8c3fc674406f35e5..f2c78e0f70b321814b890d3a0b6e6dffb7cc689c 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/range_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -33,13 +34,26 @@ class CUDARangeKernel : public framework::OpKernel<T> {
     auto* step_t = context.Input<framework::Tensor>("Step");
     auto* out = context.Output<framework::Tensor>("Out");
 
+    T start, end, step;
     framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
-    T end = n.data<T>()[0];
-    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
-    T step = n.data<T>()[0];
+    if (::paddle::platform::is_cpu_place(start_t->place())) {
+      start = start_t->data<T>()[0];
+    } else {
+      framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+      start = n.data<T>()[0];
+    }
+    if (::paddle::platform::is_cpu_place(end_t->place())) {
+      end = end_t->data<T>()[0];
+    } else {
+      framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
+      end = n.data<T>()[0];
+    }
+    if (::paddle::platform::is_cpu_place(step_t->place())) {
+      step = step_t->data<T>()[0];
+    } else {
+      framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
+      step = n.data<T>()[0];
+    }
 
     int64_t size = 0;
     GetSize(start, end, step, &size);
@@ -47,7 +61,7 @@ class CUDARangeKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     auto stream = context.cuda_device_context().stream();
-    int block = 512;
+    int block = std::min(size, static_cast<int64_t>(256));
     int grid = (size + block - 1) / block;
     RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
   }
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cd0d652af849507804ae98d2e26e28b2268fe6ff..84f99962e843072b41be57d329900729659d71c7 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1374,19 +1374,19 @@ def range(start, end, step, dtype, name=None):
 
     if not isinstance(start, Variable):
         with device_guard("cpu"):
-            start = fill_constant([1], dtype, start)
+            start = fill_constant([1], dtype, start, force_cpu=True)
     elif start.dtype != dtype:
         start = cast(start, dtype)
 
     if not isinstance(end, Variable):
         with device_guard("cpu"):
-            end = fill_constant([1], dtype, end)
+            end = fill_constant([1], dtype, end, force_cpu=True)
     elif end.dtype != dtype:
         end = cast(end, dtype)
 
     if not isinstance(step, Variable):
         with device_guard("cpu"):
-            step = fill_constant([1], dtype, step)
+            step = fill_constant([1], dtype, step, force_cpu=True)
     elif step.dtype != dtype:
         step = cast(step, dtype)