diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 49248edd322d29e015d4b2e4f8ec20cc592c4a22..6af07caaf88b2a907807b84e63d4ed5499ca98d1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2176,6 +2176,16 @@ Scope* OperatorWithKernel::PreparePhiData(
       if (!new_scope) {
         new_scope = &scope.NewScope();
       }
+      // For inference, if a gpu model has an op which could only run on CPU,
+      // each result of different input will be the same with the first one.
+      // The reason is that if a gpu tensor is the input of a cpu kernel,
+      // we will create a new cpu tensor in new scope.
+      // However, if enable_cache_runtime_context_, we get the cpu tensor each
+      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
+      // to trigger `new RuntimeContext()` in RunImpl().
+      if (enable_cache_runtime_context_) {
+        pre_scope_ = nullptr;
+      }
 
       // Create new var with the same name in transfer scopes
       auto* trans_var = new_scope->Var(name_vec[offset]);
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 916f6aa5537a6fd0d3e6a95d7a2ab40dd2115186..9ea0d7c5393c37cf51bd37be86a45c4b3432cc64 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -64,7 +64,7 @@ void ArangeKernel(const Context& dev_ctx,
 
 PD_REGISTER_KERNEL(
     arange, GPU, ALL_LAYOUT, phi::ArangeKernel, float, double, int64_t, int) {
-  kernel->InputAt(0).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(1).SetBackend(phi::Backend::CPU);
-  kernel->InputAt(2).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
 }