diff --git a/CMakeLists.txt b/CMakeLists.txt
index 996a79fbbc3005680205e9fc0442b6bc6199bebb..9cfec8e70b4a3d166e3b45048408d7f5e45ce6e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -315,7 +315,6 @@ endif()
 
 if (ON_INFER)
     message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
     #TODO(luotao), combine this warning with `make inference_lib_dist` command.
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7f5771e561f6cc419fc9b3094174645ac432546e..4e17ddee73958106d5e2c8c8ea5661acc758518a 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index c384456b648d4497bf4bd003b183b773186e0f15..e8e53f988f92685cd4854b21202bcf7f9b1a4383 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }
 
 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
             << " on scope " << scope_;
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index c59b232191c49ccb47bb9f51dcaf2fd9280fae19..ac0330218973123771367ed5ba9477c90143a043 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
         place_(dev_ctx.GetPlace()),
         library_type_(library_type) {}
 
+  size_t hash_key() const { return Hash()(*this); }
+
   bool operator==(const OpKernelType& o) const {
     return platform::places_are_same_class(place_, o.place_) &&
            data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6bd744edc22e6a90ce64e9d699e7f3c5c60d4908..2b35943d092518c7f45a8ed3b708532666a23353 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
+
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
     for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
       VLOG(30) << "Transform Variable " << var_name << " from "
                << kernel_type_for_var << " to " << expected_kernel_key;
 
+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
+
       if (new_scope == nullptr) {
         new_scope = &scope.NewScope();
       }
 
       auto* trans_var = new_scope->Var(var_name);
+
       Tensor out;
       TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
       SetTensorToVariable(*var, out, trans_var);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index bbeef150254f8f7a1f382a5b81055a6a5589eee1..26cb7d51a88afac15322eecad965912097d19a45 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);