diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 67054eccb3397ea40f0fb3e2ff2530ee1ea64736..aa452ac220ea63bbf7a79c09b90aadfd2764856b 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -133,6 +133,32 @@ struct ExtractAttribute<std::vector<int64_t>> {
   const std::string& attr_name_;
 };
 
+template <>
+struct ExtractAttribute<float> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  float* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<float>(val);
+    } else if (attr.type() == typeid(int64_t)) {  // NOLINT
+      int64_t val = boost::get<int64_t>(attr);
+      attr = static_cast<float>(val);
+    }
+    float* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<float>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type float, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 template <typename T>
 inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fcd98fb52d1f0236ec58c6ecd1a4269de7f804b5..2f7476aa38c363bee015ecf502ce68f10fbab9f6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -885,12 +885,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
   // result of HasAttr.
-  if (!enable_cache_runtime_context && HasAttr(kEnableCacheRuntimeContext))
-    enable_cache_runtime_context = true;
-  if (!all_kernels_must_compute_runtime_shape &&
+  if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
+    enable_cache_runtime_context_ = true;
+  if (!all_kernels_must_compute_runtime_shape_ &&
       HasAttr(kAllKernelsMustComputeRuntimeShape))
-    all_kernels_must_compute_runtime_shape = true;
-  if (!enable_cache_runtime_context) {
+    all_kernels_must_compute_runtime_shape_ = true;
+  if (!enable_cache_runtime_context_) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
   } else {
@@ -931,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(kernel_type_->place_);
   }
 
-  if (!all_kernels_must_compute_runtime_shape) {
+  if (!all_kernels_must_compute_runtime_shape_) {
     RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
@@ -981,6 +981,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     }
   }
+
+  // To solve issue #15032, have a discussion with @Luotao for cpu inference,
+  // do not cache transfer scope, hence in this case delete transfer scope
+  // after run to avoid memory leak
+  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
+    scope.DeleteScope(transfer_scope);
+  }
 }
 
 void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
@@ -1114,9 +1121,18 @@ Scope* OperatorWithKernel::PrepareData(
       // If this op is not called by an Executor or ParallelExecutor, it should
       // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
       // variables, that behavior a lot different.
-      if (!run_by_executor_) {
+      //
+      // To solve issue #15032, have a discussion with @Luotao for cpu
+      // inference, for all cpu kernels cases without GPU participation, here
+      // not do transfer scope caching, and cpu inference performance is not
+      // impacted by test.
+      enable_cache_transfer_scope_ = false;
+      if (!run_by_executor_ &&
+          (platform::is_gpu_place(kernel_type_for_var.place_) ||
+           platform::is_gpu_place(expected_kernel_key.place_))) {
         new_scope = TryCreateTransferScope(kernel_type_for_var,
                                            expected_kernel_key, &scope);
+        enable_cache_transfer_scope_ = true;
       }
       if (!new_scope) {
         new_scope = &scope.NewScope();
@@ -1125,11 +1141,11 @@ Scope* OperatorWithKernel::PrepareData(
       // each result of different input will be the same with the first one.
       // The reason is that if a gpu tensor is the input of a cpu kernel,
       // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context, we get the cpu tensor each
+      // However, if enable_cache_runtime_context_, we get the cpu tensor each
       // time, not the gpu tensor.
       // Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
       // RunImpl().
-      if (enable_cache_runtime_context) {
+      if (enable_cache_runtime_context_) {
         pre_scope_ = nullptr;
       }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8e158e93063cb7620440b0af8433c0baa02eab22..07e7abd5b29abde1473d26e5aea2719658b65838 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -499,9 +499,10 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
-  mutable bool enable_cache_runtime_context = false;
-  mutable bool all_kernels_must_compute_runtime_shape = false;
+  mutable bool enable_cache_runtime_context_ = false;
+  mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
+  mutable bool enable_cache_transfer_scope_ = false;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 440b2475f1631ce5b0a1018ccd13849cc2568cd5..7de32094a07b95c3ec656892fa6adeacfbab8d0f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -295,7 +295,9 @@ class AllocatorFacadePrivate {
 
 // Pimpl. Make interface clean.
 AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
-AllocatorFacade::~AllocatorFacade() { delete m_; }
+// delete m_ may cause core dump when the destructor of python in conflict with
+// cpp.
+AllocatorFacade::~AllocatorFacade() {}
 
 AllocatorFacade& AllocatorFacade::Instance() {
   static AllocatorFacade instance;
diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h
index 70c3136d095fbdcf27d6fec0b0b17140a3ee82ee..3eb4db175a745c8ea7a3afaff919e4f21d430a8b 100644
--- a/paddle/fluid/pybind/pybind_boost_headers.h
+++ b/paddle/fluid/pybind/pybind_boost_headers.h
@@ -77,6 +77,15 @@ struct paddle_variant_caster<V<Ts...>> {
         }
       }
 
+      if (std::is_same<T, float>::value) {
+        auto caster_int64 = make_caster<int64_t>();
+        if (caster_int64.load(src, convert)) {
+          VLOG(4) << "this value are float and int64 satisfy simula.";
+          value = cast_op<int64_t>(caster_int64);
+          return true;
+        }
+      }
+
       value = cast_op<T>(caster);
       return true;
     }
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index fd59c5bb7cff5dd33fae284ba3efe04e667ed75a..e22bd09ed06a5dc2385006498a7794a70c776de8 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -50,6 +50,34 @@ class TestFillConstantOp2(OpTest):
         self.check_output()
 
 
+class TestFillConstantOp3(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int64 value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 10000000000}
+        self.outputs = {'Out': np.full((123, 92), 10000000000)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillConstantOp4(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified int value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3}
+        self.outputs = {'Out': np.full((123, 92), 3)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFillConstantOpWithSelectedRows(OpTest):
     def check_with_place(self, place):
         scope = core.Scope()