diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 9a3a639579bd9d44f257c3f0f1aa63e0ae27e8e2..5b612677da3554f17ab3ac29ddc241eee5f7c768 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
@@ -69,6 +70,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+DECLARE_int32(call_stack_level);
+
 namespace paddle {
 namespace platform {
 
@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
   return str;
 }
 
-template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what, const char* file,
-                                      int line) {
+inline std::string GetCurrentTraceBackString() {
   static constexpr int TRACE_STACK_LIMIT = 100;
   std::ostringstream sout;
 
@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 #else
   sout << "Windows not support stack backtrace yet.\n";
 #endif
+  return sout.str();
+}
+
+template <typename StrType>
+inline std::string GetErrorSumaryString(StrType&& what, const char* file,
+                                        int line) {
+  std::ostringstream sout;
   sout << "\n----------------------\nError Message "
           "Summary:\n----------------------\n";
   sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
   return sout.str();
 }
 
+template <typename StrType>
+inline std::string GetTraceBackString(StrType&& what, const char* file,
+                                      int line) {
+  if (FLAGS_call_stack_level > 1) {
+    // FLAGS_call_stack_level>1 means showing c++ call stack
+    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  } else {
+    return GetErrorSumaryString(what, file, line);
+  }
+}
+
 inline bool is_error(bool stat) { return !stat; }
 
 inline void throw_on_error(bool stat, const std::string& msg) {
@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
  *
  * Examples:
  *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
-*/
+ */
 #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
   (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
     auto* __ptr = (__PTR);                                                  \
@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
  *
  * Examples:
  *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
-*/
+ */
 #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                   \
   do {                                                                      \
     PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound(     \
@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
  * Note: GCC 4.8 cannot select right overloaded function here, so need
  *    to define different functions and macros here, after we upgreade
  *    CI gcc version, we can only define one BOOST_GET macro.
-*/
+ */
 namespace details {
 
 #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr,      \
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 98bdf1f8c675da4e3a272945d605563e35016f8d..8667375c6f2726f1099c6e57c6e793252b01d454 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
  * Note:
  */
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+
+/**
+ * Debug related FLAG
+ * Name: FLAGS_call_stack_level
+ * Since Version: 2.0.0
+ * Value Range: int, default=2
+ * Example:
+ * Note: Used to debug. Determine the call stack to print when error or
+ * exeception happens.
+ * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
+ * If FLAGS_call_stack_level == 1, the python stack and  error message summary
+ * will be shown.
+ * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
+ * message summary will be shown.
+ */
+DEFINE_int32(
+    call_stack_level, 2,
+    "Determine the call stack to print when error or exeception happens."
+    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
+    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
+    // "shown. "
+    "If FLAGS_call_stack_level == 1, the python stack and error message "
+    "summary will be shown."
+    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
+    "error message summary will be shown.");
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 5178b5f89adf3b8a39b303228d1e674b22e7dc2d..deca9625e63d05625c407a1282b396398bb78ccc 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/global_value_getter_setter.h"
+
 #include <cctype>
 #include <functional>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
 DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
+DECLARE_int32(call_stack_level);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler,
-      FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode,
-      FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir,
-      FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size,
-      FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem,
-      FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion,
-      FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism,
-      FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads);
+      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
+      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
+      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
+      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
+      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
+      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
+      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
+      FLAGS_paddle_num_threads);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 776a52b300fe0c7c582b59947e13e5ca98daf4e4..88dd815d937a4778b0d24a90d448a262689907f3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -166,17 +166,34 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
     sysstr = platform.system()
     read_env_flags = [
-        'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
-        'eager_delete_scope', 'fraction_of_cpu_memory_to_use',
-        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads',
-        'dist_threadpool_size', 'eager_delete_tensor_gb',
-        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
-        'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
-        'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
+        'check_nan_inf',
+        'fast_check_nan_inf',
+        'benchmark',
+        'eager_delete_scope',
+        'fraction_of_cpu_memory_to_use',
+        'initial_cpu_memory_in_mb',
+        'init_allocated_mem',
+        'paddle_num_threads',
+        'dist_threadpool_size',
+        'eager_delete_tensor_gb',
+        'fast_eager_deletion_mode',
+        'memory_fraction_of_eager_deletion',
+        'allocator_strategy',
+        'reader_queue_speed_test_mode',
+        'print_sub_graph_dir',
+        'pe_profile_fname',
+        'inner_op_parallelism',
+        'enable_parallel_graph',
+        'fuse_parameter_groups_size',
+        'multiple_of_cupti_buffer_size',
+        'fuse_parameter_memory_size',
+        'tracer_profile_fname',
+        'dygraph_debug',
+        'use_system_allocator',
+        'enable_unused_var_check',
+        'free_idle_chunk',
+        'free_when_no_cache_hit',
+        'call_stack_level',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -208,12 +225,19 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
-            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
-            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
-            'local_exe_sub_scope_limit', 'gpu_memory_limit_mb'
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'cudnn_deterministic',
+            'enable_cublas_tensor_op_math',
+            'conv_workspace_size_limit',
+            'cudnn_exhaustive_search',
+            'selected_gpus',
+            'sync_nccl_allreduce',
+            'cudnn_batchnorm_spatial_persistent',
+            'gpu_allocator_retry_time',
+            'local_exe_sub_scope_limit',
+            'gpu_memory_limit_mb',
         ]
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])