diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 595aba887303d0b8f50a1f95750311bb4ca27959..5953f2443044f355352c9febc02eda05d957d8ec 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
 cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
-cc_library(imperative_profiler SRCS profiler.cc)
+cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
 if(NOT WIN32)
     if(WITH_NCCL OR WITH_RCCL)
         cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc
index 57656d64ab78868a5c1c5eb73520523cd0f5d0b5..c2d668eccdaf9377d1ec2a62e50648bf48c67c00 100644
--- a/paddle/fluid/imperative/flags.cc
+++ b/paddle/fluid/imperative/flags.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/flags.h"
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
-DEFINE_uint64(dygraph_debug, 0,
-              "Debug level of dygraph. This flag is not "
-              "open to users");
+PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
+                              "Debug level of dygraph. This flag is not "
+                              "open to users");
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 6d0f6a12f522977564c9dc36029670d4f3c9d752..48af63056c5e361736046aa47d83735190d85c20 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -19,9 +19,9 @@
 #endif
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
     tracer_profile_fname, "xxgperf",
     "Profiler filename for imperative tracer, which generated by gperftools."
     "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 9a0637453f03f08a50bb1af958b1ba5e584869b4..6b4afae9f8c7527060004ae0b342d6a508a1b4d5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc
 
 cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
 
-cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index bfc4a1d598200ed296bdb17e29c48bed2bca1e16..78bce53b6f4ffb2f2a77b5ec2a9c645f32651de2 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -37,14 +37,15 @@
 #endif
 #include "paddle/fluid/platform/npu_info.h"
 
-DEFINE_int64(
+PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
     "to allocate memory. No retry if this value is not greater than 0");
 
-DEFINE_bool(use_system_allocator, false,
-            "Whether to use system allocator to allocate CPU and GPU memory. "
-            "Only used for unittests.");
+PADDLE_DEFINE_EXPORTED_bool(
+    use_system_allocator, false,
+    "Whether to use system allocator to allocate CPU and GPU memory. "
+    "Only used for unittests.");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index cca29797bb68c27514e27e3ced291202915acc64..a35d8a73f7edae601ac5ab4c01f38e8772e724b3 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -17,18 +17,21 @@
 #include <algorithm>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
-
-DEFINE_bool(free_idle_chunk, false,
-            "Whether to free idle chunk when each allocation is freed. "
-            "If false, all freed allocation would be cached to speed up next "
-            "allocation request. If true, no allocation would be cached. This "
-            "flag only works when FLAGS_allocator_strategy=auto_growth.");
-
-DEFINE_bool(free_when_no_cache_hit, false,
-            "Whether to free idle chunks when no cache hit. If true, idle "
-            "chunk would be freed when no cache hit; if false, idle "
-            "chunk would be freed when out of memory occurs. This flag "
-            "only works when FLAGS_allocator_strategy=auto_growth.");
+#include "paddle/fluid/platform/flags.h"
+
+PADDLE_DEFINE_EXPORTED_READONLY_bool(
+    free_idle_chunk, false,
+    "Whether to free idle chunk when each allocation is freed. "
+    "If false, all freed allocation would be cached to speed up next "
+    "allocation request. If true, no allocation would be cached. This "
+    "flag only works when FLAGS_allocator_strategy=auto_growth.");
+
+PADDLE_DEFINE_EXPORTED_READONLY_bool(
+    free_when_no_cache_hit, false,
+    "Whether to free idle chunks when no cache hit. If true, idle "
+    "chunk would be freed when no cache hit; if false, idle "
+    "chunk would be freed when out of memory occurs. This flag "
+    "only works when FLAGS_allocator_strategy=auto_growth.");
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 9cd35ad8ad9da959606d895063fe1981c5ade18f..2c00b34dd1353b583a680fd12416fa4d545566ad 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -34,12 +34,13 @@
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
-DEFINE_bool(init_allocated_mem, false,
-            "It is a mistake that the values of the memory allocated by "
-            "BuddyAllocator are always zeroed in some op's implementation. "
-            "To find this error in time, we use init_allocated_mem to indicate "
-            "that initializing the allocated memory with a small value "
-            "during unit testing.");
+PADDLE_DEFINE_EXPORTED_bool(
+    init_allocated_mem, false,
+    "It is a mistake that the values of the memory allocated by "
+    "BuddyAllocator are always zeroed in some op's implementation. "
+    "To find this error in time, we use init_allocated_mem to indicate "
+    "that initializing the allocated memory with a small value "
+    "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 8e249e72db514a790c977558135ed5ec3a3ab35a..cd1bdc4d60c7496878d2d2a36021fc6efd6f4443 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
+                             "number of threads for rpc send");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d99f991911e9ca9ebeca7a6d7fbf31ada33bf811..d0e701b0235574f78a612f351fdd082cae4e8a23 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -37,13 +37,13 @@ if (WITH_PYTHON)
   endif(NOT WIN32)
 endif()
 
-cc_library(flags SRCS flags.cc DEPS gflags)
+cc_library(flags SRCS flags.cc DEPS gflags boost)
 cc_library(denormal SRCS denormal.cc DEPS)
 
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-set(enforce_deps flags errors boost)
+set(enforce_deps flags errors boost flags)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 6405b556217660bc0efb52eef33c83a3aceafc80..e4860444865719ede3eb7fb388fbf734ecf284e6 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #endif  // _WIN32
 
 #include <algorithm>
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 DECLARE_uint64(initial_cpu_memory_in_mb);
@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, true,
+                            "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace platform {
@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() {
   mib[1] = HW_MEMSIZE;
   int64_t size = 0;
   size_t len = sizeof(size);
-  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) {
+    return static_cast<size_t>(size);
+  }
   return 0L;
 #elif defined(_WIN32)
   MEMORYSTATUSEX sMeminfo;
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 52be0c805bbd2a68481a6cdbfef0de42b1a946f5..c420a5a64be068fdb6cc58531bacd0cbb92928af 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -101,6 +101,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/type_defs.h"
 #endif
+#include "paddle/fluid/platform/flags.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index ed465c9ea2eb8ad78543d9a42733f9228fe2ba31..b97c3106439bede55a6faafee980e7226702733f 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -12,11 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gflags/gflags.h"
+#include "paddle/fluid/platform/flags.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #endif
 
+namespace paddle {
+namespace platform {
+
+const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
+  return *GetMutableExportedFlagInfoMap();
+}
+
+ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
+  static ExportedFlagInfoMap g_exported_flag_info_map;
+  return &g_exported_flag_info_map;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
 /**
  * NOTE(paddle-dev): This file is designed to define all public FLAGS.
  */
@@ -30,8 +45,8 @@
  * instance to 2
  * Note:
  */
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
+PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, 1,
+                             "Number of threads for each paddle instance.");
 
 /**
  * Operator related FLAG
@@ -41,9 +56,10 @@ DEFINE_int32(paddle_num_threads, 1,
  * Example:
  * Note: Used to debug. Checking whether operator produce NAN/INF or not.
  */
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
+PADDLE_DEFINE_EXPORTED_bool(
+    check_nan_inf, false,
+    "Checking whether operator produce NAN/INF or not. It will be "
+    "extremely slow so please use this flag wisely.");
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
@@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false,
  * Example:
  * Note: whether to use Tensor Core, faster but it may loss precision.
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
     "but it may loss precision. Currently, There are two CUDA libraries that"
@@ -77,30 +93,34 @@ DEFINE_bool(
  * cards
  * Note: A list of device ids separated by comma, like: 0,1,2,3
  */
-DEFINE_string(selected_gpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (GPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
-              "share-memory only.");
+PADDLE_DEFINE_EXPORTED_string(
+    selected_gpus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (GPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+    "share-memory only.");
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-DEFINE_string(selected_npus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (NPU). If you want to use "
-              "all visible devices, set this to empty string.");
-DEFINE_bool(hccl_check_nan, true,
-            "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
-            "core when meets Nan value");
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
+    selected_npus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (NPU). If you want to use "
+    "all visible devices, set this to empty string.");
+PADDLE_DEFINE_EXPORTED_bool(
+    hccl_check_nan, true,
+    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
+    "core when meets Nan value");
+PADDLE_DEFINE_EXPORTED_string(
     npu_config_path, "",
     "The absolute path of configuration json file, like: /tmp/config.json. "
     "If proveided, it will be passed to aclInit().");
-DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
+PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
+                             "set minmum loss scaling value!");
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -113,10 +133,11 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
  * Note: whether to use deterministic algorithm in cudnn.
  *       If true, it will slow down some operators such as conv and pooling.
  */
-DEFINE_bool(cudnn_deterministic, false,
-            "Whether allow using an autotuning algorithm for convolution "
-            "operator. The autotuning algorithm may be non-deterministic. If "
-            "true, the algorithm is deterministic.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_deterministic, false,
+    "Whether allow using an autotuning algorithm for convolution "
+    "operator. The autotuning algorithm may be non-deterministic. If "
+    "true, the algorithm is deterministic.");
 
 /**
  * CUDNN related FLAG
@@ -130,9 +151,10 @@ DEFINE_bool(cudnn_deterministic, false,
  * increased.
  *       Users need to balance memory and speed.
  */
-DEFINE_uint64(conv_workspace_size_limit,
-              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-              "cuDNN convolution workspace limit in MB unit.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    conv_workspace_size_limit,
+    paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
+    "cuDNN convolution workspace limit in MB unit.");
 
 /**
  * CUDNN related FLAG
@@ -148,9 +170,10 @@ DEFINE_uint64(conv_workspace_size_limit,
  *       layer specification. Once you change the layer specifications
  *       (such as batch size, feature map size), it will search again.
  */
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, default is False.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_exhaustive_search, false,
+    "Whether enable exhaustive search for cuDNN convolution or "
+    "not, default is False.");
 
 /**
  * CUDNN related FLAG
@@ -160,9 +183,9 @@ DEFINE_bool(cudnn_exhaustive_search, false,
  * Example:
  * Note: only used to predict for advanced developer
  */
-DEFINE_int64(cudnn_exhaustive_search_times, -1,
-             "Exhaustive search times for cuDNN convolution, "
-             "default is -1, not exhaustive search");
+PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, -1,
+                             "Exhaustive search times for cuDNN convolution, "
+                             "default is -1, not exhaustive search");
 
 /**
  * CUDNN related FLAG
@@ -180,9 +203,10 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1,
  * certain
  *       input data range.
  */
-DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
-            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
-            "batch_norm, default is False.");
+PADDLE_DEFINE_EXPORTED_bool(
+    cudnn_batchnorm_spatial_persistent, false,
+    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
+    "batch_norm, default is False.");
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
  *       https://github.com/PaddlePaddle/Paddle/issues/15049
  *       If you want to change this default value, why?(gongwb)
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     sync_nccl_allreduce, true,
     "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
     "after allreduce, this mode can get better performance in some scenarios.");
@@ -215,11 +239,12 @@ DEFINE_bool(
  *       into the queue, and then the communicator takes the gradients out
  *       of the queue and sends them after merging.
  */
-DEFINE_int32(communicator_max_merge_var_num, 20,
-             "max var num to merge and send");
-DEFINE_bool(communicator_is_sgd_optimizer, true,
-            "gradient sent to the server is the sum of the gradients "
-            "calculated by each thread if optimizer is sgd");
+PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, 20,
+                             "max var num to merge and send");
+PADDLE_DEFINE_EXPORTED_bool(
+    communicator_is_sgd_optimizer, true,
+    "gradient sent to the server is the sum of the gradients "
+    "calculated by each thread if optimizer is sgd");
 /**
  * Distributed related FLAG
  * Name: FLAGS_communicator_send_queue_size
@@ -233,8 +258,8 @@ DEFINE_bool(communicator_is_sgd_optimizer, true,
  *       space. It is used to avoid training much faster than communication,
  *       so that too many gradients are not sent out in time.
  */
-DEFINE_int32(communicator_send_queue_size, 20,
-             "queue size to recv gradient before send");
+PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, 20,
+                             "queue size to recv gradient before send");
 #endif
 
 /**
@@ -246,8 +271,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
  * Note: Control the number of threads used for distributed modules.
  *       If it is not set, it is set to a hard thread.
  */
-DEFINE_int32(dist_threadpool_size, 0,
-             "number of threads used for distributed executed.");
+PADDLE_DEFINE_EXPORTED_int32(
+    dist_threadpool_size, 0,
+    "number of threads used for distributed executed.");
 
 /**
  * Garbage collector related FLAG
@@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1;
 static const double kDefaultEagerDeleteTensorGB = 0;
 #endif
 
-DEFINE_double(
+PADDLE_DEFINE_EXPORTED_double(
     eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
@@ -289,9 +315,10 @@ DEFINE_double(
  *       has finished, which will make the garbage collection strategy faster.
  *       Only works when garbage collection strategy is enabled.
  */
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
+PADDLE_DEFINE_EXPORTED_bool(
+    fast_eager_deletion_mode, true,
+    "Fast eager deletion mode. If enabled, memory would release "
+    "immediately without waiting GPU kernel ends.");
 
 /**
  * Memory related FLAG
@@ -311,11 +338,12 @@ DEFINE_bool(fast_eager_deletion_mode, true,
  *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
  *       The flag is only valid when running parallel data compilers.
  */
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
+PADDLE_DEFINE_EXPORTED_double(
+    memory_fraction_of_eager_deletion, 1.0,
+    "Fraction of eager deletion. If less than 1.0, all variables in "
+    "the program would be sorted according to its memory size, and "
+    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+    "variables would be deleted.");
 
 /**
  * Allocator related FLAG
@@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit";
 #else
 static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
 #endif
-DEFINE_string(
+PADDLE_DEFINE_EXPORTED_string(
     allocator_strategy, kDefaultAllocatorStrategy,
     "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
     "naive_best_fit means the original pre-allocated allocator of Paddle. "
@@ -358,9 +386,9 @@ DEFINE_string(
  *       size as the memory block will be allocated from the CUDA pinned
  *       request util the CPU does not have enough memory.
  */
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
+PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, 1,
+                              "Default use 100% of CPU memory for PaddlePaddle,"
+                              "reserve the rest for page tables, etc");
 
 /**
  * Memory related FLAG
@@ -374,8 +402,9 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
  *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
  *       as memory block sizes.
  */
-DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    initial_cpu_memory_in_mb, 500ul,
+    "Initial CPU memory for PaddlePaddle, in MD unit.");
 
 /**
  * Memory related FLAG
@@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
  *       size as the memory block will be allocated from the CPU
  *       request util the CPU does not have enough memory.
  */
-DEFINE_double(
+PADDLE_DEFINE_EXPORTED_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
@@ -425,12 +454,13 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 // which may lead to insufficient memory left for paddle
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
-DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
-              "Allocate a trunk of gpu memory that is this fraction of the "
-              "total gpu memory size. Future memory usage will be allocated "
-              "from the trunk. If the trunk doesn't have enough gpu memory, "
-              "additional trunks of the same size will be requested from gpu "
-              "until the gpu has no memory left for another trunk.");
+PADDLE_DEFINE_EXPORTED_double(
+    fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
+    "Allocate a trunk of gpu memory that is this fraction of the "
+    "total gpu memory size. Future memory usage will be allocated "
+    "from the trunk. If the trunk doesn't have enough gpu memory, "
+    "additional trunks of the same size will be requested from gpu "
+    "until the gpu has no memory left for another trunk.");
 
 /**
  * Memory related FLAG
@@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
  *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
  *       the GPU has no remaining memory.
  */
-DEFINE_uint64(
+PADDLE_DEFINE_EXPORTED_uint64(
     initial_gpu_memory_in_mb, 0ul,
     "Allocate a trunk of gpu memory whose byte size is specified by "
     "the flag. Future memory usage will be allocated from the "
@@ -466,18 +496,20 @@ DEFINE_uint64(
  * Note: If the allocated GPU memory blocks are exhausted,
  *       additional GPU memory blocks are reallocated
  */
-DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
-              "If this flag is set, Paddle will reallocate the gpu memory with "
-              "size specified by this flag. Else Paddle will reallocate by "
-              "FLAGS_fraction_of_gpu_memory_to_use");
-
-DEFINE_uint64(gpu_memory_limit_mb, 0UL,
-              "The maximum gpu memory limit that the process can allocate. "
-              "If it is equal to 0, there would be no limit and all gpu memory "
-              "would be available to the process. If it is larger than 0, "
-              "the process would raise out of memory error if the allocated "
-              "memory exceeds the limit even though there is available "
-              "memory on the gpu card. The unit is MB and default value is 0.");
+PADDLE_DEFINE_EXPORTED_uint64(
+    reallocate_gpu_memory_in_mb, 0ul,
+    "If this flag is set, Paddle will reallocate the gpu memory with "
+    "size specified by this flag. Else Paddle will reallocate by "
+    "FLAGS_fraction_of_gpu_memory_to_use");
+
+PADDLE_DEFINE_EXPORTED_uint64(
+    gpu_memory_limit_mb, 0UL,
+    "The maximum gpu memory limit that the process can allocate. "
+    "If it is equal to 0, there would be no limit and all gpu memory "
+    "would be available to the process. If it is larger than 0, "
+    "the process would raise out of memory error if the allocated "
+    "memory exceeds the limit even though there is available "
+    "memory on the gpu card. The unit is MB and default value is 0.");
 
 #endif
 
@@ -489,11 +521,12 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL,
  * Example:
  * Note:
  */
-DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
-              "The memory up limit of sub-scopes of local execution scope for "
-              "each CUDAPlace. If you don't need to limit the memory, "
-              "you should set FLAGS_local_exe_sub_scope_limit=-1. "
-              "The default value is 256 MBytes.");
+PADDLE_DEFINE_EXPORTED_double(
+    local_exe_sub_scope_limit, 256.0,  // MBytes
+    "The memory up limit of sub-scopes of local execution scope for "
+    "each CUDAPlace. If you don't need to limit the memory, "
+    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
+    "The default value is 256 MBytes.");
 
 /**
  * MKLDNN related FLAG
@@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0,  // MBytes
  * Example:
  * Note:
  */
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
 
 /**
  * Debug related FLAG
@@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2;
 static const int32_t kDefaultCallStackLevel = 1;
 #endif
 
-DEFINE_int32(
+PADDLE_DEFINE_EXPORTED_int32(
     call_stack_level, kDefaultCallStackLevel,
     "Determine the call stack to print when error or exeception happens."
     // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
@@ -545,9 +578,9 @@ DEFINE_int32(
  * Note: If True, gradients are summed by the reverse order of
  * the forward execution sequence.
  */
-DEFINE_bool(sort_sum_gradient, false,
-            "Sum gradients by the reverse order of "
-            "the forward execution sequence.");
+PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, false,
+                            "Sum gradients by the reverse order of "
+                            "the forward execution sequence.");
 
 /**
  * Performance related FLAG
@@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false,
  * Example:
  * Note: The maximum number of inplace grad_add.
  */
-DEFINE_int32(
+PADDLE_DEFINE_EXPORTED_int32(
     max_inplace_grad_add, 0,
     "The maximum number of inplace grad_add. When doing "
     "gradient accumulation, if the number of gradients need to that "
@@ -572,8 +605,8 @@ DEFINE_int32(
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be enabled.
  */
-DEFINE_string(tracer_mkldnn_ops_on, "",
-              "List of OneDNN operation types to be turned on");
+PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, "",
+                              "List of OneDNN operation types to be turned on");
 
 /**
  * Debug related FLAG
@@ -583,8 +616,9 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
  * Example:
  * Note: Holds list of operation types with OneDNN kernels to be disabled.
  */
-DEFINE_string(tracer_mkldnn_ops_off, "",
-              "List of OneDNN operation types to be turned off");
+PADDLE_DEFINE_EXPORTED_string(
+    tracer_mkldnn_ops_off, "",
+    "List of OneDNN operation types to be turned off");
 
 /**
  * Debug related FLAG
@@ -595,8 +629,9 @@ DEFINE_string(tracer_mkldnn_ops_off, "",
  * Note: Check kernel launch status after every kernel compute.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DEFINE_bool(check_kernel_launch, false,
-            "Check kernel launch status after every kernel compute");
+PADDLE_DEFINE_EXPORTED_bool(
+    check_kernel_launch, false,
+    "Check kernel launch status after every kernel compute");
 #endif
 
 /**
@@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false,
  * Note: Disable cudnn in conv2d.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
+PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false,
+                            "Disable cudnn in conv2d");
 #endif
 
 /**
@@ -621,8 +657,8 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
-DEFINE_int32(get_host_by_name_time, 120,
-             "The maximum time for get host by name time");
+PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
+                             "The maximum time for get host by name time");
 #endif
 
 /**
@@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120,
  *          program when using Fleet APIs.
  * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
  */
-DEFINE_bool(
+PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9d78c2e9dc3951d3d79bf60496aa04c15afbf5d
--- /dev/null
+++ b/paddle/fluid/platform/flags.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <type_traits>
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace platform {
+
+struct FlagInfo {
+  using ValueType =
+      boost::variant<bool, int32_t, int64_t, uint64_t, double, std::string>;
+  std::string name;
+  mutable void *value_ptr;
+  ValueType default_value;
+  std::string doc;
+  bool is_writable;
+};
+
+using ExportedFlagInfoMap = std::map<std::string, FlagInfo>;
+const ExportedFlagInfoMap &GetExportedFlagInfoMap();
+ExportedFlagInfoMap *GetMutableExportedFlagInfoMap();
+
+#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type,      \
+                                      __gflag_type, __default_value, __doc)   \
+  DEFINE_##__gflag_type(__name, __default_value, __doc);                      \
+  struct __PaddleRegisterFlag_##__name {                                      \
+    __PaddleRegisterFlag_##__name() {                                         \
+      using FlagDeclaredType =                                                \
+          typename std::remove_reference<decltype(FLAGS_##__name)>::type;     \
+      static_assert(std::is_same<FlagDeclaredType, ::std::string>::value ||   \
+                        std::is_arithmetic<FlagDeclaredType>::value,          \
+                    "FLAGS should be std::string or arithmetic type");        \
+      auto *instance = ::paddle::platform::GetMutableExportedFlagInfoMap();   \
+      auto &info = (*instance)[#__name];                                      \
+      info.name = #__name;                                                    \
+      info.value_ptr = &(FLAGS_##__name);                                     \
+      info.default_value = static_cast<__cpp_type>(__default_value);          \
+      info.doc = __doc;                                                       \
+      info.is_writable = __is_writable;                                       \
+    }                                                                         \
+    int Touch() const { return 0; }                                           \
+  };                                                                          \
+  static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \
+  int TouchPaddleFlagRegister_##__name() {                                    \
+    return __PaddleRegisterFlag_instance##__name.Touch();                     \
+  }                                                                           \
+  static_assert(std::is_same<__PaddleRegisterFlag_##__name,                   \
+                             ::__PaddleRegisterFlag_##__name>::value,         \
+                "FLAGS should define in global namespace")
+
+#define PADDLE_FORCE_LINK_FLAG(__name)           \
+  extern int TouchPaddleFlagRegister_##__name(); \
+  UNUSED static int __paddle_use_flag_##__name = \
+      TouchPaddleFlagRegister_##__name()
+
+#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc)
+#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc)              \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \
+                                doc)
+
+#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc)
+
+#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc)    \
+  __PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \
+                                default_value, doc)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index afae046531143305679f73c2892c55cb89cfc699..290b3353ae54ccbad7ff549a318edb83b75fe7b8 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -43,9 +43,10 @@ limitations under the License. */
 #endif
 
 DECLARE_int32(paddle_num_threads);
-DEFINE_int32(multiple_of_cupti_buffer_size, 1,
-             "Multiple of the CUPTI device buffer size. If the timestamps have "
-             "been dropped when you are profiling, try increasing this value.");
+PADDLE_DEFINE_EXPORTED_int32(
+    multiple_of_cupti_buffer_size, 1,
+    "Multiple of the CUPTI device buffer size. If the timestamps have "
+    "been dropped when you are profiling, try increasing this value.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 14c772d88897f4fa28e7c37a9452b78b637419a2..415babc9cb85e6c93c17dd9fdbf7ef61fc424d4c 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/place.h"
 
-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+PADDLE_DEFINE_EXPORTED_bool(
+    benchmark, false,
+    "Doing memory benchmark. It will make deleting scope synchronized, "
+    "and add some memory usage logs."
+    "Default cuda is asynchronous device, set to True will"
+    "force op run in synchronous mode.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 9c33233e1f79ac799d5acc2a711119d279a9613d..2c8f918414de4687b0be5354fc634587ec22f169 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -24,7 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
 
-DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+                            "Enable rpc profiler or not.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/xpu/xpu_info.cc b/paddle/fluid/platform/xpu/xpu_info.cc
index 6b8ab16b47d68c3d1cd8fb961aaf3bc6caa5b9b8..3f45286d8f20209a28a12e98b0643a11d72afca6 100644
--- a/paddle/fluid/platform/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/xpu/xpu_info.cc
@@ -18,14 +18,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 #include "paddle/fluid/string/split.h"
 
-DEFINE_string(selected_xpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (XPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
-              "share-memory only.");
+PADDLE_DEFINE_EXPORTED_string(
+    selected_xpus, "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (XPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "share-memory only.");
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 59c7628447479da5a49ae22dd2daf647896a6544..b01e40750f3358eae9e8c4c38332d5f0d7f0dce2 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -29,82 +29,38 @@
 #include "paddle/fluid/platform/macros.h"
 #include "pybind11/stl.h"
 
-// data processing
-DECLARE_bool(use_mkldnn);
-DECLARE_string(tracer_mkldnn_ops_on);
-DECLARE_string(tracer_mkldnn_ops_off);
+// FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
+// CPU-only Paddle. It is because they are only used in
+// AutoGrowthBestFitAllocator, but AutoGrowthBestFitAllocator is not used
+// (in the translation unit level) when compiling CPU-only Paddle. I do not
+// want to add PADDLE_FORCE_LINK_FLAG, but I have not found any other methods
+// to solve this problem.
+PADDLE_FORCE_LINK_FLAG(free_idle_chunk);
+PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit);
+
 // debug
-DECLARE_bool(check_nan_inf);
 DECLARE_bool(cpu_deterministic);
-DECLARE_bool(enable_rpc_profiler);
-DECLARE_int32(multiple_of_cupti_buffer_size);
-DECLARE_bool(reader_queue_speed_test_mode);
-DECLARE_int32(call_stack_level);
-DECLARE_bool(sort_sum_gradient);
-DECLARE_bool(check_kernel_launch);
-// device management
-DECLARE_int32(paddle_num_threads);
+
+// IR
+DECLARE_bool(convert_all_blocks);
+
 // executor
 DECLARE_bool(enable_parallel_graph);
 DECLARE_string(pe_profile_fname);
 DECLARE_string(print_sub_graph_dir);
-DECLARE_bool(use_ngraph);
+DECLARE_bool(new_executor_use_inplace);
+
 // memory management
-DECLARE_string(allocator_strategy);
-DECLARE_double(eager_delete_tensor_gb);
-DECLARE_double(fraction_of_cpu_memory_to_use);
-DECLARE_bool(free_idle_chunk);
-DECLARE_bool(free_when_no_cache_hit);
+DECLARE_bool(eager_delete_scope);
 DECLARE_int32(fuse_parameter_groups_size);
 DECLARE_double(fuse_parameter_memory_size);
-DECLARE_bool(init_allocated_mem);
-DECLARE_uint64(initial_cpu_memory_in_mb);
-DECLARE_double(memory_fraction_of_eager_deletion);
-DECLARE_bool(use_pinned_memory);
-DECLARE_bool(use_system_allocator);
+
 // others
-DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
-DECLARE_int32(max_inplace_grad_add);
-DECLARE_string(tracer_profile_fname);
-DECLARE_bool(apply_pass_to_program);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cudnn
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
-DECLARE_bool(cudnn_deterministic);
-DECLARE_bool(cudnn_exhaustive_search);
-DECLARE_bool(conv2d_disable_cudnn);
-// data processing
-DECLARE_bool(enable_cublas_tensor_op_math);
-// device management
-DECLARE_string(selected_gpus);
-// memory management
-DECLARE_bool(eager_delete_scope);
-DECLARE_bool(fast_eager_deletion_mode);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(gpu_memory_limit_mb);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-// others
-DECLARE_bool(sync_nccl_allreduce);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-// device management
-DECLARE_string(selected_xpus);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-// device management
-DECLARE_string(selected_npus);
-// set minmum loss scaling value
-DECLARE_int32(min_loss_scaling);
-#endif
+DECLARE_bool(enable_unused_var_check);
 
+// NOTE: where are these 2 flags from?
 #ifdef PADDLE_WITH_DISTRIBUTE
-DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
 DECLARE_int32(rpc_prefetch_thread_num);
 #endif
@@ -181,7 +137,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
     PADDLE_ENFORCE_NOT_NULL(setter,
                             platform::errors::InvalidArgument(
                                 "Setter of %s should not be null", name));
-
     var_infos_.insert({name, VarInfo(is_public, getter, setter)});
   }
 
@@ -243,81 +198,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
 
 GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_;
 
-class GlobalVarGetterSetterRegistryHelper {
- public:
-  GlobalVarGetterSetterRegistryHelper(bool is_public, bool is_writable,
-                                      const std::string &var_names)
-      : is_public_(is_public),
-        is_writable_(is_writable),
-        var_names_(SplitVarNames(var_names)) {}
-
-  template <typename... Args>
-  void Register(Args &&... args) const {
-    Impl<0, sizeof...(args) == 1, Args...>::Register(
-        is_public_, is_writable_, var_names_, std::forward<Args>(args)...);
-  }
-
- private:
-  static std::vector<std::string> SplitVarNames(const std::string &names) {
-    auto valid_char = [](char ch) { return !std::isspace(ch) && ch != ','; };
-
-    std::vector<std::string> ret;
-    size_t i = 0, j = 0, n = names.size();
-    while (i < n) {
-      for (; i < n && !valid_char(names[i]); ++i) {
-      }
-      for (j = i + 1; j < n && valid_char(names[j]); ++j) {
-      }
-
-      if (i < n && j <= n) {
-        auto substring = names.substr(i, j - i);
-        VLOG(10) << "Get substring: \"" << substring << "\"";
-        ret.emplace_back(substring);
-      }
-      i = j + 1;
-    }
-    return ret;
-  }
-
- private:
-  template <size_t kIdx, bool kIsStop, typename T, typename... Args>
-  struct Impl {
-    static void Register(bool is_public, bool is_writable,
-                         const std::vector<std::string> &var_names, T &&var,
-                         Args &&... args) {
-      PADDLE_ENFORCE_EQ(kIdx + 1 + sizeof...(args), var_names.size(),
-                        platform::errors::InvalidArgument(
-                            "Argument number not match name number"));
-      Impl<kIdx, true, T>::Register(is_public, is_writable, var_names, var);
-      Impl<kIdx + 1, sizeof...(Args) == 1, Args...>::Register(
-          is_public, is_writable, var_names, std::forward<Args>(args)...);
-    }
-  };
-
-  template <size_t kIdx, typename T>
-  struct Impl<kIdx, true, T> {
-    static void Register(bool is_public, bool is_writable,
-                         const std::vector<std::string> &var_names, T &&var) {
-      auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
-      if (is_writable) {
-        instance->Register(
-            var_names[kIdx], is_public,
-            GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)),
-            GlobalVarGetterSetterRegistry::CreateSetter(&var));
-      } else {
-        instance->Register(
-            var_names[kIdx], is_public,
-            GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)));
-      }
-    }
-  };
-
- private:
-  const bool is_public_;
-  const bool is_writable_;
-  const std::vector<std::string> var_names_;
-};
-
 static void RegisterGlobalVarGetterSetter();
 
 void BindGlobalValueGetterSetter(pybind11::module *module) {
@@ -338,65 +218,69 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
 }
 
 /* Public vars are designed to be writable. */
-#define REGISTER_PUBLIC_GLOBAL_VAR(...)                                        \
-  do {                                                                         \
-    GlobalVarGetterSetterRegistryHelper(/*is_public=*/true,                    \
-                                        /*is_writable=*/true, "" #__VA_ARGS__) \
-        .Register(__VA_ARGS__);                                                \
+#define REGISTER_PUBLIC_GLOBAL_VAR(var)                                    \
+  do {                                                                     \
+    auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();     \
+    instance->Register(#var, /*is_public=*/true,                           \
+                       GlobalVarGetterSetterRegistry::CreateGetter(var),   \
+                       GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
   } while (0)
 
-#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...)                     \
-  do {                                                                    \
-    GlobalVarGetterSetterRegistryHelper(/*is_public=*/false, is_writable, \
-                                        "" #__VA_ARGS__)                  \
-        .Register(__VA_ARGS__);                                           \
-  } while (0)
+struct RegisterGetterSetterVisitor : public boost::static_visitor<void> {
+  RegisterGetterSetterVisitor(const std::string &name, bool is_writable,
+                              void *value_ptr)
+      : name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {}
 
-static void RegisterGlobalVarGetterSetter() {
-  REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
-                              FLAGS_free_when_no_cache_hit);
-
-  REGISTER_PUBLIC_GLOBAL_VAR(
-      FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
-      FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
-      FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
-      FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
-      FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
-      FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
-      FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
-      FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
-      FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
-      FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off,
-      FLAGS_apply_pass_to_program);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  REGISTER_PUBLIC_GLOBAL_VAR(
-      FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
-      FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
-      FLAGS_cudnn_exhaustive_search, FLAGS_eager_delete_scope,
-      FLAGS_fast_eager_deletion_mode,
-      FLAGS_fraction_of_cuda_pinned_memory_to_use,
-      FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
-      FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
-      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
-      FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
-#endif
-#ifdef PADDLE_WITH_XPU
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
-#endif
+  template <typename T>
+  void operator()(const T &) const {
+    auto &value = *static_cast<T *>(value_ptr_);
+    auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
+    bool is_public = is_writable_;  // currently, all writable vars are public
+    if (is_writable_) {
+      instance->Register(name_, is_public,
+                         GlobalVarGetterSetterRegistry::CreateGetter(value),
+                         GlobalVarGetterSetterRegistry::CreateSetter(&value));
+    } else {
+      instance->Register(name_, is_public,
+                         GlobalVarGetterSetterRegistry::CreateGetter(value));
+    }
+  }
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling);
-#endif
+ private:
+  std::string name_;
+  bool is_writable_;
+  void *value_ptr_;
+};
+
+static void RegisterGlobalVarGetterSetter() {
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_cpu_deterministic);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_convert_all_blocks);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_parallel_graph);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_pe_profile_fname);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_print_sub_graph_dir);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_new_executor_use_inplace);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_eager_delete_scope);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_groups_size);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_memory_size);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_inner_op_parallelism);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_unused_var_check);
 
 #ifdef PADDLE_WITH_DITRIBUTE
-  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
-                             FLAGS_rpc_get_thread_num,
-                             FLAGS_rpc_prefetch_thread_num);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_get_thread_num);
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_prefetch_thread_num);
 #endif
+
+  const auto &flag_map = platform::GetExportedFlagInfoMap();
+  for (const auto &pair : flag_map) {
+    const std::string &name = pair.second.name;
+    bool is_writable = pair.second.is_writable;
+    void *value_ptr = pair.second.value_ptr;
+    const auto &default_value = pair.second.default_value;
+    RegisterGetterSetterVisitor visitor("FLAGS_" + name, is_writable,
+                                        value_ptr);
+    boost::apply_visitor(visitor, default_value);
+  }
 }
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 9ed1ed30324b28c740039d6e48a28209f1b90dfa..60b99a964a57fe90454ce4618ee2799aedd697ec 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -32,9 +32,10 @@
 #include "paddle/fluid/platform/place.h"
 #include "pybind11/stl.h"
 
-DEFINE_bool(reader_queue_speed_test_mode, false,
-            "If set true, the queue.pop will only get data from queue but not "
-            "remove the data from queue for speed testing");
+PADDLE_DEFINE_EXPORTED_bool(
+    reader_queue_speed_test_mode, false,
+    "If set true, the queue.pop will only get data from queue but not "
+    "remove the data from queue for speed testing");
 
 namespace paddle {
 namespace pybind {