diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index e2a0097cb1c20ef6f7987a162cbf233628fd03d6..46fc7a5496555f7fa0642d9910721b711c81d3b8 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -20,13 +20,9 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
-// asynchronous nccl allreduce or synchronous issue:
-// https://github.com/PaddlePaddle/Paddle/issues/15049
-// If you want to change this default value, why?(gongwb)
-DEFINE_bool(
-    sync_nccl_allreduce, true,
-    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
-    "after allreduce, this mode can get better performance in some scenarios.");
+#ifdef PADDLE_WITH_CUDA
+DECLARE_bool(sync_nccl_allreduce);
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index abab2fdb8773e27c725890ce8ca7fcb321019c6c..f100dc6349f58260ed6c501da6148efe50437fee 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -25,31 +25,13 @@
 #include "glog/logging.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 
+DECLARE_double(eager_delete_tensor_gb);
+DECLARE_double(memory_fraction_of_eager_deletion);
+DECLARE_bool(fast_eager_deletion_mode);
+
 namespace paddle {
 namespace framework {
 
-// Disable gc by default when inference library is built
-#ifdef PADDLE_ON_INFERENCE
-static const double kDefaultEagerDeleteTensorGB = -1;
-#else
-static const double kDefaultEagerDeleteTensorGB = 0;
-#endif
-
-DEFINE_double(
-    eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
-    "Memory size threshold (GB) when the garbage collector clear tensors."
-    "Disabled when this value is less than 0");
-
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
-
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
-
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                    size_t max_memory_size)
     : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3a06b5222297d033c71c6d4fe9155699c015f3ed..02a9eb20264a5de028f29c8a86f459ae5461ba9e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -32,9 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
+DECLARE_bool(check_nan_inf);
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
 DEFINE_bool(fast_check_nan_inf, false,
             "Fast checking NAN/INF after each operation. It will be a little"
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index d34f826c1abb99198fd4dbe9537495edff7b63af..7f7f426d0e28224932fc96a3fefa0df1279e6475 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
+#include <memory>
+#include <utility>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -20,8 +22,7 @@
 DEFINE_int32(io_threadpool_size, 100,
              "number of threads used for doing IO, default 100");
 
-DEFINE_int32(dist_threadpool_size, 0,
-             "number of threads used for distributed executed.");
+DECLARE_int32(dist_threadpool_size);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 4e45cc4d13b0d5abcb10bd9e34993bc0b8c17485..19b1380612b6de2387771e633ee0604bdc30046f 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -17,11 +17,7 @@
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
-DEFINE_string(allocator_strategy, "naive_best_fit",
-              "The allocation strategy. naive_best_fit means the original best "
-              "fit allocator of Fluid. "
-              "auto_growth means the experimental auto-growth allocator. "
-              "Enum in [naive_best_fit, auto_growth].");
+DECLARE_string(allocator_strategy);
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index a78a6726bc5a59cc84494656dc53e31e40eb82b3..9b50a4a61a87a61088d8c34ebcc06a2a281a01c0 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -23,15 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
-// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
-// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
-// reason we set it to false by default is that this mode may use scaled
-// atomic integer reduction that may cause a numerical overflow for certain
-// input data range.
-DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
-            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
-            "batch_norm, default is False.");
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index ec0278e5a230ec9c5cbb38855d0c2a07912f332c..7aa1419126d31ec89fc46bbaa3b23b7516f3ab27 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -24,16 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
 
-DEFINE_bool(cudnn_deterministic, false,
-            "Whether allow using an autotuning algorithm for convolution "
-            "operator. The autotuning algorithm may be non-deterministic. If "
-            "true, the algorithm is deterministic.");
-DEFINE_uint64(conv_workspace_size_limit,
-              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-              "cuDNN convolution workspace limit in MB unit.");
-DEFINE_bool(cudnn_exhaustive_search, false,
-            "Whether enable exhaustive search for cuDNN convolution or "
-            "not, default is False.");
+DECLARE_bool(cudnn_deterministic);
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index d1fa7b9d5bd81b164e51cb7a5353ed1d06f221b1..9b9b3e1d8bd6e3196d34e2b0efb2e1433f3a6016 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
-DEFINE_int64(cudnn_exhaustive_search_times, -1,
-             "Exhaustive search times for cuDNN convolution, "
-             "default is -1, not exhaustive search");
+DECLARE_int64(cudnn_exhaustive_search_times);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index af277d69c18670e31cb8fd9991b33b915261778e..a7a761fa39a7390f78b5b9b2209d12ea5ac24c30 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -26,18 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/parameter_recv.h"
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 
+DECLARE_int32(communicator_max_merge_var_num);
+DECLARE_int32(communicator_send_queue_size);
+
 DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
-DEFINE_int32(communicator_send_queue_size, 20,
-             "queue size to recv gradient before send");
 DEFINE_int32(communicator_min_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
 DEFINE_int32(communicator_send_wait_times, 5,
              "times that send thread will wait if merge num does not reach "
              "max_merge_var_num");
-DEFINE_int32(communicator_max_merge_var_num, 20,
-             "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
             "fake mode does not really send any thing");
 DEFINE_bool(communicator_merge_sparse_grad, true,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 575eed355df3e07e2f13a3a3656a325caff0f9ff..69435793a75a203533806a567c718e0af4d2e20c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -20,10 +20,12 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
+cc_library(flags SRCS flags.cc DEPS gflags) 
+
 if(WITH_GPU)
-  nv_library(enforce SRCS enforce.cc)
+  nv_library(enforce SRCS enforce.cc DEPS flags)
 else()
-  cc_library(enforce SRCS enforce.cc)
+  cc_library(enforce SRCS enforce.cc DEPS flags)
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index bdfe260793b638881a46a8d663876eeda4ed932f..b7ed66bd36369b0b31df3afbbd18e49fba8e23e1 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -32,16 +32,9 @@ limitations under the License. */
 #include <algorithm>
 #include "gflags/gflags.h"
 
-DEFINE_double(fraction_of_cpu_memory_to_use, 1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use, 0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
+DECLARE_double(fraction_of_cpu_memory_to_use);
+DECLARE_uint64(initial_cpu_memory_in_mb);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2224b05bef04d793cc40a4a4d30f51704b75da1
--- /dev/null
+++ b/paddle/fluid/platform/flags.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gflags/gflags.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#endif
+
+/**
+ * NOTE(paddle-dev): This file is designed to define all public FLAGS.
+ */
+
+/* Paddle initialization related */
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
+/* Operator related */
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
+
+/* CUDA related */
+#ifdef PADDLE_WITH_CUDA
+DEFINE_bool(
+    enable_cublas_tensor_op_math, false,
+    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
+    "but it may loss precision. Currently, There are two CUDA libraries that"
+    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
+    " GEMM computations(the matrices must be either half precision or single "
+    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
+    "input and output must be half precision) and recurrent neural networks "
+    "(RNNs).");
+
+DEFINE_string(selected_gpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (GPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+#endif
+
+/* CUDNN related */
+#ifdef PADDLE_WITH_CUDA
+DEFINE_bool(cudnn_deterministic, false,
+            "Whether allow using an autotuning algorithm for convolution "
+            "operator. The autotuning algorithm may be non-deterministic. If "
+            "true, the algorithm is deterministic.");
+
+DEFINE_uint64(conv_workspace_size_limit,
+              paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
+              "cuDNN convolution workspace limit in MB unit.");
+
+DEFINE_bool(cudnn_exhaustive_search, false,
+            "Whether enable exhaustive search for cuDNN convolution or "
+            "not, default is False.");
+
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
+             "Exhaustive search times for cuDNN convolution, "
+             "default is -1, not exhaustive search");
+
+// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
+// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
+// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
+// reason we set it to false by default is that this mode may use scaled
+// atomic integer reduction that may cause a numerical overflow for certain
+// input data range.
+DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
+            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
+            "batch_norm, default is False.");
+#endif
+
+/* NCCL related */
+#ifdef PADDLE_WITH_CUDA
+// asynchronous nccl allreduce or synchronous issue:
+// https://github.com/PaddlePaddle/Paddle/issues/15049
+// If you want to change this default value, why?(gongwb)
+DEFINE_bool(
+    sync_nccl_allreduce, true,
+    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
+    "after allreduce, this mode can get better performance in some scenarios.");
+#endif
+
+/* Distributed related */
+#ifdef PADDLE_WITH_DISTRIBUTE
+DEFINE_int32(communicator_max_merge_var_num, 20,
+             "max var num to merge and send");
+DEFINE_int32(communicator_send_queue_size, 20,
+             "queue size to recv gradient before send");
+#endif
+
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
+
+/* Garbage collector related */
+// Disable gc by default when inference library is built
+#ifdef PADDLE_ON_INFERENCE
+static const double kDefaultEagerDeleteTensorGB = -1;
+#else
+static const double kDefaultEagerDeleteTensorGB = 0;
+#endif
+
+DEFINE_double(
+    eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
+
+/* Allocator related */
+DEFINE_string(allocator_strategy, "naive_best_fit",
+              "The allocation strategy. naive_best_fit means the original best "
+              "fit allocator of Fluid. "
+              "auto_growth means the experimental auto-growth allocator. "
+              "Enum in [naive_best_fit, auto_growth].");
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
+
+DEFINE_double(
+    fraction_of_cuda_pinned_memory_to_use, 0.5,
+    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+    "reserve the rest for page tables, etc");
+
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
+#else
+// fraction_of_gpu_memory_to_use cannot be too high on windows,
+// since the win32 graphic sub-system can occupy some GPU memory
+// which may lead to insufficient memory left for paddle
+constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
+#endif
+
+DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
+              "Allocate a trunk of gpu memory that is this fraction of the "
+              "total gpu memory size. Future memory usage will be allocated "
+              "from the trunk. If the trunk doesn't have enough gpu memory, "
+              "additional trunks of the same size will be requested from gpu "
+              "until the gpu has no memory left for another trunk.");
+
+DEFINE_uint64(
+    initial_gpu_memory_in_mb, 0ul,
+    "Allocate a trunk of gpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "trunk. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the gpu memory will be requested from gpu with size "
+    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+
+DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
+#endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 5fce95d63f990db091ce5f8072654f6e346b5c1c..8191d688472a3eb0f297936f3387e77809a20e2f 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -21,61 +21,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/split.h"
 
-#ifndef _WIN32
-constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
-#else
-// fraction_of_gpu_memory_to_use cannot be too high on windows,
-// since the win32 graphic sub-system can occupy some GPU memory
-// which may lead to insufficient memory left for paddle
-constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
-#endif
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_string(selected_gpus);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
-DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
-              "Allocate a trunk of gpu memory that is this fraction of the "
-              "total gpu memory size. Future memory usage will be allocated "
-              "from the trunk. If the trunk doesn't have enough gpu memory, "
-              "additional trunks of the same size will be requested from gpu "
-              "until the gpu has no memory left for another trunk.");
-
-DEFINE_uint64(
-    initial_gpu_memory_in_mb, 0ul,
-    "Allocate a trunk of gpu memory whose byte size is specified by "
-    "the flag. Future memory usage will be allocated from the "
-    "trunk. If the trunk doesn't have enough gpu memory, additional "
-    "trunks of the gpu memory will be requested from gpu with size "
-    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
-    "no memory left for the additional trunk. Note: if you set this "
-    "flag, the memory size set by "
-    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
-    "flag. If you don't set this flag, PaddlePaddle will use "
-    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
-
-DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
-              "If this flag is set, Paddle will reallocate the gpu memory with "
-              "size specified by this flag. Else Paddle will reallocate by "
-              "FLAGS_fraction_of_gpu_memory_to_use");
-
-DEFINE_bool(
-    enable_cublas_tensor_op_math, false,
-    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
-    "but it may loss precision. Currently, There are two CUDA libraries that"
-    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
-    " GEMM computations(the matrices must be either half precision or single "
-    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
-    "input and output must be half precision) and recurrent neural networks "
-    "(RNNs).");
-
-DEFINE_string(selected_gpus, "",
-              "A list of device ids separated by comma, like: 0,1,2,3. "
-              "This option is useful when doing multi process training and "
-              "each process have only one device (GPU). If you want to use "
-              "all visible devices, set this to empty string. NOTE: the "
-              "reason of doing this is that we want to use P2P communication"
-              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
-              "share-memory only.");
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 9b7b21208eb51691963ac15b90e3182f3afcf81d..feb6b1e7dc1b03189efe89e734537eb4101e68bf 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -36,8 +36,7 @@ limitations under the License. */
 #include "dgc/dgc.h"
 #endif
 
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
+DECLARE_int32(paddle_num_threads);
 DEFINE_int32(multiple_of_cupti_buffer_size, 1,
              "Multiple of the CUPTI device buffer size. If the timestamps have "
              "been dropped when you are profiling, try increasing this value.");
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index 49a8fb82dbf67357c1c3f2658538789af51b7cdc..a465f5909a7c6ee83211b8e03f1c3e7d3103022c 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,6 +1,6 @@
-cc_library(stringpiece SRCS piece.cc)
-cc_library(pretty_log SRCS pretty_log.cc)
-cc_library(string_helper SRCS string_helper.cc DEPS boost)
+cc_library(stringpiece SRCS piece.cc DEPS flags)
+cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
+cc_library(string_helper SRCS string_helper.cc DEPS boost flags)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)