diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9cb2b5ee71d75c275937698ce8791a4eb9753645..afece8e3d28cdc3e9dce558a1dc2973241e0c3d5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -30,7 +30,6 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
-DEFINE_int32(min_row_size_to_use_multithread, 0, "");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2962dff122d59b220986f4b4b2b1c3496ad9da72..dd672c47955a0281ebf8cdea0867b51add771ba5 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -35,7 +35,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/variant.h"
 
 DECLARE_int32(inner_op_parallelism);
-DECLARE_int32(min_row_size_to_use_multithread);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index e9c395a9314180960da2b9b0f996fce5d62b14ba..955f9f455f0b6be3883118ec9df9a125cb13e3ff 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -114,6 +114,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default false) "
         "only update the parameter that has gradient in sparse update")
         .SetDefault(false);
+    AddAttr<int64_t>("min_row_size_to_use_multithread",
+                     "(int64_t, default 0) "
+                     "when not zero, if param row size is larger then "
+                     "min_row_size_to_use_multithread and "
+                     "inner_op_parallelism is larger then 0, sparse update "
+                     "will run in multithread mode")
+        .SetDefault(0);
 
     AddComment(R"DOC(
 Adam Optimizer.
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 9cd7906877f13b5b5f963d791be3e6a6ac8361e8..2c16a02f6a3bc728aaf1d78fa12fd065c2da2f34 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -354,6 +354,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
 
+    int64_t min_row_size_to_use_multithread =
+        ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
@@ -478,12 +480,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
             }
           }
         } else if (FLAGS_inner_op_parallelism > 1 &&
-                   FLAGS_min_row_size_to_use_multithread > 0 &&
-                   param.dims()[0] > FLAGS_min_row_size_to_use_multithread) {
+                   min_row_size_to_use_multithread > 0 &&
+                   param.dims()[0] > min_row_size_to_use_multithread) {
           VLOG(3) << "use multi thread, inner_op_parallelism="
                   << FLAGS_inner_op_parallelism
                   << " min_row_size_to_use_multithread="
-                  << FLAGS_min_row_size_to_use_multithread;
+                  << min_row_size_to_use_multithread;
           if (FLAGS_inner_op_parallelism > 10) {
             LOG(WARNING) << "FLAGS_inner_op_parallelism "
                          << FLAGS_inner_op_parallelism << " is two large!";
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b577dfc3e185391adf09a963c14f01eafaacdaf4..812694d99a1eae4040782fa9ec98b8e91161843e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -129,8 +129,7 @@ def __bootstrap__():
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
-        'inner_op_parallelism', 'min_row_size_to_use_multithread',
-        'enable_parallel_graph'
+        'inner_op_parallelism', 'enable_parallel_graph'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 779cb5f961639aa919827a1c1726e974fdf1cbe1..64d7fd082270f6057fa2562fbea836151bb9ff7b 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -674,6 +674,8 @@ class AdamOptimizer(Optimizer):
         may be very slow. The lazy mode only update the element that has gradient is the current
         mini-batch, so it will be much more faster. But this mode has different semantics with the
         original Adam algorithm and may lead to different result.
+        min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large,
+                        you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize.
 
     Examples:
         .. code-block:: python
@@ -694,7 +696,8 @@ class AdamOptimizer(Optimizer):
                  epsilon=1e-8,
                  regularization=None,
                  name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 min_row_size_to_use_multithread=0):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -708,6 +711,7 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
         self._lazy_mode = lazy_mode
+        self._min_row_size_to_use_multithread = min_row_size_to_use_multithread
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -762,7 +766,9 @@ class AdamOptimizer(Optimizer):
                 "beta1": self._beta1,
                 "beta2": self._beta2,
                 "epsilon": self._epsilon,
-                "lazy_mode": self._lazy_mode
+                "lazy_mode": self._lazy_mode,
+                "min_row_size_to_use_multithread":
+                self._min_row_size_to_use_multithread
             },
             stop_gradient=True)
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ac092e19b4d32f38e4bc0b2a64760836892f6029..4f7111df44e9dc86521d1dad5083851be65af90d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -87,7 +87,7 @@ list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
-py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4 FLAGS_min_row_size_to_use_multithread=2)
+py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 463a0655a8a3a311fb14b361416b7ef1cd4c7a70..2f4fc5772416dcc13602f4bc049cc1ffa20f4a48 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -261,7 +261,12 @@ class TestSparseAdamOp(unittest.TestCase):
             "LearningRate": np.full((1), 2.0).astype("float32")
         }
         self.init_output = np.full((height, row_numel), 0.0).astype("float32")
-        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            'min_row_size_to_use_multithread': 2
+        }
 
         grad_selected_rows = scope.var('Grad').get_selected_rows()
         grad_selected_rows.set_height(height)