diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9052675a9dee98ac4cfb977f6fb7bd8dba9dc9aa..f6da8860fe9236ebdb08135219a232cf52b2b5d0 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -16,9 +16,9 @@ paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, ke
 paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'eddb7a1f0083dcc70e9f6c71ee003cb9'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74'))
-paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
+paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'bedc29ad01c1b911e99032ee1e19ac59'))
 paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8'))
-paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'd521011d79e71080fe9b5bb179b43518'))
+paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '28f50904a0213f110947a30e0438529c'))
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', '6e19f92e2f185320a3a86b77e85eb3b3'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 0291b6f66a9e8cb6a3c16530084d3e3e7a6c39c1..02e467e853e9c3e7a4d581043e6a8f7b70519521 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -455,6 +455,8 @@ void MultiSlotDataFeed::Init(
   all_slots_.resize(all_slot_num);
   all_slots_type_.resize(all_slot_num);
   use_slots_index_.resize(all_slot_num);
+  total_dims_without_inductive_.resize(all_slot_num);
+  inductive_shape_index_.resize(all_slot_num);
   use_slots_.clear();
   use_slots_is_dense_.clear();
   for (size_t i = 0; i < all_slot_num; ++i) {
@@ -462,14 +464,20 @@ void MultiSlotDataFeed::Init(
     all_slots_[i] = slot.name();
     all_slots_type_[i] = slot.type();
     use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    total_dims_without_inductive_[i] = 1;
+    inductive_shape_index_[i] = -1;
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        // for batch size holder if is_dense
-        if (slot.shape(0) > 0) {
-          local_shape.push_back(0);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
+          }
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
+          }
         }
       }
       for (size_t i = 0; i < slot.shape_size(); ++i) {
@@ -762,7 +770,10 @@ void MultiSlotDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      use_slots_shape_[i][0] = batch_size_;
+      if (inductive_shape_index_[i] != -1) {
+        use_slots_shape_[i][inductive_shape_index_[i]] =
+            total_instance / total_dims_without_inductive_[i];
+      }
       feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
@@ -785,6 +796,8 @@ void MultiSlotInMemoryDataFeed::Init(
   all_slots_.resize(all_slot_num);
   all_slots_type_.resize(all_slot_num);
   use_slots_index_.resize(all_slot_num);
+  total_dims_without_inductive_.resize(all_slot_num);
+  inductive_shape_index_.resize(all_slot_num);
   use_slots_.clear();
   use_slots_is_dense_.clear();
   for (size_t i = 0; i < all_slot_num; ++i) {
@@ -797,8 +810,13 @@ void MultiSlotInMemoryDataFeed::Init(
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        if (slot.shape(0) > 0) {
-          local_shape.push_back(0);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
+          }
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
+          }
         }
       }
       for (size_t i = 0; i < slot.shape_size(); ++i) {
@@ -960,7 +978,10 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      use_slots_shape_[i][0] = batch_size_;
+      if (inductive_shape_index_[i] != -1) {
+        use_slots_shape_[i][inductive_shape_index_[i]] =
+            total_instance / total_dims_without_inductive_[i];
+      }
       feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index d098c7858a98c644bd3cad78d3cf1e3b35ca026b..c141059a6d6b3420f02c1b6090cf67db7b7b4da8 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -143,6 +143,8 @@ class DataFeed {
   std::vector<std::string> all_slots_;
   std::vector<std::string> all_slots_type_;
   std::vector<std::vector<int>> use_slots_shape_;
+  std::vector<int> inductive_shape_index_;
+  std::vector<int> total_dims_without_inductive_;
   std::vector<int>
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 386ffd84c57063e950cd8b0d57304c66190be4c4..8e184e5d3cbc6d73c45aef97981dda410bc0f962 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -425,6 +425,7 @@ void DownpourWorker::TrainFiles() {
       }
 
       VLOG(3) << "push dense gradient done.";
+
       // the following code should be more precise and clean
       // TODO(guru4elephant)
       int32_t tmp_push_dense_wait_times = -1;
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0b2d4f29ed289c0346b94a6aeea7128ea2b7a027..f3988edf08f0af24a2635ad3dacda9d5f179a0f5 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -789,13 +789,15 @@ class Executor(object):
             .. code-block:: python
 
                 import paddle.fluid as fluid
-                place = fluid.CPUPlace()
+
+                place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
                 exe = fluid.Executor(place)
-                x = fluid.layers.data(name="x", type="int64")
-                y = fluid.layers.data(name="y", type="int64")
+                x = fluid.layers.data(name="x", shape=[10, 10], dtype="int64")
+                y = fluid.layers.data(name="y", shape=[1], dtype="int64", lod_level=1)
                 dataset = fluid.DatasetFactory().create_dataset()
                 dataset.set_use_var([x, y])
-                filelist = ["dataA.txt", "dataB.txt"]
+                dataset.set_thread(1)
+                filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
                 dataset.set_filelist(filelist)
                 exe.run(fluid.default_startup_program())
                 exe.infer_from_dataset(program=fluid.default_main_program(),
@@ -868,14 +870,15 @@ class Executor(object):
             .. code-block:: python
 
               import paddle.fluid as fluid
-              place = fluid.CPUPlace()
+
+              place = fluid.CPUPlace() # you can set place = fluid.CUDAPlace(0) to use gpu
               exe = fluid.Executor(place)
-              x = fluid.layers.data(name="x", type="int64")
-              y = fluid.layers.data(name="y", type="int64")
+              x = fluid.layers.data(name="x", shape=[10, 10], dtype="int64")
+              y = fluid.layers.data(name="y", shape=[1], dtype="int64", lod_level=1)
               dataset = fluid.DatasetFactory().create_dataset()
               dataset.set_use_var([x, y])
-              dataset.set_thread(2)
-              filelist = ["dataA.txt", "dataB.txt"]
+              dataset.set_thread(1)
+              filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
               dataset.set_filelist(filelist)
               exe.run(fluid.default_startup_program())
               exe.train_from_dataset(program=fluid.default_main_program(),