Support LoDTensorArray in fetch (#23645) (#23968)

cherry-pick #23645

Support LoDTensorArray in fetch (#23645) (#23968)
cherry-pick #23645
48f41a7f · guofei · GitHub · 5bcf1632 · 48f41a7f · 48f41a7f
36 changed file
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run(

  HandleException();

-  FeedFetchList ret;
-  auto &val = boost::get<FeedFetchList>(fetch_data);
+  FetchList ret;
+  auto &val = boost::get<FetchList>(fetch_data);
  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-    std::vector<const LoDTensor *> lodtensor_ptrs;
-    lodtensor_ptrs.push_back(&val.at(fetch_idx));
-    ret.emplace_back();
-    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+    if (data_is_lod_tensor(val.at(fetch_idx))) {
+      std::vector<const LoDTensor *> lodtensor_ptrs;
+      lodtensor_ptrs.push_back(&(boost::get<LoDTensor>(val.at(fetch_idx))));
+      LoDTensor var;
+      var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+      ret.emplace_back(var);
+    } else {
+      auto array = boost::get<LoDTensorArray>(val.at(fetch_idx));
+      LoDTensorArray item_array;
+      item_array.reserve(array.size());
+      for (size_t i = 0; i < array.size(); ++i) {
+        std::vector<const LoDTensor *> lodtensor_ptrs;
+        lodtensor_ptrs.push_back(&array[i]);
+        item_array.emplace_back();
+        item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+      }
+      ret.emplace_back(item_array);
+    }
  }
  return ret;
 }

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(

  FetchResultType fetches;
  if (return_merged) {
-    fetches = FeedFetchList(fetch_tensors.size());
+    fetches = FetchList(fetch_tensors.size());
  } else {
    fetches = FetchUnmergedList(fetch_tensors.size());
  }

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
 }

-void FetchOpHandle::WaitAndMergeCPUTensors() const {
+static void CheckDims(const framework::DDim &tensor_dims,
+                      const framework::DDim &ele_dims, const size_t offset) {
+  PADDLE_ENFORCE_EQ(
+      tensor_dims.size(), ele_dims.size(),
+      platform::errors::Fatal("The dimension sizes of fetched Tensors or "
+                              "the items of fetched LoDTensorArray are "
+                              "different from each other on different "
+                              "devices. And the error is caused by the %zu "
+                              "(th) fetched variable. Please set the "
+                              "parameter `return_merged = False` when you "
+                              "call the `Executor.run()` method.",
+                              offset));
+  for (int j = 1; j < tensor_dims.size(); j++) {
+    PADDLE_ENFORCE_EQ(
+        tensor_dims[j], ele_dims[j],
+        platform::errors::Fatal("The dimensions of fetched Tensors or "
+                                "the items of fetched LoDTensorArray are "
+                                "different from each other on different "
+                                "devices. And the error is caused by the "
+                                "%zu (th) fetched variable. Please set the "
+                                "parameter `return_merged = False` when "
+                                "you call the `Executor.run()` method.",
+                                offset));
+  }
+}
+
+void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
  if (return_merged_) {
-    const auto &tensor_dims = tensors_[0].dims();
-    for (size_t i = 1; i < tensors_.size(); i++) {
-      const auto &ele_dims = tensors_[i].dims();
-      PADDLE_ENFORCE_EQ(
-          tensor_dims.size(), ele_dims.size(),
-          platform::errors::Fatal("The dimension sizes of fetched Tensors are "
-                                  "different from each other on different "
-                                  "devices. And the error is caused by the %zu "
-                                  "(th) fetched variable. Please set the "
-                                  "parameter `return_merged = False` when you "
-                                  "call the `Executor.run()` method.",
-                                  offset_));
-      for (int j = 1; j < tensor_dims.size(); j++) {
-        PADDLE_ENFORCE_EQ(
-            tensor_dims[j], ele_dims[j],
-            platform::errors::Fatal("The dimensions of fetched Tensors are "
-                                    "different from each other on different "
-                                    "devices. And the error is caused by the "
-                                    "%zu (th) fetched variable. Please set the "
-                                    "parameter `return_merged = False` when "
-                                    "you call the `Executor.run()` method.",
-                                    offset_));
+    if (data_is_lod_tensor(tensors_[0])) {
+      const auto &tensor_dims = boost::get<LoDTensor>(tensors_[0]).dims();
+      for (size_t i = 1; i < tensors_.size(); i++) {
+        const auto &ele_dims = boost::get<LoDTensor>(tensors_[i]).dims();
+        CheckDims(tensor_dims, ele_dims, offset_);
      }
+      std::vector<const LoDTensor *> tensors_ptr;
+      tensors_ptr.reserve(tensors_.size());
+      for (auto &t : tensors_) {
+        tensors_ptr.emplace_back(&boost::get<LoDTensor>(t));
+      }
+      auto &val = boost::get<FetchList>(*data_);
+      LoDTensor var;
+      var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+      val.at(offset_) = std::move(var);
+    } else {
+      auto &array = boost::get<LoDTensorArray>(tensors_[0]);
+      LoDTensorArray tmp_array;
+      tmp_array.reserve(array.size());
+      for (size_t i = 0; i < array.size(); ++i) {
+        const auto &tensor_dims = array[i].dims();
+        std::vector<const LoDTensor *> tensors_ptr;
+        tensors_ptr.reserve(tensors_.size());
+        tensors_ptr.push_back(&array[i]);
+        for (size_t j = 1; j < tensors_.size(); ++j) {
+          auto &element = boost::get<LoDTensorArray>(tensors_[j]);
+          const auto &ele_dims = element[i].dims();
+          CheckDims(tensor_dims, ele_dims, offset_);
+          tensors_ptr.push_back(&element[i]);
+        }
+        tmp_array.emplace_back();
+        tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+      }
+      auto &val = boost::get<FetchList>(*data_);
+      val.at(offset_) = std::move(tmp_array);
    }
-    std::vector<const LoDTensor *> tensors_ptr;
-    tensors_ptr.reserve(tensors_.size());
-    for (auto &t : tensors_) {
-      tensors_ptr.emplace_back(&t);
-    }
-    auto &val = boost::get<FeedFetchList>(*data_);
-    val.at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
  } else {
    auto &val = boost::get<FetchUnmergedList>(*data_);
    val.at(offset_) = std::move(tensors_);
  }
 }

+static void TransData(const framework::LoDTensor &src_item,
+                      framework::LoDTensor *dst_item) {
+  if (src_item.IsInitialized() && src_item.numel() > 0) {
+    if (platform::is_gpu_place(src_item.place())) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(src_item, platform::CPUPlace(), dst_item);
+#endif
+    } else {
+      dst_item->ShareDataWith(src_item);
+    }
+  } else {
+    dst_item->clear();
+    dst_item->Resize({0});
+  }
+  dst_item->set_lod(src_item.lod());
+}
+
 void FetchOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());
  WaitInputVarGenerated(platform::CPUPlace());

  tensors_.resize(inputs_.size());
-  platform::CPUPlace cpu;
  auto &scopes = *local_exec_scopes_;

  for (size_t i = 0; i < inputs_.size(); ++i) {
@@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() {
    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
                            var_handle->name());

-    auto &t = var->Get<framework::LoDTensor>();
-    if (t.IsInitialized() && t.numel() > 0) {
-      if (platform::is_gpu_place(t.place())) {
-#ifdef PADDLE_WITH_CUDA
-        TensorCopy(t, cpu, &tensors_[i]);
-#endif
-      } else {
-        tensors_[i].ShareDataWith(t);
-      }
+    if (var->IsType<LoDTensor>()) {
+      auto &t = var->Get<framework::LoDTensor>();
+      auto &item = boost::get<LoDTensor>(tensors_[i]);
+      TransData(t, &item);
    } else {
-      tensors_[i].clear();
-      tensors_[i].Resize({0});
+      auto &t = var->Get<framework::LoDTensorArray>();
+      LoDTensorArray tmp(t.size());
+      tensors_[i] = tmp;
+      auto &item = boost::get<LoDTensorArray>(tensors_[i]);
+      for (size_t j = 0; j < t.size(); ++j) {
+        TransData(t[j], &item[j]);
+      }
    }
-    tensors_[i].set_lod(t.lod());
  }
-
-  this->WaitAndMergeCPUTensors();
+  this->WaitAndMergeCPUFetchVars();
 }

 void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase {

  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;

-  void WaitAndMergeCPUTensors() const;
+  void WaitAndMergeCPUFetchVars() const;

  std::string Name() const override;

@@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase {
  size_t offset_;
  std::vector<Scope *> *local_scopes_;
  std::vector<Scope *> *local_exec_scopes_;
-  std::vector<LoDTensor> tensors_;
+  std::vector<FetchType> tensors_;
  bool return_merged_;
 };


--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
      }

      if (return_merged) {
-        return FeedFetchList();
+        return FetchList();
      } else {
        return FetchUnmergedList();
      }
@@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run(
  }

  if (return_merged) {
-    FeedFetchList ret;
+    FetchList ret;
    ret.reserve(fetch_tensors.size());
-
    for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
      std::vector<const LoDTensor *> lodtensor_ptrs;
      lodtensor_ptrs.reserve(place_num);
+      std::vector<const LoDTensorArray *> lodtensorarray_ptrs;
+      lodtensorarray_ptrs.reserve(place_num);
      for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
        if (!is_valid[scope_idx]) {
          continue;
        }
-        const auto &fetch_list =
-            boost::get<FeedFetchList>(fetch_data[scope_idx]);
-        lodtensor_ptrs.push_back(&fetch_list[fetch_idx]);
+        const auto &fetch_list = boost::get<FetchList>(fetch_data[scope_idx]);
+        if (data_is_lod_tensor(fetch_list[fetch_idx])) {
+          lodtensor_ptrs.push_back(
+              &(boost::get<LoDTensor>(fetch_list[fetch_idx])));
+        } else {
+          lodtensorarray_ptrs.push_back(
+              &(boost::get<LoDTensorArray>(fetch_list[fetch_idx])));
+        }
+      }
+      if (lodtensor_ptrs.size() != 0) {
+        LoDTensor var;
+        var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+        ret.emplace_back(var);
+      } else {
+        LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
+        for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
+          LoDTensor var;
+          std::vector<const LoDTensor *> ptrs;
+          for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
+            ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
+          }
+          var.MergeLoDTensor(ptrs, platform::CPUPlace());
+          var_array[i] = std::move(var);
+        }
+        ret.emplace_back(var_array);
      }
-      ret.emplace_back();
-      ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
    }
    return ret;
  } else {
@@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run(
            boost::get<FetchUnmergedList>(fetch_data[scope_idx]);
        PADDLE_ENFORCE_EQ(
            fetch_list[fetch_idx].size(), 1,
-            platform::errors::Fatal(
-                "Each place must have only one fetched LoDTensor!"));
+            platform::errors::Fatal("Each place must have only one fetched "
+                                    "LoDTensor/LoDTensorArray!"));
        ret.back().emplace_back(fetch_list[fetch_idx][0]);
      }
    }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
  std::unordered_set<VarHandleBase *> fetch_dependencies;
  FetchResultType fetch_data;
  if (return_merged) {
-    fetch_data = FeedFetchList(fetch_tensors.size());
+    fetch_data = FetchList(fetch_tensors.size());
  } else {
    fetch_data = FetchUnmergedList(fetch_tensors.size());
  }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -256,7 +256,7 @@ static bool has_feed_operators(
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
    const BlockDesc& block,
-    const std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::map<std::string, FetchType*>& fetch_targets,
    const std::string& fetch_holder_name) {
  size_t fetch_count = 0;
  for (auto* op : block.AllOps()) {
@@ -306,7 +306,7 @@ static bool has_fetch_operators(

 void Executor::Run(const ProgramDesc& program, Scope* scope,
                   std::map<std::string, const LoDTensor*>* feed_targets,
-                   std::map<std::string, LoDTensor*>* fetch_targets,
+                   std::map<std::string, FetchType*>* fetch_targets,
                   bool create_local_scope, bool create_vars,
                   const std::string& feed_holder_name,
                   const std::string& fetch_holder_name) {
@@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 void Executor::RunPreparedContext(
    ExecutorPrepareContext* ctx, Scope* scope,
    std::map<std::string, const LoDTensor*>* feed_targets,
-    std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
+    std::map<std::string, FetchType*>* fetch_targets, bool create_local_scope,
    bool create_vars, const std::string& feed_holder_name,
    const std::string& fetch_holder_name) {
  auto& global_block = ctx->prog_.Block(ctx->block_id_);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -87,7 +87,7 @@ class Executor {
  // This API is very slow.
  void Run(const ProgramDesc& program, Scope* scope,
           std::map<std::string, const LoDTensor*>* feed_targets,
-           std::map<std::string, LoDTensor*>* fetch_targets,
+           std::map<std::string, FetchType*>* fetch_targets,
           bool create_local_scope = true, bool create_vars = true,
           const std::string& feed_holder_name = "feed",
           const std::string& fetch_holder_name = "fetch");
@@ -95,7 +95,7 @@ class Executor {
  // This API is very slow.
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          std::map<std::string, const LoDTensor*>* feed_targets,
-                          std::map<std::string, LoDTensor*>* fetch_targets,
+                          std::map<std::string, FetchType*>* fetch_targets,
                          bool create_local_scope = true,
                          bool create_vars = true,
                          const std::string& feed_holder_name = "feed",

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
  // be created.
  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
  if (index >= feed_inputs.size()) {
    feed_inputs.resize(index + 1);
  }
@@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
  feed_inputs[index].set_lod(input.lod());
 }

-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index) {
-  // Since we want to fetch LodTensor from a variable, the variable must
+  // Since we want to fetch FetchType from a variable, the variable must
  // be created alreadly.
  Variable* g_fetch_value = scope.FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
-  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
-                 "Only %s can be invoked by GetFetchVariable",
-                 typeid(FeedFetchList).name());
-  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  PADDLE_ENFORCE_NOT_NULL(g_fetch_value,
+                          platform::errors::NotFound(
+                              "Variable %s is not found in scope.", var_name));
+  PADDLE_ENFORCE_EQ(g_fetch_value->IsType<FetchList>(), true,
+                    platform::errors::InvalidArgument(
+                        "Only %s can be invoked by GetFetchVariable",
+                        typeid(FetchList).name()));
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FetchList>();
  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
-  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  VLOG(3) << "Fetch " << var_name << " with index " << index;
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size(),
+                    platform::errors::InvalidArgument(
+                        "index must less than fetch_outputs size."));
  return tensor;
 }

 LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
  Variable* var = scope.FindVar(var_name);
-  PADDLE_ENFORCE(var, "%s no in scope", var_name);
-  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                      var_name));
+  PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                    platform::errors::InvalidArgument(
+                        "Only support lod tensor in GetVariableTensor now."));
  return *var->GetMutable<LoDTensor>();
 }


--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -24,7 +24,7 @@ namespace framework {
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                     const std::string& var_name, size_t index);

-LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index);

 LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);

--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -15,14 +15,33 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/variant.h"

 namespace paddle {
 namespace framework {
-using FeedFetchType = LoDTensor;
-using FeedFetchList = std::vector<FeedFetchType>;
-using FetchUnmergedList = std::vector<std::vector<FeedFetchType>>;
-using FetchResultType = boost::variant<FeedFetchList, FetchUnmergedList>;
+using FeedType = LoDTensor;
+using FeedList = std::vector<FeedType>;
+
+using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
+using FetchList = std::vector<FetchType>;
+
+using FetchUnmergedList = std::vector<std::vector<FetchType>>;
+using FetchResultType = boost::variant<FetchList, FetchUnmergedList>;
+
+inline bool data_is_lod_tensor(const FetchType &data) {
+  if (data.type() == typeid(LoDTensor)) {
+    return true;
+  }
+  return false;
+}
+
+inline bool data_is_lod_tensor_array(const FetchType &data) {
+  if (data.type() == typeid(LoDTensorArray)) {
+    return true;
+  }
+  return false;
+}

 static const char kFeedOpType[] = "feed";
 static const char kFetchOpType[] = "fetch";

--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -20,7 +20,6 @@ namespace paddle {
 namespace framework {

 using LoDTensorArray = std::vector<LoDTensor>;
-using LoDTensor2DArray = std::vector<std::vector<LoDTensor>>;

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) {
    case proto::VarType::SELECTED_ROWS:
    case proto::VarType::LOD_RANK_TABLE:
    case proto::VarType::LOD_TENSOR_ARRAY:
+    case proto::VarType::FETCH_LIST:
    case proto::VarType::READER:
      return static_cast<proto::VarType::Type>(type);
    default:
@@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
    case proto::VarType::READER:
      visitor(var.Get<ReaderHolder>());
      return;
+    case proto::VarType::FETCH_LIST:
+      visitor(var.Get<FetchList>());
+      return;
    default:
      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
  }

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -19,6 +19,7 @@
 #include <tuple>
 #include <typeindex>
 #include <vector>
+#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
@@ -139,7 +140,7 @@ struct VarTypeRegistryImpl {
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
    LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
-    operators::reader::LoDTensorBlockingQueueHolder,
+    operators::reader::LoDTensorBlockingQueueHolder, FetchList,
    operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
@@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
 REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
 REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
 REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
+REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
 REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
 REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);


--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::SELECTED_ROWS) {
    var->GetMutable<SelectedRows>();
  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
+    var->GetMutable<FeedList>();
  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
+    var->GetMutable<FetchList>();
  } else if (var_type == proto::VarType::STEP_SCOPES) {
    var->GetMutable<std::vector<framework::Scope *>>();
  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
  for (size_t i = 0; i < fetches_.size(); ++i) {
    int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
    PADDLE_ENFORCE((size_t)idx == i);
-    framework::LoDTensor &fetch =
+    framework::FetchType &fetch_var =
        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto &fetch = boost::get<framework::LoDTensor>(fetch_var);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
    output->name = fetches_[idx]->Input("X")[0];
@@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() {
 void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
  PADDLE_ENFORCE_NOT_NULL(scope);
  auto *var = scope->Var("feed");
-  var->GetMutable<framework::FeedFetchList>();
+  var->GetMutable<framework::FeedList>();
  var = scope->Var("fetch");
-  var->GetMutable<framework::FeedFetchList>();
+  var->GetMutable<framework::FetchList>();
 }

 std::vector<std::string> AnalysisPredictor::GetInputNames() {

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
  for (size_t i = 0; i < fetchs_.size(); ++i) {
    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
    PADDLE_ENFORCE((size_t)idx == i);
-    framework::LoDTensor &fetch =
+    framework::FetchType &fetch_var =
        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto fetch = boost::get<framework::LoDTensor>(fetch_var);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
    output->name = fetchs_[idx]->Input("X")[0];

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) {
  cpu_feeds.push_back(&third_word);
  cpu_feeds.push_back(&fourth_word);

-  framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);

-  float* lod_data = output1.data<float>();
-  for (int i = 0; i < output1.numel(); ++i) {
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  float* lod_data = output1_tensor.data<float>();
+  for (int i = 0; i < output1_tensor.numel(); ++i) {
    EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
    EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
  }
@@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) {
  std::vector<framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

-  framework::LoDTensor output1;
-  std::vector<framework::LoDTensor*> cpu_fetchs1;
+  framework::FetchType output1;
+  std::vector<framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  TestInference<platform::CPUPlace, false, true>(
@@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) {
  ASSERT_EQ(outputs.size(), 1UL);
  size_t len = outputs[0].data.length();
  float* data = static_cast<float*>(outputs[0].data.data());
-  float* lod_data = output1.data<float>();
+  float* lod_data =
+      boost::get<paddle::framework::LoDTensor>(output1).data<float>();
  for (size_t j = 0; j < len / sizeof(float); ++j) {
    EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
  }
@@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
  constexpr int num_jobs = 3;
  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
-  std::vector<framework::LoDTensor> refs(num_jobs);
+  std::vector<framework::FetchType> refs(num_jobs);
  for (size_t i = 0; i < jobs.size(); ++i) {
    // each job has 4 words
    jobs[i].resize(4);
@@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) {

    // get reference result of each job
    std::vector<paddle::framework::LoDTensor*> ref_feeds;
-    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    std::vector<paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
    for (auto& word : jobs[i]) {
      ref_feeds.push_back(&word);
    }
@@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) {
      }

      // check outputs correctness
-      float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
-      for (int i = 0; i < refs[tid].numel(); ++i) {
+      auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
+      float* ref_data = ref_tensor.data<float>();
+      EXPECT_EQ(ref_tensor.numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < ref_tensor.numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], 2e-3);
      }
    });
@@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);
  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
-  std::vector<framework::LoDTensor> refs(num_jobs);
+  std::vector<framework::FetchType> refs(num_jobs);
  for (size_t i = 0; i < jobs.size(); ++i) {
    // prepare inputs
    std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) {

    // get reference result of each job
    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
-    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    std::vector<framework::FetchType*> ref_fetches(1, &refs[i]);
    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
  }

@@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) {
      ASSERT_EQ(local_outputs.size(), 1UL);
      const size_t len = local_outputs[0].data.length();
      float* data = static_cast<float*>(local_outputs[0].data.data());
-      float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
-      for (int i = 0; i < refs[tid].numel(); ++i) {
+      auto ref_tensor = boost::get<paddle::framework::LoDTensor>(refs[tid]);
+      float* ref_data = ref_tensor.data<float>();
+      EXPECT_EQ((size_t)ref_tensor.numel(), len / sizeof(float));
+      for (int i = 0; i < ref_tensor.numel(); ++i) {
        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
      }
    });

--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -40,10 +40,10 @@ TEST(inference, fit_a_line) {
      cpu_feeds[i].push_back(input);
    }

-    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+    std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs1;
    cpu_fetchs1.resize(num_threads);
    for (int i = 0; i < num_threads; ++i) {
-      auto* output = new paddle::framework::LoDTensor();
+      auto* output = new paddle::framework::FetchType();
      cpu_fetchs1[i].push_back(output);
    }

@@ -58,10 +58,10 @@ TEST(inference, fit_a_line) {
    }

 #ifdef PADDLE_WITH_CUDA
-    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+    std::vector<std::vector<paddle::framework::FetchType*>> cpu_fetchs2;
    cpu_fetchs2.resize(num_threads);
    for (int i = 0; i < num_threads; ++i) {
-      auto* output = new paddle::framework::LoDTensor();
+      auto* output = new paddle::framework::FetchType();
      cpu_fetchs2[i].push_back(output);
    }

@@ -76,7 +76,9 @@ TEST(inference, fit_a_line) {
    }

    for (int i = 0; i < num_threads; ++i) {
-      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
+      CheckError<float>(
+          boost::get<paddle::framework::LoDTensor>(*cpu_fetchs1[i][0]),
+          boost::get<paddle::framework::LoDTensor>(*cpu_fetchs2[i][0]));
      delete cpu_fetchs2[i][0];
    }
 #endif

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -50,9 +50,9 @@ TEST(inference, image_classification) {
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

-  paddle::framework::LoDTensor output1;
+  paddle::framework::FetchType output1;
  if (!FLAGS_skip_cpu) {
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    std::vector<paddle::framework::FetchType*> cpu_fetchs1;
    cpu_fetchs1.push_back(&output1);

    // Run inference on CPU
@@ -60,12 +60,12 @@ TEST(inference, image_classification) {
    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
    TestInference<paddle::platform::CPUPlace, false, true>(
        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
-    LOG(INFO) << output1.dims();
+    LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output1).dims();
  }

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
@@ -73,17 +73,18 @@ TEST(inference, image_classification) {
  LOG(INFO) << "Batch size is " << FLAGS_batch_size;
  TestInference<paddle::platform::CUDAPlace, false, true>(
      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
-  LOG(INFO) << output2.dims();
+  LOG(INFO) << boost::get<paddle::framework::LoDTensor>(output2).dims();

  if (!FLAGS_skip_cpu) {
-    CheckError<float>(output1, output2);
+    CheckError<float>(boost::get<paddle::framework::LoDTensor>(output1),
+                      boost::get<paddle::framework::LoDTensor>(output2));
  }

  // float16 inference requires cuda GPUs with >= 5.3 compute capability
  if (!FLAGS_fp16_dirname.empty() &&
      paddle::platform::GetCUDAComputeCapability(0) >= 53) {
-    paddle::framework::LoDTensor output3;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
+    paddle::framework::FetchType output3;
+    std::vector<paddle::framework::FetchType*> cpu_fetchs3;
    cpu_fetchs3.push_back(&output3);

    LOG(INFO) << "--- GPU Runs in float16 mode: ---";
@@ -92,7 +93,8 @@ TEST(inference, image_classification) {
    TestInference<paddle::platform::CUDAPlace, false, true>(
        FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);

-    CheckError<float>(output2, output3);
+    CheckError<float>(boost::get<paddle::framework::LoDTensor>(output2),
+                      boost::get<paddle::framework::LoDTensor>(output3));
  }
 #endif
 }
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) {
  cpu_feeds.push_back(&ctx_p2);
  cpu_feeds.push_back(&mark);

-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  paddle::framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  LOG(INFO) << output1_tensor.lod();
+  LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+  auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+  LOG(INFO) << output2_tensor.lod();
+  LOG(INFO) << output2_tensor.dims();

-  CheckError<float>(output1, output2);
+  CheckError<float>(output1_tensor, output2_tensor);
 #endif
 }
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -118,8 +118,8 @@ void ThreadRunInfer(
      inference_program->GetFetchTargetNames();

  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  paddle::framework::LoDTensor outtensor;
+  std::map<std::string, paddle::framework::FetchType*> fetch_targets;
+  paddle::framework::FetchType outtensor;
  fetch_targets[fetch_target_names[0]] = &outtensor;

  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
@@ -150,7 +150,8 @@ void ThreadRunInfer(
      std::string fetch_target_name = op->Input("X")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      *fetch_targets[fetch_target_name] =
-          paddle::framework::GetFetchVariable(*scope, "fetch", idx);
+          boost::get<paddle::framework::LoDTensor>(
+              paddle::framework::GetFetchVariable(*scope, "fetch", idx));
    }
  }

@@ -215,8 +216,8 @@ TEST(inference, nlp) {
    const std::vector<std::string>& fetch_target_names =
        inference_program->GetFetchTargetNames();
    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
-    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-    paddle::framework::LoDTensor outtensor;
+    std::map<std::string, paddle::framework::FetchType*> fetch_targets;
+    paddle::framework::FetchType outtensor;
    fetch_targets[fetch_target_names[0]] = &outtensor;

    // prepare feed

--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -41,28 +41,30 @@ TEST(inference, recognize_digits) {
  cpu_feeds.push_back(&input);

  for (auto is_combined : {false, true}) {
-    paddle::framework::LoDTensor output1;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    paddle::framework::FetchType output1;
+    std::vector<paddle::framework::FetchType*> cpu_fetchs1;
    cpu_fetchs1.push_back(&output1);

    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
                                              FLAGS_repeat, is_combined);
-    LOG(INFO) << output1.dims();
+    auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+    LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-    paddle::framework::LoDTensor output2;
-    std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+    paddle::framework::FetchType output2;
+    std::vector<paddle::framework::FetchType*> cpu_fetchs2;
    cpu_fetchs2.push_back(&output2);

    // Run inference on CUDA GPU
    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
                                               FLAGS_repeat, is_combined);
-    LOG(INFO) << output2.dims();
+    auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+    LOG(INFO) << output2_tensor.dims();

-    CheckError<float>(output1, output2);
+    CheckError<float>(output1_tensor, output2_tensor);
 #endif
  }
 }
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -65,23 +65,25 @@ TEST(inference, recommender_system) {
  cpu_feeds.push_back(&category_id);
  cpu_feeds.push_back(&movie_title);

-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  paddle::framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
+  auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+  LOG(INFO) << output2_tensor.dims();

-  CheckError<float>(output1, output2);
+  CheckError<float>(output1_tensor, output2_tensor);
 #endif
 }
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) {
  cpu_feeds.push_back(&word_data);
  cpu_feeds.push_back(&trg_word);

-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  paddle::framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  LOG(INFO) << output1_tensor.lod();
+  LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+  auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+  LOG(INFO) << output2_tensor.lod();
+  LOG(INFO) << output2_tensor.dims();

-  CheckError<float>(output1, output2);
+  CheckError<float>(output1_tensor, output2_tensor);
 #endif
 }
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -39,25 +39,27 @@ TEST(inference, understand_sentiment) {
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&words);

-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  paddle::framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  LOG(INFO) << output1_tensor.lod();
+  LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+  auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+  LOG(INFO) << output2_tensor.lod();
+  LOG(INFO) << output2_tensor.dims();

-  CheckError<float>(output1, output2);
+  CheckError<float>(output1_tensor, output2_tensor);
 #endif
 }
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -44,25 +44,27 @@ TEST(inference, word2vec) {
  cpu_feeds.push_back(&third_word);
  cpu_feeds.push_back(&fourth_word);

-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  paddle::framework::FetchType output1;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
  cpu_fetchs1.push_back(&output1);

  // Run inference on CPU
  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  auto output1_tensor = boost::get<paddle::framework::LoDTensor>(output1);
+  LOG(INFO) << output1_tensor.lod();
+  LOG(INFO) << output1_tensor.dims();

 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  paddle::framework::FetchType output2;
+  std::vector<paddle::framework::FetchType*> cpu_fetchs2;
  cpu_fetchs2.push_back(&output2);

  // Run inference on CUDA GPU
  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+  auto output2_tensor = boost::get<paddle::framework::LoDTensor>(output2);
+  LOG(INFO) << output2_tensor.lod();
+  LOG(INFO) << output2_tensor.dims();

-  CheckError<float>(output1, output2);
+  CheckError<float>(output1_tensor, output2_tensor);
 #endif
 }
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once

 #include <map>
+#include <memory>
 #include <random>
 #include <string>
 #include <vector>
@@ -142,7 +143,7 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const std::vector<paddle::framework::FetchType*>& cpu_fetchs,
                   const int repeat = 1, const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
@@ -194,7 +195,7 @@ void TestInference(const std::string& dirname,
  }

  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  std::map<std::string, paddle::framework::FetchType*> fetch_targets;
  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
  }

--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -58,7 +58,7 @@ class FeedOp : public framework::OperatorBase {
    VLOG(3) << "Feed variable " << feed_var_name << "'s " << col
            << " column to variable " << out_name;

-    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    auto &feed_list = feed_var->Get<framework::FeedList>();
    PADDLE_ENFORCE_LT(
        static_cast<size_t>(col), feed_list.size(),
        platform::errors::InvalidArgument(
@@ -68,7 +68,7 @@ class FeedOp : public framework::OperatorBase {
            col, feed_list.size()));

    auto &feed_item = feed_list.at(static_cast<size_t>(col));
-    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
+    auto *out_item = out_var->GetMutable<framework::FeedType>();

    if (platform::is_same_place(feed_item.place(), place)) {
      out_item->ShareDataWith(feed_item);

--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -21,6 +21,39 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+// FIXME(yuyang18): Should we assume the fetch operator always generate
+// CPU outputs?
+static void DataCopy(const framework::LoDTensor &src_item,
+                     const std::string &fetch_var_name,
+                     framework::LoDTensor *dst_item) {
+  if (src_item.IsInitialized() && src_item.numel() > 0) {
+#ifdef PADDLE_WITH_MKLDNN
+    // Conversion from MKL-DNN to Paddle
+    if (src_item.layout() == framework::DataLayout::kMKLDNN) {
+      framework::Tensor out;
+      // Convert to desired Paddle layout, apart from grads of filter
+      // as params are not a subject to paddle's data_format
+      framework::innerTransDataLayoutFromMKLDNN(
+          src_item.layout(),
+          fetch_var_name == framework::GradVarName("Filter")
+              ? framework::DataLayout::kNCHW
+              : paddle::platform::get_cur_paddle_data_layout(),
+          src_item, &out, platform::CPUPlace());
+      TensorCopySync(out, platform::CPUPlace(), dst_item);
+    } else {
+      TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+    }
+#else
+    TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+#endif
+  } else {
+    // Not copy, if the src tensor is empty.
+    dst_item->clear();
+    dst_item->Resize({0});
+  }
+  dst_item->set_lod(src_item.lod());
+}
+
 class FetchOp : public framework::OperatorBase {
 public:
  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -66,42 +99,26 @@ class FetchOp : public framework::OperatorBase {
    VLOG(3) << "Fetch variable " << fetch_var_name << " to variable "
            << out_name << "'s " << col << " column.";

-    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
-    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
+    auto *fetch_list = out_var->GetMutable<framework::FetchList>();

    if (static_cast<size_t>(col) >= fetch_list->size()) {
      fetch_list->resize(col + 1);
    }
-    auto &dst_item = fetch_list->at(col);

-    // FIXME(yuyang18): Should we assume the fetch operator always generate
-    // CPU outputs?
-    if (src_item.IsInitialized() && src_item.numel() > 0) {
-#ifdef PADDLE_WITH_MKLDNN
-      // Conversion from MKL-DNN to Paddle
-      if (src_item.layout() == framework::DataLayout::kMKLDNN) {
-        framework::Tensor out;
-        // Convert to desired Paddle layout, apart from grads of filter
-        // as params are not a subject to paddle's data_format
-        framework::innerTransDataLayoutFromMKLDNN(
-            src_item.layout(),
-            fetch_var_name == framework::GradVarName("Filter")
-                ? framework::DataLayout::kNCHW
-                : paddle::platform::get_cur_paddle_data_layout(),
-            src_item, &out, platform::CPUPlace());
-        TensorCopySync(out, platform::CPUPlace(), &dst_item);
-      } else {
-        TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
-      }
-#else
-      TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
-#endif
+    if (fetch_var->IsType<framework::LoDTensor>()) {
+      auto &src_item = fetch_var->Get<framework::LoDTensor>();
+      auto *dst_item = &(boost::get<framework::LoDTensor>(fetch_list->at(col)));
+      DataCopy(src_item, fetch_var_name, dst_item);
    } else {
-      // Not copy, if the src tensor is empty.
-      dst_item.clear();
-      dst_item.Resize({0});
+      auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
+      framework::LoDTensorArray tmp(src_item.size());
+      fetch_list->at(col) = tmp;
+      auto &dst_item =
+          boost::get<framework::LoDTensorArray>(fetch_list->at(col));
+      for (size_t i = 0; i < src_item.size(); ++i) {
+        DataCopy(src_item[i], fetch_var_name, &dst_item[i]);
+      }
    }
-    dst_item.set_lod(src_item.lod());
  }
 };


--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -97,7 +97,9 @@ DECLARE_bool(use_mkldnn);

 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensor2DArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);

 namespace paddle {
 namespace pybind {
@@ -966,6 +968,9 @@ All parameter, weight, gradient are variables in Paddle.
      .def("get_lod_tensor_array",
           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
           py::return_value_policy::reference)
+      .def("get_fetch_list",
+           [](Variable &self) { return self.GetMutable<FetchList>(); },
+           py::return_value_policy::reference)
 #if (defined(PADDLE_WITH_NCCL))
      .def("get_communicator",
           [](Variable &self) -> platform::Communicator * {
@@ -1443,7 +1448,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run_prepared_ctx",
           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
              std::map<std::string, const LoDTensor *> *feed_targets,
-              std::map<std::string, LoDTensor *> *fetch_targets,
+              std::map<std::string, FetchType *> *fetch_targets,
              bool create_local_scope = true, bool create_vars = true,
              const std::string &feed_holder_name = "feed",
              const std::string &fetch_holder_name = "fetch") {
@@ -1503,7 +1508,16 @@ All parameter, weight, gradient are variables in Paddle.
 #endif

  m.def("set_feed_variable", framework::SetFeedVariable);
-  m.def("get_fetch_variable", framework::GetFetchVariable);
+  m.def("get_fetch_variable",
+        [](const Scope &scope, const std::string &var_name,
+           size_t index) -> py::object {
+          auto &var = framework::GetFetchVariable(scope, var_name, index);
+          if (data_is_lod_tensor(var)) {
+            return py::cast(boost::get<LoDTensor>(var));
+          } else {
+            return py::cast(boost::get<LoDTensorArray>(var));
+          }
+        });
  m.def("get_variable_tensor", framework::GetVariableTensor);

  m.def("_is_program_version_supported", IsProgramVersionSupported);
@@ -1583,16 +1597,70 @@ All parameter, weight, gradient are variables in Paddle.
           },
           py::return_value_policy::take_ownership);

-  py::class_<LoDTensor2DArray>(m, "LoDTensor2DArray", R"DOC(
-        LoDTensor2DArray is 2-D array of LoDTensor.
+  py::class_<FetchList>(m, "FetchList", R"DOC( FetchList is a
+        vector of boost::variant<LoDTensor, LoDTensorArray>.
+        )DOC")
+      .def("_move_to_list",
+           [](FetchList &self) -> py::list {
+             py::list res(self.size());
+             for (size_t i = 0; i < self.size(); ++i) {
+               if (data_is_lod_tensor(self[i])) {
+                 auto &data = boost::get<LoDTensor>(self[i]);
+                 res[i] = py::cast(std::move(data));
+               } else {
+                 auto &data = boost::get<LoDTensorArray>(self[i]);
+                 py::list tmp(data.size());
+                 for (size_t j = 0; j < data.size(); ++j) {
+                   tmp[j] = py::cast(std::move(data[j]));
+                 }
+                 res[i] = std::move(tmp);
+               }
+             }
+             self.clear();
+             return res;
+           },
+           py::return_value_policy::take_ownership)
+
+      .def("append",
+           [](FetchList &self, const LoDTensor &t) {
+             self.emplace_back();
+             auto &lod_tensor = boost::get<LoDTensor>(self.back());
+             lod_tensor.ShareDataWith(t);
+             lod_tensor.set_lod(t.lod());
+           },
+           py::arg("var"))
+
+      .def("append",
+           [](FetchList &self, const LoDTensorArray &t) {
+             self.emplace_back();
+             auto &lod_tensor_array = boost::get<LoDTensorArray>(self.back());
+             for (size_t i = 0; i < t.size(); ++i) {
+               lod_tensor_array[i].ShareDataWith(t[i]);
+               lod_tensor_array[i].set_lod(t[i].lod());
+             }
+           },
+           py::arg("var"));
+
+  py::class_<FetchUnmergedList>(m, "FetchUnmergedList", R"DOC(
+        FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)).
        )DOC")
      .def("_move_to_list",
-           [](LoDTensor2DArray &self) -> py::list {
+           [](FetchUnmergedList &self) -> py::list {
             py::list res(self.size());
             for (size_t i = 0; i < self.size(); ++i) {
               py::list tmp(self[i].size());
               for (size_t j = 0; j < self[i].size(); ++j) {
-                 tmp[j] = py::cast(std::move(self[i][j]));
+                 if (data_is_lod_tensor(self[i][j])) {
+                   auto &var = boost::get<LoDTensor>(self[i][j]);
+                   tmp[j] = py::cast(std::move(var));
+                 } else {
+                   auto &var = boost::get<LoDTensorArray>(self[i][j]);
+                   py::list tmp_array(var.size());
+                   for (size_t k = 0; k < var.size(); ++k) {
+                     tmp_array[k] = std::move(var[k]);
+                   }
+                   tmp[j] = std::move(tmp_array);
+                 }
               }
               res[i] = std::move(tmp);
               self[i].clear();
@@ -2326,8 +2394,8 @@ All parameter, weight, gradient are variables in Paddle.
               ret = self.Run(fetch_tensors, return_merged);
             }
             if (return_merged) {
-               return py::cast(std::move(
-                   boost::get<paddle::framework::FeedFetchList>(ret)));
+               return py::cast(
+                   std::move(boost::get<paddle::framework::FetchList>(ret)));
             } else {
               return py::cast(std::move(
                   boost::get<paddle::framework::FetchUnmergedList>(ret)));

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -931,14 +931,14 @@ class Executor(object):
            return_merged(bool): This parameter indicates whether fetched variables (the variables
                specified in the fetch list) should be merged according to the execution device dimension.
                If :code:`return_merged` is False, the type of the return value is a two-dimensional list
-                of :code:`Tensor` ( :code:`return_numpy` is False) or a two-dimensional list of
-                :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True,
-                the type of the return value is an one-dimensional list of :code:`Tensor` ( :code:`return_numpy`
-                is False) or an one-dimensional list of :code:`numpy.ndarray` ( :code:`return_numpy` is True).
-                Please see Examples 2 for more details. If the lengths of fetched results are variant, please
-                set :code:`return_merged` as False, which denotes that the fetched results will not be merged.
-                The default is True, but it is just for the compatibility, and may use False as default value
-                in the future version.
+                of :code:`Tensor` / :code:`LoDTensorArray` ( :code:`return_numpy` is False) or a two-dimensional
+                list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True,
+                the type of the return value is an one-dimensional list of :code:`Tensor` / :code:`LoDTensorArray`
+                ( :code:`return_numpy` is False) or an one-dimensional list of :code:`numpy.ndarray`
+                ( :code:`return_numpy` is True). Please see Examples 2 for more details. If the lengths of fetched
+                results are variant, please set :code:`return_merged` as False, which denotes that the fetched
+                results will not be merged. The default is True, but it is just for the compatibility, and may
+                use False as default value in the future version.
            use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. 
                If the parameter is True, the program will be pruned accroding to the given feed and fetch_list,
                which means the operators and variables in program that generate :code:`feed` and are not 
@@ -980,13 +980,17 @@ class Executor(object):
              loss = fluid.layers.mean(hidden)
              adam = fluid.optimizer.Adam()
              adam.minimize(loss)
+              i = fluid.layers.zeros(shape=[1], dtype='int64')
+              array = fluid.layers.array_write(x=loss, i=i)

              # Run the startup program once and only once.
              exe.run(fluid.default_startup_program())

              x = numpy.random.random(size=(10, 1)).astype('float32')
-              outs = exe.run(feed={'X': x},
-                             fetch_list=[loss.name])
+              loss_val, array_val = exe.run(feed={'X': x},
+                                            fetch_list=[loss.name, array.name])
+              print(array_val)
+              # [array([0.02153828], dtype=float32)]

        Examples 2:
            .. code-block:: python
@@ -1226,7 +1230,7 @@ class Executor(object):
        else:
            self._default_executor.run_prepared_ctx(ctx, scope, False, False,
                                                    False)
-        arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
+        arr = scope.find_var(fetch_var_name).get_fetch_list()
        tensors = arr._move_to_list()
        if return_numpy:
            return as_numpy(tensors)

--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -58,8 +58,11 @@ def convolutional_neural_network(use_py_reader):
        loss = fluid.layers.cross_entropy(input=prediction, label=label)
        avg_loss = fluid.layers.mean(loss)
        acc = fluid.layers.accuracy(input=prediction, label=label)
-
-        return img, label, prediction, avg_loss, acc, py_reader
+        i = fluid.layers.zeros(shape=[1], dtype='int64')
+        array = fluid.layers.array_write(x=prediction, i=i)
+        fluid.layers.increment(i)
+        fluid.layers.array_write(x=acc, i=i, array=array)
+        return array, img, label, prediction, avg_loss, acc, py_reader


 def test():
@@ -69,7 +72,7 @@ def test():
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)

-    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
+    array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
        use_py_reader=False)
    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)

@@ -102,7 +105,7 @@ def train(use_cuda, thread_num, cpu_num):
        print("paddle is not compiled with cuda, exit!")
        return

-    img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
+    array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
        use_py_reader=True)
    print("build convolutional neural network done.")

@@ -150,7 +153,12 @@ def train(use_cuda, thread_num, cpu_num):
        py_reader.start()
        try:
            while True:
-                loss_val = pe.run(fetch_list=[avg_loss.name])
+                array_v, acc_v, prediction_v, loss_val = pe.run(
+                    fetch_list=[array, acc, prediction, avg_loss.name])
+
+                assert numpy.allclose(array_v[0], prediction_v) == True
+                assert numpy.allclose(array_v[1], acc_v) == True
+
                loss_val = numpy.mean(loss_val)
                if step % 10 == 0:
                    print("Pass %d, Batch %d, Cost %f, queue size %d" %

--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -19,25 +19,40 @@ import unittest
 import numpy
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
-from paddle.fluid.layers import mul, data
+from paddle.fluid.layers import mul, data, zeros, array_write, increment


 class TestExecutor(unittest.TestCase):
    def test_mul(self):
+        i = zeros(shape=[1], dtype='int64')
        a = data(name='a', shape=[784], dtype='float32')
+        array = array_write(x=a, i=i)
+
+        i = increment(i)
        b = data(
            name='b',
            shape=[784, 100],
            dtype='float32',
            append_batch_size=False)
+        array_write(x=b, i=i, array=array)
+
+        i = increment(i)
        out = mul(x=a, y=b)
+        array_write(x=out, i=i, array=array)
+
        a_np = numpy.random.random((100, 784)).astype('float32')
        b_np = numpy.random.random((784, 100)).astype('float32')
+
        exe = Executor()
-        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
-        out = outs[0]
-        self.assertEqual((100, 100), out.shape)
-        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+        res, res_array = exe.run(feed={'a': a_np,
+                                       'b': b_np},
+                                 fetch_list=[out, array])
+
+        self.assertEqual((100, 100), res.shape)
+        self.assertTrue(numpy.allclose(res, numpy.dot(a_np, b_np)))
+        self.assertTrue(numpy.allclose(res_array[0], a_np))
+        self.assertTrue(numpy.allclose(res_array[1], b_np))
+        self.assertTrue(numpy.allclose(res_array[2], res))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -31,7 +31,9 @@ class TestFeedFetch(unittest.TestCase):

        core.set_feed_variable(scope, input_tensor, "feed", 0)

-        output_tensor = core.get_fetch_variable(scope, "feed", 0)
+        output = scope.var("fetch").get_fetch_list()
+        output.append(input_tensor)
+        output_tensor = core.get_fetch_variable(scope, "fetch", 0)

        output_lod = output_tensor.recursive_sequence_lengths()
        self.assertEqual(2, output_lod[0][0])

--- a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import unittest
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from simple_nets import simple_fc_net_with_inputs, simple_fc_net
+
+
+class TestFetchLoDTensorArray(unittest.TestCase):
+    def build_program(self, main_program, startup_program):
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main_program, startup_program):
+                i = layers.zeros(shape=[1], dtype='int64')
+                img = fluid.data(name='image', shape=[-1, 784], dtype='float32')
+                label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
+                loss = simple_fc_net_with_inputs(img, label, class_num=10)
+                loss = simple_fc_net()
+                opt = fluid.optimizer.SGD(learning_rate=0.001)
+                opt.minimize(loss)
+
+                array = layers.array_write(x=img, i=i)
+                i = layers.increment(i)
+                layers.array_write(x=label, i=i, array=array)
+                i = layers.increment(i)
+                layers.array_write(x=loss, i=i, array=array)
+
+                return loss, array
+
+    def check_network(self, use_cuda=True):
+        os.environ["CPU_NUM"] = str(2)
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+
+        loss, array = self.build_program(main_program, startup_program)
+
+        batch_size = 32
+        image = np.random.normal(size=(batch_size, 784)).astype('float32')
+        label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        feed_dict = {'image': image, 'label': label}
+
+        build_strategy = fluid.BuildStrategy()
+        binary = fluid.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
+
+        device_num = fluid.core.get_cuda_device_count() if use_cuda else 2
+        for _ in range(3):
+            loss_v, array_v = exe.run(binary,
+                                      feed=feed_dict,
+                                      fetch_list=[loss, array],
+                                      return_merged=False)
+            self.assertEqual(np.array(loss_v).shape, (device_num, 1))
+            self.assertEqual(
+                np.array(array_v[0][0]).shape, (batch_size / device_num, 784))
+            self.assertEqual(
+                np.array(array_v[0][1]).shape, (batch_size / device_num, 1))
+            self.assertEqual(np.array(array_v[0][2]).shape, (1, ))
+
+        for _ in range(3):
+            loss_v, array_v = exe.run(binary,
+                                      feed=feed_dict,
+                                      fetch_list=[loss, array],
+                                      return_merged=True)
+            self.assertEqual(np.array(loss_v).shape, (device_num, ))
+            self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784))
+            self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1))
+            self.assertTrue(np.allclose(loss_v, array_v[2]))
+
+    def test_fetch_lod_tensor_array(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_network(use_cuda=True)
+        self.check_network(use_cuda=False)
+
+    def test_fetch_unmerged_parallel_graph(self):
+        fluid.core.globals()['FLAGS_enable_parallel_graph'] = True
+        if fluid.core.is_compiled_with_cuda():
+            self.check_network(use_cuda=True)
+        self.check_network(use_cuda=False)
+        fluid.core.globals()['FLAGS_enable_parallel_graph'] = False
+
+
+if __name__ == '__main__':
+    unittest.main()