add the support for pipeline (#24560)

* add device_worker for pipeline, test=develop

add the support for pipeline (#24560)
* add device_worker for pipeline, test=develop
e39aa70e · lilong12 · GitHub · 0dcb8754 · e39aa70e · e39aa70e
8 changed file
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
 class FleetWrapper;
-#define SEC_LOG                                                              \
-  VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
-          << "]: "
 class PullDenseWorker {
 public:
  virtual ~PullDenseWorker() {}
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
 };
 #if defined(PADDLE_WITH_NCCL)
-using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
-class SyncFunctor {
- public:
-  SyncFunctor(int rank_id, int rank_num, int sync_steps);
-  virtual ~SyncFunctor() {}
-  void SetSyncParam(const std::vector<std::string>& sync_param) {
-    sync_param_ = &sync_param;
-  }
-  void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) {
-    nccl_ctx_map_ = nccl_ctx_map;
-  }
-  int operator()(Scope* scope);
-  static std::vector<Scope*> pipeline_scopes_;
-  static uint64_t sync_flag_;
- protected:
-  const int rank_id_;
-  const int rank_num_;
-  const std::vector<std::string>* sync_param_ = nullptr;
-  platform::NCCLContextMap* nccl_ctx_map_ = nullptr;
-  uint64_t sync_signal_;
-  const int sync_steps_;
-  int counter_;
-  void Synchronize();
-};
 class SectionWorker : public DeviceWorker {
 public:
-  SectionWorker() {}
+  SectionWorker() { local_batch_id_ = 0; }
  ~SectionWorker() override {}
  void Initialize(const TrainerDesc& desc) override;
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
  const platform::Place& place() const { return place_; }
  void SetSectionIndex(int section_id) { section_id_ = section_id; }
-  void SetDeviceIndex(int tid) override { pipeline_id_ = tid; }
+  void SetDeviceIndex(int tid) override {}
  void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
-  void SetVarNames(const std::vector<std::string>& in_var_names,
+  void SetMicrobatchNum(int num) { num_microbatches_ = num; }
-                   const std::vector<std::string>& out_var_names) {
+  void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
-    in_var_names_ = &in_var_names;
+    microbatch_scopes_ = scope;
-    out_var_names_ = &out_var_names;
-  }
-  void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) {
-    in_scope_queue_ = in_scope_queue;
-    out_scope_queue_ = out_scope_queue;
  }
-  void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; }
+  void SetMinibatchScope(const Scope* scope) { minibatch_scope_ = scope; }
-  void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; }
+  void SetSkipVars(const std::vector<std::string>& skip_vars) {
-  void SetSectionNum(int section_num) { section_num_ = section_num; }
+    skip_vars_ = skip_vars;
-  void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; }
-  void SetNextSectionPlace(const paddle::platform::Place& place) {
-    next_section_place_ = place;
  }
-  SyncFunctor* sync_func_ = nullptr;
-  void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; }
  static std::atomic<int> cpu_id_;
 protected:
  void AutoSetCPUAffinity(bool reuse);
  int section_id_;
-  int pipeline_id_;
-  int section_num_;
-  int pipeline_num_;
  int thread_id_;
-  // This worker will consume scope from in_scope_queue_
+  int num_microbatches_;
-  // and produce scope to out_scope_queue_
+  std::vector<Scope*> microbatch_scopes_;
-  ScopeQueue* in_scope_queue_ = nullptr;
+  std::vector<std::string> skip_vars_;
-  ScopeQueue* out_scope_queue_ = nullptr;
+  const Scope* minibatch_scope_;
-  const std::vector<std::string>* in_var_names_ = nullptr;
-  const std::vector<std::string>* out_var_names_ = nullptr;
-  std::mutex* worker_count_mutex_ = nullptr;
-  int* worker_count_ = nullptr;
-  paddle::platform::Place next_section_place_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  static std::mutex thread_mutex;
+  static std::condition_variable thread_condition;
+  static bool threads_completed;
+  std::shared_ptr<framework::ProgramDesc> program_;
+  static uint64_t batch_id_;
+  uint64_t local_batch_id_;
  platform::DeviceContext* dev_ctx_ = nullptr;
 };
 #endif
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -23,8 +23,13 @@ namespace framework {
 void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                 Dataset* dataset) {
-  pipeline_num_ = trainer_desc.thread_num();
+  const auto& section_params = trainer_desc.section_param();
-  VLOG(3) << "pipeline num: " << pipeline_num_;
+  num_microbatches_ = section_params.num_microbatches();
+  VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
+  section_num_ = section_params.section_config_size();
+  VLOG(3) << "Number of program sections: " << section_num_;
+  trainer_desc_ = trainer_desc;
+  start_cpu_core_id_ = section_params.start_cpu_core_id();
  SetDataset(dataset);
  ParseDumpConfig(trainer_desc);
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
  const std::vector<paddle::framework::DataFeed*> readers =
      dataset->GetReaders();
  VLOG(3) << "readers num: " << readers.size();
+  int num_readers = readers.size();
-  pipeline_config_ = trainer_desc.section_param();
+  PADDLE_ENFORCE_EQ(num_readers, 1,
-  scope_queue_size_ = pipeline_config_.queue_size();
+                    platform::errors::InvalidArgument(
-  sync_steps_ = pipeline_config_.sync_steps();
+                        "Number of dataset readers for pipeline "
-  section_num_ = pipeline_config_.section_config_size();
+                        "must be 1 now, but the value you give is %d.",
+                        num_readers));
-  VLOG(3) << "scope_queue_size: " << scope_queue_size_;
+  auto* reader = readers[0];
-  VLOG(3) << "section num: " << section_num_;
+  feed_var_names_ = reader->GetUseSlotAlias();
-  VLOG(3) << "sync_steps: " << sync_steps_;
  workers_.resize(section_num_);
-  in_var_names_.resize(section_num_);
-  out_var_names_.resize(section_num_);
-  worker_count_.resize(section_num_);
-  worker_count_mutex_.resize(section_num_);
-  param_need_sync_.reset(new std::vector<std::string>);
-  int reader_index = 0;
  for (int i = 0; i < section_num_; ++i) {
-    const auto& section_config = pipeline_config_.section_config(i);
+    const auto& section_config = section_params.section_config(i);
-    int concurrency = section_config.concurrency();
-    VLOG(3) << "the thread num of each pipeline in section " << i
-            << " is: " << concurrency;
-    in_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_in_var_names().begin(),
-        section_config.section_in_var_names().end()));
-    out_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_out_var_names().begin(),
-        section_config.section_out_var_names().end()));
-    worker_count_[i].resize(pipeline_num_);
-    worker_count_mutex_[i].resize(pipeline_num_);
-    for (int j = 0; j < pipeline_num_; ++j) {
-      worker_count_[i][j] = new int(concurrency);
-      worker_count_mutex_[i][j].reset(new std::mutex);
-    }
    platform::Place place;
-    workers_[i].resize(pipeline_num_);
+    int place_id = section_config.place_id();
-    for (int j = 0; j < pipeline_num_; ++j) {
+    switch (section_config.place()) {
-      workers_[i][j].resize(concurrency);
+      case SectionConfig::CPUPlace:
+        place = platform::CPUPlace();
-      switch (section_config.place()) {
+        break;
-        case SectionConfig::CPUPlace:
+      case SectionConfig::CUDAPlace:
-          place = platform::CPUPlace();
+        // Note that one section has at most one GPU place in one pipeline
-          break;
+        PADDLE_ENFORCE_GE(
-        case SectionConfig::CUDAPlace:
+            place_id, 0,
-          // Note that one section has at most one GPU place in one pipeline
+            platform::errors::InvalidArgument(
-          place = platform::CUDAPlace(j);
+                "The place_id value for CUDAPlace shoud be greater "
-          break;
+                "than or equal to 0, but the value you give is %d.",
-        case SectionConfig::CUDAPinnedPlace:
+                place_id));
-          place = platform::CUDAPinnedPlace();
+        place = platform::CUDAPlace(place_id);
-          break;
+        break;
-        default:
+      case SectionConfig::CUDAPinnedPlace:
-          PADDLE_ENFORCE(false, "Unkown place type in SectionConfig: %d",
+        place = platform::CUDAPinnedPlace();
-                         section_config.place());
+        break;
-      }
+      default:
+        PADDLE_ENFORCE_NOT_NULL(nullptr,
+                                platform::errors::InvalidArgument(
+                                    "Unkown place type in SectionConfig: %d",
+                                    section_config.place()));
+    }
+    places_.emplace_back(place);
+    VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
+            << ", section: " << i;
-      for (int k = 0; k < concurrency; ++k) {
+    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
-        workers_[i][j][k] = DeviceWorkerFactory::CreateDeviceWorker(
+        trainer_desc.device_worker_name());
-            trainer_desc.device_worker_name());
+    auto this_worker =
-        auto this_worker =
+        std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
+            workers_[i]);
-                workers_[i][j][k]);
+    if (i == 0) {
-        this_worker->SetSectionIndex(i);
+      // we only set reader for the first section
-        this_worker->SetDeviceIndex(j);
+      this_worker->SetDataFeed(reader);
-        this_worker->SetThreadIndex(k);
+      this_worker->SetReaderPlace(place);
-        this_worker->SetSectionNum(section_num_);
-        this_worker->SetPipelineNum(pipeline_num_);
-        if (i == 0) {
-          this_worker->SetDataFeed(readers[reader_index++]);
-          this_worker->SetReaderPlace(place);
-        }
-        if (i == section_num_ - 1) {
-          this_worker->SetNeedDumpField(need_dump_field_);
-          this_worker->SetNeedDumpParam(need_dump_param_);
-          this_worker->SetDumpFieldVector(dump_fields_);
-          this_worker->SetDumpParamVector(dump_param_);
-        }
-        this_worker->SetPlace(place);
-        this_worker->Initialize(trainer_desc);
-        this_worker->InitRandomDumpConfig(trainer_desc);
-      }
    }
-  }
+    this_worker->SetThreadIndex(i);
-  param_need_sync_.reset(
+    this_worker->SetSectionIndex(i);
-      new std::vector<std::string>(pipeline_config_.param_need_sync().begin(),
+    this_worker->SetPlace(place);
-                                   pipeline_config_.param_need_sync().end()));
+    this_worker->Initialize(trainer_desc);
-  VLOG(3) << "param_need_sync_ have: ";
+    this_worker->SetMicrobatchNum(num_microbatches_);
-  for (const std::string& name : *param_need_sync_) {
-    VLOG(3) << name;
  }
  // set debug here
  SetDebug(trainer_desc.debug());
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
 void PipelineTrainer::InitDumpEnv() {
  queue_ = paddle::framework::MakeChannel<std::string>();
-  // Only set dump channel on the last section
+  // TODO(sandyhouse): should make it as a config
-  for (int j = 0; j < pipeline_num_; ++j) {
-    for (size_t k = 0; k < workers_[section_num_ - 1][j].size(); ++k) {
-      workers_[section_num_ - 1][j][k]->SetChannelWriter(queue_.get());
-    }
-  }
-  // TODO(hutuxian): should make it as a config
  dump_thread_num_ = 1;
  for (int i = 0; i < dump_thread_num_; i++) {
    dump_thread_.push_back(
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
  }
 }
-void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue,
+void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
-                                          int pipeline_id,
+                                     const ProgramDesc& program,
-                                          const ProgramDesc& main_program,
+                                     const platform::Place& place) {
-                                          const Scope& root_scope) {
+  auto& global_block = program.Block(0);
-  for (int i = 0; i < scope_queue_size_; ++i) {
+  for (auto& var : global_block.AllVars()) {
-    Scope* scope = &pipeline_scopes_[pipeline_id]->NewScope();
+    int is_feed_var =
-    for (auto& var : main_program.Block(0).AllVars()) {
+        std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name());
-      if (!var->Persistable()) {
+    if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
-        auto* ptr = scope->Var(var->Name());
+      if (is_feed_var) {
-        InitializeVariable(ptr, var->GetType());
+        auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
+        VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
+        InitializeVariable(new_ptr, var->GetType());
      } else {
-        if (section_num_ == 1) {  // Means only one section and it must be
+        auto* ptr = root_scope_->FindVar(var->Name());
-                                  // CUDAPlace, so copy all persistable vars to
+        auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
-                                  // pipeline scope
+        VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
-          const LoDTensor& root_tensor =
+                << section_id << ", which pointer is " << new_ptr;
-              root_scope.FindVar(var->Name())->Get<LoDTensor>();
+        InitializeVariable(new_ptr, var->GetType());
-          LoDTensor* gpu_tensor = pipeline_scopes_[pipeline_id]
+        const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
-                                      ->Var(var->Name())
+        LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
-                                      ->GetMutable<LoDTensor>();
+        TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-          platform::Place place = platform::CUDAPlace(pipeline_id);
+                   static_cast<Tensor*>(minibatch_tensor));
-          TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-                     static_cast<Tensor*>(gpu_tensor));
-        }
      }
+    } else if (!var->Persistable() && !is_feed_var) {
+      auto* ptr =
+          microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
+      VLOG(3) << "Create variable " << var->Name() << " for section "
+              << section_id << " microbatch " << microbatch_id
+              << ", which pointer is " << ptr;
+      InitializeVariable(ptr, var->GetType());
    }
-    scope_queue->Send(scope);
  }
 }
-void PipelineTrainer::CopyParameters(const Scope& root_scope, int pipeline_id) {
+void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) {
-  for (const std::string& name : *param_need_sync_) {
+  auto& global_block = program.Block(0);
-    const LoDTensor& root_tensor = root_scope.FindVar(name)->Get<LoDTensor>();
+  for (auto& op : global_block.AllOps()) {
+    if (op->Type() != "enqueue") {
-    // TODO(hutxian): check a new var of the same name is created in
+      continue;
-    // pipeline_scope
+    }
-    LoDTensor* gpu_tensor =
+    auto input_arg_names = op->InputArgumentNames();
-        pipeline_scopes_[pipeline_id]->Var(name)->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
-    platform::Place place = platform::CUDAPlace(pipeline_id);
+                      platform::errors::InvalidArgument(
-    TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
+                          "Number of input arguments for enqueue op must be 1, "
-               static_cast<Tensor*>(gpu_tensor));
+                          "but the value is %d.",
+                          input_arg_names.size()));
+    std::string input_arg_name = input_arg_names[0];
+    if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
+      skip_vars_[section_id].emplace_back(input_arg_name);
+      VLOG(3) << "add skip var name: " << input_arg_name;
+    }
  }
 }
 void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                     const platform::Place& place) {
-  PADDLE_ENFORCE(root_scope_, "Null root_scope pointer");
+  PADDLE_ENFORCE_NOT_NULL(root_scope_,
-  SectionWorker::cpu_id_.store(pipeline_config_.start_cpu_core_id());
+                          platform::errors::InvalidArgument(
-  scope_queues_.resize(section_num_);
+                              "root_scope pointer can not be nullptr"));
-  pipeline_scopes_.resize(pipeline_num_);
+  auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
-  for (auto& var : main_program.Block(0).AllVars()) {
+  SectionWorker::cpu_id_.store(start_cpu_id);
-    if (var->Persistable()) {
+  minibatch_scopes_.resize(section_num_);
-      persistable_vars_.push_back(var->Name());
+  microbatch_scopes_.resize(section_num_);
-    }
+  skip_vars_.resize(section_num_);
-  }
  VLOG(3) << "Init ScopeQueues and create all scopes";
  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
+    minibatch_scopes_[i] = &root_scope_->NewScope();
-      scope_queues_[i].emplace_back(new ScopeQueue(scope_queue_size_));
+    std::shared_ptr<framework::ProgramDesc> program;
-      if (i == 0) {
+    program.reset(new ProgramDesc(
-        pipeline_scopes_[j] = &root_scope_->NewScope();
+        trainer_desc_.section_param().section_config(i).program_desc()));
-        CopyParameters(*root_scope_, j);
+    microbatch_scopes_[i].resize(num_microbatches_);
-        InitFirstScopeQueue(scope_queues_[0].back().get(), j, main_program,
+    for (int j = 0; j < num_microbatches_; ++j) {
-                            *root_scope_);
+      microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope();
-      }
+      CopyParameters(i, j, *program, places_[i]);
    }
+    GetSkipVars(i, *program);
  }
  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
+    auto this_worker =
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
+        std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-        auto this_worker =
+            workers_[i]);
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
+    this_worker->SetRootScope(root_scope_);
-                workers_[i][j][k]);
+    this_worker->SetMinibatchScope(minibatch_scopes_[i]);
-        this_worker->SetRootScope(root_scope_);
+    this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
-        this_worker->SetCountMutex(worker_count_mutex_[i][j].get());
+    this_worker->SetSkipVars(skip_vars_[i]);
-        this_worker->SetWorkerCount(worker_count_[i][j]);
-        this_worker->SetScopeQueue(scope_queues_[i][j].get(),
-                                   (i == section_num_ - 1)
-                                       ? scope_queues_[0][j].get()
-                                       : scope_queues_[i + 1][j].get());
-        this_worker->SetVarNames(*in_var_names_[i], *out_var_names_[i]);
-        if (i != section_num_ - 1) {
-          // For data copy in adjacent different place
-          this_worker->SetNextSectionPlace(
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i + 1][j][0])
-                  ->place());
-        }
-      }
-    }
-  }
-  if (pipeline_num_ > 1 && sync_steps_ != -1) {
-    construct_sync_functor();
-  }
-}
-void PipelineTrainer::construct_sync_functor() {
-  std::vector<platform::Place> cuda_places;
-  for (int i = 0; i < pipeline_num_; ++i) {
-    cuda_places.emplace_back(platform::CUDAPlace(i));
-  }
-  nccl_ctx_map_.reset(new platform::NCCLContextMap(cuda_places));
-  sync_functors_.resize(pipeline_num_);
-  SyncFunctor::sync_flag_ = 0;
-  SyncFunctor::pipeline_scopes_.resize(0);
-  for (int j = 0; j < pipeline_num_; ++j) {
-    SyncFunctor* sync_function = new SyncFunctor(j, pipeline_num_, sync_steps_);
-    sync_function->SetSyncParam(*param_need_sync_);
-    sync_function->SetNcclCtxMap(nccl_ctx_map_.get());
-    SyncFunctor::pipeline_scopes_.push_back(this->pipeline_scopes_[j]);
-    sync_functors_[j].reset(sync_function);
-  }
-  for (int i = section_num_ - 1; i >= 0; --i) {
-    if (SectionConfig::CUDAPlace ==
-        pipeline_config_.section_config(i).place()) {
-      for (int j = 0; j < pipeline_num_; ++j) {
-        for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-          auto this_worker =
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i][j][k]);
-          this_worker->SetSyncFunctor(sync_functors_[j].get());
-        }
-      }
-      break;
-    }
  }
 }
 void PipelineTrainer::Run() {
  VLOG(3) << "Going to run";
  for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
+    if (!debug_) {
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
+      section_threads_.push_back(
-        if (!debug_) {
+          std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
-          section_threads_.push_back(
+    } else {
-              std::thread(&DeviceWorker::TrainFiles, workers_[i][j][k].get()));
+      section_threads_.push_back(std::thread(
-        } else {
+          &DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
-          section_threads_.push_back(std::thread(
-              &DeviceWorker::TrainFilesWithProfiler, workers_[i][j][k].get()));
-        }
-      }
    }
  }
 }
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
  if (need_dump_field_) {
    FinalizeDumpEnv();
  }
-  for (const auto& var : persistable_vars_) {
+  VLOG(3) << "copying back parameters. ";
-    auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>();
+  for (int i = 0; i < section_num_; ++i) {
-    // TODO(hutuxian): Add a final all-reduce?
+    std::shared_ptr<framework::ProgramDesc> program;
-    const auto& thread_tensor =
+    program.reset(new ProgramDesc(
-        pipeline_scopes_[0]->FindVar(var)->Get<LoDTensor>();
+        trainer_desc_.section_param().section_config(i).program_desc()));
-    TensorCopySync(thread_tensor, platform::CPUPlace(), root_tensor);
+    for (int j = 0; j < num_microbatches_; ++j) {
+      auto& global_block = program->Block(0);
+      for (auto& var : global_block.AllVars()) {
+        if (var->Persistable()) {
+          auto* ptr = root_scope_->FindVar(var->Name());
+          LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
+          auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
+          const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
+          TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
+                     static_cast<Tensor*>(root_tensor));
+          VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
+        }
+      }
+    }
  }
  root_scope_->DropKids();
 }
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
-  return pipeline_scopes_[thread_id];
+  return microbatch_scopes_[thread_id][0];
 }
 }  // end namespace framework

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
  virtual Scope* GetWorkerScope(int thread_id);
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
+  void GetSkipVars(int section_id, const ProgramDesc& main_program);
 protected:
  int section_num_;
-  int pipeline_num_;
+  int num_microbatches_;
-  int scope_queue_size_;
+  int start_cpu_core_id_;
-  int sync_steps_;
+  std::vector<std::string> feed_var_names_;
+  std::vector<platform::Place> places_;
+  std::vector<std::vector<std::string>> skip_vars_;
+  TrainerDesc trainer_desc_;
-  SectionWorkerParameter pipeline_config_;
-  // The in/output var names for each section
-  std::vector<std::unique_ptr<std::vector<std::string>>> in_var_names_;
-  std::vector<std::unique_ptr<std::vector<std::string>>> out_var_names_;
-  // Counter for the running thread
-  std::vector<std::vector<int*>> worker_count_;
-  std::vector<std::vector<std::unique_ptr<std::mutex>>> worker_count_mutex_;
-  // worker: [section_id][pipeline_id][thread_id]
-  std::vector<std::vector<
-      std::vector<std::shared_ptr<paddle::framework::DeviceWorker>>>>
-      workers_;
  std::vector<std::thread> section_threads_;
+  // worker: [section_id]
-  // We use scope to maintain context info, and scopes
+  std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_;
-  // will be deliverd between different sections.
+  // minibatch_scopes_: [section_id]
-  std::vector<std::vector<std::unique_ptr<ScopeQueue>>> scope_queues_;
+  std::vector<Scope*> minibatch_scopes_;
-  std::vector<Scope*> pipeline_scopes_;
+  // microbatch_scopes_: [section_id][microbatch_id]
+  std::vector<std::vector<Scope*>> microbatch_scopes_;
-  // The parameters that should be syncronized between different cards using
-  // nccl all-reduce
+  void CopyParameters(int section_id, int microbatch_id,
-  std::shared_ptr<std::vector<std::string>> param_need_sync_;
+                      const ProgramDesc& program, const platform::Place& place);
-  std::vector<std::string> persistable_vars_;
+  bool isPersistableVarGrad(std::string name);
-  std::vector<std::unique_ptr<SyncFunctor>> sync_functors_;
+  bool isPersistable(VarDesc* var);
-  std::shared_ptr<platform::NCCLContextMap> nccl_ctx_map_;
-  std::vector<DataFeed*> readers_;
-  void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id,
-                           const ProgramDesc& main_program,
-                           const Scope& root_scope);
-  void CopyParameters(const Scope& root_scope, int pipeline_id);
-  void construct_sync_functor();
 };
 #endif
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
  optional int64 sync_steps = 3 [ default = 1 ];
  optional int32 start_cpu_core_id = 4 [ default = 1 ];
  repeated string param_need_sync = 5;
+  optional int32 num_microbatches = 6;
 }
 message SectionConfig {
@@ -99,6 +100,7 @@ message SectionConfig {
  optional int32 concurrency = 3 [ default = 1 ];
  repeated string section_in_var_names = 4;
  repeated string section_out_var_names = 5;
+  optional int32 place_id = 6 [ default = -1 ];
 }
 message FetchConfig {

--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
        trainer_desc.device_worker_name = "SectionWorker"
        pipeline_opt = self._program._pipeline_opt
        section_param = trainer_desc.section_param
-        section_param.queue_size = pipeline_opt["queue_size"]
+        section_param.num_microbatches = pipeline_opt["num_microbatches"]
-        section_param.sync_steps = pipeline_opt["sync_steps"]
        section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
-        for e in pipeline_opt["param_need_sync"]:
-            section_param.param_need_sync.append(e)
        for i, program in enumerate(pipeline_opt["section_program_list"]):
            cfg = section_param.section_config.add()
            cfg.program_desc.ParseFromString(program["program"]._get_desc()
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
            # TODO: why does not work
            # cfg.program_desc.CopyFrom(program.program._get_desc())
            place = pipeline_opt["place_list"][i]
+            place_id = pipeline_opt["place_id_list"][i]
            if isinstance(place, core.CPUPlace):
                cfg.place = cfg.CPUPlace
            elif isinstance(place, core.CUDAPlace):
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
                raise NotImplementedError(
                    "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
                )
+            cfg.place_id = place_id
-            cfg.concurrency = pipeline_opt["concurrency_list"][i]
-            for var in program["input_set"]:
-                cfg.section_in_var_names.append(var)
-            for var in program["output_set"]:
-                cfg.section_out_var_names.append(var)
 class DeviceWorkerFactory(object):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
            "place_list": place_list,
            "place_id_list": place_id_list,
            "sync_steps": -1,
-            "queue_size": self._num_microbatches,
+            "num_microbatches": self._num_microbatches,
            "start_cpu_core_id": self._start_cpu_core_id,
        }
        return optimize_ops, params_grads, program_list

--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
            pool_type='max')
    if layers >= 50:
        for block in range(len(depth)):
-            with fluid.device_guard("cpu"):
+            with fluid.device_guard("gpu:0"):
                for i in range(depth[block]):
                    conv = bottleneck_block(
                        input=conv,
@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
    else:
        for block in range(len(depth)):
-            with fluid.device_guard("cpu"):
+            with fluid.device_guard("gpu:0"):
                for i in range(depth[block]):
                    conv = basic_block(
                        input=conv,
@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000):
 class TestPipeline(unittest.TestCase):
    """  TestCases for Pipeline Training. """
+    def _run(self, debug):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.device_guard("cpu"):
+                image = fluid.layers.data(
+                    name="image", shape=[3, 224, 224], dtype="float32")
+                label = fluid.layers.data(
+                    name="label", shape=[1], dtype="int64")
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[image, label],
+                    capacity=64,
+                    use_double_buffer=True,
+                    iterable=False)
+                fc = build_network(image, layers=50)
+            with fluid.device_guard("gpu:0"):
+                out, prob = fluid.layers.softmax_with_cross_entropy(
+                    logits=fc, label=label, return_softmax=True)
+                loss = fluid.layers.mean(out)
+                acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
+                acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            total_images = 1281167
+            steps_per_pass = total_images // 128
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+            optimizer = fluid.optimizer.MomentumOptimizer(
+                lr_val,
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+            optimizer = fluid.optimizer.PipelineOptimizer(
+                optimizer, num_microbatches=2)
+            optimizer.minimize(loss)
+        def train_reader():
+            for _ in range(4):
+                img = np.random.random(size=[3, 224, 224]).astype('float32')
+                label = np.random.random(size=[1]).astype('int64')
+                yield img, label
+        data_loader.set_sample_generator(train_reader, batch_size=1)
+        place = fluid.CPUPlace()
+        # The following dataset is only used for the 
+        # interface 'train_from_dataset'.
+        # And it has no actual meaning.
+        dataset = fluid.DatasetFactory().create_dataset('FileInstantDataset')
+        dataset.set_batch_size(1)
+        dataset.set_thread(1)
+        dataset.set_filelist(['/tmp/tmp_2.txt'])
+        dataset.set_use_var([image, label])
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        data_loader.start()
+        exe.train_from_dataset(main_prog, dataset, debug=debug)
    def test_pipeline(self):
-        with fluid.device_guard("cpu"):
+        self._run(False)
-            image = fluid.layers.data(
+        self._run(True)
-                name="image", shape=[3, 224, 224], dtype="float32")
-            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            data_loader = fluid.io.DataLoader.from_generator(
-                feed_list=[image, label],
-                capacity=64,
-                use_double_buffer=True,
-                iterable=False)
-            fc = build_network(image, layers=50)
-        with fluid.device_guard("gpu:0"):
-            out, prob = fluid.layers.softmax_with_cross_entropy(
-                logits=fc, label=label, return_softmax=True)
-            loss = fluid.layers.mean(out)
-            acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
-            acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
-        base_lr = 0.1
-        passes = [30, 60, 80, 90]
-        total_images = 1281167
-        steps_per_pass = total_images // 128
-        bd = [steps_per_pass * p for p in passes]
-        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
-        optimizer = fluid.optimizer.Momentum(
-            lr_val,
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-        optimizer = fluid.optimizer.PipelineOptimizer(
-            optimizer, num_microbatches=2)
-        optimizer.minimize(loss)
    def test_pipeline_noneoptimizer(self):
        with fluid.device_guard("gpu:0"):