Merge branch 'add_timeline' into pipeline_exe_run

f71543ee · sandyhouse · 27f245cd · 0f752e89 · f71543ee · f71543ee
5 changed file
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -455,6 +455,7 @@ class SectionWorker : public DeviceWorker {

  std::vector<std::unique_ptr<OperatorBase>> ops_;
  static std::mutex thread_mutex;
+  static std::mutex cout_mutex;
  static std::condition_variable thread_condition;
  static bool threads_completed;
  std::shared_ptr<framework::ProgramDesc> program_;

--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #if defined(PADDLE_WITH_NCCL)
+#include <map>
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -44,7 +45,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                        "must be 1 now, but the value you give is %d.",
                        num_readers));
  auto* reader = readers[0];
-  feed_var_names_ = reader->GetUseSlotAlias();

  workers_.resize(section_num_);
  for (int i = 0; i < section_num_; ++i) {
@@ -123,26 +123,36 @@ void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
                                     const ProgramDesc& program,
                                     const platform::Place& place) {
  auto& global_block = program.Block(0);
+  std::map<std::string, int> param_map;
  for (auto& var : global_block.AllVars()) {
-    int is_feed_var =
-        std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name());
-    if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
-      if (is_feed_var) {
-        auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
-        VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
-        InitializeVariable(new_ptr, var->GetType());
-      } else {
+    if (var->Persistable()) {
+      param_map[var->Name()] = 1;
+    }
+  }
+  for (auto& var : global_block.AllVars()) {
+    bool is_param_grad = false;
+    size_t pos = 0;
+    if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
+      auto prefix_name = var->Name().substr(0, pos);
+      if (param_map.find(prefix_name) != param_map.end()) {
+        is_param_grad = true;
+      }
+    }
+    VLOG(3) << "Var name: " << var->Name();
+    if ((var->Persistable() || is_param_grad) && microbatch_id == 0) {
      auto* ptr = root_scope_->FindVar(var->Name());
      auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
      VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
              << section_id << ", which pointer is " << new_ptr;
      InitializeVariable(new_ptr, var->GetType());
+      if (is_param_grad) {
+        continue;
+      }
      const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
      LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
      TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
                 static_cast<Tensor*>(minibatch_tensor));
-      }
-    } else if (!var->Persistable() && !is_feed_var) {
+    } else if (!var->Persistable() && !is_param_grad) {
      auto* ptr =
          microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
      VLOG(3) << "Create variable " << var->Name() << " for section "
@@ -244,7 +254,7 @@ void PipelineTrainer::Finalize() {
          const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
          TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
                     static_cast<Tensor*>(root_tensor));
-          VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
+          VLOG(3) << "Copy persitable var " << var->Name() << " to root scope";
        }
      }
    }

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -32,6 +32,7 @@ namespace framework {

 std::atomic<int> SectionWorker::cpu_id_(0);
 std::mutex SectionWorker::thread_mutex;
+std::mutex SectionWorker::cout_mutex;
 std::condition_variable SectionWorker::thread_condition;
 bool SectionWorker::threads_completed = false;
 uint64_t SectionWorker::batch_id_(0);
@@ -103,9 +104,14 @@ void SectionWorker::TrainFiles() {
  }
 #endif

+  platform::Timer batch_timer;
+
  if (thread_id_ == 0) {
    while (true) {
      // Start a minibatch.
+      // real number of microbatches run
+      int real_microbatch_num = 0;
+      batch_timer.Start();
      for (int i = 0; i < num_microbatches_; ++i) {
        try {
          for (auto& op : ops_) {
@@ -137,17 +143,21 @@ void SectionWorker::TrainFiles() {
          VLOG(3) << "called notify all";
          thread_condition.notify_all();
          VLOG(0) << "EOF encountered";
-          return;
+          break;
        }
-        if (i == 0) {
+        {
+          real_microbatch_num += 1;
+          batch_id_ += 1;
          VLOG(3) << "called notify all";
          std::unique_lock<std::mutex> lk(thread_mutex);
-          batch_id_ += 1;
          thread_condition.notify_all();
        }
      }
+      dev_ctx_->Wait();
+
+      VLOG(0) << "real_microbatch_num for thread 0 " << real_microbatch_num;
      // backward pass
-      for (int i = 0; i < num_microbatches_; ++i) {
+      for (int i = 0; i < real_microbatch_num; ++i) {
        for (auto& op : ops_) {
          int op_role = op->Attr<int>(std::string("op_role"));
          if (op_role == static_cast<int>(OpRole::kBackward) ||
@@ -163,6 +173,12 @@ void SectionWorker::TrainFiles() {
          }
        }
      }
+      dev_ctx_->Wait();
+      if (real_microbatch_num == 0) {
+        batch_timer.Pause();
+        VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
+        return;
+      }
      // update pass
      for (auto& op : ops_) {
        int op_role = op->Attr<int>(std::string("op_role"));
@@ -177,9 +193,21 @@ void SectionWorker::TrainFiles() {
        }
      }
      dev_ctx_->Wait();
+      batch_timer.Pause();
+      VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
+      {
+        std::unique_lock<std::mutex> lk(thread_mutex);
+        if (threads_completed) {
+          return;
+        }
+      }
    }
  } else {
    while (true) {
+      // forward pass:
+      bool local_completed = false;
+      int real_microbatch_num = 0;
+      for (int i = 0; i < num_microbatches_; ++i) {
        {
          PADDLE_ENFORCE_LE(
              local_batch_id_, batch_id_,
@@ -197,13 +225,13 @@ void SectionWorker::TrainFiles() {
            VLOG(3) << "thread " << thread_id_ << " completed.";
            lk.unlock();
            threads_completed = false;
-          return;
+            local_completed = true;
+            break;
          }
          lk.unlock();
          local_batch_id_ += 1;
+          real_microbatch_num += 1;
        }
-      // forward pass:
-      for (int i = 0; i < num_microbatches_; ++i) {
        for (auto& op : ops_) {
          int op_role = op->Attr<int>(std::string("op_role"));
          // We run op with op_role = kLRSched only for the first microbatch
@@ -227,8 +255,9 @@ void SectionWorker::TrainFiles() {
          }
        }
      }
+      dev_ctx_->Wait();
      // backward pass
-      for (int i = 0; i < num_microbatches_; ++i) {
+      for (int i = 0; i < real_microbatch_num; ++i) {
        for (auto& op : ops_) {
          int op_role = op->Attr<int>(std::string("op_role"));
          if (op_role == static_cast<int>(OpRole::kBackward) ||
@@ -244,7 +273,11 @@ void SectionWorker::TrainFiles() {
          }
        }
      }
+      dev_ctx_->Wait();
      // update pass
+      if (real_microbatch_num == 0) {
+        return;
+      }
      for (auto& op : ops_) {
        int op_role = op->Attr<int>(std::string("op_role"));
        if (op_role == static_cast<int>(OpRole::kOptimize)) {
@@ -258,6 +291,9 @@ void SectionWorker::TrainFiles() {
        }
      }
      dev_ctx_->Wait();
+      if (local_completed) {
+        return;
+      }
    }
  }
 }
@@ -307,14 +343,20 @@ void SectionWorker::TrainFilesWithProfiler() {
 #endif

  if (thread_id_ == 0) {
+    struct timeval start;
+    struct timeval end;
+    struct timeval micro_start;
+    struct timeval micro_end;
    while (true) {
      // Start a minibatch.
-      // int batch_size = 0;
      batch_timer.Start();
+      int real_microbatch_num = 0;
      for (int i = 0; i < num_microbatches_; ++i) {
        try {
          int op_idx = 0;
+          gettimeofday(&micro_start, NULL);
          for (auto& op : ops_) {
+            gettimeofday(&start, NULL);
            int op_role = op->Attr<int>(std::string("op_role"));
            // We run op with op_role = kLRSched only for the first microbatch
            // to avoid increasing the @LR_DECAY_STEP@ multiple times.
@@ -335,7 +377,9 @@ void SectionWorker::TrainFilesWithProfiler() {
                DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
                                    unused_vars_, gc.get());
              }
+              cudaDeviceSynchronize();
              timeline.Pause();
+              gettimeofday(&end, NULL);
              auto time = timeline.ElapsedUS();
              op_total_time[op_idx] += time;
              if (time > op_max_time[op_idx]) {
@@ -346,9 +390,30 @@ void SectionWorker::TrainFilesWithProfiler() {
              }
              op_count[op_idx] += 1;
              op_total_time[op_idx] += time;
+              {
+                std::unique_lock<std::mutex> lk(cout_mutex);
+                std::cout << std::fixed;
+                std::cout.precision(0);
+                std::cout << "::FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                          << "]:SCOPE[" << i << "]:OP[" << op->Type()
+                          << "]:START[" << start.tv_sec * 1e6 + start.tv_usec
+                          << "]:END[" << end.tv_sec * 1e6 + end.tv_usec << "]"
+                          << std::endl;
+              }
            }
            op_idx++;
          }
+          gettimeofday(&micro_end, NULL);
+          {
+            std::unique_lock<std::mutex> lk(cout_mutex);
+            std::cout << std::fixed;
+            std::cout.precision(0);
+            std::cout << "!!FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                      << "]:START["
+                      << micro_start.tv_sec * 1e6 + micro_start.tv_usec
+                      << "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
+                      << "]" << std::endl;
+          }
        } catch (platform::EOFException&) {
          std::unique_lock<std::mutex> lk(thread_mutex);
          threads_completed = true;
@@ -363,19 +428,23 @@ void SectionWorker::TrainFilesWithProfiler() {
                    << ", mean_time: " << op_total_time[i] / op_count[i];
          }
          VLOG(0) << "================================";
-          return;
+          break;
        }
-        if (i == 0) {
+        {
          VLOG(3) << "called notify all";
          std::unique_lock<std::mutex> lk(thread_mutex);
+          real_microbatch_num += 1;
          batch_id_ += 1;
          thread_condition.notify_all();
        }
      }
+      dev_ctx_->Wait();
      // backward pass
-      for (int i = 0; i < num_microbatches_; ++i) {
+      for (int i = 0; i < real_microbatch_num; ++i) {
        int op_idx = 0;
+        gettimeofday(&micro_start, NULL);
        for (auto& op : ops_) {
+          gettimeofday(&start, NULL);
          int op_role = op->Attr<int>(std::string("op_role"));
          if (op_role == static_cast<int>(OpRole::kBackward) ||
              op_role == (static_cast<int>(OpRole::kBackward) |
@@ -388,6 +457,8 @@ void SectionWorker::TrainFilesWithProfiler() {
              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
                                  unused_vars_, gc.get());
            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
            timeline.Pause();
            auto time = timeline.ElapsedUS();
            op_total_time[op_idx] += time;
@@ -399,13 +470,42 @@ void SectionWorker::TrainFilesWithProfiler() {
            }
            op_count[op_idx] += 1;
            op_total_time[op_idx] += time;
+            {
+              std::unique_lock<std::mutex> lk(cout_mutex);
+              std::cout << std::fixed;
+              std::cout.precision(0);
+              std::cout << "::BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                        << "]:SCOPE[" << i << "]:OP[" << op->Type()
+                        << "]:START[" << start.tv_sec * 1e6 + start.tv_usec
+                        << "]:END[" << end.tv_sec * 1e6 + end.tv_usec << "]"
+                        << std::endl;
+            }
          }
          op_idx++;
        }
+        gettimeofday(&micro_end, NULL);
+        {
+          std::unique_lock<std::mutex> lk(cout_mutex);
+          std::cout << std::fixed;
+          std::cout.precision(0);
+          std::cout << "!!BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                    << "]:START["
+                    << micro_start.tv_sec * 1e6 + micro_start.tv_usec
+                    << "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
+                    << "]" << std::endl;
+        }
+      }
+      dev_ctx_->Wait();
+      if (real_microbatch_num == 0) {
+        batch_timer.Pause();
+        VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
+        return;
      }
      // update pass
      int op_idx = 0;
+      gettimeofday(&micro_start, NULL);
      for (auto& op : ops_) {
+        gettimeofday(&start, NULL);
        int op_role = op->Attr<int>(std::string("op_role"));
        if (op_role == static_cast<int>(OpRole::kOptimize)) {
          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
@@ -416,6 +516,8 @@ void SectionWorker::TrainFilesWithProfiler() {
            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
                                op.get(), unused_vars_, gc.get());
          }
+          cudaDeviceSynchronize();
+          gettimeofday(&end, NULL);
          timeline.Pause();
          auto time = timeline.ElapsedUS();
          op_total_time[op_idx] += time;
@@ -427,15 +529,53 @@ void SectionWorker::TrainFilesWithProfiler() {
          }
          op_count[op_idx] += 1;
          op_total_time[op_idx] += time;
+          {
+            std::unique_lock<std::mutex> lk(cout_mutex);
+            std::cout << std::fixed;
+            std::cout.precision(0);
+            std::cout << "::UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                      << "]:SCOPE[" << num_microbatches_ << "]:OP["
+                      << op->Type() << "]:START["
+                      << start.tv_sec * 1e6 + start.tv_usec << "]:END["
+                      << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
+          }
        }
        op_idx++;
      }
+      gettimeofday(&micro_end, NULL);
+      {
+        std::unique_lock<std::mutex> lk(cout_mutex);
+        std::cout << std::fixed;
+        std::cout.precision(0);
+        std::cout << "!!UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                  << "]:START["
+                  << micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END["
+                  << micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]"
+                  << std::endl;
+      }
      dev_ctx_->Wait();
      batch_timer.Pause();
      VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
+      {
+        std::unique_lock<std::mutex> lk(thread_mutex);
+        if (threads_completed) {
+          return;
+        }
+      }
    }
  } else {
+    struct timeval start;
+    struct timeval end;
+    struct timeval micro_start;
+    struct timeval micro_end;
+    cudaEvent_t cu_start, cu_stop;
+    cudaEventCreate(&cu_start);
+    cudaEventCreate(&cu_stop);
+    bool local_completed = false;
    while (true) {
+      // forward pass:
+      int real_microbatch_num = 0;
+      for (int i = 0; i < num_microbatches_; ++i) {
        {
          PADDLE_ENFORCE_LE(
              local_batch_id_, batch_id_,
@@ -450,25 +590,27 @@ void SectionWorker::TrainFilesWithProfiler() {
          VLOG(3) << "thread " << thread_id_ << " local_batch_id_ "
                  << local_batch_id_ << " batch_id_ " << batch_id_;
          if (threads_completed) {
+            local_completed = true;
            VLOG(3) << "thread " << thread_id_ << " completed.";
            lk.unlock();
            VLOG(0) << "============timeline============";
            for (size_t i = 0; i < ops_.size(); ++i) {
-            VLOG(0) << "op: " << op_name[i] << ", max_time: " << op_max_time[i]
+              VLOG(0) << "op: " << op_name[i]
+                      << ", max_time: " << op_max_time[i]
                      << ", min_time: " << op_min_time[i]
                      << ", mean_time: " << op_total_time[i] / op_count[i];
            }
            VLOG(0) << "================================";
-          threads_completed = false;
-          return;
+            break;
          }
          lk.unlock();
+          real_microbatch_num += 1;
          local_batch_id_ += 1;
        }
-      // forward pass:
-      for (int i = 0; i < num_microbatches_; ++i) {
        int op_idx = 0;
+        gettimeofday(&micro_start, NULL);
        for (auto& op : ops_) {
+          gettimeofday(&start, NULL);
          int op_role = op->Attr<int>(std::string("op_role"));
          // We run op with op_role = kLRSched only for the first microbatch
          // to avoid increasing the @LR_DECAY_STEP@ multiple times.
@@ -489,6 +631,8 @@ void SectionWorker::TrainFilesWithProfiler() {
              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
                                  unused_vars_, gc.get());
            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
            timeline.Pause();
            auto time = timeline.ElapsedUS();
            op_total_time[op_idx] += time;
@@ -500,14 +644,38 @@ void SectionWorker::TrainFilesWithProfiler() {
            }
            op_count[op_idx] += 1;
            op_total_time[op_idx] += time;
+            {
+              std::unique_lock<std::mutex> lk(cout_mutex);
+              std::cout << std::fixed;
+              std::cout.precision(0);
+              std::cout << "::FWD:B[" << local_batch_id_ << "]:SEC["
+                        << thread_id_ << "]:SCOPE[" << i << "]:OP["
+                        << op->Type() << "]:START["
+                        << start.tv_sec * 1e6 + start.tv_usec << "]:END["
+                        << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
+            }
          }
          op_idx++;
        }
+        gettimeofday(&micro_end, NULL);
+        {
+          std::unique_lock<std::mutex> lk(cout_mutex);
+          std::cout << std::fixed;
+          std::cout.precision(0);
+          std::cout << "!!FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                    << "]:START["
+                    << micro_start.tv_sec * 1e6 + micro_start.tv_usec
+                    << "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
+                    << "]" << std::endl;
        }
+      }
+      dev_ctx_->Wait();
      // backward pass
-      for (int i = 0; i < num_microbatches_; ++i) {
+      for (int i = 0; i < real_microbatch_num; ++i) {
        int op_idx = 0;
+        gettimeofday(&micro_start, NULL);
        for (auto& op : ops_) {
+          gettimeofday(&start, NULL);
          int op_role = op->Attr<int>(std::string("op_role"));
          if (op_role == static_cast<int>(OpRole::kBackward) ||
              op_role == (static_cast<int>(OpRole::kBackward) |
@@ -520,6 +688,8 @@ void SectionWorker::TrainFilesWithProfiler() {
              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
                                  unused_vars_, gc.get());
            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
            timeline.Pause();
            auto time = timeline.ElapsedUS();
            op_total_time[op_idx] += time;
@@ -531,13 +701,40 @@ void SectionWorker::TrainFilesWithProfiler() {
            }
            op_count[op_idx] += 1;
            op_total_time[op_idx] += time;
+            {
+              std::unique_lock<std::mutex> lk(cout_mutex);
+              std::cout << std::fixed;
+              std::cout.precision(0);
+              std::cout << "::BWD:B[" << local_batch_id_ << "]:SEC["
+                        << thread_id_ << "]:SCOPE[" << i << "]:OP["
+                        << op->Type() << "]:START["
+                        << start.tv_sec * 1e6 + start.tv_usec << "]:END["
+                        << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
+            }
          }
          op_idx++;
        }
+        gettimeofday(&micro_end, NULL);
+        {
+          std::unique_lock<std::mutex> lk(cout_mutex);
+          std::cout << std::fixed;
+          std::cout.precision(0);
+          std::cout << "!!BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                    << "]:START["
+                    << micro_start.tv_sec * 1e6 + micro_start.tv_usec
+                    << "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
+                    << "]" << std::endl;
+        }
+      }
+      dev_ctx_->Wait();
+      if (real_microbatch_num == 0) {
+        return;
      }
      // update pass
      int op_idx = 0;
+      gettimeofday(&micro_start, NULL);
      for (auto& op : ops_) {
+        gettimeofday(&start, NULL);
        int op_role = op->Attr<int>(std::string("op_role"));
        if (op_role == static_cast<int>(OpRole::kOptimize)) {
          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
@@ -548,6 +745,8 @@ void SectionWorker::TrainFilesWithProfiler() {
            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
                                op.get(), unused_vars_, gc.get());
          }
+          cudaDeviceSynchronize();
+          gettimeofday(&end, NULL);
          timeline.Pause();
          auto time = timeline.ElapsedUS();
          op_total_time[op_idx] += time;
@@ -559,10 +758,34 @@ void SectionWorker::TrainFilesWithProfiler() {
          }
          op_count[op_idx] += 1;
          op_total_time[op_idx] += time;
+          {
+            std::unique_lock<std::mutex> lk(cout_mutex);
+            std::cout << std::fixed;
+            std::cout.precision(0);
+            std::cout << "::UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                      << "]:SCOPE[" << num_microbatches_ << "]:OP["
+                      << op->Type() << "]:START["
+                      << start.tv_sec * 1e6 + start.tv_usec << "]:END["
+                      << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
+          }
        }
        op_idx++;
      }
+      gettimeofday(&micro_end, NULL);
+      {
+        std::unique_lock<std::mutex> lk(cout_mutex);
+        std::cout << std::fixed;
+        std::cout.precision(0);
+        std::cout << "!!UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
+                  << "]:START["
+                  << micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END["
+                  << micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]"
+                  << std::endl;
+      }
      dev_ctx_->Wait();
+      if (local_completed) {
+        return;
+      }
    }
  }
 }

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -223,7 +223,6 @@ class PipelineTrainer : public TrainerBase {
  int section_num_;
  int num_microbatches_;
  int start_cpu_core_id_;
-  std::vector<std::string> feed_var_names_;
  std::vector<platform::Place> places_;
  std::vector<std::vector<std::string>> skip_vars_;
  TrainerDesc trainer_desc_;

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py