You need to sign in or sign up before continuing.
提交 a6344af2 编写于 作者: S sandyhouse

update, test=develop

上级 f71543ee
...@@ -414,7 +414,8 @@ class HeterCpuWorker : public HogwildWorker { ...@@ -414,7 +414,8 @@ class HeterCpuWorker : public HogwildWorker {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
SectionWorker() { local_batch_id_ = 0; } // SectionWorker() { local_batch_id_ = 0; }
SectionWorker() {}
~SectionWorker() override {} ~SectionWorker() override {}
void Initialize(const TrainerDesc& desc) override; void Initialize(const TrainerDesc& desc) override;
...@@ -429,7 +430,7 @@ class SectionWorker : public DeviceWorker { ...@@ -429,7 +430,7 @@ class SectionWorker : public DeviceWorker {
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
void SetSectionIndex(int section_id) { section_id_ = section_id; } // void SetSectionIndex(int section_id) { section_id_ = section_id; }
void SetDeviceIndex(int tid) override {} void SetDeviceIndex(int tid) override {}
void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
void SetMicrobatchNum(int num) { num_microbatches_ = num; } void SetMicrobatchNum(int num) { num_microbatches_ = num; }
...@@ -440,7 +441,7 @@ class SectionWorker : public DeviceWorker { ...@@ -440,7 +441,7 @@ class SectionWorker : public DeviceWorker {
void SetSkipVars(const std::vector<std::string>& skip_vars) { void SetSkipVars(const std::vector<std::string>& skip_vars) {
skip_vars_ = skip_vars; skip_vars_ = skip_vars;
} }
static void ResetBatchId() { batch_id_ = 0; } // static void ResetBatchId() { batch_id_ = 0; }
static std::atomic<int> cpu_id_; static std::atomic<int> cpu_id_;
...@@ -454,13 +455,13 @@ class SectionWorker : public DeviceWorker { ...@@ -454,13 +455,13 @@ class SectionWorker : public DeviceWorker {
const Scope* minibatch_scope_; const Scope* minibatch_scope_;
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
static std::mutex thread_mutex; // static std::mutex thread_mutex;
static std::mutex cout_mutex; // static std::mutex cout_mutex;
static std::condition_variable thread_condition; // static std::condition_variable thread_condition;
static bool threads_completed; // static bool threads_completed;
std::shared_ptr<framework::ProgramDesc> program_; std::shared_ptr<framework::ProgramDesc> program_;
static uint64_t batch_id_; static uint64_t batch_id_;
uint64_t local_batch_id_; // uint64_t local_batch_id_;
platform::DeviceContext* dev_ctx_ = nullptr; platform::DeviceContext* dev_ctx_ = nullptr;
}; };
......
...@@ -27,73 +27,88 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -27,73 +27,88 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const auto& section_params = trainer_desc.section_param(); const auto& section_params = trainer_desc.section_param();
num_microbatches_ = section_params.num_microbatches(); num_microbatches_ = section_params.num_microbatches();
VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_; VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
section_num_ = section_params.section_config_size();
VLOG(3) << "Number of program sections: " << section_num_;
trainer_desc_ = trainer_desc; trainer_desc_ = trainer_desc;
start_cpu_core_id_ = section_params.start_cpu_core_id(); start_cpu_core_id_ = section_params.start_cpu_core_id();
SetDataset(dataset); // SetDataset(dataset);
ParseDumpConfig(trainer_desc); ParseDumpConfig(trainer_desc);
// get filelist from trainer_desc here // get filelist from trainer_desc here
const std::vector<paddle::framework::DataFeed*> readers = // const std::vector<paddle::framework::DataFeed*> readers =
dataset->GetReaders(); // VLOG(3) << "Number of program sections: " << section_num_;
VLOG(3) << "readers num: " << readers.size(); // dataset->GetReaders();
int num_readers = readers.size(); // VLOG(3) << "readers num: " << readers.size();
PADDLE_ENFORCE_EQ(num_readers, 1, // int num_readers = readers.size();
platform::errors::InvalidArgument( // PADDLE_ENFORCE_EQ(num_readers, 1,
"Number of dataset readers for pipeline " // platform::errors::InvalidArgument(
"must be 1 now, but the value you give is %d.", // "Number of dataset readers for pipeline "
num_readers)); // "must be 1 now, but the value you give is %d.",
auto* reader = readers[0]; // num_readers));
// auto* reader = readers[0];
workers_.resize(section_num_); // workers_.resize(section_num_);
for (int i = 0; i < section_num_; ++i) { // for (int i = 0; i < section_num_; ++i) {
const auto& section_config = section_params.section_config(i); // const auto& section_config = section_params.section_config(i);
platform::Place place; // platform::Place place;
int place_id = section_config.place_id(); // int place_id = section_config.place_id();
switch (section_config.place()) { // switch (section_config.place()) {
case SectionConfig::CPUPlace: // case SectionConfig::CPUPlace:
place = platform::CPUPlace(); // place = platform::CPUPlace();
break; // break;
case SectionConfig::CUDAPlace: // case SectionConfig::CUDAPlace:
// Note that one section has at most one GPU place in one pipeline // // Note that one section has at most one GPU place in one pipeline
PADDLE_ENFORCE_GE( // PADDLE_ENFORCE_GE(
place_id, 0, // place_id, 0,
platform::errors::InvalidArgument( // platform::errors::InvalidArgument(
"The place_id value for CUDAPlace shoud be greater " // "The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d.", // "than or equal to 0, but the value you give is %d.",
place_id)); // place_id));
place = platform::CUDAPlace(place_id); // place = platform::CUDAPlace(place_id);
break; // break;
case SectionConfig::CUDAPinnedPlace: // case SectionConfig::CUDAPinnedPlace:
place = platform::CUDAPinnedPlace(); // place = platform::CUDAPinnedPlace();
break; // break;
default: // default:
PADDLE_ENFORCE_NOT_NULL(nullptr, // PADDLE_ENFORCE_NOT_NULL(nullptr,
platform::errors::InvalidArgument( // platform::errors::InvalidArgument(
"Unkown place type in SectionConfig: %d", // "Unkown place type in SectionConfig: %d",
section_config.place())); // section_config.place()));
} // }
places_.emplace_back(place); // places_.emplace_back(place);
VLOG(3) << "Device worker place: " << place << ", device id: " << place_id // VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
<< ", section: " << i; // << ", section: " << i;
// workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
// trainer_desc.device_worker_name());
// auto this_worker =
// std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
// workers_[i]);
// if (i == 0) {
// // we only set reader for the first section
// this_worker->SetDataFeed(reader);
// this_worker->SetReaderPlace(place);
// }
// this_worker->SetThreadIndex(i);
// this_worker->SetSectionIndex(i);
// this_worker->SetPlace(place);
// this_worker->Initialize(trainer_desc);
// this_worker->SetMicrobatchNum(num_microbatches_);
//}
const auto& section_config = section_params.section_config();
int place_id = section_config.place_id();
PADDLE_ENFORCE_GE(place_id, 0,
platform::errors::InvalidArgument(
"The place_id value for CUDAPlace shoud be "
"non-negative, but the value given is %d.",
place_id));
place_ = platform::CUDAPlace(place_id);
worker_ = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
this_worker->SetPlace(place_);
this_worker->Initialize(trainer_desc);
this_worker->SetMicrobatchNum(num_microbatches_);
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i]);
if (i == 0) {
// we only set reader for the first section
this_worker->SetDataFeed(reader);
this_worker->SetReaderPlace(place);
}
this_worker->SetThreadIndex(i);
this_worker->SetSectionIndex(i);
this_worker->SetPlace(place);
this_worker->Initialize(trainer_desc);
this_worker->SetMicrobatchNum(num_microbatches_);
}
// set debug here // set debug here
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
...@@ -119,7 +134,52 @@ void PipelineTrainer::InitDumpEnv() { ...@@ -119,7 +134,52 @@ void PipelineTrainer::InitDumpEnv() {
} }
} }
void PipelineTrainer::CopyParameters(int section_id, int microbatch_id, // void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
// const ProgramDesc& program,
// const platform::Place& place) {
// auto& global_block = program.Block(0);
// std::map<std::string, int> param_map;
// for (auto& var : global_block.AllVars()) {
// if (var->Persistable()) {
// param_map[var->Name()] = 1;
// }
// }
// for (auto& var : global_block.AllVars()) {
// bool is_param_grad = false;
// size_t pos = 0;
// if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
// auto prefix_name = var->Name().substr(0, pos);
// if (param_map.find(prefix_name) != param_map.end()) {
// is_param_grad = true;
// }
// }
// VLOG(3) << "Var name: " << var->Name();
// if ((var->Persistable() || is_param_grad) && microbatch_id == 0) {
// auto* ptr = root_scope_->FindVar(var->Name());
// auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
// VLOG(3) << "Create persistable var " << var->Name() << " for minibatch
// "
// << section_id << ", which pointer is " << new_ptr;
// InitializeVariable(new_ptr, var->GetType());
// if (is_param_grad) {
// continue;
// }
// const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
// LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
// TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
// static_cast<Tensor*>(minibatch_tensor));
// } else if (!var->Persistable() && !is_param_grad) {
// auto* ptr =
// microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
// VLOG(3) << "Create variable " << var->Name() << " for section "
// << section_id << " microbatch " << microbatch_id
// << ", which pointer is " << ptr;
// InitializeVariable(ptr, var->GetType());
// }
// }
// }
void PipelineTrainer::CopyParameters(int microbatch_id,
const ProgramDesc& program, const ProgramDesc& program,
const platform::Place& place) { const platform::Place& place) {
auto& global_block = program.Block(0); auto& global_block = program.Block(0);
...@@ -139,45 +199,57 @@ void PipelineTrainer::CopyParameters(int section_id, int microbatch_id, ...@@ -139,45 +199,57 @@ void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
} }
} }
VLOG(3) << "Var name: " << var->Name(); VLOG(3) << "Var name: " << var->Name();
if ((var->Persistable() || is_param_grad) && microbatch_id == 0) { if (is_param_grad && microbatch_id == 0) {
auto* ptr = root_scope_->FindVar(var->Name()); auto* ptr = minibatch_scope_->Var(var->Name());
auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name()); InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create persistable var " << var->Name() << " for minibatch " VLOG(3) << "Create grad for persistable var: " << var->Name()
<< section_id << ", which pointer is " << new_ptr; << ", which pointer is " << ptr;
InitializeVariable(new_ptr, var->GetType());
if (is_param_grad) {
continue;
}
const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
static_cast<Tensor*>(minibatch_tensor));
} else if (!var->Persistable() && !is_param_grad) { } else if (!var->Persistable() && !is_param_grad) {
auto* ptr = auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
microbatch_scopes_[section_id][microbatch_id]->Var(var->Name()); VLOG(3) << "Create variable " << var->Name() << " microbatch "
VLOG(3) << "Create variable " << var->Name() << " for section "
<< section_id << " microbatch " << microbatch_id
<< ", which pointer is " << ptr; << ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
} }
} }
} }
void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) { // void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program)
// {
// auto& global_block = program.Block(0);
// for (auto& op : global_block.AllOps()) {
// if (op->Type() != "enqueue") {
// continue;
// }
// auto input_arg_names = op->InputArgumentNames();
// PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
// platform::errors::InvalidArgument(
// "Number of input arguments for enqueue op must be
// 1, "
// "but the value is %d.",
// input_arg_names.size()));
// std::string input_arg_name = input_arg_names[0];
// if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
// skip_vars_[section_id].emplace_back(input_arg_name);
// VLOG(3) << "add skip var name: " << input_arg_name;
// }
// }
// }
void PipelineTrainer::GetSkipVars(const ProgramDesc& program) {
auto& global_block = program.Block(0); auto& global_block = program.Block(0);
for (auto& op : global_block.AllOps()) { for (auto& op : global_block.AllOps()) {
if (op->Type() != "enqueue") { if (op->Type() != "c_send") {
continue; continue;
} }
auto input_arg_names = op->InputArgumentNames(); auto input_arg_names = op->InputArgumentNames();
PADDLE_ENFORCE_EQ(input_arg_names.size(), 1, PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Number of input arguments for enqueue op must be 1, " "Number of input arguments for c_send op must be 1, "
"but the value is %d.", "but the value given is %d.",
input_arg_names.size())); input_arg_names.size()));
std::string input_arg_name = input_arg_names[0]; std::string input_arg_name = input_arg_names[0];
if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) { if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
skip_vars_[section_id].emplace_back(input_arg_name); skip_vars_.emplace_back(input_arg_name);
VLOG(3) << "add skip var name: " << input_arg_name; VLOG(3) << "add skip var name: " << input_arg_name;
} }
} }
...@@ -185,86 +257,101 @@ void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) { ...@@ -185,86 +257,101 @@ void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) {
void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) { const platform::Place& place) {
PADDLE_ENFORCE_NOT_NULL(root_scope_, PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "root_scope_ can not be nullptr"));
"root_scope pointer can not be nullptr"));
auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id(); auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
SectionWorker::cpu_id_.store(start_cpu_id); SectionWorker::cpu_id_.store(start_cpu_id);
minibatch_scopes_.resize(section_num_); // minibatch_scopes_.resize(section_num_);
microbatch_scopes_.resize(section_num_); // microbatch_scopes_.resize(section_num_);
skip_vars_.resize(section_num_); // minibatch_scopes_.resize(1);
microbatch_scopes_.resize(num_microbatches_);
// skip_vars_.resize(section_num_);
VLOG(3) << "Init ScopeQueues and create all scopes"; VLOG(3) << "Init ScopeQueues and create all scopes";
for (int i = 0; i < section_num_; ++i) { // for (int i = 0; i < section_num_; ++i) {
minibatch_scopes_[i] = &root_scope_->NewScope(); minibatch_scope_ = &root_scope_->NewScope();
std::shared_ptr<framework::ProgramDesc> program; std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc( program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc())); trainer_desc_.section_param().section_config().program_desc()));
microbatch_scopes_[i].resize(num_microbatches_); // trainer_desc_.section_param().section_config(i).program_desc()));
for (int j = 0; j < num_microbatches_; ++j) { // microbatch_scopes_[i].resize(num_microbatches_);
microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope(); for (int j = 0; j < num_microbatches_; ++j) {
CopyParameters(i, j, *program, places_[i]); // microbatch_scopes_[j] = &minibatch_scopes_[i]->NewScope();
} microbatch_scopes_[j] = &minibatch_scope_->NewScope();
GetSkipVars(i, *program); // CopyParameters(i, j, *program, places_[i]);
CopyParameters(j, *program, place_);
} }
// GetSkipVars(i, *program);
GetSkipVars(*program);
// }
for (int i = 0; i < section_num_; ++i) { // for (int i = 0; i < section_num_; ++i) {
auto this_worker = auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>( std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
workers_[i]); // workers_[i]);
this_worker->SetRootScope(root_scope_); this_worker->SetRootScope(root_scope_);
this_worker->SetMinibatchScope(minibatch_scopes_[i]); this_worker->SetMinibatchScope(minibatch_scope_);
this_worker->SetMicrobatchScopes(microbatch_scopes_[i]); // this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
this_worker->SetSkipVars(skip_vars_[i]); this_worker->SetMicrobatchScopes(microbatch_scopes_);
} // this_worker->SetSkipVars(skip_vars_[i]);
//}
} }
void PipelineTrainer::Run() { void PipelineTrainer::Run() {
VLOG(3) << "Going to run"; VLOG(3) << "Going to run";
for (int i = 0; i < section_num_; ++i) { // for (int i = 0; i < section_num_; ++i) {
if (!debug_) { if (!debug_) {
section_threads_.push_back( section_thread_ = std::thread(&DeviceWorker::TrainFiles, worker_.get());
std::thread(&DeviceWorker::TrainFiles, workers_[i].get())); // section_threads_.push_back(
} else { // std::thread(&DeviceWorker::TrainFiles, workers_.get()));
section_threads_.push_back(std::thread( // std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
&DeviceWorker::TrainFilesWithProfiler, workers_[i].get())); } else {
} section_thread_ =
std::thread(&DeviceWorker::TrainFilesWithProfiler, worker_.get());
// section_threads_.push_back(std::thread(
// &DeviceWorker::TrainFilesWithProfiler, workers_.get()));
// &DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
} }
//}
} }
void PipelineTrainer::Finalize() { void PipelineTrainer::Finalize() {
for (auto& th : section_threads_) { // for (auto& th : section_threads_) {
th.join(); // th.join();
} //}
section_thread_.join();
if (need_dump_field_) { if (need_dump_field_) {
FinalizeDumpEnv(); FinalizeDumpEnv();
} }
VLOG(3) << "copying back parameters. "; // VLOG(3) << "copying back parameters. ";
for (int i = 0; i < section_num_; ++i) { // for (int i = 0; i < section_num_; ++i) {
std::shared_ptr<framework::ProgramDesc> program; // std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc( // program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc())); // trainer_desc_.section_param().section_config(i).program_desc()));
for (int j = 0; j < num_microbatches_; ++j) { // for (int j = 0; j < num_microbatches_; ++j) {
auto& global_block = program->Block(0); // auto& global_block = program->Block(0);
for (auto& var : global_block.AllVars()) { // for (auto& var : global_block.AllVars()) {
if (var->Persistable()) { // if (var->Persistable()) {
auto* ptr = root_scope_->FindVar(var->Name()); // auto* ptr = root_scope_->FindVar(var->Name());
LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>(); // LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name()); // auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>(); // const LoDTensor& minibatch_tensor =
TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0], // minibatch_ptr->Get<LoDTensor>();
static_cast<Tensor*>(root_tensor)); // TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor),
VLOG(3) << "Copy persitable var " << var->Name() << " to root scope"; // places_[0],
} // static_cast<Tensor*>(root_tensor));
} // VLOG(3) << "Copy persitable var " << var->Name() << " to root
} // scope";
} // }
// }
// }
// }
root_scope_->DropKids(); root_scope_->DropKids();
SectionWorker::ResetBatchId(); // SectionWorker::ResetBatchId();
} }
Scope* PipelineTrainer::GetWorkerScope(int thread_id) { Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
return microbatch_scopes_[thread_id][0]; return microbatch_scopes_[0];
} }
} // end namespace framework } // end namespace framework
......
...@@ -31,16 +31,17 @@ namespace paddle { ...@@ -31,16 +31,17 @@ namespace paddle {
namespace framework { namespace framework {
std::atomic<int> SectionWorker::cpu_id_(0); std::atomic<int> SectionWorker::cpu_id_(0);
std::mutex SectionWorker::thread_mutex; // std::mutex SectionWorker::thread_mutex;
std::mutex SectionWorker::cout_mutex; // std::mutex SectionWorker::cout_mutex;
std::condition_variable SectionWorker::thread_condition; // std::condition_variable SectionWorker::thread_condition;
bool SectionWorker::threads_completed = false; // bool SectionWorker::threads_completed = false;
uint64_t SectionWorker::batch_id_(0); uint64_t SectionWorker::batch_id_(0);
void SectionWorker::Initialize(const TrainerDesc& desc) { void SectionWorker::Initialize(const TrainerDesc& desc) {
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
program_.reset(new ProgramDesc( program_.reset(
desc.section_param().section_config(section_id_).program_desc())); new ProgramDesc(desc.section_param().section_config().program_desc()));
// desc.section_param().section_config(section_id_).program_desc()));
for (auto& op_desc : program_->Block(0).AllOps()) { for (auto& op_desc : program_->Block(0).AllOps()) {
ops_.push_back(OpRegistry::CreateOp(*op_desc)); ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
...@@ -82,7 +83,7 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) { ...@@ -82,7 +83,7 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) {
void SectionWorker::TrainFiles() { void SectionWorker::TrainFiles() {
VLOG(3) << "begin section_worker TrainFiles"; VLOG(3) << "begin section_worker TrainFiles";
AutoSetCPUAffinity(true); // AutoSetCPUAffinity(true);
int64_t max_memory_size = 0; int64_t max_memory_size = 0;
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
...@@ -106,201 +107,86 @@ void SectionWorker::TrainFiles() { ...@@ -106,201 +107,86 @@ void SectionWorker::TrainFiles() {
platform::Timer batch_timer; platform::Timer batch_timer;
if (thread_id_ == 0) { // if (thread_id_ == 0) {
while (true) { // while (true) {
// Start a minibatch. // Start a minibatch.
// real number of microbatches run // real number of microbatches run
int real_microbatch_num = 0; // int real_microbatch_num = 0;
batch_timer.Start(); batch_timer.Start();
for (int i = 0; i < num_microbatches_; ++i) { for (int i = 0; i < num_microbatches_; ++i) {
try { try {
for (auto& op : ops_) {
int op_role = op->Attr<int>(std::string("op_role"));
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool run_first_mbatch =
op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
}
}
} catch (platform::EOFException&) {
std::unique_lock<std::mutex> lk(thread_mutex);
threads_completed = true;
VLOG(3) << "thread " << thread_id_ << " completed.";
VLOG(3) << "called notify all";
thread_condition.notify_all();
VLOG(0) << "EOF encountered";
break;
}
{
real_microbatch_num += 1;
batch_id_ += 1;
VLOG(3) << "called notify all";
std::unique_lock<std::mutex> lk(thread_mutex);
thread_condition.notify_all();
}
}
dev_ctx_->Wait();
VLOG(0) << "real_microbatch_num for thread 0 " << real_microbatch_num;
// backward pass
for (int i = 0; i < real_microbatch_num; ++i) {
for (auto& op : ops_) {
int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kBackward) ||
op_role == (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss))) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
}
}
}
dev_ctx_->Wait();
if (real_microbatch_num == 0) {
batch_timer.Pause();
VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
return;
}
// update pass
for (auto& op : ops_) { for (auto& op : ops_) {
int op_role = op->Attr<int>(std::string("op_role")); int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kOptimize)) { // We run op with op_role = kLRSched only for the first microbatch
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_ // to avoid increasing the @LR_DECAY_STEP@ multiple times.
<< " for minibatch scope"; bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
op->Run(*microbatch_scopes_[0], place_); op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for scope " << i;
op->Run(*microbatch_scopes_[i], place_);
if (gc) { if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1], DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
op.get(), unused_vars_, gc.get()); gc.get());
} }
} }
} }
dev_ctx_->Wait(); } catch (platform::EOFException&) {
batch_timer.Pause(); // std::unique_lock<std::mutex> lk(thread_mutex);
VLOG(0) << "batch time: " << batch_timer.ElapsedUS(); // threads_completed = true;
{ VLOG(3) << "thread completed.";
std::unique_lock<std::mutex> lk(thread_mutex); // VLOG(3) << "called notify all";
if (threads_completed) { // thread_condition.notify_all();
return; VLOG(0) << "EOF encountered";
} break;
}
} }
} else { }
while (true) { dev_ctx_->Wait();
// forward pass:
bool local_completed = false; // backward pass
int real_microbatch_num = 0; for (int i = 0; i < num_microbatches_; ++i) {
for (int i = 0; i < num_microbatches_; ++i) { for (auto& op : ops_) {
{ int op_role = op->Attr<int>(std::string("op_role"));
PADDLE_ENFORCE_LE( if (op_role == static_cast<int>(OpRole::kBackward) ||
local_batch_id_, batch_id_, op_role == (static_cast<int>(OpRole::kBackward) |
platform::errors::InvalidArgument( static_cast<int>(OpRole::kLoss))) {
"local_batch_id_ (%d) must be less than or equal to " VLOG(3) << "running an op " << op->Type() << " for scope " << i;
"batch_id_ (%d)", op->Run(*microbatch_scopes_[i], place_);
local_batch_id_, batch_id_)); if (gc) {
std::unique_lock<std::mutex> lk(thread_mutex); DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
if (local_batch_id_ == batch_id_ && !threads_completed) { gc.get());
thread_condition.wait(lk);
}
VLOG(3) << "thread " << thread_id_ << " local_batch_id_ "
<< local_batch_id_ << " batch_id_ " << batch_id_;
if (threads_completed) {
VLOG(3) << "thread " << thread_id_ << " completed.";
lk.unlock();
threads_completed = false;
local_completed = true;
break;
}
lk.unlock();
local_batch_id_ += 1;
real_microbatch_num += 1;
}
for (auto& op : ops_) {
int op_role = op->Attr<int>(std::string("op_role"));
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool run_first_mbatch =
op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
}
}
}
dev_ctx_->Wait();
// backward pass
for (int i = 0; i < real_microbatch_num; ++i) {
for (auto& op : ops_) {
int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kBackward) ||
op_role == (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss))) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
}
} }
} }
dev_ctx_->Wait(); }
// update pass }
if (real_microbatch_num == 0) { dev_ctx_->Wait();
return; // update pass
} for (auto& op : ops_) {
for (auto& op : ops_) { int op_role = op->Attr<int>(std::string("op_role"));
int op_role = op->Attr<int>(std::string("op_role")); if (op_role == static_cast<int>(OpRole::kOptimize)) {
if (op_role == static_cast<int>(OpRole::kOptimize)) { VLOG(3) << "running an op " << op->Type() << " for minibatch scope";
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_ op->Run(*microbatch_scopes_[0], place_);
<< " for minibatch scope"; if (gc) {
op->Run(*microbatch_scopes_[0], place_); for (int i = 0; i < num_microbatches_; ++i) {
if (gc) { DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1], gc.get());
op.get(), unused_vars_, gc.get());
}
} }
} }
dev_ctx_->Wait();
if (local_completed) {
return;
}
} }
} }
dev_ctx_->Wait();
batch_timer.Pause();
VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
++batch_id_;
} }
void SectionWorker::TrainFilesWithProfiler() { void SectionWorker::TrainFilesWithProfiler() {
VLOG(3) << "begin section_worker TrainFiles with profiler"; VLOG(3) << "begin section_worker TrainFiles with profiler";
AutoSetCPUAffinity(true); // AutoSetCPUAffinity(true);
platform::Timer batch_timer; platform::Timer batch_timer;
platform::Timer timeline; platform::Timer timeline;
...@@ -342,183 +228,42 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -342,183 +228,42 @@ void SectionWorker::TrainFilesWithProfiler() {
} }
#endif #endif
if (thread_id_ == 0) { // if (thread_id_ == 0) {
struct timeval start; struct timeval start;
struct timeval end; struct timeval end;
struct timeval micro_start; struct timeval micro_start;
struct timeval micro_end; struct timeval micro_end;
while (true) { // Start a minibatch.
// Start a minibatch. batch_timer.Start();
batch_timer.Start(); int real_microbatch_num = 0;
int real_microbatch_num = 0; for (int i = 0; i < num_microbatches_; ++i) {
for (int i = 0; i < num_microbatches_; ++i) { try {
try {
int op_idx = 0;
gettimeofday(&micro_start, NULL);
for (auto& op : ops_) {
gettimeofday(&start, NULL);
int op_role = op->Attr<int>(std::string("op_role"));
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool run_first_mbatch =
op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
timeline.Start();
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
cudaDeviceSynchronize();
timeline.Pause();
gettimeofday(&end, NULL);
auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) {
op_max_time[op_idx] = time;
}
if (time < op_min_time[op_idx]) {
op_min_time[op_idx] = time;
}
op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "::FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:SCOPE[" << i << "]:OP[" << op->Type()
<< "]:START[" << start.tv_sec * 1e6 + start.tv_usec
<< "]:END[" << end.tv_sec * 1e6 + end.tv_usec << "]"
<< std::endl;
}
}
op_idx++;
}
gettimeofday(&micro_end, NULL);
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "!!FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec
<< "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
<< "]" << std::endl;
}
} catch (platform::EOFException&) {
std::unique_lock<std::mutex> lk(thread_mutex);
threads_completed = true;
VLOG(3) << "thread " << thread_id_ << " completed.";
VLOG(3) << "called notify all";
thread_condition.notify_all();
VLOG(0) << "EOF encountered";
VLOG(0) << "============timeline============";
for (size_t i = 0; i < ops_.size(); ++i) {
VLOG(0) << "op: " << op_name[i] << ", max_time: " << op_max_time[i]
<< ", min_time: " << op_min_time[i]
<< ", mean_time: " << op_total_time[i] / op_count[i];
}
VLOG(0) << "================================";
break;
}
{
VLOG(3) << "called notify all";
std::unique_lock<std::mutex> lk(thread_mutex);
real_microbatch_num += 1;
batch_id_ += 1;
thread_condition.notify_all();
}
}
dev_ctx_->Wait();
// backward pass
for (int i = 0; i < real_microbatch_num; ++i) {
int op_idx = 0;
gettimeofday(&micro_start, NULL);
for (auto& op : ops_) {
gettimeofday(&start, NULL);
int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kBackward) ||
op_role == (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss))) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
timeline.Start();
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
timeline.Pause();
auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) {
op_max_time[op_idx] = time;
}
if (time < op_min_time[op_idx]) {
op_min_time[op_idx] = time;
}
op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "::BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:SCOPE[" << i << "]:OP[" << op->Type()
<< "]:START[" << start.tv_sec * 1e6 + start.tv_usec
<< "]:END[" << end.tv_sec * 1e6 + end.tv_usec << "]"
<< std::endl;
}
}
op_idx++;
}
gettimeofday(&micro_end, NULL);
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "!!BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec
<< "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
<< "]" << std::endl;
}
}
dev_ctx_->Wait();
if (real_microbatch_num == 0) {
batch_timer.Pause();
VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
return;
}
// update pass
int op_idx = 0; int op_idx = 0;
gettimeofday(&micro_start, NULL); gettimeofday(&micro_start, NULL);
for (auto& op : ops_) { for (auto& op : ops_) {
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
int op_role = op->Attr<int>(std::string("op_role")); int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kOptimize)) { // We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_ VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for minibatch scope"; << " for scope " << i;
timeline.Start(); timeline.Start();
op->Run(*microbatch_scopes_[0], place_); op->Run(*microbatch_scopes_[i], place_);
if (gc) { if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1], DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
op.get(), unused_vars_, gc.get()); gc.get());
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL);
timeline.Pause(); timeline.Pause();
gettimeofday(&end, NULL);
auto time = timeline.ElapsedUS(); auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time; op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) { if (time > op_max_time[op_idx]) {
...@@ -530,12 +275,11 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -530,12 +275,11 @@ void SectionWorker::TrainFilesWithProfiler() {
op_count[op_idx] += 1; op_count[op_idx] += 1;
op_total_time[op_idx] += time; op_total_time[op_idx] += time;
{ {
std::unique_lock<std::mutex> lk(cout_mutex); // std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.precision(0); std::cout.precision(0);
std::cout << "::UPD:B[" << batch_id_ << "]:SEC[" << thread_id_ std::cout << "::FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:SCOPE[" << num_microbatches_ << "]:OP[" << "]:SCOPE[" << i << "]:OP[" << op->Type() << "]:START["
<< op->Type() << "]:START["
<< start.tv_sec * 1e6 + start.tv_usec << "]:END[" << start.tv_sec * 1e6 + start.tv_usec << "]:END["
<< end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl; << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
} }
...@@ -544,250 +288,142 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -544,250 +288,142 @@ void SectionWorker::TrainFilesWithProfiler() {
} }
gettimeofday(&micro_end, NULL); gettimeofday(&micro_end, NULL);
{ {
std::unique_lock<std::mutex> lk(cout_mutex); // std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.precision(0); std::cout.precision(0);
std::cout << "!!UPD:B[" << batch_id_ << "]:SEC[" << thread_id_ std::cout << "!!FWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START[" << "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END[" << micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END["
<< micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]" << micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]"
<< std::endl; << std::endl;
} }
dev_ctx_->Wait(); } catch (platform::EOFException&) {
batch_timer.Pause(); VLOG(3) << "thread completed.";
VLOG(0) << "batch time: " << batch_timer.ElapsedUS(); VLOG(0) << "EOF encountered";
{ VLOG(0) << "============timeline============";
std::unique_lock<std::mutex> lk(thread_mutex); for (size_t i = 0; i < ops_.size(); ++i) {
if (threads_completed) { VLOG(0) << "op: " << op_name[i] << ", max_time: " << op_max_time[i]
return; << ", min_time: " << op_min_time[i]
} << ", mean_time: " << op_total_time[i] / op_count[i];
} }
VLOG(0) << "================================";
break;
} }
} else { }
struct timeval start; dev_ctx_->Wait();
struct timeval end; // backward pass
struct timeval micro_start; for (int i = 0; i < num_microbatches_; ++i) {
struct timeval micro_end; int op_idx = 0;
cudaEvent_t cu_start, cu_stop; gettimeofday(&micro_start, NULL);
cudaEventCreate(&cu_start); for (auto& op : ops_) {
cudaEventCreate(&cu_stop); gettimeofday(&start, NULL);
bool local_completed = false; int op_role = op->Attr<int>(std::string("op_role"));
while (true) { if (op_role == static_cast<int>(OpRole::kBackward) ||
// forward pass: op_role == (static_cast<int>(OpRole::kBackward) |
int real_microbatch_num = 0; static_cast<int>(OpRole::kLoss))) {
for (int i = 0; i < num_microbatches_; ++i) { VLOG(3) << "running an op " << op->Type() << " for scope " << i;
{ timeline.Start();
PADDLE_ENFORCE_LE( op->Run(*microbatch_scopes_[i], place_);
local_batch_id_, batch_id_, if (gc) {
platform::errors::InvalidArgument( DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
"local_batch_id_ (%d) must be less than or equal to " gc.get());
"batch_id_ (%d)",
local_batch_id_, batch_id_));
std::unique_lock<std::mutex> lk(thread_mutex);
if (local_batch_id_ == batch_id_ && !threads_completed) {
thread_condition.wait(lk);
}
VLOG(3) << "thread " << thread_id_ << " local_batch_id_ "
<< local_batch_id_ << " batch_id_ " << batch_id_;
if (threads_completed) {
local_completed = true;
VLOG(3) << "thread " << thread_id_ << " completed.";
lk.unlock();
VLOG(0) << "============timeline============";
for (size_t i = 0; i < ops_.size(); ++i) {
VLOG(0) << "op: " << op_name[i]
<< ", max_time: " << op_max_time[i]
<< ", min_time: " << op_min_time[i]
<< ", mean_time: " << op_total_time[i] / op_count[i];
}
VLOG(0) << "================================";
break;
}
lk.unlock();
real_microbatch_num += 1;
local_batch_id_ += 1;
} }
int op_idx = 0; cudaDeviceSynchronize();
gettimeofday(&micro_start, NULL); gettimeofday(&end, NULL);
for (auto& op : ops_) { timeline.Pause();
gettimeofday(&start, NULL); auto time = timeline.ElapsedUS();
int op_role = op->Attr<int>(std::string("op_role")); op_total_time[op_idx] += time;
// We run op with op_role = kLRSched only for the first microbatch if (time > op_max_time[op_idx]) {
// to avoid increasing the @LR_DECAY_STEP@ multiple times. op_max_time[op_idx] = time;
bool run_first_mbatch = }
op_role == static_cast<int>(OpRole::kForward) || if (time < op_min_time[op_idx]) {
op_role == (static_cast<int>(OpRole::kForward) | op_min_time[op_idx] = time;
static_cast<int>(OpRole::kLoss)) ||
op_role == static_cast<int>(OpRole::kLRSched);
bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
op_role == (static_cast<int>(OpRole::kForward) |
static_cast<int>(OpRole::kLoss));
if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for scope " << i;
timeline.Start();
op->Run(*microbatch_scopes_[i], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
unused_vars_, gc.get());
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
timeline.Pause();
auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) {
op_max_time[op_idx] = time;
}
if (time < op_min_time[op_idx]) {
op_min_time[op_idx] = time;
}
op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "::FWD:B[" << local_batch_id_ << "]:SEC["
<< thread_id_ << "]:SCOPE[" << i << "]:OP["
<< op->Type() << "]:START["
<< start.tv_sec * 1e6 + start.tv_usec << "]:END["
<< end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
}
}
op_idx++;
} }
gettimeofday(&micro_end, NULL); op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{ {
std::unique_lock<std::mutex> lk(cout_mutex); // std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.precision(0); std::cout.precision(0);
std::cout << "!!FWD:B[" << batch_id_ << "]:SEC[" << thread_id_ std::cout << "::BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START[" << "]:SCOPE[" << i << "]:OP[" << op->Type() << "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec << start.tv_sec * 1e6 + start.tv_usec << "]:END["
<< "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec << end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
<< "]" << std::endl;
} }
} }
dev_ctx_->Wait(); op_idx++;
// backward pass }
for (int i = 0; i < real_microbatch_num; ++i) { gettimeofday(&micro_end, NULL);
int op_idx = 0; {
gettimeofday(&micro_start, NULL); // std::unique_lock<std::mutex> lk(cout_mutex);
for (auto& op : ops_) { std::cout << std::fixed;
gettimeofday(&start, NULL); std::cout.precision(0);
int op_role = op->Attr<int>(std::string("op_role")); std::cout << "!!BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
if (op_role == static_cast<int>(OpRole::kBackward) || << "]:START[" << micro_start.tv_sec * 1e6 + micro_start.tv_usec
op_role == (static_cast<int>(OpRole::kBackward) | << "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]"
static_cast<int>(OpRole::kLoss))) { << std::endl;
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_ }
<< " for scope " << i; }
timeline.Start(); dev_ctx_->Wait();
op->Run(*microbatch_scopes_[i], place_); if (real_microbatch_num == 0) {
if (gc) { batch_timer.Pause();
DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
unused_vars_, gc.get()); return;
} }
cudaDeviceSynchronize(); // update pass
gettimeofday(&end, NULL); int op_idx = 0;
timeline.Pause(); gettimeofday(&micro_start, NULL);
auto time = timeline.ElapsedUS(); for (auto& op : ops_) {
op_total_time[op_idx] += time; gettimeofday(&start, NULL);
if (time > op_max_time[op_idx]) { int op_role = op->Attr<int>(std::string("op_role"));
op_max_time[op_idx] = time; if (op_role == static_cast<int>(OpRole::kOptimize)) {
} VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
if (time < op_min_time[op_idx]) { << " for minibatch scope";
op_min_time[op_idx] = time; timeline.Start();
} op->Run(*microbatch_scopes_[0], place_);
op_count[op_idx] += 1; if (gc) {
op_total_time[op_idx] += time; for (int i = 0; i < num_microbatches_; ++i) {
{ DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
std::unique_lock<std::mutex> lk(cout_mutex); gc.get());
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "::BWD:B[" << local_batch_id_ << "]:SEC["
<< thread_id_ << "]:SCOPE[" << i << "]:OP["
<< op->Type() << "]:START["
<< start.tv_sec * 1e6 + start.tv_usec << "]:END["
<< end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
}
}
op_idx++;
}
gettimeofday(&micro_end, NULL);
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "!!BWD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec
<< "]:END[" << micro_end.tv_sec * 1e6 + micro_end.tv_usec
<< "]" << std::endl;
} }
} }
dev_ctx_->Wait(); cudaDeviceSynchronize();
if (real_microbatch_num == 0) { gettimeofday(&end, NULL);
return; timeline.Pause();
auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) {
op_max_time[op_idx] = time;
} }
// update pass if (time < op_min_time[op_idx]) {
int op_idx = 0; op_min_time[op_idx] = time;
gettimeofday(&micro_start, NULL);
for (auto& op : ops_) {
gettimeofday(&start, NULL);
int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kOptimize)) {
VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
<< " for minibatch scope";
timeline.Start();
op->Run(*microbatch_scopes_[0], place_);
if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
op.get(), unused_vars_, gc.get());
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
timeline.Pause();
auto time = timeline.ElapsedUS();
op_total_time[op_idx] += time;
if (time > op_max_time[op_idx]) {
op_max_time[op_idx] = time;
}
if (time < op_min_time[op_idx]) {
op_min_time[op_idx] = time;
}
op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "::UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:SCOPE[" << num_microbatches_ << "]:OP["
<< op->Type() << "]:START["
<< start.tv_sec * 1e6 + start.tv_usec << "]:END["
<< end.tv_sec * 1e6 + end.tv_usec << "]" << std::endl;
}
}
op_idx++;
} }
gettimeofday(&micro_end, NULL); op_count[op_idx] += 1;
op_total_time[op_idx] += time;
{ {
std::unique_lock<std::mutex> lk(cout_mutex);
std::cout << std::fixed; std::cout << std::fixed;
std::cout.precision(0); std::cout.precision(0);
std::cout << "!!UPD:B[" << batch_id_ << "]:SEC[" << thread_id_ std::cout << "::UPD:B[" << batch_id_ << "]:SEC[" << thread_id_
<< "]:START[" << "]:SCOPE[" << num_microbatches_ << "]:OP[" << op->Type()
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END[" << "]:START[" << start.tv_sec * 1e6 + start.tv_usec
<< micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]" << "]:END[" << end.tv_sec * 1e6 + end.tv_usec << "]"
<< std::endl; << std::endl;
} }
dev_ctx_->Wait();
if (local_completed) {
return;
}
} }
op_idx++;
}
gettimeofday(&micro_end, NULL);
{
std::cout << std::fixed;
std::cout.precision(0);
std::cout << "!!UPD:B[" << batch_id_ << "]:SEC[" << thread_id_ << "]:START["
<< micro_start.tv_sec * 1e6 + micro_start.tv_usec << "]:END["
<< micro_end.tv_sec * 1e6 + micro_end.tv_usec << "]" << std::endl;
} }
dev_ctx_->Wait();
batch_timer.Pause();
VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
++batch_id_;
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <fstream> #include <fstream>
#include <map>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
...@@ -217,28 +218,35 @@ class PipelineTrainer : public TrainerBase { ...@@ -217,28 +218,35 @@ class PipelineTrainer : public TrainerBase {
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
void InitDumpEnv() override; void InitDumpEnv() override;
virtual std::string GetDumpPath(int tid); virtual std::string GetDumpPath(int tid);
void GetSkipVars(int section_id, const ProgramDesc& main_program); void GetSkipVars(const ProgramDesc& main_program);
protected: protected:
int section_num_; // int section_num_;
int num_microbatches_; int num_microbatches_;
int start_cpu_core_id_; int start_cpu_core_id_;
std::vector<platform::Place> places_; // std::vector<platform::Place> places_;
std::vector<std::vector<std::string>> skip_vars_; platform::Place place_;
// std::vector<std::vector<std::string>> skip_vars_;
std::vector<std::string> skip_vars_;
TrainerDesc trainer_desc_; TrainerDesc trainer_desc_;
std::vector<std::thread> section_threads_; // std::vector<std::thread> section_threads_;
std::thread section_thread_;
// worker: [section_id] // worker: [section_id]
std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_; // std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_;
std::shared_ptr<paddle::framework::DeviceWorker> worker_;
// minibatch_scopes_: [section_id] // minibatch_scopes_: [section_id]
std::vector<Scope*> minibatch_scopes_; // std::vector<Scope*> minibatch_scopes_;
Scope* minibatch_scope_;
// microbatch_scopes_: [section_id][microbatch_id] // microbatch_scopes_: [section_id][microbatch_id]
std::vector<std::vector<Scope*>> microbatch_scopes_; // std::vector<std::vector<Scope*>> microbatch_scopes_;
// microbatch_scopes_: [microbatch_id]
void CopyParameters(int section_id, int microbatch_id, std::vector<Scope*> microbatch_scopes_;
const ProgramDesc& program, const platform::Place& place);
bool isPersistableVarGrad(std::string name); void CopyParameters(int microbatch_id, const ProgramDesc& program,
bool isPersistable(VarDesc* var); const platform::Place& place);
// bool isPersistableVarGrad(std::string name);
// bool isPersistable(VarDesc* var);
}; };
#endif #endif
......
...@@ -84,7 +84,7 @@ message DownpourWorkerParameter { ...@@ -84,7 +84,7 @@ message DownpourWorkerParameter {
} }
message SectionWorkerParameter { message SectionWorkerParameter {
repeated SectionConfig section_config = 1; SectionConfig section_config = 1;
optional int32 queue_size = 2 [ default = 1 ]; optional int32 queue_size = 2 [ default = 1 ];
optional int64 sync_steps = 3 [ default = 1 ]; optional int64 sync_steps = 3 [ default = 1 ];
optional int32 start_cpu_core_id = 4 [ default = 1 ]; optional int32 start_cpu_core_id = 4 [ default = 1 ];
......
...@@ -3784,6 +3784,7 @@ class PipelineOptimizer(object): ...@@ -3784,6 +3784,7 @@ class PipelineOptimizer(object):
Args: Args:
main_program (Program): the main program main_program (Program): the main program
devices: all used devices
""" """
programs = [] programs = []
# Map from device to its corresponding section program info # Map from device to its corresponding section program info
...@@ -3910,10 +3911,10 @@ class PipelineOptimizer(object): ...@@ -3910,10 +3911,10 @@ class PipelineOptimizer(object):
data_devices_map[var_name].append(dev_spec) data_devices_map[var_name].append(dev_spec)
return data_devices_map return data_devices_map
def _insert_enq_deq_for_data_var(self, main_block, programs, startup, def _insert_sendrecv_for_data_var(self, main_block, programs, startup,
devices): devices):
""" """
Insert enqueue and dequeue ops for data var that on other devices. Insert send and recv ops for data var that on other devices.
Args: Args:
main_block (Block): Global block for main program main_block (Block): Global block for main program
...@@ -3926,39 +3927,24 @@ class PipelineOptimizer(object): ...@@ -3926,39 +3927,24 @@ class PipelineOptimizer(object):
first_prog = programs[0]['program'] first_prog = programs[0]['program']
first_block = first_prog.block(0) first_block = first_prog.block(0)
enqueue_index = 0 insert_index = 0
for op in first_block.ops: for op in first_block.ops:
enqueue_index += 1 insert_index += 1
if op.type == "read": if op.type == "read":
break break
first_dev_spec = devices[0] first_dev_spec = devices[0]
for var_name in data_devices_map.keys(): for var_name in data_devices_map.keys():
for device in data_devices_map[var_name]: for device in data_devices_map[var_name]:
if device == first_dev_spec: continue if device == first_dev_spec: continue
# step1: generate queue for each pair of data var and device
# that that data on
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup.block(0).create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup.block(0).append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
main_var = main_block.var(var_name) main_var = main_block.var(var_name)
assert main_var.is_data assert main_var.is_data
if not var_name in first_block.vars: if not var_name in first_block.vars:
self._create_var(first_block, main_var, var_name) self._create_var(first_block, main_var, var_name)
first_block._insert_op( first_block._insert_op(
index=enqueue_index, index=insert_index,
type='enqueue', type='c_send',
inputs={'X': first_block.var(var_name)}, inputs={'X': first_block.var(var_name)},
attrs={ attrs={
'queue_name': queue_name,
self._op_device_key: first_dev_spec, self._op_device_key: first_dev_spec,
self._op_role_key: self._op_role.Forward self._op_role_key: self._op_role.Forward
}) })
...@@ -3972,12 +3958,11 @@ class PipelineOptimizer(object): ...@@ -3972,12 +3958,11 @@ class PipelineOptimizer(object):
new_var = self._create_var(block, source_var, var_name) new_var = self._create_var(block, source_var, var_name)
block._insert_op( block._insert_op(
index=0, index=0,
type='dequeue', type='c_recv',
outputs={'Out': [new_var]}, outputs={'Out': [new_var]},
attrs={ attrs={
self._op_device_key: device, self._op_device_key: device,
self._op_role_key: self._op_role.Forward, self._op_role_key: self._op_role.Forward,
'queue_name': queue_name,
}) })
def _strip_grad_suffix(self, name): def _strip_grad_suffix(self, name):
...@@ -4080,23 +4065,22 @@ class PipelineOptimizer(object): ...@@ -4080,23 +4065,22 @@ class PipelineOptimizer(object):
assert sorted_device_specs == device_specs assert sorted_device_specs == device_specs
return device_specs return device_specs
def _insert_enq_deq_ops_for_boundaries(self, block, origin_block, def _insert_sendrecv_ops_for_boundaries(self, block, origin_block):
startup_program):
""" """
Insert a pair of enqueue and dequeue ops for every two Insert a pair of send and recv ops for every two
consecutive ops on different devices. consecutive ops on different devices.
""" """
startup_block = startup_program.global_block()
extra_index = 0 extra_index = 0
# A map from var to device spec where op takes it as input, # A map from var to device spec where op takes it as input,
# avoiding multiple enqueue and dequeue ops. # avoiding multiple send and recv ops.
var_devspec = dict() var_devspec = dict()
for index, op in list(enumerate(origin_block.ops)): for index, op in list(enumerate(origin_block.ops)):
# skips lr-related op and vars, as we will process them later. # skips lr-related ops and vars, as we will process them later.
if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched): if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched):
continue continue
# skips update ops and vars, as we will process them later.
if self._is_update_op(op): continue if self._is_update_op(op): continue
cur_device_spec = op.attr(self._op_device_key) cur_device_spec = op.attr(self._op_device_key)
...@@ -4119,37 +4103,23 @@ class PipelineOptimizer(object): ...@@ -4119,37 +4103,23 @@ class PipelineOptimizer(object):
if cur_device_spec in var_devspec[var_name]: continue if cur_device_spec in var_devspec[var_name]: continue
var_devspec[var_name].append(cur_device_spec) var_devspec[var_name].append(cur_device_spec)
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup_block.create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup_block.append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
op_role = op.all_attrs()[self._op_role_key] op_role = op.all_attrs()[self._op_role_key]
var = block.vars[var_name] var = block.vars[var_name]
block._insert_op( block._insert_op(
index=index + extra_index, index=index + extra_index,
type='enqueue', type='c_send',
inputs={'X': var}, inputs={'X': var},
attrs={ attrs={
'queue_name': queue_name,
self._op_device_key: prev_device_spec, self._op_device_key: prev_device_spec,
self._op_role_key: op_role self._op_role_key: op_role
}) })
extra_index += 1 extra_index += 1
block._insert_op( block._insert_op(
index=index + extra_index, index=index + extra_index,
type='dequeue', type='c_recv',
outputs={'Out': [var]}, outputs={'Out': [var]},
attrs={ attrs={
self._op_device_key: cur_device_spec, self._op_device_key: cur_device_spec,
'queue_name': queue_name,
self._op_role_key: op_role self._op_role_key: op_role
}) })
extra_index += 1 extra_index += 1
...@@ -4178,7 +4148,9 @@ class PipelineOptimizer(object): ...@@ -4178,7 +4148,9 @@ class PipelineOptimizer(object):
def _accumulate_gradients(self, block): def _accumulate_gradients(self, block):
""" """
Accumulate the graident generated in microbatch to the one in mini-batch. Accumulate the gradients generated in microbatch to the one in mini-batch.
We also scale the loss corresponding to number of micro-batches at
the same time.
""" """
for index, op in reversed(list(enumerate(block.ops))): for index, op in reversed(list(enumerate(block.ops))):
offset = index offset = index
...@@ -4210,12 +4182,10 @@ class PipelineOptimizer(object): ...@@ -4210,12 +4182,10 @@ class PipelineOptimizer(object):
for i in range(0, len(op_role_var), 2): for i in range(0, len(op_role_var), 2):
grad_name = op_role_var[i + 1] grad_name = op_role_var[i + 1]
grad_var = block.vars[grad_name] grad_var = block.vars[grad_name]
param_name = op_role_var[i] new_grad_var_name = unique_name.generate(grad_name)
param_var = block.vars[param_name] new_var = self._create_var(block, grad_var,
new_var_name = unique_name.generate(param_name) new_grad_var_name)
new_var_name = self._append_grad_suffix(new_var_name) self._rename_arg(op, grad_name, new_grad_var_name)
new_var = self._create_var(block, grad_var, new_var_name)
self._rename_arg(op, grad_name, new_var_name)
block._insert_op( block._insert_op(
index=offset + 1, index=offset + 1,
type='sum', type='sum',
...@@ -4247,7 +4217,6 @@ class PipelineOptimizer(object): ...@@ -4247,7 +4217,6 @@ class PipelineOptimizer(object):
def _get_device_info(self, block): def _get_device_info(self, block):
for op in block.ops: for op in block.ops:
if not op._has_kernel(op.type): continue if not op._has_kernel(op.type): continue
op_device = op.attr(self._op_device_key) op_device = op.attr(self._op_device_key)
return op_device return op_device
...@@ -4282,7 +4251,7 @@ class PipelineOptimizer(object): ...@@ -4282,7 +4251,7 @@ class PipelineOptimizer(object):
for prog in var_info[var_name]: for prog in var_info[var_name]:
block = prog.block(0) block = prog.block(0)
for op in block.ops: for op in block.ops:
if op.type == "dequeue": continue if op.type == "c_recv": continue
# We have processed lr related vars # We have processed lr related vars
if op.attr(self._op_role_key) == int( if op.attr(self._op_role_key) == int(
self._op_role.Optimize.LRSched): self._op_role.Optimize.LRSched):
...@@ -4306,24 +4275,11 @@ class PipelineOptimizer(object): ...@@ -4306,24 +4275,11 @@ class PipelineOptimizer(object):
for prog in all_progs: for prog in all_progs:
if prog == write_prog: continue if prog == write_prog: continue
queue_name = var_name + "_blocking_queue"
queue_name = unique_name.generate(queue_name)
queue_var = startup_prog.block(0).create_var(
name=queue_name,
persistable=True,
type=core.VarDesc.VarType.RAW)
startup_prog.block(0).append_op(
type='queue_generator',
attrs={
'names': [queue_name],
'capacity': self._num_microbatches
})
write_block._insert_op( write_block._insert_op(
index=0, index=0,
type='enqueue', type='c_send',
inputs={'X': write_block.var(var_name), }, inputs={'X': write_block.var(var_name), },
attrs={ attrs={
'queue_name': queue_name,
self._op_device_key: write_device, self._op_device_key: write_device,
# A trick to make the role LRSched to avoid copy every # A trick to make the role LRSched to avoid copy every
# microbatch # microbatch
...@@ -4333,14 +4289,13 @@ class PipelineOptimizer(object): ...@@ -4333,14 +4289,13 @@ class PipelineOptimizer(object):
read_device = self._get_device_info(read_block) read_device = self._get_device_info(read_block)
read_block._insert_op( read_block._insert_op(
index=0, index=0,
type='dequeue', type='c_recv',
outputs={'Out': [read_block.var(var_name)]}, outputs={'Out': [read_block.var(var_name)]},
attrs={ attrs={
self._op_device_key: read_device, self._op_device_key: read_device,
# A trick to make the role LRSched to avoid copy every # A trick to make the role LRSched to avoid copy every
# microbatch # microbatch
self._op_role_key: self._op_role.LRSched, self._op_role_key: self._op_role.LRSched,
'queue_name': queue_name,
}) })
def minimize(self, def minimize(self,
...@@ -4365,14 +4320,13 @@ class PipelineOptimizer(object): ...@@ -4365,14 +4320,13 @@ class PipelineOptimizer(object):
device_specs = self._check_validation(main_block) device_specs = self._check_validation(main_block)
# Step3: add enqueue and dequeue ops between section boundaries # Step3: add send and recv ops between section boundaries
origin_prog = main_block.program.clone(for_test=False) origin_prog = main_block.program.clone(for_test=False)
origin_main_block = origin_prog.global_block() origin_main_block = origin_prog.global_block()
self._insert_enq_deq_ops_for_boundaries(main_block, origin_main_block, self._insert_sendrecv_ops_for_boundaries(main_block, origin_main_block)
startup_program)
# Step4: accumulate gradients during backward # Step4: clear gradients before each mini-batch and
# and clear them after update # accumulate gradients during backward
self._clear_gradients(main_block) self._clear_gradients(main_block)
self._accumulate_gradients(main_block) self._accumulate_gradients(main_block)
...@@ -4392,14 +4346,14 @@ class PipelineOptimizer(object): ...@@ -4392,14 +4346,14 @@ class PipelineOptimizer(object):
raise ValueError("Unknown device type: %s", dev_spec) raise ValueError("Unknown device type: %s", dev_spec)
# Step5: split program into sections and add pairs of # Step5: split program into sections and add pairs of
# enqueue and dequeue ops for data var. # send and recv ops for data var.
if len(place_list) <= 1: if len(place_list) <= 1:
raise ValueError("Run on one device, do not use pipeline.") raise ValueError("Run on one device, do not use pipeline.")
program_list = self._split_program(main_program, device_specs) program_list = self._split_program(main_program, device_specs)
for p in program_list: for p in program_list:
self._create_vars(p["program"].block(0), main_program) self._create_vars(p["program"].block(0), main_program)
self._insert_enq_deq_for_data_var(main_block, program_list, self._insert_sendrecv_for_data_var(main_block, program_list,
startup_program, device_specs) startup_program, device_specs)
# Step6: Special Case: process persistable vars that exist in # Step6: Special Case: process persistable vars that exist in
# multiple sections # multiple sections
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册