未验证 提交 e39aa70e 编写于 作者: L lilong12 提交者: GitHub

add the support for pipeline (#24560)

* add device_worker for pipeline, test=develop
上级 0dcb8754
...@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size); ...@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class FleetWrapper; class FleetWrapper;
#define SEC_LOG \
VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
<< "]: "
class PullDenseWorker { class PullDenseWorker {
public: public:
virtual ~PullDenseWorker() {} virtual ~PullDenseWorker() {}
...@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker { ...@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
}; };
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
class SyncFunctor {
public:
SyncFunctor(int rank_id, int rank_num, int sync_steps);
virtual ~SyncFunctor() {}
void SetSyncParam(const std::vector<std::string>& sync_param) {
sync_param_ = &sync_param;
}
void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) {
nccl_ctx_map_ = nccl_ctx_map;
}
int operator()(Scope* scope);
static std::vector<Scope*> pipeline_scopes_;
static uint64_t sync_flag_;
protected:
const int rank_id_;
const int rank_num_;
const std::vector<std::string>* sync_param_ = nullptr;
platform::NCCLContextMap* nccl_ctx_map_ = nullptr;
uint64_t sync_signal_;
const int sync_steps_;
int counter_;
void Synchronize();
};
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
SectionWorker() {} SectionWorker() { local_batch_id_ = 0; }
~SectionWorker() override {} ~SectionWorker() override {}
void Initialize(const TrainerDesc& desc) override; void Initialize(const TrainerDesc& desc) override;
...@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker { ...@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
void SetSectionIndex(int section_id) { section_id_ = section_id; } void SetSectionIndex(int section_id) { section_id_ = section_id; }
void SetDeviceIndex(int tid) override { pipeline_id_ = tid; } void SetDeviceIndex(int tid) override {}
void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
void SetVarNames(const std::vector<std::string>& in_var_names, void SetMicrobatchNum(int num) { num_microbatches_ = num; }
const std::vector<std::string>& out_var_names) { void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
in_var_names_ = &in_var_names; microbatch_scopes_ = scope;
out_var_names_ = &out_var_names;
}
void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) {
in_scope_queue_ = in_scope_queue;
out_scope_queue_ = out_scope_queue;
} }
void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; } void SetMinibatchScope(const Scope* scope) { minibatch_scope_ = scope; }
void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; } void SetSkipVars(const std::vector<std::string>& skip_vars) {
void SetSectionNum(int section_num) { section_num_ = section_num; } skip_vars_ = skip_vars;
void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; }
void SetNextSectionPlace(const paddle::platform::Place& place) {
next_section_place_ = place;
} }
SyncFunctor* sync_func_ = nullptr;
void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; }
static std::atomic<int> cpu_id_; static std::atomic<int> cpu_id_;
protected: protected:
void AutoSetCPUAffinity(bool reuse); void AutoSetCPUAffinity(bool reuse);
int section_id_; int section_id_;
int pipeline_id_;
int section_num_;
int pipeline_num_;
int thread_id_; int thread_id_;
// This worker will consume scope from in_scope_queue_ int num_microbatches_;
// and produce scope to out_scope_queue_ std::vector<Scope*> microbatch_scopes_;
ScopeQueue* in_scope_queue_ = nullptr; std::vector<std::string> skip_vars_;
ScopeQueue* out_scope_queue_ = nullptr; const Scope* minibatch_scope_;
const std::vector<std::string>* in_var_names_ = nullptr;
const std::vector<std::string>* out_var_names_ = nullptr;
std::mutex* worker_count_mutex_ = nullptr;
int* worker_count_ = nullptr;
paddle::platform::Place next_section_place_;
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
static std::mutex thread_mutex;
static std::condition_variable thread_condition;
static bool threads_completed;
std::shared_ptr<framework::ProgramDesc> program_;
static uint64_t batch_id_;
uint64_t local_batch_id_;
platform::DeviceContext* dev_ctx_ = nullptr; platform::DeviceContext* dev_ctx_ = nullptr;
}; };
#endif #endif
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -23,8 +23,13 @@ namespace framework { ...@@ -23,8 +23,13 @@ namespace framework {
void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
Dataset* dataset) { Dataset* dataset) {
pipeline_num_ = trainer_desc.thread_num(); const auto& section_params = trainer_desc.section_param();
VLOG(3) << "pipeline num: " << pipeline_num_; num_microbatches_ = section_params.num_microbatches();
VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
section_num_ = section_params.section_config_size();
VLOG(3) << "Number of program sections: " << section_num_;
trainer_desc_ = trainer_desc;
start_cpu_core_id_ = section_params.start_cpu_core_id();
SetDataset(dataset); SetDataset(dataset);
ParseDumpConfig(trainer_desc); ParseDumpConfig(trainer_desc);
...@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const std::vector<paddle::framework::DataFeed*> readers = const std::vector<paddle::framework::DataFeed*> readers =
dataset->GetReaders(); dataset->GetReaders();
VLOG(3) << "readers num: " << readers.size(); VLOG(3) << "readers num: " << readers.size();
int num_readers = readers.size();
pipeline_config_ = trainer_desc.section_param(); PADDLE_ENFORCE_EQ(num_readers, 1,
scope_queue_size_ = pipeline_config_.queue_size(); platform::errors::InvalidArgument(
sync_steps_ = pipeline_config_.sync_steps(); "Number of dataset readers for pipeline "
section_num_ = pipeline_config_.section_config_size(); "must be 1 now, but the value you give is %d.",
num_readers));
VLOG(3) << "scope_queue_size: " << scope_queue_size_; auto* reader = readers[0];
VLOG(3) << "section num: " << section_num_; feed_var_names_ = reader->GetUseSlotAlias();
VLOG(3) << "sync_steps: " << sync_steps_;
workers_.resize(section_num_); workers_.resize(section_num_);
in_var_names_.resize(section_num_);
out_var_names_.resize(section_num_);
worker_count_.resize(section_num_);
worker_count_mutex_.resize(section_num_);
param_need_sync_.reset(new std::vector<std::string>);
int reader_index = 0;
for (int i = 0; i < section_num_; ++i) { for (int i = 0; i < section_num_; ++i) {
const auto& section_config = pipeline_config_.section_config(i); const auto& section_config = section_params.section_config(i);
int concurrency = section_config.concurrency();
VLOG(3) << "the thread num of each pipeline in section " << i
<< " is: " << concurrency;
in_var_names_[i].reset(new std::vector<std::string>(
section_config.section_in_var_names().begin(),
section_config.section_in_var_names().end()));
out_var_names_[i].reset(new std::vector<std::string>(
section_config.section_out_var_names().begin(),
section_config.section_out_var_names().end()));
worker_count_[i].resize(pipeline_num_);
worker_count_mutex_[i].resize(pipeline_num_);
for (int j = 0; j < pipeline_num_; ++j) {
worker_count_[i][j] = new int(concurrency);
worker_count_mutex_[i][j].reset(new std::mutex);
}
platform::Place place; platform::Place place;
workers_[i].resize(pipeline_num_); int place_id = section_config.place_id();
for (int j = 0; j < pipeline_num_; ++j) { switch (section_config.place()) {
workers_[i][j].resize(concurrency); case SectionConfig::CPUPlace:
place = platform::CPUPlace();
switch (section_config.place()) { break;
case SectionConfig::CPUPlace: case SectionConfig::CUDAPlace:
place = platform::CPUPlace(); // Note that one section has at most one GPU place in one pipeline
break; PADDLE_ENFORCE_GE(
case SectionConfig::CUDAPlace: place_id, 0,
// Note that one section has at most one GPU place in one pipeline platform::errors::InvalidArgument(
place = platform::CUDAPlace(j); "The place_id value for CUDAPlace shoud be greater "
break; "than or equal to 0, but the value you give is %d.",
case SectionConfig::CUDAPinnedPlace: place_id));
place = platform::CUDAPinnedPlace(); place = platform::CUDAPlace(place_id);
break; break;
default: case SectionConfig::CUDAPinnedPlace:
PADDLE_ENFORCE(false, "Unkown place type in SectionConfig: %d", place = platform::CUDAPinnedPlace();
section_config.place()); break;
} default:
PADDLE_ENFORCE_NOT_NULL(nullptr,
platform::errors::InvalidArgument(
"Unkown place type in SectionConfig: %d",
section_config.place()));
}
places_.emplace_back(place);
VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
<< ", section: " << i;
for (int k = 0; k < concurrency; ++k) { workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
workers_[i][j][k] = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name());
trainer_desc.device_worker_name()); auto this_worker =
auto this_worker = std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
std::dynamic_pointer_cast<paddle::framework::SectionWorker>( workers_[i]);
workers_[i][j][k]); if (i == 0) {
this_worker->SetSectionIndex(i); // we only set reader for the first section
this_worker->SetDeviceIndex(j); this_worker->SetDataFeed(reader);
this_worker->SetThreadIndex(k); this_worker->SetReaderPlace(place);
this_worker->SetSectionNum(section_num_);
this_worker->SetPipelineNum(pipeline_num_);
if (i == 0) {
this_worker->SetDataFeed(readers[reader_index++]);
this_worker->SetReaderPlace(place);
}
if (i == section_num_ - 1) {
this_worker->SetNeedDumpField(need_dump_field_);
this_worker->SetNeedDumpParam(need_dump_param_);
this_worker->SetDumpFieldVector(dump_fields_);
this_worker->SetDumpParamVector(dump_param_);
}
this_worker->SetPlace(place);
this_worker->Initialize(trainer_desc);
this_worker->InitRandomDumpConfig(trainer_desc);
}
} }
} this_worker->SetThreadIndex(i);
param_need_sync_.reset( this_worker->SetSectionIndex(i);
new std::vector<std::string>(pipeline_config_.param_need_sync().begin(), this_worker->SetPlace(place);
pipeline_config_.param_need_sync().end())); this_worker->Initialize(trainer_desc);
VLOG(3) << "param_need_sync_ have: "; this_worker->SetMicrobatchNum(num_microbatches_);
for (const std::string& name : *param_need_sync_) {
VLOG(3) << name;
} }
// set debug here // set debug here
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
...@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) { ...@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
void PipelineTrainer::InitDumpEnv() { void PipelineTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>(); queue_ = paddle::framework::MakeChannel<std::string>();
// Only set dump channel on the last section // TODO(sandyhouse): should make it as a config
for (int j = 0; j < pipeline_num_; ++j) {
for (size_t k = 0; k < workers_[section_num_ - 1][j].size(); ++k) {
workers_[section_num_ - 1][j][k]->SetChannelWriter(queue_.get());
}
}
// TODO(hutuxian): should make it as a config
dump_thread_num_ = 1; dump_thread_num_ = 1;
for (int i = 0; i < dump_thread_num_; i++) { for (int i = 0; i < dump_thread_num_; i++) {
dump_thread_.push_back( dump_thread_.push_back(
...@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() { ...@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
} }
} }
void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue, void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
int pipeline_id, const ProgramDesc& program,
const ProgramDesc& main_program, const platform::Place& place) {
const Scope& root_scope) { auto& global_block = program.Block(0);
for (int i = 0; i < scope_queue_size_; ++i) { for (auto& var : global_block.AllVars()) {
Scope* scope = &pipeline_scopes_[pipeline_id]->NewScope(); int is_feed_var =
for (auto& var : main_program.Block(0).AllVars()) { std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name());
if (!var->Persistable()) { if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
auto* ptr = scope->Var(var->Name()); if (is_feed_var) {
InitializeVariable(ptr, var->GetType()); auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
InitializeVariable(new_ptr, var->GetType());
} else { } else {
if (section_num_ == 1) { // Means only one section and it must be auto* ptr = root_scope_->FindVar(var->Name());
// CUDAPlace, so copy all persistable vars to auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
// pipeline scope VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
const LoDTensor& root_tensor = << section_id << ", which pointer is " << new_ptr;
root_scope.FindVar(var->Name())->Get<LoDTensor>(); InitializeVariable(new_ptr, var->GetType());
LoDTensor* gpu_tensor = pipeline_scopes_[pipeline_id] const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
->Var(var->Name()) LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
->GetMutable<LoDTensor>(); TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
platform::Place place = platform::CUDAPlace(pipeline_id); static_cast<Tensor*>(minibatch_tensor));
TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
static_cast<Tensor*>(gpu_tensor));
}
} }
} else if (!var->Persistable() && !is_feed_var) {
auto* ptr =
microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for section "
<< section_id << " microbatch " << microbatch_id
<< ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType());
} }
scope_queue->Send(scope);
} }
} }
void PipelineTrainer::CopyParameters(const Scope& root_scope, int pipeline_id) { void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) {
for (const std::string& name : *param_need_sync_) { auto& global_block = program.Block(0);
const LoDTensor& root_tensor = root_scope.FindVar(name)->Get<LoDTensor>(); for (auto& op : global_block.AllOps()) {
if (op->Type() != "enqueue") {
// TODO(hutxian): check a new var of the same name is created in continue;
// pipeline_scope }
LoDTensor* gpu_tensor = auto input_arg_names = op->InputArgumentNames();
pipeline_scopes_[pipeline_id]->Var(name)->GetMutable<LoDTensor>(); PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
platform::Place place = platform::CUDAPlace(pipeline_id); platform::errors::InvalidArgument(
TensorCopy(*static_cast<const Tensor*>(&root_tensor), place, "Number of input arguments for enqueue op must be 1, "
static_cast<Tensor*>(gpu_tensor)); "but the value is %d.",
input_arg_names.size()));
std::string input_arg_name = input_arg_names[0];
if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
skip_vars_[section_id].emplace_back(input_arg_name);
VLOG(3) << "add skip var name: " << input_arg_name;
}
} }
} }
void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) { const platform::Place& place) {
PADDLE_ENFORCE(root_scope_, "Null root_scope pointer"); PADDLE_ENFORCE_NOT_NULL(root_scope_,
SectionWorker::cpu_id_.store(pipeline_config_.start_cpu_core_id()); platform::errors::InvalidArgument(
scope_queues_.resize(section_num_); "root_scope pointer can not be nullptr"));
pipeline_scopes_.resize(pipeline_num_); auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
for (auto& var : main_program.Block(0).AllVars()) { SectionWorker::cpu_id_.store(start_cpu_id);
if (var->Persistable()) { minibatch_scopes_.resize(section_num_);
persistable_vars_.push_back(var->Name()); microbatch_scopes_.resize(section_num_);
} skip_vars_.resize(section_num_);
}
VLOG(3) << "Init ScopeQueues and create all scopes"; VLOG(3) << "Init ScopeQueues and create all scopes";
for (int i = 0; i < section_num_; ++i) { for (int i = 0; i < section_num_; ++i) {
for (int j = 0; j < pipeline_num_; ++j) { minibatch_scopes_[i] = &root_scope_->NewScope();
scope_queues_[i].emplace_back(new ScopeQueue(scope_queue_size_)); std::shared_ptr<framework::ProgramDesc> program;
if (i == 0) { program.reset(new ProgramDesc(
pipeline_scopes_[j] = &root_scope_->NewScope(); trainer_desc_.section_param().section_config(i).program_desc()));
CopyParameters(*root_scope_, j); microbatch_scopes_[i].resize(num_microbatches_);
InitFirstScopeQueue(scope_queues_[0].back().get(), j, main_program, for (int j = 0; j < num_microbatches_; ++j) {
*root_scope_); microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope();
} CopyParameters(i, j, *program, places_[i]);
} }
GetSkipVars(i, *program);
} }
for (int i = 0; i < section_num_; ++i) { for (int i = 0; i < section_num_; ++i) {
for (int j = 0; j < pipeline_num_; ++j) { auto this_worker =
for (size_t k = 0; k < workers_[i][j].size(); ++k) { std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
auto this_worker = workers_[i]);
std::dynamic_pointer_cast<paddle::framework::SectionWorker>( this_worker->SetRootScope(root_scope_);
workers_[i][j][k]); this_worker->SetMinibatchScope(minibatch_scopes_[i]);
this_worker->SetRootScope(root_scope_); this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
this_worker->SetCountMutex(worker_count_mutex_[i][j].get()); this_worker->SetSkipVars(skip_vars_[i]);
this_worker->SetWorkerCount(worker_count_[i][j]);
this_worker->SetScopeQueue(scope_queues_[i][j].get(),
(i == section_num_ - 1)
? scope_queues_[0][j].get()
: scope_queues_[i + 1][j].get());
this_worker->SetVarNames(*in_var_names_[i], *out_var_names_[i]);
if (i != section_num_ - 1) {
// For data copy in adjacent different place
this_worker->SetNextSectionPlace(
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i + 1][j][0])
->place());
}
}
}
}
if (pipeline_num_ > 1 && sync_steps_ != -1) {
construct_sync_functor();
}
}
void PipelineTrainer::construct_sync_functor() {
std::vector<platform::Place> cuda_places;
for (int i = 0; i < pipeline_num_; ++i) {
cuda_places.emplace_back(platform::CUDAPlace(i));
}
nccl_ctx_map_.reset(new platform::NCCLContextMap(cuda_places));
sync_functors_.resize(pipeline_num_);
SyncFunctor::sync_flag_ = 0;
SyncFunctor::pipeline_scopes_.resize(0);
for (int j = 0; j < pipeline_num_; ++j) {
SyncFunctor* sync_function = new SyncFunctor(j, pipeline_num_, sync_steps_);
sync_function->SetSyncParam(*param_need_sync_);
sync_function->SetNcclCtxMap(nccl_ctx_map_.get());
SyncFunctor::pipeline_scopes_.push_back(this->pipeline_scopes_[j]);
sync_functors_[j].reset(sync_function);
}
for (int i = section_num_ - 1; i >= 0; --i) {
if (SectionConfig::CUDAPlace ==
pipeline_config_.section_config(i).place()) {
for (int j = 0; j < pipeline_num_; ++j) {
for (size_t k = 0; k < workers_[i][j].size(); ++k) {
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i][j][k]);
this_worker->SetSyncFunctor(sync_functors_[j].get());
}
}
break;
}
} }
} }
void PipelineTrainer::Run() { void PipelineTrainer::Run() {
VLOG(3) << "Going to run"; VLOG(3) << "Going to run";
for (int i = 0; i < section_num_; ++i) { for (int i = 0; i < section_num_; ++i) {
for (int j = 0; j < pipeline_num_; ++j) { if (!debug_) {
for (size_t k = 0; k < workers_[i][j].size(); ++k) { section_threads_.push_back(
if (!debug_) { std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
section_threads_.push_back( } else {
std::thread(&DeviceWorker::TrainFiles, workers_[i][j][k].get())); section_threads_.push_back(std::thread(
} else { &DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
section_threads_.push_back(std::thread(
&DeviceWorker::TrainFilesWithProfiler, workers_[i][j][k].get()));
}
}
} }
} }
} }
...@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() { ...@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
if (need_dump_field_) { if (need_dump_field_) {
FinalizeDumpEnv(); FinalizeDumpEnv();
} }
for (const auto& var : persistable_vars_) { VLOG(3) << "copying back parameters. ";
auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>(); for (int i = 0; i < section_num_; ++i) {
// TODO(hutuxian): Add a final all-reduce? std::shared_ptr<framework::ProgramDesc> program;
const auto& thread_tensor = program.reset(new ProgramDesc(
pipeline_scopes_[0]->FindVar(var)->Get<LoDTensor>(); trainer_desc_.section_param().section_config(i).program_desc()));
TensorCopySync(thread_tensor, platform::CPUPlace(), root_tensor); for (int j = 0; j < num_microbatches_; ++j) {
auto& global_block = program->Block(0);
for (auto& var : global_block.AllVars()) {
if (var->Persistable()) {
auto* ptr = root_scope_->FindVar(var->Name());
LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
static_cast<Tensor*>(root_tensor));
VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
}
}
}
} }
root_scope_->DropKids(); root_scope_->DropKids();
} }
Scope* PipelineTrainer::GetWorkerScope(int thread_id) { Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
return pipeline_scopes_[thread_id]; return microbatch_scopes_[thread_id][0];
} }
} // end namespace framework } // end namespace framework
......
...@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase { ...@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
void InitDumpEnv() override; void InitDumpEnv() override;
virtual std::string GetDumpPath(int tid); virtual std::string GetDumpPath(int tid);
void GetSkipVars(int section_id, const ProgramDesc& main_program);
protected: protected:
int section_num_; int section_num_;
int pipeline_num_; int num_microbatches_;
int scope_queue_size_; int start_cpu_core_id_;
int sync_steps_; std::vector<std::string> feed_var_names_;
std::vector<platform::Place> places_;
std::vector<std::vector<std::string>> skip_vars_;
TrainerDesc trainer_desc_;
SectionWorkerParameter pipeline_config_;
// The in/output var names for each section
std::vector<std::unique_ptr<std::vector<std::string>>> in_var_names_;
std::vector<std::unique_ptr<std::vector<std::string>>> out_var_names_;
// Counter for the running thread
std::vector<std::vector<int*>> worker_count_;
std::vector<std::vector<std::unique_ptr<std::mutex>>> worker_count_mutex_;
// worker: [section_id][pipeline_id][thread_id]
std::vector<std::vector<
std::vector<std::shared_ptr<paddle::framework::DeviceWorker>>>>
workers_;
std::vector<std::thread> section_threads_; std::vector<std::thread> section_threads_;
// worker: [section_id]
// We use scope to maintain context info, and scopes std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_;
// will be deliverd between different sections. // minibatch_scopes_: [section_id]
std::vector<std::vector<std::unique_ptr<ScopeQueue>>> scope_queues_; std::vector<Scope*> minibatch_scopes_;
std::vector<Scope*> pipeline_scopes_; // microbatch_scopes_: [section_id][microbatch_id]
std::vector<std::vector<Scope*>> microbatch_scopes_;
// The parameters that should be syncronized between different cards using
// nccl all-reduce void CopyParameters(int section_id, int microbatch_id,
std::shared_ptr<std::vector<std::string>> param_need_sync_; const ProgramDesc& program, const platform::Place& place);
std::vector<std::string> persistable_vars_; bool isPersistableVarGrad(std::string name);
std::vector<std::unique_ptr<SyncFunctor>> sync_functors_; bool isPersistable(VarDesc* var);
std::shared_ptr<platform::NCCLContextMap> nccl_ctx_map_;
std::vector<DataFeed*> readers_;
void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id,
const ProgramDesc& main_program,
const Scope& root_scope);
void CopyParameters(const Scope& root_scope, int pipeline_id);
void construct_sync_functor();
}; };
#endif #endif
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -83,6 +83,7 @@ message SectionWorkerParameter { ...@@ -83,6 +83,7 @@ message SectionWorkerParameter {
optional int64 sync_steps = 3 [ default = 1 ]; optional int64 sync_steps = 3 [ default = 1 ];
optional int32 start_cpu_core_id = 4 [ default = 1 ]; optional int32 start_cpu_core_id = 4 [ default = 1 ];
repeated string param_need_sync = 5; repeated string param_need_sync = 5;
optional int32 num_microbatches = 6;
} }
message SectionConfig { message SectionConfig {
...@@ -99,6 +100,7 @@ message SectionConfig { ...@@ -99,6 +100,7 @@ message SectionConfig {
optional int32 concurrency = 3 [ default = 1 ]; optional int32 concurrency = 3 [ default = 1 ];
repeated string section_in_var_names = 4; repeated string section_in_var_names = 4;
repeated string section_out_var_names = 5; repeated string section_out_var_names = 5;
optional int32 place_id = 6 [ default = -1 ];
} }
message FetchConfig { message FetchConfig {
......
...@@ -403,11 +403,8 @@ class Section(DeviceWorker): ...@@ -403,11 +403,8 @@ class Section(DeviceWorker):
trainer_desc.device_worker_name = "SectionWorker" trainer_desc.device_worker_name = "SectionWorker"
pipeline_opt = self._program._pipeline_opt pipeline_opt = self._program._pipeline_opt
section_param = trainer_desc.section_param section_param = trainer_desc.section_param
section_param.queue_size = pipeline_opt["queue_size"] section_param.num_microbatches = pipeline_opt["num_microbatches"]
section_param.sync_steps = pipeline_opt["sync_steps"]
section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"] section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
for e in pipeline_opt["param_need_sync"]:
section_param.param_need_sync.append(e)
for i, program in enumerate(pipeline_opt["section_program_list"]): for i, program in enumerate(pipeline_opt["section_program_list"]):
cfg = section_param.section_config.add() cfg = section_param.section_config.add()
cfg.program_desc.ParseFromString(program["program"]._get_desc() cfg.program_desc.ParseFromString(program["program"]._get_desc()
...@@ -415,6 +412,7 @@ class Section(DeviceWorker): ...@@ -415,6 +412,7 @@ class Section(DeviceWorker):
# TODO: why does not work # TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc()) # cfg.program_desc.CopyFrom(program.program._get_desc())
place = pipeline_opt["place_list"][i] place = pipeline_opt["place_list"][i]
place_id = pipeline_opt["place_id_list"][i]
if isinstance(place, core.CPUPlace): if isinstance(place, core.CPUPlace):
cfg.place = cfg.CPUPlace cfg.place = cfg.CPUPlace
elif isinstance(place, core.CUDAPlace): elif isinstance(place, core.CUDAPlace):
...@@ -425,12 +423,7 @@ class Section(DeviceWorker): ...@@ -425,12 +423,7 @@ class Section(DeviceWorker):
raise NotImplementedError( raise NotImplementedError(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now." "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
) )
cfg.place_id = place_id
cfg.concurrency = pipeline_opt["concurrency_list"][i]
for var in program["input_set"]:
cfg.section_in_var_names.append(var)
for var in program["output_set"]:
cfg.section_out_var_names.append(var)
class DeviceWorkerFactory(object): class DeviceWorkerFactory(object):
......
...@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object): ...@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
"place_list": place_list, "place_list": place_list,
"place_id_list": place_id_list, "place_id_list": place_id_list,
"sync_steps": -1, "sync_steps": -1,
"queue_size": self._num_microbatches, "num_microbatches": self._num_microbatches,
"start_cpu_core_id": self._start_cpu_core_id, "start_cpu_core_id": self._start_cpu_core_id,
} }
return optimize_ops, params_grads, program_list return optimize_ops, params_grads, program_list
......
...@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000): ...@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
pool_type='max') pool_type='max')
if layers >= 50: if layers >= 50:
for block in range(len(depth)): for block in range(len(depth)):
with fluid.device_guard("cpu"): with fluid.device_guard("gpu:0"):
for i in range(depth[block]): for i in range(depth[block]):
conv = bottleneck_block( conv = bottleneck_block(
input=conv, input=conv,
...@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000): ...@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
initializer=fluid.initializer.Uniform(-stdv, stdv))) initializer=fluid.initializer.Uniform(-stdv, stdv)))
else: else:
for block in range(len(depth)): for block in range(len(depth)):
with fluid.device_guard("cpu"): with fluid.device_guard("gpu:0"):
for i in range(depth[block]): for i in range(depth[block]):
conv = basic_block( conv = basic_block(
input=conv, input=conv,
...@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000): ...@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000):
class TestPipeline(unittest.TestCase): class TestPipeline(unittest.TestCase):
""" TestCases for Pipeline Training. """ """ TestCases for Pipeline Training. """
def _run(self, debug):
main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog):
with fluid.device_guard("cpu"):
image = fluid.layers.data(
name="image", shape=[3, 224, 224], dtype="float32")
label = fluid.layers.data(
name="label", shape=[1], dtype="int64")
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
use_double_buffer=True,
iterable=False)
fc = build_network(image, layers=50)
with fluid.device_guard("gpu:0"):
out, prob = fluid.layers.softmax_with_cross_entropy(
logits=fc, label=label, return_softmax=True)
loss = fluid.layers.mean(out)
acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
base_lr = 0.1
passes = [30, 60, 80, 90]
total_images = 1281167
steps_per_pass = total_images // 128
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.MomentumOptimizer(
lr_val,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, num_microbatches=2)
optimizer.minimize(loss)
def train_reader():
for _ in range(4):
img = np.random.random(size=[3, 224, 224]).astype('float32')
label = np.random.random(size=[1]).astype('int64')
yield img, label
data_loader.set_sample_generator(train_reader, batch_size=1)
place = fluid.CPUPlace()
# The following dataset is only used for the
# interface 'train_from_dataset'.
# And it has no actual meaning.
dataset = fluid.DatasetFactory().create_dataset('FileInstantDataset')
dataset.set_batch_size(1)
dataset.set_thread(1)
dataset.set_filelist(['/tmp/tmp_2.txt'])
dataset.set_use_var([image, label])
exe = fluid.Executor(place)
exe.run(startup_prog)
data_loader.start()
exe.train_from_dataset(main_prog, dataset, debug=debug)
def test_pipeline(self): def test_pipeline(self):
with fluid.device_guard("cpu"): self._run(False)
image = fluid.layers.data( self._run(True)
name="image", shape=[3, 224, 224], dtype="float32")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
use_double_buffer=True,
iterable=False)
fc = build_network(image, layers=50)
with fluid.device_guard("gpu:0"):
out, prob = fluid.layers.softmax_with_cross_entropy(
logits=fc, label=label, return_softmax=True)
loss = fluid.layers.mean(out)
acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
base_lr = 0.1
passes = [30, 60, 80, 90]
total_images = 1281167
steps_per_pass = total_images // 128
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.Momentum(
lr_val,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, num_microbatches=2)
optimizer.minimize(loss)
def test_pipeline_noneoptimizer(self): def test_pipeline_noneoptimizer(self):
with fluid.device_guard("gpu:0"): with fluid.device_guard("gpu:0"):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册