未验证 提交 f77a78cd 编写于 作者: L lilong12 提交者: GitHub

enable pipeline to run with Executor.run() (#28373)

* update, test=develop
上级 9f642ed8
...@@ -540,7 +540,7 @@ class HeterBoxWorker : public HogwildWorker { ...@@ -540,7 +540,7 @@ class HeterBoxWorker : public HogwildWorker {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
SectionWorker() { local_batch_id_ = 0; } SectionWorker() {}
~SectionWorker() override {} ~SectionWorker() override {}
void Initialize(const TrainerDesc& desc) override; void Initialize(const TrainerDesc& desc) override;
...@@ -549,13 +549,12 @@ class SectionWorker : public DeviceWorker { ...@@ -549,13 +549,12 @@ class SectionWorker : public DeviceWorker {
void CreateDeviceResource(const ProgramDesc& main_prog) override{}; void CreateDeviceResource(const ProgramDesc& main_prog) override{};
void TrainFiles() override; void TrainFiles() override;
void TrainFilesWithProfiler() override; void TrainFilesWithProfiler() override{};
void PrintFetchVars() override {} void PrintFetchVars() override {}
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
void SetSectionIndex(int section_id) { section_id_ = section_id; }
void SetDeviceIndex(int tid) override {} void SetDeviceIndex(int tid) override {}
void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
void SetMicrobatchNum(int num) { num_microbatches_ = num; } void SetMicrobatchNum(int num) { num_microbatches_ = num; }
...@@ -566,13 +565,8 @@ class SectionWorker : public DeviceWorker { ...@@ -566,13 +565,8 @@ class SectionWorker : public DeviceWorker {
void SetSkipVars(const std::vector<std::string>& skip_vars) { void SetSkipVars(const std::vector<std::string>& skip_vars) {
skip_vars_ = skip_vars; skip_vars_ = skip_vars;
} }
static void ResetBatchId() { batch_id_ = 0; }
static void ResetThreadCompletedFlag() { threads_completed = false; }
static std::atomic<int> cpu_id_;
protected: protected:
void AutoSetCPUAffinity(bool reuse);
int section_id_; int section_id_;
int thread_id_; int thread_id_;
int num_microbatches_; int num_microbatches_;
...@@ -581,12 +575,8 @@ class SectionWorker : public DeviceWorker { ...@@ -581,12 +575,8 @@ class SectionWorker : public DeviceWorker {
const Scope* minibatch_scope_; const Scope* minibatch_scope_;
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
static std::mutex thread_mutex;
static std::condition_variable thread_condition;
static bool threads_completed;
std::shared_ptr<framework::ProgramDesc> program_; std::shared_ptr<framework::ProgramDesc> program_;
static uint64_t batch_id_; static uint64_t batch_id_;
uint64_t local_batch_id_;
platform::DeviceContext* dev_ctx_ = nullptr; platform::DeviceContext* dev_ctx_ = nullptr;
}; };
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include <map>
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
...@@ -26,83 +27,25 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -26,83 +27,25 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const auto& section_params = trainer_desc.section_param(); const auto& section_params = trainer_desc.section_param();
num_microbatches_ = section_params.num_microbatches(); num_microbatches_ = section_params.num_microbatches();
VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_; VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
section_num_ = section_params.section_config_size();
VLOG(3) << "Number of program sections: " << section_num_;
trainer_desc_ = trainer_desc; trainer_desc_ = trainer_desc;
start_cpu_core_id_ = section_params.start_cpu_core_id();
SetDataset(dataset);
ParseDumpConfig(trainer_desc); ParseDumpConfig(trainer_desc);
// get filelist from trainer_desc here const auto& section_config = section_params.section_config();
const std::vector<paddle::framework::DataFeed*> readers = int place_id = section_config.place_id();
dataset->GetReaders(); place_ = platform::CUDAPlace(place_id);
VLOG(3) << "readers num: " << readers.size(); worker_ = DeviceWorkerFactory::CreateDeviceWorker(
int num_readers = readers.size(); trainer_desc.device_worker_name());
PADDLE_ENFORCE_EQ(num_readers, 1, auto this_worker =
platform::errors::InvalidArgument( std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
"Number of dataset readers for pipeline " this_worker->SetPlace(place_);
"must be 1 now, but the value you give is %d.", this_worker->Initialize(trainer_desc);
num_readers)); this_worker->SetMicrobatchNum(num_microbatches_);
auto* reader = readers[0];
feed_var_names_ = reader->GetUseSlotAlias();
workers_.resize(section_num_);
for (int i = 0; i < section_num_; ++i) {
const auto& section_config = section_params.section_config(i);
platform::Place place;
int place_id = section_config.place_id();
switch (section_config.place()) {
case SectionConfig::CPUPlace:
place = platform::CPUPlace();
break;
case SectionConfig::CUDAPlace:
// Note that one section has at most one GPU place in one pipeline
PADDLE_ENFORCE_GE(
place_id, 0,
platform::errors::InvalidArgument(
"The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d.",
place_id));
place = platform::CUDAPlace(place_id);
break;
case SectionConfig::CUDAPinnedPlace:
place = platform::CUDAPinnedPlace();
break;
default:
PADDLE_ENFORCE_NOT_NULL(nullptr,
platform::errors::InvalidArgument(
"Unkown place type in SectionConfig: %d",
section_config.place()));
}
places_.emplace_back(place);
VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
<< ", section: " << i;
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i]);
if (i == 0) {
// we only set reader for the first section
this_worker->SetDataFeed(reader);
this_worker->SetReaderPlace(place);
}
this_worker->SetThreadIndex(i);
this_worker->SetSectionIndex(i);
this_worker->SetPlace(place);
this_worker->Initialize(trainer_desc);
this_worker->SetMicrobatchNum(num_microbatches_);
}
// set debug here
SetDebug(trainer_desc.debug());
} }
void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) { void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) {
if (need_dump_field_) { if (need_dump_field_) {
InitDumpEnv(); InitDumpEnv();
} }
VLOG(3) << "init other env done.";
} }
std::string PipelineTrainer::GetDumpPath(int tid) { std::string PipelineTrainer::GetDumpPath(int tid) {
...@@ -119,143 +62,87 @@ void PipelineTrainer::InitDumpEnv() { ...@@ -119,143 +62,87 @@ void PipelineTrainer::InitDumpEnv() {
} }
} }
void PipelineTrainer::CopyParameters(int section_id, int microbatch_id, void PipelineTrainer::CopyParameters(int microbatch_id,
const ProgramDesc& program, const ProgramDesc& program,
const platform::Place& place) { const platform::Place& place) {
auto& global_block = program.Block(0); auto& global_block = program.Block(0);
std::map<std::string, int> param_map;
for (auto& var : global_block.AllVars()) { for (auto& var : global_block.AllVars()) {
int is_feed_var = if (var->Persistable()) {
std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name()); param_map[var->Name()] = 1;
if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
if (is_feed_var) {
auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
InitializeVariable(new_ptr, var->GetType());
} else {
auto* ptr = root_scope_->FindVar(var->Name());
auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
<< section_id << ", which pointer is " << new_ptr;
InitializeVariable(new_ptr, var->GetType());
const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
static_cast<Tensor*>(minibatch_tensor));
}
} else if (!var->Persistable() && !is_feed_var) {
auto* ptr =
microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for section "
<< section_id << " microbatch " << microbatch_id
<< ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType());
} }
} }
}
void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) { for (auto& var : global_block.AllVars()) {
auto& global_block = program.Block(0); bool is_param_grad = false;
for (auto& op : global_block.AllOps()) { size_t pos = 0;
if (op->Type() != "enqueue") { if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
continue; auto prefix_name = var->Name().substr(0, pos);
if (param_map.find(prefix_name) != param_map.end()) {
is_param_grad = true;
}
} }
auto input_arg_names = op->InputArgumentNames(); if (var->Persistable() && microbatch_id == 0) {
PADDLE_ENFORCE_EQ(input_arg_names.size(), 1, auto* ptr = root_scope_->Var(var->Name());
platform::errors::InvalidArgument( InitializeVariable(ptr, var->GetType());
"Number of input arguments for enqueue op must be 1, " VLOG(3) << "Create persistable var: " << var->Name()
"but the value is %d.", << ", which pointer is " << ptr;
input_arg_names.size())); } else if (is_param_grad && microbatch_id == 0) {
std::string input_arg_name = input_arg_names[0]; auto* ptr = minibatch_scope_->Var(var->Name());
if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) { InitializeVariable(ptr, var->GetType());
skip_vars_[section_id].emplace_back(input_arg_name); VLOG(3) << "Create grad for persistable var: " << var->Name()
VLOG(3) << "add skip var name: " << input_arg_name; << ", which pointer is " << ptr;
} else if (!var->Persistable() && !is_param_grad) {
auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for microbatch "
<< microbatch_id << ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType());
} }
} }
} }
void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) { const platform::Place& place) {
PADDLE_ENFORCE_NOT_NULL(root_scope_, PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "root_scope_ can not be nullptr"));
"root_scope pointer can not be nullptr")); microbatch_scopes_.resize(num_microbatches_);
auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
SectionWorker::cpu_id_.store(start_cpu_id); VLOG(3) << "Create minibatch and microbatch scopes...";
minibatch_scopes_.resize(section_num_); minibatch_scope_ = &root_scope_->NewScope();
microbatch_scopes_.resize(section_num_); std::shared_ptr<framework::ProgramDesc> program;
skip_vars_.resize(section_num_); program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config().program_desc()));
VLOG(3) << "Init ScopeQueues and create all scopes"; for (int j = 0; j < num_microbatches_; ++j) {
for (int i = 0; i < section_num_; ++i) { microbatch_scopes_[j] = &minibatch_scope_->NewScope();
minibatch_scopes_[i] = &root_scope_->NewScope(); CopyParameters(j, *program, place_);
std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc()));
microbatch_scopes_[i].resize(num_microbatches_);
for (int j = 0; j < num_microbatches_; ++j) {
microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope();
CopyParameters(i, j, *program, places_[i]);
}
GetSkipVars(i, *program);
} }
for (int i = 0; i < section_num_; ++i) { auto this_worker =
auto this_worker = std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
std::dynamic_pointer_cast<paddle::framework::SectionWorker>( this_worker->SetRootScope(root_scope_);
workers_[i]); this_worker->SetMinibatchScope(minibatch_scope_);
this_worker->SetRootScope(root_scope_); this_worker->SetMicrobatchScopes(microbatch_scopes_);
this_worker->SetMinibatchScope(minibatch_scopes_[i]);
this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
this_worker->SetSkipVars(skip_vars_[i]);
}
} }
void PipelineTrainer::Run() { void PipelineTrainer::Run() {
VLOG(3) << "Going to run"; VLOG(5) << "Going to run PipelineTrainer::Run()";
for (int i = 0; i < section_num_; ++i) { section_thread_ = std::async(&DeviceWorker::TrainFiles, worker_.get());
if (!debug_) {
section_threads_.push_back(
std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
} else {
section_threads_.push_back(std::thread(
&DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
}
}
} }
void PipelineTrainer::Finalize() { void PipelineTrainer::Finalize() {
for (auto& th : section_threads_) { try {
th.join(); section_thread_.get();
} catch (platform::EOFException& e) {
std::rethrow_exception(std::current_exception());
} }
if (need_dump_field_) { if (need_dump_field_) {
FinalizeDumpEnv(); FinalizeDumpEnv();
} }
VLOG(3) << "copying back parameters. ";
for (int i = 0; i < section_num_; ++i) {
std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc()));
for (int j = 0; j < num_microbatches_; ++j) {
auto& global_block = program->Block(0);
for (auto& var : global_block.AllVars()) {
if (var->Persistable()) {
auto* ptr = root_scope_->FindVar(var->Name());
LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
static_cast<Tensor*>(root_tensor));
VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
}
}
}
}
root_scope_->DropKids(); root_scope_->DropKids();
SectionWorker::ResetBatchId();
SectionWorker::ResetThreadCompletedFlag();
} }
Scope* PipelineTrainer::GetWorkerScope(int thread_id) { Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
return microbatch_scopes_[thread_id][0]; return microbatch_scopes_[0];
} }
} // end namespace framework } // end namespace framework
......
...@@ -290,29 +290,22 @@ class PipelineTrainer : public TrainerBase { ...@@ -290,29 +290,22 @@ class PipelineTrainer : public TrainerBase {
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
void InitDumpEnv() override; void InitDumpEnv() override;
virtual std::string GetDumpPath(int tid); virtual std::string GetDumpPath(int tid);
void GetSkipVars(int section_id, const ProgramDesc& main_program); void GetSkipVars(const ProgramDesc& main_program);
protected: protected:
int section_num_;
int num_microbatches_; int num_microbatches_;
int start_cpu_core_id_; platform::Place place_;
std::vector<std::string> feed_var_names_; std::vector<std::string> skip_vars_;
std::vector<platform::Place> places_;
std::vector<std::vector<std::string>> skip_vars_;
TrainerDesc trainer_desc_; TrainerDesc trainer_desc_;
std::vector<std::thread> section_threads_; std::future<void> section_thread_;
// worker: [section_id] std::shared_ptr<paddle::framework::DeviceWorker> worker_;
std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_; Scope* minibatch_scope_;
// minibatch_scopes_: [section_id] // microbatch_scopes_: [microbatch_id]
std::vector<Scope*> minibatch_scopes_; std::vector<Scope*> microbatch_scopes_;
// microbatch_scopes_: [section_id][microbatch_id]
std::vector<std::vector<Scope*>> microbatch_scopes_;
void CopyParameters(int section_id, int microbatch_id, void CopyParameters(int microbatch_id, const ProgramDesc& program,
const ProgramDesc& program, const platform::Place& place); const platform::Place& place);
bool isPersistableVarGrad(std::string name);
bool isPersistable(VarDesc* var);
}; };
#endif #endif
......
...@@ -86,7 +86,7 @@ message DownpourWorkerParameter { ...@@ -86,7 +86,7 @@ message DownpourWorkerParameter {
} }
message SectionWorkerParameter { message SectionWorkerParameter {
repeated SectionConfig section_config = 1; optional SectionConfig section_config = 1;
optional int32 queue_size = 2 [ default = 1 ]; optional int32 queue_size = 2 [ default = 1 ];
optional int64 sync_steps = 3 [ default = 1 ]; optional int64 sync_steps = 3 [ default = 1 ];
optional int32 start_cpu_core_id = 4 [ default = 1 ]; optional int32 start_cpu_core_id = 4 [ default = 1 ];
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
from __future__ import print_function from __future__ import print_function
from __future__ import division
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core, unique_name from paddle.fluid import core, unique_name
...@@ -21,9 +22,55 @@ from .meta_optimizer_base import MetaOptimizerBase ...@@ -21,9 +22,55 @@ from .meta_optimizer_base import MetaOptimizerBase
from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
class PipelineHelper(CollectiveHelper): def _get_node_num(endpoints):
def __init__(self, role_maker, nrings=1, wait_port='6174'): ss = set()
super(PipelineHelper, self).__init__(role_maker, nrings, wait_port) for ep in endpoints:
ip = ep.split(":")[0].strip()
if ip not in ss:
ss.add(ip)
return len(ss)
class PipelineHelper(object):
def __init__(self, role_maker, wait_port='6174'):
self.wait_port = wait_port
self.role_maker = role_maker
def update_startup_program(self,
startup_program=None,
inner_parallelism=None):
self.startup_program = startup_program
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
node_num = _get_node_num(endpoints)
assert len(endpoints) % node_num == 0
nranks = self.role_maker._worker_num()
rank = self.role_maker._worker_index()
# Create ring 0 for all gpus in a pipeline
pipeline_endpoints = []
pipeline_rank = rank % inner_parallelism
pipeline_id = rank // inner_parallelism
for idx, ep in enumerate(endpoints):
if idx // inner_parallelism == pipeline_id:
pipeline_endpoints.append(ep)
self._init_communicator(self.startup_program, current_endpoint,
pipeline_endpoints, pipeline_rank, 0,
self.wait_port)
pipeline_num = len(endpoints) // inner_parallelism
if pipeline_num == 1: return
# Create rings for gpus with the same gpu id
eps = []
local_rank = self.role_maker._worker_index() % inner_parallelism
ring_id = local_rank + 1
for i in range(pipeline_num):
eps.append(endpoints[i * inner_parallelism + local_rank])
temp_rank = self.role_maker._worker_index() // inner_parallelism
self._init_communicator(self.startup_program, current_endpoint, eps,
temp_rank, ring_id, self.wait_port)
self._broadcast_params(ring_id)
def _init_communicator(self, program, current_endpoint, endpoints, rank, def _init_communicator(self, program, current_endpoint, endpoints, rank,
ring_id, wait_port): ring_id, wait_port):
...@@ -46,9 +93,8 @@ class PipelineHelper(CollectiveHelper): ...@@ -46,9 +93,8 @@ class PipelineHelper(CollectiveHelper):
'rank': rank, 'rank': rank,
'endpoint': current_endpoint, 'endpoint': current_endpoint,
'other_endpoints': other_endpoints, 'other_endpoints': other_endpoints,
OP_ROLE_KEY: OpRole.Forward OP_ROLE_KEY: OpRole.Forward,
}) })
block.append_op( block.append_op(
type='c_comm_init', type='c_comm_init',
inputs={'X': nccl_id_var}, inputs={'X': nccl_id_var},
...@@ -58,12 +104,10 @@ class PipelineHelper(CollectiveHelper): ...@@ -58,12 +104,10 @@ class PipelineHelper(CollectiveHelper):
'rank': rank, 'rank': rank,
'ring_id': ring_id, 'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward, OP_ROLE_KEY: OpRole.Forward,
'device_id': OpRole.Forward
}) })
def _broadcast_params(self): def _broadcast_params(self, ring_id):
block = self.startup_program.global_block() block = self.startup_program.global_block()
ring_id = 0
for param in block.iter_parameters(): for param in block.iter_parameters():
if param.is_distributed: if param.is_distributed:
continue continue
...@@ -78,13 +122,12 @@ class PipelineHelper(CollectiveHelper): ...@@ -78,13 +122,12 @@ class PipelineHelper(CollectiveHelper):
OP_ROLE_KEY: OpRole.Forward OP_ROLE_KEY: OpRole.Forward
}) })
for ring_id in range(self.nrings): block.append_op(
block.append_op( type='c_sync_comm_stream',
type='c_sync_comm_stream', inputs={'X': param},
inputs={'X': param}, outputs={'Out': param},
outputs={'Out': param}, attrs={'ring_id': ring_id,
attrs={'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward})
OP_ROLE_KEY: OpRole.Forward})
class PipelineOptimizer(MetaOptimizerBase): class PipelineOptimizer(MetaOptimizerBase):
...@@ -99,8 +142,8 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -99,8 +142,8 @@ class PipelineOptimizer(MetaOptimizerBase):
user_defined_strategy): user_defined_strategy):
super(PipelineOptimizer, self)._set_basic_info( super(PipelineOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy) loss, role_maker, user_defined_optimizer, user_defined_strategy)
num_microbatches = user_defined_strategy.pipeline_configs['micro_batch'] self.num_microbatches = user_defined_strategy.pipeline_configs[
self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches) 'micro_batch']
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective: if not self.role_maker._is_collective:
...@@ -115,29 +158,46 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -115,29 +158,46 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy.pipeline_configs = {} dist_strategy.pipeline_configs = {}
def _enable_strategy(self, dist_strategy, context): def _enable_strategy(self, dist_strategy, context):
# we do not support enable pipeline automatically right now dist_strategy.pipeline = True
return dist_strategy.pipeline_configs = {"micro_batch": 1, }
def _get_local_rank(self, current_endpoint, endpoints):
cur_node_endpoints = []
cur_ip = current_endpoint.split(':')[0].strip()
for ep in endpoints:
if cur_ip == ep.split(':')[0].strip():
cur_node_endpoints.append(ep)
return cur_node_endpoints.index(current_endpoint)
def minimize_impl(self, def minimize_impl(self,
loss, loss,
startup_program=None, startup_program=None,
parameter_list=None, parameter_list=None,
no_grad_set=None): no_grad_set=None):
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
if self.role_maker._worker_num() == 1:
return optimize_ops, params_grads
endpoints = self.role_maker._get_trainer_endpoints() endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()] current_endpoint = endpoints[self.role_maker._worker_index()]
self.local_rank = self._get_local_rank(current_endpoint, endpoints)
self.wrapped_opt = PO(self.inner_opt,
num_microbatches=self.num_microbatches,
start_cpu_core_id=self.local_rank)
node_num = _get_node_num(endpoints)
gpus_per_node = len(endpoints) // node_num
self.startup_program = startup_program self.startup_program = startup_program
self.local_rank = self._get_local_rank(current_endpoint, endpoints)
if startup_program is None: if startup_program is None:
self.startup_program = fluid.default_startup_program() self.startup_program = fluid.default_startup_program()
loss.block.program._pipeline_opt = dict()
loss.block.program._pipeline_opt['local_rank'] = self.local_rank
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
assert prog_list assert prog_list
self.main_program_list = prog_list self.main_program_list = prog_list
self.main_program = loss.block.program self.main_program = loss.block.program
self.inner_parallelism = loss.block.program._pipeline_opt[
'inner_parallelism']
nranks = len(endpoints) nranks = len(endpoints)
self.nranks = nranks self.nranks = nranks
self.nrings = len(self.main_program_list) self.nrings = len(self.main_program_list)
...@@ -146,24 +206,26 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -146,24 +206,26 @@ class PipelineOptimizer(MetaOptimizerBase):
self.endpoints = endpoints self.endpoints = endpoints
self.current_endpoint = current_endpoint self.current_endpoint = current_endpoint
pipeline_helper = PipelineHelper(self.role_maker, nrings=self.nrings) pipeline_helper = PipelineHelper(self.role_maker)
pipeline_helper.update_startup_program(self.startup_program) pipeline_helper.update_startup_program(
self.startup_program._pipeline_opt["startup_program"],
self.inner_parallelism)
self._transpile_main_program() self._transpile_main_program(loss, node_num, gpus_per_node)
return optimize_ops, params_grads return optimize_ops, params_grads
def _transpile_main_program(self): def _transpile_main_program(self, loss, node_num, gpus_per_node):
self._insert_loss_grad_ops() self._insert_loss_grad_ops(loss, gpus_per_node, node_num)
for ring_id in range(self.nrings): for ring_id in range(1, gpus_per_node + 1):
self._insert_allreduce_ops(ring_id) self._insert_allreduce_ops(ring_id)
def _insert_loss_grad_ops(self): def _insert_loss_grad_ops(self, loss, gpus_per_node, node_num):
""" """
In order to keep the learning rate consistent in different numbers of In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers training workers, we scale the loss grad by the number of workers
""" """
block = self.main_program_list[self.nrings - 1]['program'].global_block( block = self.main_program_list[gpus_per_node - 1][
) 'program'].global_block()
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_loss_grad_op(op): if is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]] loss_grad_var = block.vars[op.output_arg_names[0]]
...@@ -173,12 +235,12 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -173,12 +235,12 @@ class PipelineOptimizer(MetaOptimizerBase):
inputs={'X': loss_grad_var}, inputs={'X': loss_grad_var},
outputs={'Out': loss_grad_var}, outputs={'Out': loss_grad_var},
attrs={ attrs={
'scale': 1.0 / self.nranks, 'scale': 1.0 / node_num,
OP_ROLE_KEY: OpRole.Backward OP_ROLE_KEY: OpRole.Backward
}) })
def _insert_allreduce_ops(self, ring_id): def _insert_allreduce_ops(self, ring_id):
block = self.main_program_list[ring_id]['program'].global_block() block = self.main_program_list[ring_id - 1]['program'].global_block()
origin_block = self.main_program.global_block() origin_block = self.main_program.global_block()
grad = None grad = None
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
......
...@@ -413,25 +413,17 @@ class Section(DeviceWorker): ...@@ -413,25 +413,17 @@ class Section(DeviceWorker):
section_param = trainer_desc.section_param section_param = trainer_desc.section_param
section_param.num_microbatches = pipeline_opt["num_microbatches"] section_param.num_microbatches = pipeline_opt["num_microbatches"]
section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"] section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
for i, program in enumerate(pipeline_opt["section_program_list"]): cfg = section_param.section_config
cfg = section_param.section_config.add() program = pipeline_opt["section_program"]
cfg.program_desc.ParseFromString(program["program"]._get_desc() cfg.program_desc.ParseFromString(program["program"]._get_desc()
.serialize_to_string()) .serialize_to_string())
# TODO: why does not work # TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc()) # cfg.program_desc.CopyFrom(program.program._get_desc())
place = pipeline_opt["place_list"][i] place = pipeline_opt["place"]
place_id = pipeline_opt["place_id_list"][i] place_id = pipeline_opt["place_id"]
if isinstance(place, core.CPUPlace): assert isinstance(place, core.CUDAPlace)
cfg.place = cfg.CPUPlace cfg.place = cfg.CUDAPlace
elif isinstance(place, core.CUDAPlace): cfg.place_id = place_id
cfg.place = cfg.CUDAPlace
elif isinstance(place, core.CUDAPinnedPlace):
cfg.place = cfg.CUDAPinnedPlace
else:
raise NotImplementedError(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg.place_id = place_id
class DeviceWorkerFactory(object): class DeviceWorkerFactory(object):
......
...@@ -561,6 +561,7 @@ class Executor(object): ...@@ -561,6 +561,7 @@ class Executor(object):
self._default_executor = core.Executor(p) self._default_executor = core.Executor(p)
self._closed = False self._closed = False
self.pruned_program_scope_caches = dict() self.pruned_program_scope_caches = dict()
self._prepare_to_run_called = False
self._auto_checkpoint_name = unique_name.generate( self._auto_checkpoint_name = unique_name.generate(
"__auto_checkpoint_executor__") "__auto_checkpoint_executor__")
...@@ -1115,6 +1116,24 @@ class Executor(object): ...@@ -1115,6 +1116,24 @@ class Executor(object):
use_default_main_program = program is None use_default_main_program = program is None
if program is None: if program is None:
program = default_main_program() program = default_main_program()
if fetch_list is not None:
if isinstance(fetch_list, Variable) or isinstance(
fetch_list, str) or isinstance(fetch_list,
six.string_types):
fetch_list = [fetch_list]
assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
"Currently , The fetch_list type only should be list or tuple, \n"\
"but the input type is {}. For more information please refer to \n"\
"the executor.run(...).".format(type(fetch_list))
else:
fetch_list = []
if isinstance(program, Program) and program._pipeline_opt:
if "startup_program" in program._pipeline_opt:
program = program._pipeline_opt["startup_program"]
else:
return self.train_from_dataset(program, fetch_list=fetch_list)
if isinstance(program, Program) and \ if isinstance(program, Program) and \
len(program.global_block().ops) == 0: len(program.global_block().ops) == 0:
if use_default_main_program: if use_default_main_program:
...@@ -1131,18 +1150,6 @@ class Executor(object): ...@@ -1131,18 +1150,6 @@ class Executor(object):
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if fetch_list is not None:
if isinstance(fetch_list, Variable) or isinstance(
fetch_list, str) or isinstance(fetch_list,
six.string_types):
fetch_list = [fetch_list]
assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
"Currently , The fetch_list type only should be list or tuple, \n"\
"but the input type is {}. For more information please refer to \n"\
"the executor.run(...).".format(type(fetch_list))
else:
fetch_list = []
# use_prune can be overrided by putting optimize_ops in fetch_list # use_prune can be overrided by putting optimize_ops in fetch_list
_origin_fetch_list = fetch_list _origin_fetch_list = fetch_list
_origin_program = program _origin_program = program
...@@ -1449,6 +1456,25 @@ class Executor(object): ...@@ -1449,6 +1456,25 @@ class Executor(object):
raise RuntimeError("dataset is need and should be initialized") raise RuntimeError("dataset is need and should be initialized")
dataset._prepare_to_run() dataset._prepare_to_run()
real_fetch_list = []
if program._pipeline_opt:
real_program = program._pipeline_opt["section_program"]['program']
for fetch_var in fetch_list:
if isinstance(fetch_var, Variable):
fetch_var_name = fetch_var.name
else:
fetch_var_name = fetch_var
if fetch_var_name in real_program.global_block().vars:
real_fetch_list.append(fetch_var)
program._pipeline_opt["section_program"][
'program'] = self._add_feed_fetch_ops(
program=program._pipeline_opt["section_program"]['program'],
feed=[],
fetch_list=real_fetch_list,
feed_var_name='feed',
fetch_var_name='fetch')
fetch_list = None
scope, trainer = self._prepare_trainer( scope, trainer = self._prepare_trainer(
program=program, program=program,
...@@ -1483,6 +1509,10 @@ class Executor(object): ...@@ -1483,6 +1509,10 @@ class Executor(object):
dataset._dynamic_adjust_after_train() dataset._dynamic_adjust_after_train()
dataset._finish_to_run() dataset._finish_to_run()
if real_fetch_list:
arr = scope.find_var('fetch').get_fetch_list()
tensors = arr._move_to_list()
return as_numpy(tensors)
return None return None
......
此差异已折叠。
...@@ -10,10 +10,12 @@ if(NOT WITH_NCCL) ...@@ -10,10 +10,12 @@ if(NOT WITH_NCCL)
endif() endif()
string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
list(APPEND DIST_TEST_OPS test_pipeline)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
list(APPEND DIST_TEST_OPS test_listen_and_serv_op) list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
...@@ -146,7 +148,6 @@ if (WITH_NCCL) ...@@ -146,7 +148,6 @@ if (WITH_NCCL)
endif() endif()
if(NOT WITH_GPU OR WIN32) if(NOT WITH_GPU OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_pipeline)
LIST(REMOVE_ITEM TEST_OPS test_boxps) LIST(REMOVE_ITEM TEST_OPS test_boxps)
endif() endif()
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...@@ -469,7 +470,6 @@ if(WITH_DISTRIBUTE) ...@@ -469,7 +470,6 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS}) py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS}) py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy) py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
paddle.enable_static()
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return predict
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
# Input data
with fluid.device_guard("gpu:0"):
images = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if dist_strategy:
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[images, label],
capacity=64,
use_double_buffer=False,
iterable=False)
# Train program
predict = cnn_model(images)
with fluid.device_guard("gpu:1"):
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
with fluid.device_guard("gpu:1"):
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
base_lr = self.lr
passes = [30, 60, 80, 90]
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
if dist_strategy:
fleet.init(is_collective=True)
strategy = fleet.DistributedStrategy()
strategy.pipeline = True
dist_opt = fleet.distributed_optimizer(
optimizer=opt, strategy=strategy)
dist_opt.minimize(avg_cost)
else:
opt.minimize(avg_cost)
if dist_strategy:
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
else:
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
...@@ -124,6 +124,67 @@ class TestDistRunnerBase(object): ...@@ -124,6 +124,67 @@ class TestDistRunnerBase(object):
exe.run(pserver_prog) exe.run(pserver_prog)
print_to_err(type(self).__name__, "run pserver main program done.") print_to_err(type(self).__name__, "run pserver main program done.")
def run_pipeline_trainer(self, args):
self.lr = args.lr
dist_strategy = DistributedStrategy()
test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \
self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
eprint(type(self).__name__, "device_id: %d." % device_id)
place = fluid.CUDAPlace(device_id)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
eprint(type(self).__name__, "run worker startup program done.")
data_loader.set_sample_list_generator(train_reader, place)
data_loader.start()
print_to_err(type(self).__name__, "begin to train on trainer")
out_losses = []
for i in six.moves.xrange(RUN_STEP):
loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
loss = loss[0] if loss else None
out_losses.append(loss)
print_to_err(type(self).__name__, "run step %d finished" % i)
print_to_err(type(self).__name__, "trainer run finished")
if six.PY2:
print(pickle.dumps(out_losses))
else:
sys.stdout.buffer.write(pickle.dumps(out_losses))
if args.save_model:
model_save_dir = "/tmp"
if fleet.worker_index() == 0:
model_save_dir_fluid = os.path.join(model_save_dir,
"fluid_persistables")
model_save_dir_fleet = os.path.join(model_save_dir,
"fleet_persistables")
infer_save_dir_fluid = os.path.join(model_save_dir,
"fluid_infer")
infer_save_dir_fleet = os.path.join(model_save_dir,
"fleet_infer")
else:
model_save_dir_fluid = os.path.join(model_save_dir,
"fluid_persistables_2")
model_save_dir_fleet = os.path.join(model_save_dir,
"fleet_persistables_2")
infer_save_dir_fluid = os.path.join(model_save_dir,
"fluid_infer_2")
infer_save_dir_fleet = os.path.join(model_save_dir,
"fleet_infer_2")
fluid.io.save_persistables(exe, model_save_dir_fluid,
fleet._origin_program)
fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
feeded_var_names = [var.name for var in feed_var_list]
fluid.io.save_inference_model(infer_save_dir_fluid,
feeded_var_names, [avg_cost], exe,
fleet._origin_program)
fleet.save_inference_model(exe, infer_save_dir_fleet,
feeded_var_names, [avg_cost])
def run_gpu_fleet_api_trainer(self, args): def run_gpu_fleet_api_trainer(self, args):
assert args.update_method == "nccl2" assert args.update_method == "nccl2"
...@@ -532,6 +593,7 @@ def runtime_main(test_class): ...@@ -532,6 +593,7 @@ def runtime_main(test_class):
parser.add_argument('--nccl_comm_num', type=int, required=False, default=1) parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
parser.add_argument('--enable_backward_deps', action='store_true') parser.add_argument('--enable_backward_deps', action='store_true')
parser.add_argument('--use_hallreduce', action='store_true') parser.add_argument('--use_hallreduce', action='store_true')
parser.add_argument('--use_pipeline', action='store_true')
parser.add_argument('--gpu_fleet_api', action='store_true') parser.add_argument('--gpu_fleet_api', action='store_true')
parser.add_argument('--use_local_sgd', action='store_true') parser.add_argument('--use_local_sgd', action='store_true')
parser.add_argument('--ut4grad_allreduce', action='store_true') parser.add_argument('--ut4grad_allreduce', action='store_true')
...@@ -566,6 +628,8 @@ def runtime_main(test_class): ...@@ -566,6 +628,8 @@ def runtime_main(test_class):
model.run_pserver(args) model.run_pserver(args)
elif args.gpu_fleet_api: elif args.gpu_fleet_api:
model.run_gpu_fleet_api_trainer(args) model.run_gpu_fleet_api_trainer(args)
elif args.use_pipeline:
model.run_pipeline_trainer(args)
else: else:
model.run_trainer(args) model.run_trainer(args)
...@@ -607,6 +671,7 @@ class TestDistBase(unittest.TestCase): ...@@ -607,6 +671,7 @@ class TestDistBase(unittest.TestCase):
self._dc_asgd = False # must use with async mode self._dc_asgd = False # must use with async mode
self._use_reader_alloc = True self._use_reader_alloc = True
self._nccl2_mode = False self._nccl2_mode = False
self._pipeline_mode = False
self._mp_mode = False self._mp_mode = False
# FIXME(typhoonzero): I added this stupid argument to enable # FIXME(typhoonzero): I added this stupid argument to enable
# testing allreduce layers, which users can call layers.allreduce # testing allreduce layers, which users can call layers.allreduce
...@@ -892,6 +957,8 @@ class TestDistBase(unittest.TestCase): ...@@ -892,6 +957,8 @@ class TestDistBase(unittest.TestCase):
if self._use_dgc: if self._use_dgc:
tr_cmd += " --use_dgc" tr_cmd += " --use_dgc"
if self._pipeline_mode:
tr_cmd += " --use_pipeline"
if self._mp_mode: if self._mp_mode:
env = {"FLAGS_selected_gpus": "{}".format(trainer_id % 2)} env = {"FLAGS_selected_gpus": "{}".format(trainer_id % 2)}
...@@ -978,6 +1045,51 @@ class TestDistBase(unittest.TestCase): ...@@ -978,6 +1045,51 @@ class TestDistBase(unittest.TestCase):
print("outs[1]:", outs[1]) print("outs[1]:", outs[1])
return pickle.loads(outs[0]), pickle.loads(outs[1]) return pickle.loads(outs[0]), pickle.loads(outs[1])
def _run_pipeline(self, model, envs, check_error_log, log_name):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints = self._ps_endpoints.split(",")
update_method = "nccl2"
trainer_num = len(worker_endpoints)
procs = []
pipes = []
for i in range(0, trainer_num):
tr_cmd, tr_env = self._get_nccl2_trainer_cmd(
model, worker_endpoints[i], update_method, i, trainer_num)
tr_env.update(envs)
tr_env['CUDA_VISIBLE_DEVICES'] = "0,1"
tr_env['NCCL_SHM_DISABLE'] = '1'
tr_env['FLAGS_selected_gpus'] = str(i)
tr_env['FLAGS_cudnn_deterministic'] = '0'
print("tr_cmd:{}, env: {}".format(tr_cmd, tr_env))
tr_pipe = open("/tmp/" + "tr{}_err.log".format(i), "wb")
print_to_err(
type(self).__name__,
"going to start process {} with nccl2".format(i))
tr_proc = subprocess.Popen(
tr_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr_pipe,
env=tr_env)
procs.append(tr_proc)
pipes.append(tr_pipe)
outs = []
for i in range(0, trainer_num):
tr_out, tr_err = procs[i].communicate()
outs.append(tr_out)
pipes[i].close()
sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
if check_error_log:
print("outs[0]:", outs[0])
print("outs[1]:", outs[1])
return pickle.loads(outs[0]), pickle.loads(outs[1])
def _get_required_envs(self, check_error_log=False, need_envs={}): def _get_required_envs(self, check_error_log=False, need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine. # TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = { required_envs = {
...@@ -1032,6 +1144,9 @@ class TestDistBase(unittest.TestCase): ...@@ -1032,6 +1144,9 @@ class TestDistBase(unittest.TestCase):
False, False,
check_error_log, check_error_log,
log_name=log_name) log_name=log_name)
elif self._pipeline_mode:
tr0_losses, tr1_losses = self._run_pipeline(
model_file, required_envs, check_error_log, log_name=log_name)
else: else:
tr0_losses, tr1_losses = self._run_cluster( tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log, log_name=log_name) model_file, required_envs, check_error_log, log_name=log_name)
...@@ -1040,7 +1155,10 @@ class TestDistBase(unittest.TestCase): ...@@ -1040,7 +1155,10 @@ class TestDistBase(unittest.TestCase):
local_loss = local_losses[step_id] local_loss = local_losses[step_id]
tr0_loss = tr0_losses[step_id] tr0_loss = tr0_losses[step_id]
tr1_loss = tr1_losses[step_id] tr1_loss = tr1_losses[step_id]
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2 if self._pipeline_mode:
dist_loss = np.array([tr1_loss])
else:
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
print("=======", local_loss, ":", dist_loss[0], "=======") print("=======", local_loss, ":", dist_loss[0], "=======")
self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta) self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
......
...@@ -16,6 +16,8 @@ import unittest ...@@ -16,6 +16,8 @@ import unittest
import paddle import paddle
import os import os
paddle.enable_static()
class TestFleetMetaOptimizer(unittest.TestCase): class TestFleetMetaOptimizer(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -28,19 +30,14 @@ class TestFleetMetaOptimizer(unittest.TestCase): ...@@ -28,19 +30,14 @@ class TestFleetMetaOptimizer(unittest.TestCase):
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
with paddle.fluid.device_guard("cpu"): with paddle.fluid.device_guard("gpu:0"):
input_x = paddle.fluid.layers.data( input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32') name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data( input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64') name="y", shape=[1], dtype='int64')
data_loader = paddle.fluid.io.DataLoader.from_generator(
feed_list=[input_x, input_y],
capacity=64,
use_double_buffer=True,
iterable=False)
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
with paddle.fluid.device_guard("gpu:0"): with paddle.fluid.device_guard("gpu:1"):
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2, size=2,
......
...@@ -13,212 +13,32 @@ ...@@ -13,212 +13,32 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import numpy as np
import os
import shutil
import unittest import unittest
import math from test_dist_base import TestDistBase
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv,
act=act, )
def shortcut(input, ch_out, stride, is_first):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1 or is_first == True:
return conv_bn_layer(input, ch_out, 1, stride)
else:
return input
def bottleneck_block(input, num_filters, stride):
conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
short = shortcut(input, num_filters * 4, stride, is_first=False)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(input, num_filters, stride, is_first):
conv0 = conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride)
conv1 = conv_bn_layer(
input=conv0, num_filters=num_filters, filter_size=3, act=None)
short = shortcut(input, num_filters, stride, is_first)
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
def build_network(input, layers=50, class_dim=1000): import os
supported_layers = [18, 34, 50, 101, 152] import paddle
assert layers in supported_layers
depth = None
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
with fluid.device_guard("cpu"):
conv = conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
with fluid.device_guard("gpu:0"):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
else:
for block in range(len(depth)):
with fluid.device_guard("gpu:0"):
for i in range(depth[block]):
conv = basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
is_first=block == i == 0)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
return out
class TestPipeline(unittest.TestCase):
""" TestCases for Pipeline Training. """
def _run(self, debug):
main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog):
with fluid.device_guard("cpu"):
image = fluid.layers.data(
name="image", shape=[3, 224, 224], dtype="float32")
label = fluid.layers.data(
name="label", shape=[1], dtype="int64")
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
use_double_buffer=True,
iterable=False)
fc = build_network(image, layers=50)
with fluid.device_guard("gpu:0"):
out, prob = fluid.layers.softmax_with_cross_entropy(
logits=fc, label=label, return_softmax=True)
loss = fluid.layers.mean(out)
acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
base_lr = 0.1
passes = [30, 60, 80, 90]
total_images = 1281167
steps_per_pass = total_images // 128
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.MomentumOptimizer(
lr_val,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, num_microbatches=2)
optimizer.minimize(loss)
def train_reader():
for _ in range(4):
img = np.random.random(size=[3, 224, 224]).astype('float32')
label = np.random.random(size=[1]).astype('int64')
yield img, label
data_loader.set_sample_generator(train_reader, batch_size=1)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
data_loader.start()
exe.train_from_dataset(main_prog, debug=debug)
def test_pipeline(self):
self._run(False)
self._run(True)
def test_pipeline_noneoptimizer(self):
with fluid.device_guard("gpu:0"):
x = fluid.layers.data(
name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(
name='y', shape=[1], dtype='int64', lod_level=0)
emb_x = layers.embedding(
input=x,
param_attr=fluid.ParamAttr(name="embx"),
size=[10, 2],
is_sparse=False)
fc = layers.fc(input=emb_x,
name="fc",
size=1,
num_flatten_dims=1,
bias_attr=False)
loss = layers.reduce_mean(fc)
optimizer = fluid.optimizer.SGD(learning_rate=0.5) paddle.enable_static()
with self.assertRaises(ValueError): flag_name = os.path.splitext(__file__)[0]
optimizer = fluid.optimizer.PipelineOptimizer(
dict(), num_microbatches=2)
class TestPipeline(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
self._use_reader_alloc = False
self._pipeline_mode = True
self._nccl_comm_num = 1
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place(
"pipeline_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册