未验证 提交 f77a78cd 编写于 作者: L lilong12 提交者: GitHub

enable pipeline to run with Executor.run() (#28373)

* update, test=develop
上级 9f642ed8
......@@ -540,7 +540,7 @@ class HeterBoxWorker : public HogwildWorker {
#if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker {
public:
SectionWorker() { local_batch_id_ = 0; }
SectionWorker() {}
~SectionWorker() override {}
void Initialize(const TrainerDesc& desc) override;
......@@ -549,13 +549,12 @@ class SectionWorker : public DeviceWorker {
void CreateDeviceResource(const ProgramDesc& main_prog) override{};
void TrainFiles() override;
void TrainFilesWithProfiler() override;
void TrainFilesWithProfiler() override{};
void PrintFetchVars() override {}
const platform::Place& place() const { return place_; }
void SetSectionIndex(int section_id) { section_id_ = section_id; }
void SetDeviceIndex(int tid) override {}
void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
void SetMicrobatchNum(int num) { num_microbatches_ = num; }
......@@ -566,13 +565,8 @@ class SectionWorker : public DeviceWorker {
void SetSkipVars(const std::vector<std::string>& skip_vars) {
skip_vars_ = skip_vars;
}
static void ResetBatchId() { batch_id_ = 0; }
static void ResetThreadCompletedFlag() { threads_completed = false; }
static std::atomic<int> cpu_id_;
protected:
void AutoSetCPUAffinity(bool reuse);
int section_id_;
int thread_id_;
int num_microbatches_;
......@@ -581,12 +575,8 @@ class SectionWorker : public DeviceWorker {
const Scope* minibatch_scope_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
static std::mutex thread_mutex;
static std::condition_variable thread_condition;
static bool threads_completed;
std::shared_ptr<framework::ProgramDesc> program_;
static uint64_t batch_id_;
uint64_t local_batch_id_;
platform::DeviceContext* dev_ctx_ = nullptr;
};
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#if defined(PADDLE_WITH_NCCL)
#include <map>
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
......@@ -26,83 +27,25 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const auto& section_params = trainer_desc.section_param();
num_microbatches_ = section_params.num_microbatches();
VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
section_num_ = section_params.section_config_size();
VLOG(3) << "Number of program sections: " << section_num_;
trainer_desc_ = trainer_desc;
start_cpu_core_id_ = section_params.start_cpu_core_id();
SetDataset(dataset);
ParseDumpConfig(trainer_desc);
// get filelist from trainer_desc here
const std::vector<paddle::framework::DataFeed*> readers =
dataset->GetReaders();
VLOG(3) << "readers num: " << readers.size();
int num_readers = readers.size();
PADDLE_ENFORCE_EQ(num_readers, 1,
platform::errors::InvalidArgument(
"Number of dataset readers for pipeline "
"must be 1 now, but the value you give is %d.",
num_readers));
auto* reader = readers[0];
feed_var_names_ = reader->GetUseSlotAlias();
workers_.resize(section_num_);
for (int i = 0; i < section_num_; ++i) {
const auto& section_config = section_params.section_config(i);
platform::Place place;
int place_id = section_config.place_id();
switch (section_config.place()) {
case SectionConfig::CPUPlace:
place = platform::CPUPlace();
break;
case SectionConfig::CUDAPlace:
// Note that one section has at most one GPU place in one pipeline
PADDLE_ENFORCE_GE(
place_id, 0,
platform::errors::InvalidArgument(
"The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d.",
place_id));
place = platform::CUDAPlace(place_id);
break;
case SectionConfig::CUDAPinnedPlace:
place = platform::CUDAPinnedPlace();
break;
default:
PADDLE_ENFORCE_NOT_NULL(nullptr,
platform::errors::InvalidArgument(
"Unkown place type in SectionConfig: %d",
section_config.place()));
}
places_.emplace_back(place);
VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
<< ", section: " << i;
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i]);
if (i == 0) {
// we only set reader for the first section
this_worker->SetDataFeed(reader);
this_worker->SetReaderPlace(place);
}
this_worker->SetThreadIndex(i);
this_worker->SetSectionIndex(i);
this_worker->SetPlace(place);
this_worker->Initialize(trainer_desc);
this_worker->SetMicrobatchNum(num_microbatches_);
}
// set debug here
SetDebug(trainer_desc.debug());
const auto& section_config = section_params.section_config();
int place_id = section_config.place_id();
place_ = platform::CUDAPlace(place_id);
worker_ = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
this_worker->SetPlace(place_);
this_worker->Initialize(trainer_desc);
this_worker->SetMicrobatchNum(num_microbatches_);
}
void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) {
if (need_dump_field_) {
InitDumpEnv();
}
VLOG(3) << "init other env done.";
}
std::string PipelineTrainer::GetDumpPath(int tid) {
......@@ -119,143 +62,87 @@ void PipelineTrainer::InitDumpEnv() {
}
}
void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
void PipelineTrainer::CopyParameters(int microbatch_id,
const ProgramDesc& program,
const platform::Place& place) {
auto& global_block = program.Block(0);
std::map<std::string, int> param_map;
for (auto& var : global_block.AllVars()) {
int is_feed_var =
std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name());
if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
if (is_feed_var) {
auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
InitializeVariable(new_ptr, var->GetType());
} else {
auto* ptr = root_scope_->FindVar(var->Name());
auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
<< section_id << ", which pointer is " << new_ptr;
InitializeVariable(new_ptr, var->GetType());
const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
static_cast<Tensor*>(minibatch_tensor));
}
} else if (!var->Persistable() && !is_feed_var) {
auto* ptr =
microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for section "
<< section_id << " microbatch " << microbatch_id
<< ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType());
if (var->Persistable()) {
param_map[var->Name()] = 1;
}
}
}
void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) {
auto& global_block = program.Block(0);
for (auto& op : global_block.AllOps()) {
if (op->Type() != "enqueue") {
continue;
for (auto& var : global_block.AllVars()) {
bool is_param_grad = false;
size_t pos = 0;
if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
auto prefix_name = var->Name().substr(0, pos);
if (param_map.find(prefix_name) != param_map.end()) {
is_param_grad = true;
}
}
auto input_arg_names = op->InputArgumentNames();
PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
platform::errors::InvalidArgument(
"Number of input arguments for enqueue op must be 1, "
"but the value is %d.",
input_arg_names.size()));
std::string input_arg_name = input_arg_names[0];
if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
skip_vars_[section_id].emplace_back(input_arg_name);
VLOG(3) << "add skip var name: " << input_arg_name;
if (var->Persistable() && microbatch_id == 0) {
auto* ptr = root_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create persistable var: " << var->Name()
<< ", which pointer is " << ptr;
} else if (is_param_grad && microbatch_id == 0) {
auto* ptr = minibatch_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create grad for persistable var: " << var->Name()
<< ", which pointer is " << ptr;
} else if (!var->Persistable() && !is_param_grad) {
auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
VLOG(3) << "Create variable " << var->Name() << " for microbatch "
<< microbatch_id << ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType());
}
}
}
void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) {
PADDLE_ENFORCE_NOT_NULL(root_scope_,
platform::errors::InvalidArgument(
"root_scope pointer can not be nullptr"));
auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
SectionWorker::cpu_id_.store(start_cpu_id);
minibatch_scopes_.resize(section_num_);
microbatch_scopes_.resize(section_num_);
skip_vars_.resize(section_num_);
VLOG(3) << "Init ScopeQueues and create all scopes";
for (int i = 0; i < section_num_; ++i) {
minibatch_scopes_[i] = &root_scope_->NewScope();
std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc()));
microbatch_scopes_[i].resize(num_microbatches_);
for (int j = 0; j < num_microbatches_; ++j) {
microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope();
CopyParameters(i, j, *program, places_[i]);
}
GetSkipVars(i, *program);
PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
"root_scope_ can not be nullptr"));
microbatch_scopes_.resize(num_microbatches_);
VLOG(3) << "Create minibatch and microbatch scopes...";
minibatch_scope_ = &root_scope_->NewScope();
std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config().program_desc()));
for (int j = 0; j < num_microbatches_; ++j) {
microbatch_scopes_[j] = &minibatch_scope_->NewScope();
CopyParameters(j, *program, place_);
}
for (int i = 0; i < section_num_; ++i) {
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
workers_[i]);
this_worker->SetRootScope(root_scope_);
this_worker->SetMinibatchScope(minibatch_scopes_[i]);
this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
this_worker->SetSkipVars(skip_vars_[i]);
}
auto this_worker =
std::dynamic_pointer_cast<paddle::framework::SectionWorker>(worker_);
this_worker->SetRootScope(root_scope_);
this_worker->SetMinibatchScope(minibatch_scope_);
this_worker->SetMicrobatchScopes(microbatch_scopes_);
}
void PipelineTrainer::Run() {
VLOG(3) << "Going to run";
for (int i = 0; i < section_num_; ++i) {
if (!debug_) {
section_threads_.push_back(
std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
} else {
section_threads_.push_back(std::thread(
&DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
}
}
VLOG(5) << "Going to run PipelineTrainer::Run()";
section_thread_ = std::async(&DeviceWorker::TrainFiles, worker_.get());
}
void PipelineTrainer::Finalize() {
for (auto& th : section_threads_) {
th.join();
try {
section_thread_.get();
} catch (platform::EOFException& e) {
std::rethrow_exception(std::current_exception());
}
if (need_dump_field_) {
FinalizeDumpEnv();
}
VLOG(3) << "copying back parameters. ";
for (int i = 0; i < section_num_; ++i) {
std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc(
trainer_desc_.section_param().section_config(i).program_desc()));
for (int j = 0; j < num_microbatches_; ++j) {
auto& global_block = program->Block(0);
for (auto& var : global_block.AllVars()) {
if (var->Persistable()) {
auto* ptr = root_scope_->FindVar(var->Name());
LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
static_cast<Tensor*>(root_tensor));
VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
}
}
}
}
root_scope_->DropKids();
SectionWorker::ResetBatchId();
SectionWorker::ResetThreadCompletedFlag();
}
Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
return microbatch_scopes_[thread_id][0];
return microbatch_scopes_[0];
}
} // end namespace framework
......
......@@ -290,29 +290,22 @@ class PipelineTrainer : public TrainerBase {
virtual Scope* GetWorkerScope(int thread_id);
void InitDumpEnv() override;
virtual std::string GetDumpPath(int tid);
void GetSkipVars(int section_id, const ProgramDesc& main_program);
void GetSkipVars(const ProgramDesc& main_program);
protected:
int section_num_;
int num_microbatches_;
int start_cpu_core_id_;
std::vector<std::string> feed_var_names_;
std::vector<platform::Place> places_;
std::vector<std::vector<std::string>> skip_vars_;
platform::Place place_;
std::vector<std::string> skip_vars_;
TrainerDesc trainer_desc_;
std::vector<std::thread> section_threads_;
// worker: [section_id]
std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_;
// minibatch_scopes_: [section_id]
std::vector<Scope*> minibatch_scopes_;
// microbatch_scopes_: [section_id][microbatch_id]
std::vector<std::vector<Scope*>> microbatch_scopes_;
std::future<void> section_thread_;
std::shared_ptr<paddle::framework::DeviceWorker> worker_;
Scope* minibatch_scope_;
// microbatch_scopes_: [microbatch_id]
std::vector<Scope*> microbatch_scopes_;
void CopyParameters(int section_id, int microbatch_id,
const ProgramDesc& program, const platform::Place& place);
bool isPersistableVarGrad(std::string name);
bool isPersistable(VarDesc* var);
void CopyParameters(int microbatch_id, const ProgramDesc& program,
const platform::Place& place);
};
#endif
......
......@@ -86,7 +86,7 @@ message DownpourWorkerParameter {
}
message SectionWorkerParameter {
repeated SectionConfig section_config = 1;
optional SectionConfig section_config = 1;
optional int32 queue_size = 2 [ default = 1 ];
optional int64 sync_steps = 3 [ default = 1 ];
optional int32 start_cpu_core_id = 4 [ default = 1 ];
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
from __future__ import print_function
from __future__ import division
import paddle.fluid as fluid
from paddle.fluid import core, unique_name
......@@ -21,9 +22,55 @@ from .meta_optimizer_base import MetaOptimizerBase
from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
class PipelineHelper(CollectiveHelper):
def __init__(self, role_maker, nrings=1, wait_port='6174'):
super(PipelineHelper, self).__init__(role_maker, nrings, wait_port)
def _get_node_num(endpoints):
ss = set()
for ep in endpoints:
ip = ep.split(":")[0].strip()
if ip not in ss:
ss.add(ip)
return len(ss)
class PipelineHelper(object):
def __init__(self, role_maker, wait_port='6174'):
self.wait_port = wait_port
self.role_maker = role_maker
def update_startup_program(self,
startup_program=None,
inner_parallelism=None):
self.startup_program = startup_program
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
node_num = _get_node_num(endpoints)
assert len(endpoints) % node_num == 0
nranks = self.role_maker._worker_num()
rank = self.role_maker._worker_index()
# Create ring 0 for all gpus in a pipeline
pipeline_endpoints = []
pipeline_rank = rank % inner_parallelism
pipeline_id = rank // inner_parallelism
for idx, ep in enumerate(endpoints):
if idx // inner_parallelism == pipeline_id:
pipeline_endpoints.append(ep)
self._init_communicator(self.startup_program, current_endpoint,
pipeline_endpoints, pipeline_rank, 0,
self.wait_port)
pipeline_num = len(endpoints) // inner_parallelism
if pipeline_num == 1: return
# Create rings for gpus with the same gpu id
eps = []
local_rank = self.role_maker._worker_index() % inner_parallelism
ring_id = local_rank + 1
for i in range(pipeline_num):
eps.append(endpoints[i * inner_parallelism + local_rank])
temp_rank = self.role_maker._worker_index() // inner_parallelism
self._init_communicator(self.startup_program, current_endpoint, eps,
temp_rank, ring_id, self.wait_port)
self._broadcast_params(ring_id)
def _init_communicator(self, program, current_endpoint, endpoints, rank,
ring_id, wait_port):
......@@ -46,9 +93,8 @@ class PipelineHelper(CollectiveHelper):
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
OP_ROLE_KEY: OpRole.Forward
OP_ROLE_KEY: OpRole.Forward,
})
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
......@@ -58,12 +104,10 @@ class PipelineHelper(CollectiveHelper):
'rank': rank,
'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward,
'device_id': OpRole.Forward
})
def _broadcast_params(self):
def _broadcast_params(self, ring_id):
block = self.startup_program.global_block()
ring_id = 0
for param in block.iter_parameters():
if param.is_distributed:
continue
......@@ -78,13 +122,12 @@ class PipelineHelper(CollectiveHelper):
OP_ROLE_KEY: OpRole.Forward
})
for ring_id in range(self.nrings):
block.append_op(
type='c_sync_comm_stream',
inputs={'X': param},
outputs={'Out': param},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward})
block.append_op(
type='c_sync_comm_stream',
inputs={'X': param},
outputs={'Out': param},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward})
class PipelineOptimizer(MetaOptimizerBase):
......@@ -99,8 +142,8 @@ class PipelineOptimizer(MetaOptimizerBase):
user_defined_strategy):
super(PipelineOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy)
num_microbatches = user_defined_strategy.pipeline_configs['micro_batch']
self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
self.num_microbatches = user_defined_strategy.pipeline_configs[
'micro_batch']
def _can_apply(self):
if not self.role_maker._is_collective:
......@@ -115,29 +158,46 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy.pipeline_configs = {}
def _enable_strategy(self, dist_strategy, context):
# we do not support enable pipeline automatically right now
return
dist_strategy.pipeline = True
dist_strategy.pipeline_configs = {"micro_batch": 1, }
def _get_local_rank(self, current_endpoint, endpoints):
cur_node_endpoints = []
cur_ip = current_endpoint.split(':')[0].strip()
for ep in endpoints:
if cur_ip == ep.split(':')[0].strip():
cur_node_endpoints.append(ep)
return cur_node_endpoints.index(current_endpoint)
def minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
if self.role_maker._worker_num() == 1:
return optimize_ops, params_grads
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
self.local_rank = self._get_local_rank(current_endpoint, endpoints)
self.wrapped_opt = PO(self.inner_opt,
num_microbatches=self.num_microbatches,
start_cpu_core_id=self.local_rank)
node_num = _get_node_num(endpoints)
gpus_per_node = len(endpoints) // node_num
self.startup_program = startup_program
self.local_rank = self._get_local_rank(current_endpoint, endpoints)
if startup_program is None:
self.startup_program = fluid.default_startup_program()
loss.block.program._pipeline_opt = dict()
loss.block.program._pipeline_opt['local_rank'] = self.local_rank
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
assert prog_list
self.main_program_list = prog_list
self.main_program = loss.block.program
self.inner_parallelism = loss.block.program._pipeline_opt[
'inner_parallelism']
nranks = len(endpoints)
self.nranks = nranks
self.nrings = len(self.main_program_list)
......@@ -146,24 +206,26 @@ class PipelineOptimizer(MetaOptimizerBase):
self.endpoints = endpoints
self.current_endpoint = current_endpoint
pipeline_helper = PipelineHelper(self.role_maker, nrings=self.nrings)
pipeline_helper.update_startup_program(self.startup_program)
pipeline_helper = PipelineHelper(self.role_maker)
pipeline_helper.update_startup_program(
self.startup_program._pipeline_opt["startup_program"],
self.inner_parallelism)
self._transpile_main_program()
self._transpile_main_program(loss, node_num, gpus_per_node)
return optimize_ops, params_grads
def _transpile_main_program(self):
self._insert_loss_grad_ops()
for ring_id in range(self.nrings):
def _transpile_main_program(self, loss, node_num, gpus_per_node):
self._insert_loss_grad_ops(loss, gpus_per_node, node_num)
for ring_id in range(1, gpus_per_node + 1):
self._insert_allreduce_ops(ring_id)
def _insert_loss_grad_ops(self):
def _insert_loss_grad_ops(self, loss, gpus_per_node, node_num):
"""
In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers
"""
block = self.main_program_list[self.nrings - 1]['program'].global_block(
)
block = self.main_program_list[gpus_per_node - 1][
'program'].global_block()
for idx, op in reversed(list(enumerate(block.ops))):
if is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]]
......@@ -173,12 +235,12 @@ class PipelineOptimizer(MetaOptimizerBase):
inputs={'X': loss_grad_var},
outputs={'Out': loss_grad_var},
attrs={
'scale': 1.0 / self.nranks,
'scale': 1.0 / node_num,
OP_ROLE_KEY: OpRole.Backward
})
def _insert_allreduce_ops(self, ring_id):
block = self.main_program_list[ring_id]['program'].global_block()
block = self.main_program_list[ring_id - 1]['program'].global_block()
origin_block = self.main_program.global_block()
grad = None
for idx, op in reversed(list(enumerate(block.ops))):
......
......@@ -413,25 +413,17 @@ class Section(DeviceWorker):
section_param = trainer_desc.section_param
section_param.num_microbatches = pipeline_opt["num_microbatches"]
section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
for i, program in enumerate(pipeline_opt["section_program_list"]):
cfg = section_param.section_config.add()
cfg.program_desc.ParseFromString(program["program"]._get_desc()
.serialize_to_string())
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place = pipeline_opt["place_list"][i]
place_id = pipeline_opt["place_id_list"][i]
if isinstance(place, core.CPUPlace):
cfg.place = cfg.CPUPlace
elif isinstance(place, core.CUDAPlace):
cfg.place = cfg.CUDAPlace
elif isinstance(place, core.CUDAPinnedPlace):
cfg.place = cfg.CUDAPinnedPlace
else:
raise NotImplementedError(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg.place_id = place_id
cfg = section_param.section_config
program = pipeline_opt["section_program"]
cfg.program_desc.ParseFromString(program["program"]._get_desc()
.serialize_to_string())
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place = pipeline_opt["place"]
place_id = pipeline_opt["place_id"]
assert isinstance(place, core.CUDAPlace)
cfg.place = cfg.CUDAPlace
cfg.place_id = place_id
class DeviceWorkerFactory(object):
......
......@@ -561,6 +561,7 @@ class Executor(object):
self._default_executor = core.Executor(p)
self._closed = False
self.pruned_program_scope_caches = dict()
self._prepare_to_run_called = False
self._auto_checkpoint_name = unique_name.generate(
"__auto_checkpoint_executor__")
......@@ -1115,6 +1116,24 @@ class Executor(object):
use_default_main_program = program is None
if program is None:
program = default_main_program()
if fetch_list is not None:
if isinstance(fetch_list, Variable) or isinstance(
fetch_list, str) or isinstance(fetch_list,
six.string_types):
fetch_list = [fetch_list]
assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
"Currently , The fetch_list type only should be list or tuple, \n"\
"but the input type is {}. For more information please refer to \n"\
"the executor.run(...).".format(type(fetch_list))
else:
fetch_list = []
if isinstance(program, Program) and program._pipeline_opt:
if "startup_program" in program._pipeline_opt:
program = program._pipeline_opt["startup_program"]
else:
return self.train_from_dataset(program, fetch_list=fetch_list)
if isinstance(program, Program) and \
len(program.global_block().ops) == 0:
if use_default_main_program:
......@@ -1131,18 +1150,6 @@ class Executor(object):
if scope is None:
scope = global_scope()
if fetch_list is not None:
if isinstance(fetch_list, Variable) or isinstance(
fetch_list, str) or isinstance(fetch_list,
six.string_types):
fetch_list = [fetch_list]
assert isinstance(fetch_list, tuple) or isinstance(fetch_list, list), \
"Currently , The fetch_list type only should be list or tuple, \n"\
"but the input type is {}. For more information please refer to \n"\
"the executor.run(...).".format(type(fetch_list))
else:
fetch_list = []
# use_prune can be overrided by putting optimize_ops in fetch_list
_origin_fetch_list = fetch_list
_origin_program = program
......@@ -1449,6 +1456,25 @@ class Executor(object):
raise RuntimeError("dataset is need and should be initialized")
dataset._prepare_to_run()
real_fetch_list = []
if program._pipeline_opt:
real_program = program._pipeline_opt["section_program"]['program']
for fetch_var in fetch_list:
if isinstance(fetch_var, Variable):
fetch_var_name = fetch_var.name
else:
fetch_var_name = fetch_var
if fetch_var_name in real_program.global_block().vars:
real_fetch_list.append(fetch_var)
program._pipeline_opt["section_program"][
'program'] = self._add_feed_fetch_ops(
program=program._pipeline_opt["section_program"]['program'],
feed=[],
fetch_list=real_fetch_list,
feed_var_name='feed',
fetch_var_name='fetch')
fetch_list = None
scope, trainer = self._prepare_trainer(
program=program,
......@@ -1483,6 +1509,10 @@ class Executor(object):
dataset._dynamic_adjust_after_train()
dataset._finish_to_run()
if real_fetch_list:
arr = scope.find_var('fetch').get_fetch_list()
tensors = arr._move_to_list()
return as_numpy(tensors)
return None
......
此差异已折叠。
......@@ -10,10 +10,12 @@ if(NOT WITH_NCCL)
endif()
string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
list(APPEND DIST_TEST_OPS test_pipeline)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
......@@ -146,7 +148,6 @@ if (WITH_NCCL)
endif()
if(NOT WITH_GPU OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_pipeline)
LIST(REMOVE_ITEM TEST_OPS test_boxps)
endif()
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
......@@ -469,7 +470,6 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.distributed.fleet as fleet
paddle.enable_static()
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return predict
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
# Input data
with fluid.device_guard("gpu:0"):
images = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if dist_strategy:
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[images, label],
capacity=64,
use_double_buffer=False,
iterable=False)
# Train program
predict = cnn_model(images)
with fluid.device_guard("gpu:1"):
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
with fluid.device_guard("gpu:1"):
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
base_lr = self.lr
passes = [30, 60, 80, 90]
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
if dist_strategy:
fleet.init(is_collective=True)
strategy = fleet.DistributedStrategy()
strategy.pipeline = True
dist_opt = fleet.distributed_optimizer(
optimizer=opt, strategy=strategy)
dist_opt.minimize(avg_cost)
else:
opt.minimize(avg_cost)
if dist_strategy:
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
else:
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
......@@ -124,6 +124,67 @@ class TestDistRunnerBase(object):
exe.run(pserver_prog)
print_to_err(type(self).__name__, "run pserver main program done.")
def run_pipeline_trainer(self, args):
self.lr = args.lr
dist_strategy = DistributedStrategy()
test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \
self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
eprint(type(self).__name__, "device_id: %d." % device_id)
place = fluid.CUDAPlace(device_id)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
eprint(type(self).__name__, "run worker startup program done.")
data_loader.set_sample_list_generator(train_reader, place)
data_loader.start()
print_to_err(type(self).__name__, "begin to train on trainer")
out_losses = []
for i in six.moves.xrange(RUN_STEP):
loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
loss = loss[0] if loss else None
out_losses.append(loss)
print_to_err(type(self).__name__, "run step %d finished" % i)
print_to_err(type(self).__name__, "trainer run finished")
if six.PY2:
print(pickle.dumps(out_losses))
else:
sys.stdout.buffer.write(pickle.dumps(out_losses))
if args.save_model:
model_save_dir = "/tmp"
if fleet.worker_index() == 0:
model_save_dir_fluid = os.path.join(model_save_dir,
"fluid_persistables")
model_save_dir_fleet = os.path.join(model_save_dir,
"fleet_persistables")
infer_save_dir_fluid = os.path.join(model_save_dir,
"fluid_infer")
infer_save_dir_fleet = os.path.join(model_save_dir,
"fleet_infer")
else:
model_save_dir_fluid = os.path.join(model_save_dir,
"fluid_persistables_2")
model_save_dir_fleet = os.path.join(model_save_dir,
"fleet_persistables_2")
infer_save_dir_fluid = os.path.join(model_save_dir,
"fluid_infer_2")
infer_save_dir_fleet = os.path.join(model_save_dir,
"fleet_infer_2")
fluid.io.save_persistables(exe, model_save_dir_fluid,
fleet._origin_program)
fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
feeded_var_names = [var.name for var in feed_var_list]
fluid.io.save_inference_model(infer_save_dir_fluid,
feeded_var_names, [avg_cost], exe,
fleet._origin_program)
fleet.save_inference_model(exe, infer_save_dir_fleet,
feeded_var_names, [avg_cost])
def run_gpu_fleet_api_trainer(self, args):
assert args.update_method == "nccl2"
......@@ -532,6 +593,7 @@ def runtime_main(test_class):
parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
parser.add_argument('--enable_backward_deps', action='store_true')
parser.add_argument('--use_hallreduce', action='store_true')
parser.add_argument('--use_pipeline', action='store_true')
parser.add_argument('--gpu_fleet_api', action='store_true')
parser.add_argument('--use_local_sgd', action='store_true')
parser.add_argument('--ut4grad_allreduce', action='store_true')
......@@ -566,6 +628,8 @@ def runtime_main(test_class):
model.run_pserver(args)
elif args.gpu_fleet_api:
model.run_gpu_fleet_api_trainer(args)
elif args.use_pipeline:
model.run_pipeline_trainer(args)
else:
model.run_trainer(args)
......@@ -607,6 +671,7 @@ class TestDistBase(unittest.TestCase):
self._dc_asgd = False # must use with async mode
self._use_reader_alloc = True
self._nccl2_mode = False
self._pipeline_mode = False
self._mp_mode = False
# FIXME(typhoonzero): I added this stupid argument to enable
# testing allreduce layers, which users can call layers.allreduce
......@@ -892,6 +957,8 @@ class TestDistBase(unittest.TestCase):
if self._use_dgc:
tr_cmd += " --use_dgc"
if self._pipeline_mode:
tr_cmd += " --use_pipeline"
if self._mp_mode:
env = {"FLAGS_selected_gpus": "{}".format(trainer_id % 2)}
......@@ -978,6 +1045,51 @@ class TestDistBase(unittest.TestCase):
print("outs[1]:", outs[1])
return pickle.loads(outs[0]), pickle.loads(outs[1])
def _run_pipeline(self, model, envs, check_error_log, log_name):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints = self._ps_endpoints.split(",")
update_method = "nccl2"
trainer_num = len(worker_endpoints)
procs = []
pipes = []
for i in range(0, trainer_num):
tr_cmd, tr_env = self._get_nccl2_trainer_cmd(
model, worker_endpoints[i], update_method, i, trainer_num)
tr_env.update(envs)
tr_env['CUDA_VISIBLE_DEVICES'] = "0,1"
tr_env['NCCL_SHM_DISABLE'] = '1'
tr_env['FLAGS_selected_gpus'] = str(i)
tr_env['FLAGS_cudnn_deterministic'] = '0'
print("tr_cmd:{}, env: {}".format(tr_cmd, tr_env))
tr_pipe = open("/tmp/" + "tr{}_err.log".format(i), "wb")
print_to_err(
type(self).__name__,
"going to start process {} with nccl2".format(i))
tr_proc = subprocess.Popen(
tr_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr_pipe,
env=tr_env)
procs.append(tr_proc)
pipes.append(tr_pipe)
outs = []
for i in range(0, trainer_num):
tr_out, tr_err = procs[i].communicate()
outs.append(tr_out)
pipes[i].close()
sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))
if check_error_log:
print("outs[0]:", outs[0])
print("outs[1]:", outs[1])
return pickle.loads(outs[0]), pickle.loads(outs[1])
def _get_required_envs(self, check_error_log=False, need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
......@@ -1032,6 +1144,9 @@ class TestDistBase(unittest.TestCase):
False,
check_error_log,
log_name=log_name)
elif self._pipeline_mode:
tr0_losses, tr1_losses = self._run_pipeline(
model_file, required_envs, check_error_log, log_name=log_name)
else:
tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log, log_name=log_name)
......@@ -1040,7 +1155,10 @@ class TestDistBase(unittest.TestCase):
local_loss = local_losses[step_id]
tr0_loss = tr0_losses[step_id]
tr1_loss = tr1_losses[step_id]
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
if self._pipeline_mode:
dist_loss = np.array([tr1_loss])
else:
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
print("=======", local_loss, ":", dist_loss[0], "=======")
self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
......
......@@ -16,6 +16,8 @@ import unittest
import paddle
import os
paddle.enable_static()
class TestFleetMetaOptimizer(unittest.TestCase):
def setUp(self):
......@@ -28,19 +30,14 @@ class TestFleetMetaOptimizer(unittest.TestCase):
import paddle.distributed.fleet.base.role_maker as role_maker
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
with paddle.fluid.device_guard("cpu"):
with paddle.fluid.device_guard("gpu:0"):
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(
name="y", shape=[1], dtype='int64')
data_loader = paddle.fluid.io.DataLoader.from_generator(
feed_list=[input_x, input_y],
capacity=64,
use_double_buffer=True,
iterable=False)
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
with paddle.fluid.device_guard("gpu:0"):
with paddle.fluid.device_guard("gpu:1"):
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2],
size=2,
......
......@@ -13,212 +13,32 @@
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import numpy as np
import os
import shutil
import unittest
import math
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv,
act=act, )
def shortcut(input, ch_out, stride, is_first):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1 or is_first == True:
return conv_bn_layer(input, ch_out, 1, stride)
else:
return input
def bottleneck_block(input, num_filters, stride):
conv0 = conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
conv2 = conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
short = shortcut(input, num_filters * 4, stride, is_first=False)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(input, num_filters, stride, is_first):
conv0 = conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride)
conv1 = conv_bn_layer(
input=conv0, num_filters=num_filters, filter_size=3, act=None)
short = shortcut(input, num_filters, stride, is_first)
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
from test_dist_base import TestDistBase
def build_network(input, layers=50, class_dim=1000):
supported_layers = [18, 34, 50, 101, 152]
assert layers in supported_layers
depth = None
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
with fluid.device_guard("cpu"):
conv = conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
with fluid.device_guard("gpu:0"):
for i in range(depth[block]):
conv = bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
else:
for block in range(len(depth)):
with fluid.device_guard("gpu:0"):
for i in range(depth[block]):
conv = basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
is_first=block == i == 0)
with fluid.device_guard("gpu:0"):
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
return out
class TestPipeline(unittest.TestCase):
""" TestCases for Pipeline Training. """
def _run(self, debug):
main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog):
with fluid.device_guard("cpu"):
image = fluid.layers.data(
name="image", shape=[3, 224, 224], dtype="float32")
label = fluid.layers.data(
name="label", shape=[1], dtype="int64")
data_loader = fluid.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
use_double_buffer=True,
iterable=False)
fc = build_network(image, layers=50)
with fluid.device_guard("gpu:0"):
out, prob = fluid.layers.softmax_with_cross_entropy(
logits=fc, label=label, return_softmax=True)
loss = fluid.layers.mean(out)
acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
base_lr = 0.1
passes = [30, 60, 80, 90]
total_images = 1281167
steps_per_pass = total_images // 128
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
optimizer = fluid.optimizer.MomentumOptimizer(
lr_val,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, num_microbatches=2)
optimizer.minimize(loss)
def train_reader():
for _ in range(4):
img = np.random.random(size=[3, 224, 224]).astype('float32')
label = np.random.random(size=[1]).astype('int64')
yield img, label
data_loader.set_sample_generator(train_reader, batch_size=1)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
data_loader.start()
exe.train_from_dataset(main_prog, debug=debug)
def test_pipeline(self):
self._run(False)
self._run(True)
def test_pipeline_noneoptimizer(self):
with fluid.device_guard("gpu:0"):
x = fluid.layers.data(
name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(
name='y', shape=[1], dtype='int64', lod_level=0)
emb_x = layers.embedding(
input=x,
param_attr=fluid.ParamAttr(name="embx"),
size=[10, 2],
is_sparse=False)
fc = layers.fc(input=emb_x,
name="fc",
size=1,
num_flatten_dims=1,
bias_attr=False)
loss = layers.reduce_mean(fc)
import os
import paddle
optimizer = fluid.optimizer.SGD(learning_rate=0.5)
with self.assertRaises(ValueError):
optimizer = fluid.optimizer.PipelineOptimizer(
dict(), num_microbatches=2)
paddle.enable_static()
flag_name = os.path.splitext(__file__)[0]
class TestPipeline(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
self._use_reader_alloc = False
self._pipeline_mode = True
self._nccl_comm_num = 1
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place(
"pipeline_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册