未验证 提交 0ec3a42e 编写于 作者: H hutuxian 提交者: GitHub

Random Dump (#24477)

* Refactor code for dump_field & dump_param: abstracting the common function in base class.
* Support dump randomly & random with lineid
* Support specify the random interval, which avoids printing too much logs.
上级 736d3acc
...@@ -66,11 +66,9 @@ else() ...@@ -66,11 +66,9 @@ else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker)
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
...@@ -87,6 +85,8 @@ endif() ...@@ -87,6 +85,8 @@ endif()
cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope)
cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
......
...@@ -881,6 +881,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { ...@@ -881,6 +881,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
uint32_t rank; uint32_t rank;
GetMsgFromLogKey(log_key, &search_id, &cmatch, &rank); GetMsgFromLogKey(log_key, &search_id, &cmatch, &rank);
instance->ins_id_ = log_key;
instance->search_id = search_id; instance->search_id = search_id;
instance->cmatch = cmatch; instance->cmatch = cmatch;
instance->rank = rank; instance->rank = rank;
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include "xxhash.h" // NOLINT
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -91,5 +92,109 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) { ...@@ -91,5 +92,109 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) {
return true; return true;
} }
void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) {
std::ostringstream os;
for (auto& param : *dump_param_) {
os.str("");
Variable* var = scope.FindVar(param);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
framework::LoDTensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
tensor = &cpu_tensor;
}
int64_t len = tensor->numel();
os << "(" << batch_id << "," << param << ")"
<< PrintLodTensor(tensor, 0, len);
writer_ << os.str();
}
}
void DeviceWorker::InitRandomDumpConfig(const TrainerDesc& desc) {
bool enable_random_dump = desc.enable_random_dump();
if (!enable_random_dump) {
dump_mode_ = 0;
} else {
if (desc.random_with_lineid()) {
dump_mode_ = 1;
} else {
dump_mode_ = 2;
}
}
dump_interval_ = desc.dump_interval();
}
void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
int dump_interval) { // dump_mode: 0: no random,
// 1: random with insid hash,
// 2: random with random
// number
size_t batch_size = device_reader_->GetCurBatchSize();
auto& ins_id_vec = device_reader_->GetInsIdVec();
auto& ins_content_vec = device_reader_->GetInsContentVec();
if (ins_id_vec.size() > 0) {
batch_size = ins_id_vec.size();
}
std::vector<std::string> ars(batch_size);
std::vector<bool> hit(batch_size, false);
std::default_random_engine engine(0);
std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
for (size_t i = 0; i < batch_size; i++) {
size_t r = 0;
if (dump_mode == 1) {
r = XXH64(ins_id_vec[i].data(), ins_id_vec[i].length(), 0);
} else if (dump_mode == 2) {
r = dist(engine);
}
if (r % dump_interval != 0) {
continue;
}
hit[i] = true;
}
for (size_t i = 0; i < ins_id_vec.size(); i++) {
if (!hit[i]) {
continue;
}
ars[i] += ins_id_vec[i];
ars[i] = ars[i] + "\t" + ins_content_vec[i];
}
for (auto& field : *dump_fields_) {
Variable* var = scope.FindVar(field);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
framework::LoDTensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
tensor = &cpu_tensor;
}
if (!CheckValidOutput(tensor, batch_size)) {
continue;
}
for (size_t i = 0; i < batch_size; ++i) {
if (!hit[i]) {
continue;
}
auto output_dim = tensor->dims()[1];
std::string output_dimstr = boost::lexical_cast<std::string>(output_dim);
ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
auto bound = GetTensorBound(tensor, i);
ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
}
}
// #pragma omp parallel for
for (size_t i = 0; i < ars.size(); i++) {
if (ars[i].length() == 0) {
continue;
}
writer_ << ars[i];
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -120,6 +120,7 @@ class DeviceWorker { ...@@ -120,6 +120,7 @@ class DeviceWorker {
} }
virtual ~DeviceWorker() {} virtual ~DeviceWorker() {}
virtual void Initialize(const TrainerDesc& desc) = 0; virtual void Initialize(const TrainerDesc& desc) = 0;
virtual void InitRandomDumpConfig(const TrainerDesc& desc);
virtual void SetDeviceIndex(int tid) = 0; virtual void SetDeviceIndex(int tid) = 0;
virtual void TrainFiles() = 0; virtual void TrainFiles() = 0;
virtual void PrintFetchVars() = 0; virtual void PrintFetchVars() = 0;
...@@ -129,8 +130,21 @@ class DeviceWorker { ...@@ -129,8 +130,21 @@ class DeviceWorker {
virtual void BindingDataFeedMemory() = 0; virtual void BindingDataFeedMemory() = 0;
virtual void SetRootScope(Scope* root_scope); virtual void SetRootScope(Scope* root_scope);
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetNeedDump(bool need_dump_field) {} virtual void SetNeedDumpField(bool need_dump_field) {
virtual void SetChannelWriter(ChannelObject<std::string>* queue) {} need_dump_field_ = need_dump_field;
}
virtual void SetNeedDumpParam(bool need_dump_param) {
need_dump_param_ = need_dump_param;
}
virtual void SetDumpFieldVector(const std::vector<std::string>& dump_fields) {
dump_fields_ = &dump_fields;
}
virtual void SetDumpParamVector(const std::vector<std::string>& dump_param) {
dump_param_ = &dump_param;
}
virtual void SetChannelWriter(ChannelObject<std::string>* queue) {
writer_.Reset(queue);
}
virtual void SetPlace(const paddle::platform::Place& place) { virtual void SetPlace(const paddle::platform::Place& place) {
place_ = place; place_ = place;
} }
...@@ -140,6 +154,9 @@ class DeviceWorker { ...@@ -140,6 +154,9 @@ class DeviceWorker {
virtual Scope* GetThreadScope() { return thread_scope_; } virtual Scope* GetThreadScope() { return thread_scope_; }
protected: protected:
virtual void DumpParam(const Scope& scope, const int batch_id);
virtual void DumpField(const Scope& scope, int dump_mode,
int dump_interval = 10000);
Scope* root_scope_ = nullptr; Scope* root_scope_ = nullptr;
Scope* thread_scope_; Scope* thread_scope_;
paddle::platform::Place place_; paddle::platform::Place place_;
...@@ -148,6 +165,16 @@ class DeviceWorker { ...@@ -148,6 +165,16 @@ class DeviceWorker {
FetchConfig fetch_config_; FetchConfig fetch_config_;
bool use_cvm_; bool use_cvm_;
bool no_cvm_; bool no_cvm_;
// dump params or grads for debug
bool need_dump_param_;
bool need_dump_field_;
const std::vector<std::string>* dump_param_;
const std::vector<std::string>* dump_fields_;
int dump_mode_ = 0;
int dump_interval_ = 10000;
ChannelWriter<std::string> writer_;
}; };
class CPUWorkerBase : public DeviceWorker { class CPUWorkerBase : public DeviceWorker {
...@@ -176,8 +203,6 @@ class HogwildWorker : public CPUWorkerBase { ...@@ -176,8 +203,6 @@ class HogwildWorker : public CPUWorkerBase {
virtual void Initialize(const TrainerDesc& desc); virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles(); virtual void TrainFiles();
virtual void TrainFilesWithProfiler(); virtual void TrainFilesWithProfiler();
virtual void SetNeedDump(bool need_dump_field);
virtual void SetChannelWriter(ChannelObject<std::string>* queue);
virtual void PrintFetchVars(); virtual void PrintFetchVars();
virtual void CreateDeviceResource(const ProgramDesc& main_prog); virtual void CreateDeviceResource(const ProgramDesc& main_prog);
virtual void BindingDataFeedMemory(); virtual void BindingDataFeedMemory();
...@@ -187,7 +212,6 @@ class HogwildWorker : public CPUWorkerBase { ...@@ -187,7 +212,6 @@ class HogwildWorker : public CPUWorkerBase {
protected: protected:
void CreateThreadOperators(const ProgramDesc& program); void CreateThreadOperators(const ProgramDesc& program);
void CreateThreadScope(const ProgramDesc& program); void CreateThreadScope(const ProgramDesc& program);
virtual void DumpParam(const int batch_id);
std::vector<std::string> op_names_; std::vector<std::string> op_names_;
std::vector<OperatorBase*> ops_; std::vector<OperatorBase*> ops_;
...@@ -196,12 +220,6 @@ class HogwildWorker : public CPUWorkerBase { ...@@ -196,12 +220,6 @@ class HogwildWorker : public CPUWorkerBase {
HogwildWorkerParameter param_; HogwildWorkerParameter param_;
std::vector<std::string> skip_ops_; std::vector<std::string> skip_ops_;
std::map<std::string, int> stat_var_name_map_; std::map<std::string, int> stat_var_name_map_;
// dump params or grads for debug
bool need_dump_param_;
bool need_dump_field_;
std::vector<std::string> dump_param_;
std::vector<std::string> dump_fields_;
ChannelWriter<std::string> writer_;
}; };
class DownpourWorker : public HogwildWorker { class DownpourWorker : public HogwildWorker {
...@@ -211,8 +229,6 @@ class DownpourWorker : public HogwildWorker { ...@@ -211,8 +229,6 @@ class DownpourWorker : public HogwildWorker {
virtual void Initialize(const TrainerDesc& desc); virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles(); virtual void TrainFiles();
virtual void TrainFilesWithProfiler(); virtual void TrainFilesWithProfiler();
virtual void SetNeedDump(bool need_dump_field);
virtual void SetChannelWriter(ChannelObject<std::string>* queue);
protected: protected:
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_; std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
...@@ -224,7 +240,6 @@ class DownpourWorker : public HogwildWorker { ...@@ -224,7 +240,6 @@ class DownpourWorker : public HogwildWorker {
void CopySparseTable(); void CopySparseTable();
void CopyDenseTable(); void CopyDenseTable();
void CopyDenseVars(); void CopyDenseVars();
virtual void DumpParam(const int batch_id);
DownpourWorkerParameter param_; DownpourWorkerParameter param_;
// copy table // copy table
......
...@@ -29,18 +29,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -29,18 +29,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
thread_num_ = trainer_desc.thread_num(); thread_num_ = trainer_desc.thread_num();
SetDataset(dataset); SetDataset(dataset);
dump_fields_path_ = trainer_desc.dump_fields_path(); ParseDumpConfig(trainer_desc);
dump_converter_ = trainer_desc.dump_converter();
need_dump_field_ = false;
if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
need_dump_field_ = true;
}
if (need_dump_field_) {
auto &file_list = dataset->GetFileList();
if (file_list.size() == 0) {
need_dump_field_ = false;
}
}
mpi_rank_ = trainer_desc.mpi_rank(); mpi_rank_ = trainer_desc.mpi_rank();
mpi_size_ = trainer_desc.mpi_size(); mpi_size_ = trainer_desc.mpi_size();
dump_file_num_ = trainer_desc.dump_file_num(); dump_file_num_ = trainer_desc.dump_file_num();
...@@ -60,8 +49,12 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -60,8 +49,12 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
trainer_desc.device_worker_name()); trainer_desc.device_worker_name());
workers_[i]->SetDeviceIndex(i); workers_[i]->SetDeviceIndex(i);
workers_[i]->SetDataFeed(readers[i]); workers_[i]->SetDataFeed(readers[i]);
workers_[i]->SetNeedDumpField(need_dump_field_);
workers_[i]->SetNeedDumpParam(need_dump_param_);
workers_[i]->SetDumpFieldVector(dump_fields_);
workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetNeedDump(need_dump_field_);
} }
VLOG(3) << "going to initialize pull dense worker"; VLOG(3) << "going to initialize pull dense worker";
...@@ -71,33 +64,6 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -71,33 +64,6 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
void DistMultiTrainer::DumpWork(int tid) {
#ifdef _LINUX
int err_no = 0;
std::string path = string::format_string(
"%s/part-%03d-%05d", dump_fields_path_.c_str(), mpi_rank_, tid);
std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
while (1) {
std::string out_str;
if (!queue_->Get(out_str)) {
break;
}
size_t write_count =
fwrite_unlocked(out_str.data(), 1, out_str.length(), fp.get());
if (write_count != out_str.length()) {
VLOG(3) << "dump text failed";
continue;
}
write_count = fwrite_unlocked("\n", 1, 1, fp.get());
if (write_count != 1) {
VLOG(3) << "dump text failed";
continue;
}
}
#endif
}
void DistMultiTrainer::InitDumpEnv() { void DistMultiTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>(); queue_ = paddle::framework::MakeChannel<std::string>();
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
...@@ -112,16 +78,8 @@ void DistMultiTrainer::InitDumpEnv() { ...@@ -112,16 +78,8 @@ void DistMultiTrainer::InitDumpEnv() {
} }
for (int i = 0; i < dump_thread_num_; i++) { for (int i = 0; i < dump_thread_num_; i++) {
dump_thread_.push_back( dump_thread_.push_back(
std::thread(std::bind(&DistMultiTrainer::DumpWork, this, i))); std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
}
}
void DistMultiTrainer::FinalizeDumpEnv() {
queue_->Close();
for (auto &th : dump_thread_) {
th.join();
} }
queue_.reset();
} }
void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program, void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
......
...@@ -80,19 +80,7 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { ...@@ -80,19 +80,7 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
no_cvm_ = desc.no_cvm(); no_cvm_ = desc.no_cvm();
scale_datanorm_ = desc.scale_datanorm(); scale_datanorm_ = desc.scale_datanorm();
dump_slot_ = desc.dump_slot(); dump_slot_ = desc.dump_slot();
dump_fields_.resize(desc.dump_fields_size());
for (int i = 0; i < desc.dump_fields_size(); ++i) {
dump_fields_[i] = desc.dump_fields(i);
}
adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
need_dump_param_ = false;
dump_param_.resize(desc.dump_param_size());
for (int i = 0; i < desc.dump_param_size(); ++i) {
dump_param_[i] = desc.dump_param(i);
}
if (desc.dump_param_size() != 0) {
need_dump_param_ = true;
}
for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { for (int i = 0; i < desc.check_nan_var_names_size(); ++i) {
check_nan_var_names_.push_back(desc.check_nan_var_names(i)); check_nan_var_names_.push_back(desc.check_nan_var_names(i));
} }
...@@ -121,30 +109,6 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { ...@@ -121,30 +109,6 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
} }
} }
void DownpourWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
writer_.Reset(queue);
}
void DownpourWorker::SetNeedDump(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
void DownpourWorker::DumpParam(const int batch_id) {
std::ostringstream os;
for (auto& param : dump_param_) {
os.str("");
Variable* var = thread_scope_->FindVar(param);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t len = tensor->numel();
os << "(" << batch_id << "," << param << ")"
<< PrintLodTensor(tensor, 0, len);
writer_ << os.str();
}
}
void DownpourWorker::CollectLabelInfo(size_t table_idx) { void DownpourWorker::CollectLabelInfo(size_t table_idx) {
if (no_cvm_) { if (no_cvm_) {
return; return;
...@@ -915,52 +879,17 @@ void DownpourWorker::TrainFiles() { ...@@ -915,52 +879,17 @@ void DownpourWorker::TrainFiles() {
} }
} }
if (need_dump_field_) { if (need_dump_field_) {
size_t batch_size = device_reader_->GetCurBatchSize(); DumpField(*thread_scope_, dump_mode_, dump_interval_);
std::vector<std::string> ars(batch_size); }
for (auto& ar : ars) { if (need_dump_param_ && thread_id_ == 0) {
ar.clear(); DumpParam(*thread_scope_, batch_cnt);
}
auto& ins_id_vec = device_reader_->GetInsIdVec();
auto& ins_content_vec = device_reader_->GetInsContentVec();
for (size_t i = 0; i < ins_id_vec.size(); i++) {
ars[i] += ins_id_vec[i];
ars[i] = ars[i] + "\t" + ins_content_vec[i];
}
for (auto& field : dump_fields_) {
Variable* var = thread_scope_->FindVar(field);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (!CheckValidOutput(tensor, batch_size)) {
continue;
}
for (size_t i = 0; i < batch_size; ++i) {
auto output_dim = tensor->dims()[1];
std::string output_dimstr =
boost::lexical_cast<std::string>(output_dim);
ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
auto bound = GetTensorBound(tensor, i);
ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
}
}
// #pragma omp parallel for
for (size_t i = 0; i < ars.size(); i++) {
if (ars[i].length() == 0) {
continue;
}
writer_ << ars[i];
}
if (need_dump_param_ && thread_id_ == 0) {
DumpParam(batch_cnt);
}
} }
PrintFetchVars(); PrintFetchVars();
thread_scope_->DropKids(); thread_scope_->DropKids();
++batch_cnt; ++batch_cnt;
} }
if (need_dump_field_) { if (need_dump_field_ || need_dump_param_) {
writer_.Flush(); writer_.Flush();
} }
if (copy_table_config_.need_copy()) { if (copy_table_config_.need_copy()) {
......
...@@ -156,19 +156,7 @@ void DownpourWorkerOpt::Initialize(const TrainerDesc& desc) { ...@@ -156,19 +156,7 @@ void DownpourWorkerOpt::Initialize(const TrainerDesc& desc) {
no_cvm_ = desc.no_cvm(); no_cvm_ = desc.no_cvm();
scale_datanorm_ = desc.scale_datanorm(); scale_datanorm_ = desc.scale_datanorm();
dump_slot_ = desc.dump_slot(); dump_slot_ = desc.dump_slot();
dump_fields_.resize(desc.dump_fields_size());
for (int i = 0; i < desc.dump_fields_size(); ++i) {
dump_fields_[i] = desc.dump_fields(i);
}
adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
need_dump_param_ = false;
dump_param_.resize(desc.dump_param_size());
for (int i = 0; i < desc.dump_param_size(); ++i) {
dump_param_[i] = desc.dump_param(i);
}
if (desc.dump_param_size() != 0) {
need_dump_param_ = true;
}
for (int i = 0; i < desc.loss_names_size(); ++i) { for (int i = 0; i < desc.loss_names_size(); ++i) {
loss_names_.push_back(desc.loss_names(i)); loss_names_.push_back(desc.loss_names(i));
} }
...@@ -527,52 +515,17 @@ void DownpourWorkerOpt::TrainFiles() { ...@@ -527,52 +515,17 @@ void DownpourWorkerOpt::TrainFiles() {
} }
} }
if (need_dump_field_) { if (need_dump_field_) {
size_t batch_size = device_reader_->GetCurBatchSize(); DumpField(*thread_scope_, dump_mode_, dump_interval_);
std::vector<std::string> ars(batch_size); }
for (auto& ar : ars) { if (need_dump_param_ && thread_id_ == 0) {
ar.clear(); DumpParam(*thread_scope_, batch_cnt);
}
auto& ins_id_vec = device_reader_->GetInsIdVec();
auto& ins_content_vec = device_reader_->GetInsContentVec();
for (size_t i = 0; i < ins_id_vec.size(); i++) {
ars[i] += ins_id_vec[i];
ars[i] = ars[i] + "\t" + ins_content_vec[i];
}
for (auto& field : dump_fields_) {
Variable* var = thread_scope_->FindVar(field);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (!CheckValidOutput(tensor, batch_size)) {
continue;
}
for (size_t i = 0; i < batch_size; ++i) {
auto output_dim = tensor->dims()[1];
std::string output_dimstr =
boost::lexical_cast<std::string>(output_dim);
ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
auto bound = GetTensorBound(tensor, i);
ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
}
}
// #pragma omp parallel for
for (size_t i = 0; i < ars.size(); i++) {
if (ars[i].length() == 0) {
continue;
}
writer_ << ars[i];
}
if (need_dump_param_ && thread_id_ == 0) {
DumpParam(batch_cnt);
}
} }
PrintFetchVars(); PrintFetchVars();
thread_scope_->DropKids(); thread_scope_->DropKids();
++batch_cnt; ++batch_cnt;
} }
if (need_dump_field_) { if (need_dump_field_ || need_dump_param_) {
writer_.Flush(); writer_.Flush();
} }
if (copy_table_config_.need_copy()) { if (copy_table_config_.need_copy()) {
......
...@@ -32,23 +32,9 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) { ...@@ -32,23 +32,9 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
use_cvm_ = desc.use_cvm(); use_cvm_ = desc.use_cvm();
thread_barrier_ = desc.thread_barrier(); thread_barrier_ = desc.thread_barrier();
dump_fields_.resize(desc.dump_fields_size());
for (int i = 0; i < desc.dump_fields_size(); ++i) {
dump_fields_[i] = desc.dump_fields(i);
}
for (int i = 0; i < param_.stat_var_names_size(); ++i) { for (int i = 0; i < param_.stat_var_names_size(); ++i) {
stat_var_name_map_[param_.stat_var_names(i)] = 1; stat_var_name_map_[param_.stat_var_names(i)] = 1;
} }
need_dump_param_ = false;
dump_param_.resize(desc.dump_param_size());
for (int i = 0; i < desc.dump_param_size(); ++i) {
dump_param_[i] = desc.dump_param(i);
}
if (desc.dump_param_size() != 0) {
need_dump_param_ = true;
}
} }
void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) { void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
...@@ -163,45 +149,10 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -163,45 +149,10 @@ void HogwildWorker::TrainFilesWithProfiler() {
} }
if (need_dump_field_) { if (need_dump_field_) {
size_t batch_size = device_reader_->GetCurBatchSize(); DumpField(*thread_scope_, dump_mode_, dump_interval_);
std::vector<std::string> ars(batch_size); }
for (auto &ar : ars) { if (need_dump_param_ && thread_id_ == 0) {
ar.clear(); DumpParam(*thread_scope_, batch_cnt);
}
auto &ins_id_vec = device_reader_->GetInsIdVec();
auto &ins_content_vec = device_reader_->GetInsContentVec();
for (size_t i = 0; i < ins_id_vec.size(); i++) {
ars[i] += ins_id_vec[i];
ars[i] = ars[i] + "\t" + ins_content_vec[i];
}
for (auto &field : dump_fields_) {
Variable *var = thread_scope_->FindVar(field);
if (var == nullptr) {
continue;
}
LoDTensor *tensor = var->GetMutable<LoDTensor>();
if (!CheckValidOutput(tensor, batch_size)) {
continue;
}
for (size_t i = 0; i < batch_size; ++i) {
auto output_dim = tensor->dims()[1];
std::string output_dimstr =
boost::lexical_cast<std::string>(output_dim);
ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
auto bound = GetTensorBound(tensor, i);
ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
}
}
// #pragma omp parallel for
for (size_t i = 0; i < ars.size(); i++) {
if (ars[i].length() == 0) {
continue;
}
writer_ << ars[i];
}
if (need_dump_param_ && thread_id_ == 0) {
DumpParam(batch_cnt);
}
} }
total_inst += cur_batch; total_inst += cur_batch;
...@@ -222,7 +173,7 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -222,7 +173,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
timeline.Start(); timeline.Start();
} }
if (need_dump_field_) { if (need_dump_field_ || need_dump_param_) {
writer_.Flush(); writer_.Flush();
} }
...@@ -234,10 +185,6 @@ void HogwildWorker::TrainFilesWithProfiler() { ...@@ -234,10 +185,6 @@ void HogwildWorker::TrainFilesWithProfiler() {
#endif #endif
} }
void HogwildWorker::SetChannelWriter(ChannelObject<std::string> *queue) {
writer_.Reset(queue);
}
void HogwildWorker::TrainFiles() { void HogwildWorker::TrainFiles() {
platform::SetNumThreads(1); platform::SetNumThreads(1);
...@@ -284,25 +231,5 @@ void HogwildWorker::PrintFetchVars() { ...@@ -284,25 +231,5 @@ void HogwildWorker::PrintFetchVars() {
} }
} }
void HogwildWorker::SetNeedDump(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
void HogwildWorker::DumpParam(const int batch_id) {
std::ostringstream os;
for (auto &param : dump_param_) {
os.str("");
Variable *var = thread_scope_->FindVar(param);
if (var == nullptr) {
continue;
}
LoDTensor *tensor = var->GetMutable<LoDTensor>();
int64_t len = tensor->numel();
os << "(" << batch_id << "," << param << ")"
<< PrintLodTensor(tensor, 0, len);
writer_ << os.str();
}
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
...@@ -28,18 +27,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -28,18 +27,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
thread_num_ = trainer_desc.thread_num(); thread_num_ = trainer_desc.thread_num();
SetDataset(dataset); SetDataset(dataset);
dump_fields_path_ = trainer_desc.dump_fields_path(); ParseDumpConfig(trainer_desc);
dump_converter_ = trainer_desc.dump_converter();
need_dump_field_ = false;
if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
need_dump_field_ = true;
}
if (need_dump_field_) {
auto& file_list = dataset->GetFileList();
if (file_list.size() == 0) {
need_dump_field_ = false;
}
}
mpi_rank_ = trainer_desc.mpi_rank(); mpi_rank_ = trainer_desc.mpi_rank();
mpi_size_ = trainer_desc.mpi_size(); mpi_size_ = trainer_desc.mpi_size();
dump_file_num_ = trainer_desc.dump_file_num(); dump_file_num_ = trainer_desc.dump_file_num();
...@@ -68,41 +56,23 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -68,41 +56,23 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name()); trainer_desc.device_worker_name());
workers_[i]->SetNeedDumpField(need_dump_field_);
workers_[i]->SetNeedDumpParam(need_dump_param_);
workers_[i]->SetDumpFieldVector(dump_fields_);
workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetDeviceIndex(i); workers_[i]->SetDeviceIndex(i);
workers_[i]->SetDataFeed(readers[i]); workers_[i]->SetDataFeed(readers[i]);
workers_[i]->SetNeedDump(need_dump_field_);
} }
// set debug here // set debug here
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
void MultiTrainer::DumpWork(int tid) { std::string MultiTrainer::GetDumpPath(int tid) {
#ifdef _LINUX return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(),
int err_no = 0; mpi_rank_, tid);
std::string path = string::format_string(
"%s/part-%03d-%05d", dump_fields_path_.c_str(), mpi_rank_, tid);
std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
while (1) {
std::string out_str;
if (!queue_->Get(out_str)) {
break;
}
size_t write_count =
fwrite_unlocked(out_str.data(), 1, out_str.length(), fp.get());
if (write_count != out_str.length()) {
VLOG(3) << "dump text failed";
continue;
}
write_count = fwrite_unlocked("\n", 1, 1, fp.get());
if (write_count != 1) {
VLOG(3) << "dump text failed";
continue;
}
}
#endif
} }
void MultiTrainer::InitDumpEnv() { void MultiTrainer::InitDumpEnv() {
...@@ -119,16 +89,8 @@ void MultiTrainer::InitDumpEnv() { ...@@ -119,16 +89,8 @@ void MultiTrainer::InitDumpEnv() {
} }
for (int i = 0; i < dump_thread_num_; i++) { for (int i = 0; i < dump_thread_num_; i++) {
dump_thread_.push_back( dump_thread_.push_back(
std::thread(std::bind(&MultiTrainer::DumpWork, this, i))); std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
}
}
void MultiTrainer::FinalizeDumpEnv() {
queue_->Close();
for (auto& th : dump_thread_) {
th.join();
} }
queue_.reset();
} }
// call only after all resources are set in current trainer // call only after all resources are set in current trainer
......
...@@ -27,6 +27,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -27,6 +27,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
VLOG(3) << "pipeline num: " << pipeline_num_; VLOG(3) << "pipeline num: " << pipeline_num_;
SetDataset(dataset); SetDataset(dataset);
ParseDumpConfig(trainer_desc);
// get filelist from trainer_desc here // get filelist from trainer_desc here
const std::vector<paddle::framework::DataFeed*> readers = const std::vector<paddle::framework::DataFeed*> readers =
dataset->GetReaders(); dataset->GetReaders();
...@@ -103,8 +104,15 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -103,8 +104,15 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
this_worker->SetDataFeed(readers[reader_index++]); this_worker->SetDataFeed(readers[reader_index++]);
this_worker->SetReaderPlace(place); this_worker->SetReaderPlace(place);
} }
if (i == section_num_ - 1) {
this_worker->SetNeedDumpField(need_dump_field_);
this_worker->SetNeedDumpParam(need_dump_param_);
this_worker->SetDumpFieldVector(dump_fields_);
this_worker->SetDumpParamVector(dump_param_);
}
this_worker->SetPlace(place); this_worker->SetPlace(place);
this_worker->Initialize(trainer_desc); this_worker->Initialize(trainer_desc);
this_worker->InitRandomDumpConfig(trainer_desc);
} }
} }
} }
...@@ -119,6 +127,33 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -119,6 +127,33 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) {
if (need_dump_field_) {
InitDumpEnv();
}
VLOG(3) << "init other env done.";
}
std::string PipelineTrainer::GetDumpPath(int tid) {
return string::format_string("%s/part-%05d", dump_fields_path_.c_str(), tid);
}
void PipelineTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>();
// Only set dump channel on the last section
for (int j = 0; j < pipeline_num_; ++j) {
for (size_t k = 0; k < workers_[section_num_ - 1][j].size(); ++k) {
workers_[section_num_ - 1][j][k]->SetChannelWriter(queue_.get());
}
}
// TODO(hutuxian): should make it as a config
dump_thread_num_ = 1;
for (int i = 0; i < dump_thread_num_; i++) {
dump_thread_.push_back(
std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
}
}
void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue, void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue,
int pipeline_id, int pipeline_id,
const ProgramDesc& main_program, const ProgramDesc& main_program,
...@@ -271,6 +306,9 @@ void PipelineTrainer::Finalize() { ...@@ -271,6 +306,9 @@ void PipelineTrainer::Finalize() {
for (auto& th : section_threads_) { for (auto& th : section_threads_) {
th.join(); th.join();
} }
if (need_dump_field_) {
FinalizeDumpEnv();
}
for (const auto& var : persistable_vars_) { for (const auto& var : persistable_vars_) {
auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>(); auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>();
// TODO(hutuxian): Add a final all-reduce? // TODO(hutuxian): Add a final all-reduce?
......
...@@ -95,11 +95,11 @@ void SyncFunctor::Synchronize() { ...@@ -95,11 +95,11 @@ void SyncFunctor::Synchronize() {
} }
std::atomic<int> SectionWorker::cpu_id_(0); std::atomic<int> SectionWorker::cpu_id_(0);
void SectionWorker::Initialize(const TrainerDesc& trainer_desc) { void SectionWorker::Initialize(const TrainerDesc& desc) {
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
std::shared_ptr<framework::ProgramDesc> program; std::shared_ptr<framework::ProgramDesc> program;
program.reset(new ProgramDesc( program.reset(new ProgramDesc(
trainer_desc.section_param().section_config(section_id_).program_desc())); desc.section_param().section_config(section_id_).program_desc()));
for (auto& op_desc : program->Block(0).AllOps()) { for (auto& op_desc : program->Block(0).AllOps()) {
ops_.push_back(OpRegistry::CreateOp(*op_desc)); ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
...@@ -373,6 +373,12 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -373,6 +373,12 @@ void SectionWorker::TrainFilesWithProfiler() {
metric_msg->add_data(exe_scope); metric_msg->add_data(exe_scope);
} }
#endif #endif
if (need_dump_field_) {
DumpField(*scope, dump_mode_, dump_interval_);
}
if (need_dump_param_ && pipeline_id_ == 0) {
DumpParam(*scope, step_cnt);
}
if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) { if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) {
// FIXME: Temporarily we assume two adjacent sections are in different // FIXME: Temporarily we assume two adjacent sections are in different
...@@ -410,6 +416,9 @@ void SectionWorker::TrainFilesWithProfiler() { ...@@ -410,6 +416,9 @@ void SectionWorker::TrainFilesWithProfiler() {
accum_num += batch_size; accum_num += batch_size;
main_timer.Pause(); main_timer.Pause();
} }
if (need_dump_field_ || need_dump_param_) {
writer_.Flush();
}
outer_timer.Pause(); outer_timer.Pause();
worker_count_mutex_->lock(); worker_count_mutex_->lock();
......
...@@ -13,11 +13,77 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,77 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#include "io/fs.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void TrainerBase::SetScope(Scope* root_scope) { root_scope_ = root_scope; } void TrainerBase::SetScope(Scope* root_scope) { root_scope_ = root_scope; }
void TrainerBase::ParseDumpConfig(const TrainerDesc& desc) {
dump_fields_path_ = desc.dump_fields_path();
if (dump_fields_path_ == "") {
VLOG(2) << "dump_fields_path_ is empty";
return;
}
auto& file_list = dataset_ptr_->GetFileList();
if (file_list.size() == 0) {
VLOG(2) << "file_list is empty";
return;
}
dump_converter_ = desc.dump_converter();
if (desc.dump_fields_size() != 0) {
need_dump_field_ = true;
dump_fields_.resize(desc.dump_fields_size());
for (int i = 0; i < desc.dump_fields_size(); ++i) {
dump_fields_[i] = desc.dump_fields(i);
}
}
if (desc.dump_param_size() != 0) {
need_dump_param_ = true;
dump_param_.resize(desc.dump_param_size());
for (int i = 0; i < desc.dump_param_size(); ++i) {
dump_param_[i] = desc.dump_param(i);
}
}
}
void TrainerBase::DumpWork(int tid) {
#ifdef _LINUX
int err_no = 0;
// GetDumpPath is implemented in each Trainer
std::string path = GetDumpPath(tid);
std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
while (1) {
std::string out_str;
if (!queue_->Get(out_str)) {
break;
}
size_t write_count =
fwrite_unlocked(out_str.data(), 1, out_str.length(), fp.get());
if (write_count != out_str.length()) {
VLOG(3) << "dump text failed";
continue;
}
write_count = fwrite_unlocked("\n", 1, 1, fp.get());
if (write_count != 1) {
VLOG(3) << "dump text failed";
continue;
}
}
#endif
}
void TrainerBase::FinalizeDumpEnv() {
queue_->Close();
for (auto& th : dump_thread_) {
th.join();
}
queue_.reset();
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
...@@ -51,11 +51,28 @@ class TrainerBase { ...@@ -51,11 +51,28 @@ class TrainerBase {
virtual void Run() = 0; virtual void Run() = 0;
virtual void Finalize() = 0; virtual void Finalize() = 0;
virtual Scope* GetWorkerScope(int thread_id) = 0; virtual Scope* GetWorkerScope(int thread_id) = 0;
virtual void InitDumpEnv() = 0;
virtual void DumpWork(int tid);
protected: protected:
virtual std::string GetDumpPath(int tid) = 0;
virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
virtual void FinalizeDumpEnv();
Scope* root_scope_; Scope* root_scope_;
bool debug_; bool debug_;
Dataset* dataset_ptr_; Dataset* dataset_ptr_;
// For dump param or field
bool need_dump_field_ = false;
bool need_dump_param_ = false;
std::string dump_fields_path_;
std::string dump_converter_;
std::vector<std::string> dump_param_;
std::vector<std::string> dump_fields_;
int dump_thread_num_;
std::vector<std::thread> dump_thread_;
std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
}; };
// general trainer for async execution // general trainer for async execution
...@@ -71,10 +88,9 @@ class MultiTrainer : public TrainerBase { ...@@ -71,10 +88,9 @@ class MultiTrainer : public TrainerBase {
virtual void InitOtherEnv(const ProgramDesc& main_program); virtual void InitOtherEnv(const ProgramDesc& main_program);
virtual void Run(); virtual void Run();
virtual void Finalize(); virtual void Finalize();
virtual void FinalizeDumpEnv();
virtual void InitDumpEnv(); virtual void InitDumpEnv();
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
virtual void DumpWork(int tid); virtual std::string GetDumpPath(int tid);
protected: protected:
int thread_num_; int thread_num_;
...@@ -83,16 +99,9 @@ class MultiTrainer : public TrainerBase { ...@@ -83,16 +99,9 @@ class MultiTrainer : public TrainerBase {
std::vector<std::shared_ptr<DeviceWorker>> workers_; std::vector<std::shared_ptr<DeviceWorker>> workers_;
std::vector<std::string> need_merge_var_names_; std::vector<std::string> need_merge_var_names_;
bool need_dump_field_;
std::string dump_fields_path_;
std::string dump_converter_;
int mpi_rank_; int mpi_rank_;
int mpi_size_; int mpi_size_;
int dump_file_num_; int dump_file_num_;
std::vector<std::thread> dump_thread_;
int dump_thread_num_;
std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
}; };
class DistMultiTrainer : public MultiTrainer { class DistMultiTrainer : public MultiTrainer {
...@@ -107,10 +116,8 @@ class DistMultiTrainer : public MultiTrainer { ...@@ -107,10 +116,8 @@ class DistMultiTrainer : public MultiTrainer {
virtual void Finalize(); virtual void Finalize();
template <typename T> template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
virtual void FinalizeDumpEnv();
virtual void InitDumpEnv(); virtual void InitDumpEnv();
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
virtual void DumpWork(int tid);
protected: protected:
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_; std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
...@@ -124,10 +131,12 @@ class PipelineTrainer : public TrainerBase { ...@@ -124,10 +131,12 @@ class PipelineTrainer : public TrainerBase {
void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override; void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
void InitTrainerEnv(const ProgramDesc& main_program, void InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) override; const platform::Place& place) override;
void InitOtherEnv(const ProgramDesc& main_program) override {} void InitOtherEnv(const ProgramDesc& main_program) override;
void Run() override; void Run() override;
void Finalize() override; void Finalize() override;
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
void InitDumpEnv() override;
virtual std::string GetDumpPath(int tid);
protected: protected:
int section_num_; int section_num_;
......
...@@ -49,6 +49,9 @@ message TrainerDesc { ...@@ -49,6 +49,9 @@ message TrainerDesc {
optional bool no_cvm = 21 [ default = false ]; optional bool no_cvm = 21 [ default = false ];
optional bool thread_barrier = 22; optional bool thread_barrier = 22;
repeated string loss_names = 23; repeated string loss_names = 23;
optional bool enable_random_dump = 24 [ default = false ];
optional bool random_with_lineid = 25 [ default = false ];
optional int32 dump_interval = 26 [ default = 10000 ];
// device worker parameters // device worker parameters
optional HogwildWorkerParameter hogwild_param = 101; optional HogwildWorkerParameter hogwild_param = 101;
......
...@@ -16,6 +16,7 @@ import paddle.fluid as fluid ...@@ -16,6 +16,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import numpy as np import numpy as np
import os import os
import shutil
import paddle.fluid.core as core import paddle.fluid.core as core
import unittest import unittest
from paddle.fluid.layers.nn import _pull_box_sparse from paddle.fluid.layers.nn import _pull_box_sparse
...@@ -90,87 +91,105 @@ class TestBoxPSPreload(unittest.TestCase): ...@@ -90,87 +91,105 @@ class TestBoxPSPreload(unittest.TestCase):
""" TestCases for BoxPS Preload """ """ TestCases for BoxPS Preload """
def test_boxps_cpu(self): def test_boxps_cpu(self):
self.run_boxps_preload(True) self.run_boxps_preload(True, True)
self.run_boxps_preload(True, False)
def test_boxps_gpu(self): def test_boxps_gpu(self):
self.run_boxps_preload(False) self.run_boxps_preload(False, True)
self.run_boxps_preload(False, False)
def run_boxps_preload(self, is_cpu=True):
x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) def run_boxps_preload(self, is_cpu=True, random_with_lineid=False):
y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) program = fluid.Program()
emb_x, emb_y = _pull_box_sparse([x, y], size=2) with fluid.program_guard(program):
emb_xp = _pull_box_sparse(x, size=2) x = fluid.layers.data(
concat = layers.concat([emb_x, emb_y], axis=1) name='x', shape=[1], dtype='int64', lod_level=0)
fc = layers.fc(input=concat, y = fluid.layers.data(
name="fc", name='y', shape=[1], dtype='int64', lod_level=0)
size=1, emb_x, emb_y = _pull_box_sparse([x, y], size=2)
num_flatten_dims=1, emb_xp = _pull_box_sparse(x, size=2)
bias_attr=False) concat = layers.concat([emb_x, emb_y], axis=1)
loss = layers.reduce_mean(fc) fc = layers.fc(input=concat,
layers.Print(loss) name="fc",
place = fluid.CPUPlace() if is_cpu or not core.is_compiled_with_cuda( size=1,
) else fluid.CUDAPlace(0) num_flatten_dims=1,
exe = fluid.Executor(place) bias_attr=False)
batch_size = 2 loss = layers.reduce_mean(fc)
layers.Print(loss)
def binary_print(slot, fout): place = fluid.CPUPlace(
fout.write(str(len(slot)) + " ") ) if is_cpu or not core.is_compiled_with_cuda(
for e in slot: ) else fluid.CUDAPlace(0)
fout.write(str(e) + " ") exe = fluid.Executor(place)
batch_size = 100
batch1 = np.ones(
(batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) def binary_print(slot, fout):
filelist = [] fout.write(str(len(slot)) + " ")
place_str = "cpu" if is_cpu else "gpu" for e in slot:
for i in range(2): fout.write(str(e) + " ")
filelist.append("test_hdfs_" + place_str + "_" + str(i))
for f in filelist: batch1 = np.ones(
with open(f, "w") as fout: (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
for ins in batch1: filelist = []
for slot in ins: place_str = "cpu" if is_cpu else "gpu"
binary_print(slot, fout) for i in range(2):
fout.write("\n") filelist.append("test_hdfs_" + place_str + "_" + str(i))
for f in filelist:
def create_dataset(): with open(f, "w") as fout:
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") for ins in batch1:
dataset.set_date("20190930") for slot in ins:
dataset.set_use_var([x, y]) binary_print(slot, fout)
dataset.set_batch_size(2) fout.write("\n")
dataset.set_thread(1)
dataset.set_filelist(filelist) def create_dataset():
return dataset dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.set_date("20190930")
datasets = [] dataset.set_use_var([x, y])
datasets.append(create_dataset()) dataset.set_batch_size(2)
datasets.append(create_dataset()) dataset.set_thread(1)
optimizer = fluid.optimizer.SGD(learning_rate=0.5) dataset.set_filelist(filelist)
optimizer = fluid.optimizer.PipelineOptimizer( return dataset
optimizer,
cut_list=[], datasets = []
place_list=[place], datasets.append(create_dataset())
concurrency_list=[1], datasets.append(create_dataset())
queue_size=1, optimizer = fluid.optimizer.SGD(learning_rate=0.5)
sync_steps=-1) optimizer = fluid.optimizer.PipelineOptimizer(
optimizer.minimize(loss) optimizer,
exe.run(fluid.default_startup_program()) cut_list=[],
datasets[0].load_into_memory() place_list=[place],
datasets[0].begin_pass() concurrency_list=[1],
datasets[1].preload_into_memory() queue_size=1,
exe.train_from_dataset( sync_steps=-1)
program=fluid.default_main_program(), optimizer.minimize(loss)
dataset=datasets[0],
print_period=1) program._pipeline_opt[
datasets[0].end_pass(True) "dump_fields"] = ["fc.tmp_0", "fc.tmp_0@GRAD", "hehe"]
datasets[1].wait_preload_done() program._pipeline_opt["dump_fields_path"] = "./dump_log/"
datasets[1].begin_pass() program._pipeline_opt["dump_param"] = ["fc.w_0"]
exe.train_from_dataset( program._pipeline_opt["enable_random_dump"] = True
program=fluid.default_main_program(), program._pipeline_opt["dump_interval"] = 10
dataset=datasets[1], program._pipeline_opt["random_with_lineid"] = random_with_lineid
print_period=1,
debug=True) exe.run(fluid.default_startup_program())
datasets[1].end_pass(False) datasets[0].load_into_memory()
for f in filelist: datasets[0].begin_pass()
os.remove(f) datasets[1].preload_into_memory()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=datasets[0],
print_period=1)
datasets[0].end_pass(True)
datasets[1].wait_preload_done()
datasets[1].begin_pass()
exe.train_from_dataset(
program=fluid.default_main_program(),
dataset=datasets[1],
print_period=1,
debug=True)
datasets[1].end_pass(False)
for f in filelist:
os.remove(f)
if os.path.isdir("dump_log"):
shutil.rmtree("dump_log")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -147,7 +147,7 @@ class TestPipeline(unittest.TestCase): ...@@ -147,7 +147,7 @@ class TestPipeline(unittest.TestCase):
for f in filelist: for f in filelist:
os.remove(f) os.remove(f)
def test_pipeline_single_section(self): def single_section(self, random_dump):
program = fluid.Program() program = fluid.Program()
with fluid.program_guard(program): with fluid.program_guard(program):
x = fluid.layers.data( x = fluid.layers.data(
...@@ -179,11 +179,20 @@ class TestPipeline(unittest.TestCase): ...@@ -179,11 +179,20 @@ class TestPipeline(unittest.TestCase):
optimizer = fluid.optimizer.PipelineOptimizer( optimizer = fluid.optimizer.PipelineOptimizer(
optimizer, optimizer,
cut_list=[], cut_list=[],
#place_list=[fluid.CPUPlace()],
place_list=[fluid.CUDAPlace(0)], place_list=[fluid.CUDAPlace(0)],
concurrency_list=[1], concurrency_list=[1],
queue_size=1, queue_size=1,
sync_steps=-1) sync_steps=-1)
optimizer.minimize(loss) optimizer.minimize(loss)
program._pipeline_opt["dump_fields"] = ["fc.tmp_0", "fc.tmp_0@GRAD"]
program._pipeline_opt["dump_fields_path"] = "./dump_log/"
program._pipeline_opt["dump_param"] = ["embx"]
program._pipeline_opt["enable_random_dump"] = random_dump
program._pipeline_opt["dump_interval"] = 10
program._pipeline_opt["random_with_lineid"] = False
#print(program._pipeline_opt)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -225,13 +234,19 @@ class TestPipeline(unittest.TestCase): ...@@ -225,13 +234,19 @@ class TestPipeline(unittest.TestCase):
fluid.default_main_program(), fluid.default_main_program(),
dataset, dataset,
thread=1, thread=1,
debug=False, debug=True,
fetch_list=[], fetch_list=[],
fetch_info=[], fetch_info=[],
print_period=1) print_period=1)
for f in filelist: for f in filelist:
os.remove(f) os.remove(f)
if os.path.isdir("dump_log"):
shutil.rmtree("dump_log")
def test_pipeline(self):
self.single_section(True)
self.single_section(False)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -107,6 +107,15 @@ class TrainerDesc(object): ...@@ -107,6 +107,15 @@ class TrainerDesc(object):
def _set_dump_converter(self, converter): def _set_dump_converter(self, converter):
self.proto_desc.dump_converter = converter self.proto_desc.dump_converter = converter
def _set_enable_random_dump(self, enable_random_dump):
self.proto_desc.enable_random_dump = enable_random_dump
def _set_dump_interval(self, dump_interval):
self.proto_desc.dump_interval = dump_interval
def _set_random_with_lineid(self, random_with_lineid):
self.proto_desc.random_with_lineid = random_with_lineid
def _set_dump_param(self, dump_param): def _set_dump_param(self, dump_param):
for param in dump_param: for param in dump_param:
self.proto_desc.dump_param.append(param) self.proto_desc.dump_param.append(param)
......
...@@ -72,6 +72,14 @@ class TrainerFactory(object): ...@@ -72,6 +72,14 @@ class TrainerFactory(object):
trainer._set_dump_converter(opt_info["dump_converter"]) trainer._set_dump_converter(opt_info["dump_converter"])
if opt_info.get("dump_param") is not None: if opt_info.get("dump_param") is not None:
trainer._set_dump_param(opt_info["dump_param"]) trainer._set_dump_param(opt_info["dump_param"])
if opt_info.get("enable_random_dump") is not None:
trainer._set_enable_random_dump(opt_info[
"enable_random_dump"])
if opt_info.get("dump_interval") is not None:
trainer._set_dump_interval(opt_info["dump_interval"])
if opt_info.get("random_with_lineid") is not None:
trainer._set_random_with_lineid(opt_info[
"random_with_lineid"])
if "fleet_desc" in opt_info: if "fleet_desc" in opt_info:
device_worker._set_fleet_desc(opt_info["fleet_desc"]) device_worker._set_fleet_desc(opt_info["fleet_desc"])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册