From 59bcdc8a1907b7e4eee3468f9ac0130594918d0f Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Thu, 31 Oct 2019 10:59:34 +0800 Subject: [PATCH] support dump param of model into afs (#20302) * support dump param to afs test=develop * code style test=develop * code style test=develop * dump param test=develop * dump param test=develop * dump param test=develop * dump param test=develop --- paddle/fluid/framework/device_worker.h | 3 +++ paddle/fluid/framework/downpour_worker.cc | 27 +++++++++++++++++++ paddle/fluid/framework/trainer.h | 1 - paddle/fluid/framework/trainer_desc.proto | 1 + .../pslib/optimizer_factory.py | 1 + python/paddle/fluid/trainer_desc.py | 4 +++ python/paddle/fluid/trainer_factory.py | 1 + 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 516d189ad8..58df49d324 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -194,8 +194,11 @@ class DownpourWorker : public HogwildWorker { void PushGradients(); void CollectLabelInfo(size_t table_id); void AdjustInsWeight(); + void DumpParam(); private: + bool need_dump_param_; + std::vector dump_param_; bool need_to_push_dense_; bool need_dump_field_; bool dump_slot_; diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index c2bfa1fa4a..248855b795 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -82,6 +82,14 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { dump_fields_[i] = desc.dump_fields(i); } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); + need_dump_param_ = false; + dump_param_.resize(desc.dump_param_size()); + for (int i = 0; i < desc.dump_param_size(); ++i) { + dump_param_[i] = desc.dump_param(i); + } + if (desc.dump_param_size() != 0) { + need_dump_param_ = true; + } for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { check_nan_var_names_.push_back(desc.check_nan_var_names(i)); } @@ -163,6 +171,22 @@ bool CheckValidOutput(LoDTensor* tensor, int batch_size) { return true; } +void DownpourWorker::DumpParam() { + std::string os; + for (auto& param : dump_param_) { + os.clear(); + os = param; + Variable* var = thread_scope_->FindVar(param); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + int64_t len = tensor->numel(); + os += PrintLodTensor(tensor, 0, len); + writer_ << os; + } +} + void DownpourWorker::CollectLabelInfo(size_t table_idx) { uint64_t table_id = static_cast( param_.program_config(0).pull_sparse_table_id(table_idx)); @@ -814,6 +838,9 @@ void DownpourWorker::TrainFiles() { } writer_ << ars[i]; } + if (need_dump_param_ && thread_id_ == 0) { + DumpParam(); + } } PrintFetchVars(); diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 10018439ed..2193d7b71d 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -105,7 +105,6 @@ class DistMultiTrainer : public MultiTrainer { bool need_dump_field_; std::string dump_fields_path_; std::string dump_converter_; - std::vector dump_fields_; int mpi_rank_; int mpi_size_; int dump_file_num_; diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 11261e9e17..59f2cd9d32 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -39,6 +39,7 @@ message TrainerDesc { optional string dump_fields_path = 12; repeated string dump_fields = 13; optional string dump_converter = 14; + repeated string dump_param = 15; optional int32 mpi_size = 16 [ default = -1 ]; optional int32 dump_file_num = 17 [ default = 16 ]; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 107b7d2c9b..0169de22ed 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -358,6 +358,7 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["dump_fields"] = strategy.get("dump_fields", []) opt_info["dump_file_num"] = strategy.get("dump_file_num", 16) opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") + opt_info["dump_param"] = strategy.get("dump_param", []) if server._server.downpour_server_param.downpour_table_param[ 0].accessor.accessor_class == "DownpourCtrAccessor": opt_info["dump_slot"] = True diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 4ee98d8b85..a4cf033062 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -100,6 +100,10 @@ class TrainerDesc(object): def _set_dump_converter(self, converter): self.proto_desc.dump_converter = converter + def _set_dump_param(self, dump_param): + for param in dump_param: + self.proto_desc.dump_param.append(param) + def _set_check_nan_var_names(self, check_nan_var_names): for var in check_nan_var_names: self.proto_desc.check_nan_var_names.append(var) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index f6f794a242..1469af3d18 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -53,6 +53,7 @@ class TrainerFactory(object): trainer._set_dump_file_num(opt_info["dump_file_num"]) trainer._set_dump_converter(opt_info["dump_converter"]) trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"]) + trainer._set_dump_param(opt_info["dump_param"]) trainer._set_check_nan_var_names(opt_info[ "check_nan_var_names"]) trainer._set_device_worker(device_worker) -- GitLab