diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 516d189ad87d65e59b4512494dbf972b51748eb1..58df49d324bb23c3f8425ca556de4a9ed2b0a863 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -194,8 +194,11 @@ class DownpourWorker : public HogwildWorker { void PushGradients(); void CollectLabelInfo(size_t table_id); void AdjustInsWeight(); + void DumpParam(); private: + bool need_dump_param_; + std::vector dump_param_; bool need_to_push_dense_; bool need_dump_field_; bool dump_slot_; diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index c2bfa1fa4a3b02481de9fc5ad4bc4be369e00ae6..248855b795f340ce21939335eec6ffe41645b763 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -82,6 +82,14 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { dump_fields_[i] = desc.dump_fields(i); } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); + need_dump_param_ = false; + dump_param_.resize(desc.dump_param_size()); + for (int i = 0; i < desc.dump_param_size(); ++i) { + dump_param_[i] = desc.dump_param(i); + } + if (desc.dump_param_size() != 0) { + need_dump_param_ = true; + } for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { check_nan_var_names_.push_back(desc.check_nan_var_names(i)); } @@ -163,6 +171,22 @@ bool CheckValidOutput(LoDTensor* tensor, int batch_size) { return true; } +void DownpourWorker::DumpParam() { + std::string os; + for (auto& param : dump_param_) { + os.clear(); + os = param; + Variable* var = thread_scope_->FindVar(param); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + int64_t len = tensor->numel(); + os += PrintLodTensor(tensor, 0, len); + writer_ << os; + } +} + void DownpourWorker::CollectLabelInfo(size_t table_idx) { uint64_t table_id = static_cast( param_.program_config(0).pull_sparse_table_id(table_idx)); @@ -814,6 +838,9 @@ void DownpourWorker::TrainFiles() { } writer_ << ars[i]; } + if (need_dump_param_ && thread_id_ == 0) { + DumpParam(); + } } PrintFetchVars(); diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 10018439edfd4bea860850c6c8b672a39cf420e5..2193d7b71ddfc3b800348e4f627d113e046a2ee6 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -105,7 +105,6 @@ class DistMultiTrainer : public MultiTrainer { bool need_dump_field_; std::string dump_fields_path_; std::string dump_converter_; - std::vector dump_fields_; int mpi_rank_; int mpi_size_; int dump_file_num_; diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 11261e9e1774825761116d9fcc951b2b56d0cdc4..59f2cd9d327f6b6af8504a0e7e4af8ce135c4233 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -39,6 +39,7 @@ message TrainerDesc { optional string dump_fields_path = 12; repeated string dump_fields = 13; optional string dump_converter = 14; + repeated string dump_param = 15; optional int32 mpi_size = 16 [ default = -1 ]; optional int32 dump_file_num = 17 [ default = 16 ]; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 107b7d2c9b7aad042db0e9198bea5985407df51f..0169de22ed1057a5f78f2375f9d25c02fa773185 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -358,6 +358,7 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["dump_fields"] = strategy.get("dump_fields", []) opt_info["dump_file_num"] = strategy.get("dump_file_num", 16) opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") + opt_info["dump_param"] = strategy.get("dump_param", []) if server._server.downpour_server_param.downpour_table_param[ 0].accessor.accessor_class == "DownpourCtrAccessor": opt_info["dump_slot"] = True diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 4ee98d8b85ad8d58e354eb1d25a0c70e7d66be1d..a4cf033062e5ca73d217b690595b9bd2b31c86b4 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -100,6 +100,10 @@ class TrainerDesc(object): def _set_dump_converter(self, converter): self.proto_desc.dump_converter = converter + def _set_dump_param(self, dump_param): + for param in dump_param: + self.proto_desc.dump_param.append(param) + def _set_check_nan_var_names(self, check_nan_var_names): for var in check_nan_var_names: self.proto_desc.check_nan_var_names.append(var) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index f6f794a2428a6fc13c0a1a6a1dbb640c77390eb3..1469af3d182bd3182e8c9e811bfa11a34f29b7c1 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -53,6 +53,7 @@ class TrainerFactory(object): trainer._set_dump_file_num(opt_info["dump_file_num"]) trainer._set_dump_converter(opt_info["dump_converter"]) trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"]) + trainer._set_dump_param(opt_info["dump_param"]) trainer._set_check_nan_var_names(opt_info[ "check_nan_var_names"]) trainer._set_device_worker(device_worker)