From 5c3656bb5450c618b79a0b3cebb49db9e4a6c7a2 Mon Sep 17 00:00:00 2001 From: xujiaqi01 <173596896@qq.com> Date: Fri, 1 Nov 2019 16:21:51 +0800 Subject: [PATCH] add check nan / inf in downpour worker (#20694) (#20925) * add check nan / inf in downpour worker during training * test=develop --- paddle/fluid/framework/device_worker.h | 2 ++ paddle/fluid/framework/downpour_worker.cc | 35 +++++++++++++++++++ paddle/fluid/framework/trainer_desc.proto | 1 + .../pslib/optimizer_factory.py | 2 ++ python/paddle/fluid/trainer_desc.py | 4 +++ python/paddle/fluid/trainer_factory.py | 2 ++ 6 files changed, 46 insertions(+) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 6164953083b..5b2d96e941b 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -230,6 +230,8 @@ class DownpourWorker : public HogwildWorker { // adjust ins weight AdjustInsWeightConfig adjust_ins_weight_config_; std::vector nid_show_; + // check nan and inf during training + std::vector check_nan_var_names_; }; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index e7dbf3b1131..784f6abb449 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -81,6 +81,9 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { dump_fields_[i] = desc.dump_fields(i); } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); + for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { + check_nan_var_names_.push_back(desc.check_nan_var_names(i)); + } } void DownpourWorker::SetChannelWriter(ChannelObject* queue) { @@ -468,6 +471,22 @@ void DownpourWorker::TrainFilesWithProfiler() { } } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = thread_scope_->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false, + "Tensor %s contains Inf", var_name); + PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false, + "Tensor %s contains NAN", var_name); + } + if (need_to_push_sparse_) { for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); ++i) { @@ -655,6 +674,22 @@ void DownpourWorker::TrainFiles() { } } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = thread_scope_->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false, + "Tensor %s contains Inf", var_name); + PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false, + "Tensor %s contains NAN", var_name); + } + if (need_to_push_sparse_) { // push gradients here for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 11bb8073ad2..11261e9e177 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -42,6 +42,7 @@ message TrainerDesc { optional int32 mpi_size = 16 [ default = -1 ]; optional int32 dump_file_num = 17 [ default = 16 ]; + repeated string check_nan_var_names = 18; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 0a15dc03358..b4798690579 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -248,6 +248,8 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["stat_var_names"] = strategy.get("stat_var_names", []) opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1) + opt_info["check_nan_var_names"] = strategy.get("check_nan_var_names", + []) opt_info["dump_slot"] = False opt_info["dump_converter"] = "" opt_info["dump_fields"] = strategy.get("dump_fields", []) diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 9df3cb327f8..4ee98d8b85a 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -100,6 +100,10 @@ class TrainerDesc(object): def _set_dump_converter(self, converter): self.proto_desc.dump_converter = converter + def _set_check_nan_var_names(self, check_nan_var_names): + for var in check_nan_var_names: + self.proto_desc.check_nan_var_names.append(var) + def _set_adjust_ins_weight(self, config_dict): self.proto_desc.adjust_ins_weight_config.need_adjust = \ config_dict.get("need_adjust", False) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index daea2deba69..f6f794a2428 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -53,6 +53,8 @@ class TrainerFactory(object): trainer._set_dump_file_num(opt_info["dump_file_num"]) trainer._set_dump_converter(opt_info["dump_converter"]) trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"]) + trainer._set_check_nan_var_names(opt_info[ + "check_nan_var_names"]) trainer._set_device_worker(device_worker) return trainer -- GitLab