diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 6164953083bf6ee1e1612829f6f89ac23dc02392..5b2d96e941ba2fbdbd63ccf13032624b0b5048d1 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -230,6 +230,8 @@ class DownpourWorker : public HogwildWorker { // adjust ins weight AdjustInsWeightConfig adjust_ins_weight_config_; std::vector nid_show_; + // check nan and inf during training + std::vector check_nan_var_names_; }; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index e7dbf3b1131740748a5258455fdc76e2a50f1fc9..784f6abb4490f38213f72756eaf39b2cf70b4b35 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -81,6 +81,9 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { dump_fields_[i] = desc.dump_fields(i); } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); + for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { + check_nan_var_names_.push_back(desc.check_nan_var_names(i)); + } } void DownpourWorker::SetChannelWriter(ChannelObject* queue) { @@ -468,6 +471,22 @@ void DownpourWorker::TrainFilesWithProfiler() { } } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = thread_scope_->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false, + "Tensor %s contains Inf", var_name); + PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false, + "Tensor %s contains NAN", var_name); + } + if (need_to_push_sparse_) { for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); ++i) { @@ -655,6 +674,22 @@ void DownpourWorker::TrainFiles() { } } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = thread_scope_->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false, + "Tensor %s contains Inf", var_name); + PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false, + "Tensor %s contains NAN", var_name); + } + if (need_to_push_sparse_) { // push gradients here for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 11bb8073ad2bdc09ebc563080297c0700bb7c625..11261e9e1774825761116d9fcc951b2b56d0cdc4 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -42,6 +42,7 @@ message TrainerDesc { optional int32 mpi_size = 16 [ default = -1 ]; optional int32 dump_file_num = 17 [ default = 16 ]; + repeated string check_nan_var_names = 18; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 0a15dc03358d5cb8f863b7ecdf6a9a07ddf461d1..b47986905791fff741f1e0e4ceaaf4329754e524 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -248,6 +248,8 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["stat_var_names"] = strategy.get("stat_var_names", []) opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1) + opt_info["check_nan_var_names"] = strategy.get("check_nan_var_names", + []) opt_info["dump_slot"] = False opt_info["dump_converter"] = "" opt_info["dump_fields"] = strategy.get("dump_fields", []) diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 9df3cb327f81472df4baa598af17b46bd86cae77..4ee98d8b85ad8d58e354eb1d25a0c70e7d66be1d 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -100,6 +100,10 @@ class TrainerDesc(object): def _set_dump_converter(self, converter): self.proto_desc.dump_converter = converter + def _set_check_nan_var_names(self, check_nan_var_names): + for var in check_nan_var_names: + self.proto_desc.check_nan_var_names.append(var) + def _set_adjust_ins_weight(self, config_dict): self.proto_desc.adjust_ins_weight_config.need_adjust = \ config_dict.get("need_adjust", False) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index daea2deba698a51fb316adfa5ff177a58b708424..f6f794a2428a6fc13c0a1a6a1dbb640c77390eb3 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -53,6 +53,8 @@ class TrainerFactory(object): trainer._set_dump_file_num(opt_info["dump_file_num"]) trainer._set_dump_converter(opt_info["dump_converter"]) trainer._set_adjust_ins_weight(opt_info["adjust_ins_weight"]) + trainer._set_check_nan_var_names(opt_info[ + "check_nan_var_names"]) trainer._set_device_worker(device_worker) return trainer