“b54435a8ab77bb8d74f05949a2ff0d7cddc112ec”上不存在“develop/doc/design/kernel_selection.html”
未验证 提交 092839d6 编写于 作者: D danleifeng 提交者: GitHub

[psgpu]add checknan print and fix trainer device (#38131)

* trainer_device fix and checknan tool for psgpu;test=develop

* disable show_one_table;test=develop
上级 25c1b623
...@@ -454,9 +454,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -454,9 +454,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(), this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
gpu_task->device_values_[i].data(), gpu_task->device_values_[i].data(),
feature_keys_count[i], 500000, 2); feature_keys_count[i], 500000, 2);
if (feature_keys_count[i] > 0) { // if (feature_keys_count[i] > 0) {
HeterPs_->show_one_table(i); // HeterPs_->show_one_table(i);
} // }
}; };
for (size_t i = 0; i < threads.size(); i++) { for (size_t i = 0; i < threads.size(); i++) {
threads[i] = std::thread(build_func, i); threads[i] = std::thread(build_func, i);
......
...@@ -75,6 +75,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -75,6 +75,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
workers_[i]->SetDumpParamVector(dump_param_); workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->SetDataFeed(readers[i]); workers_[i]->SetDataFeed(readers[i]);
workers_[i]->SetPlace(places_[i]);
workers_[i]->SetReaderPlace(places_[i]);
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetWorkerNum(place_num); workers_[i]->SetWorkerNum(place_num);
} }
...@@ -102,8 +104,6 @@ void PSGPUTrainer::RegisterHeterCallback() { ...@@ -102,8 +104,6 @@ void PSGPUTrainer::RegisterHeterCallback() {
void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program, void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) { const platform::Place& place) {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
workers_[i]->SetPlace(places_[i]);
workers_[i]->SetReaderPlace(places_[i]);
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
...@@ -216,7 +216,9 @@ void PSGPUTrainer::Finalize() { ...@@ -216,7 +216,9 @@ void PSGPUTrainer::Finalize() {
continue; continue;
} }
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>(); LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
if (root_tensor == nullptr || !root_tensor->IsInitialized()) {
continue;
}
for (size_t j = 0; j < places_.size(); j++) { for (size_t j = 0; j < places_.size(); j++) {
Scope* cur_thread_scope = workers_[j]->GetThreadScope(); Scope* cur_thread_scope = workers_[j]->GetThreadScope();
Variable* thread_var = Variable* thread_var =
...@@ -225,6 +227,9 @@ void PSGPUTrainer::Finalize() { ...@@ -225,6 +227,9 @@ void PSGPUTrainer::Finalize() {
continue; continue;
} }
LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>(); LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
if (thread_tensor == nullptr || !thread_tensor->IsInitialized()) {
continue;
}
#define MergeCallback(cpp_type, proto_type) \ #define MergeCallback(cpp_type, proto_type) \
do { \ do { \
if (root_tensor->type() == proto_type) { \ if (root_tensor->type() == proto_type) { \
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/lodtensor_printer.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
...@@ -149,6 +150,38 @@ void PSGPUWorker::TrainFiles() { ...@@ -149,6 +150,38 @@ void PSGPUWorker::TrainFiles() {
DumpParam(*thread_scope_, batch_cnt); DumpParam(*thread_scope_, batch_cnt);
} }
for (std::string& var_name : check_nan_var_names_) {
Variable* var = thread_scope_->FindVar(var_name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr || !tensor->IsInitialized()) {
continue;
}
if (framework::TensorContainsInf(*tensor) ||
framework::TensorContainsNAN(*tensor)) {
static std::mutex mutex;
{
std::lock_guard<std::mutex> lock(mutex);
VLOG(0) << "worker " << thread_id_ << ": " << var_name
<< " cantains inf or nan";
auto all_vars = thread_scope_->LocalVarNames();
std::stringstream ss;
ss << "====== worker " << thread_id_ << "======\n";
for (auto& local_var : all_vars) {
platform::PrintVar(thread_scope_, local_var, local_var, &ss);
ss << "\n";
}
std::cout << ss.str() << std::endl;
VLOG(0) << "worker " << thread_id_ << "print nan var done....";
}
sleep(600);
exit(-1);
}
}
dev_ctx_->Wait();
PrintFetchVars(); PrintFetchVars();
thread_scope_->DropKids(); thread_scope_->DropKids();
++batch_cnt; ++batch_cnt;
......
...@@ -35,6 +35,10 @@ class TensorFormatter { ...@@ -35,6 +35,10 @@ class TensorFormatter {
const std::string& tensor_name = "", const std::string& tensor_name = "",
const std::string& message = ""); const std::string& message = "");
template <typename T>
void FormatData(const framework::LoDTensor& print_tensor,
std::stringstream& log_stream);
void Print(const framework::LoDTensor& print_tensor, void Print(const framework::LoDTensor& print_tensor,
const std::string& tensor_name = "", const std::string& tensor_name = "",
const std::string& message = ""); const std::string& message = "");
...@@ -46,10 +50,6 @@ class TensorFormatter { ...@@ -46,10 +50,6 @@ class TensorFormatter {
void SetSummarize(int64_t summarize); void SetSummarize(int64_t summarize);
private: private:
template <typename T>
void FormatData(const framework::LoDTensor& print_tensor,
std::stringstream& log_stream);
int64_t summarize_ = -1; int64_t summarize_ = -1;
bool print_tensor_type_ = true; bool print_tensor_type_ = true;
bool print_tensor_shape_ = true; bool print_tensor_shape_ = true;
......
...@@ -39,15 +39,29 @@ void PrintVar(framework::Scope* scope, const std::string& var_name, ...@@ -39,15 +39,29 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
<< " does not exist in your scope"; << " does not exist in your scope";
return; return;
} }
if (!tensor->IsInitialized()) {
VLOG(0) << "tensor of variable " << var_name
<< " does not initialized in your scope";
return;
}
*sstream << print_info << ": "; *sstream << print_info;
#define PrintTensorCallback(cpp_type, proto_type) \ #define PrintTensorCallback(cpp_type, proto_type) \
do { \ do { \
if (tensor->type() == proto_type) { \ if (tensor->type() == proto_type) { \
*sstream << "["; \ *sstream << "["; \
auto* data = tensor->data<cpp_type>(); \ const cpp_type* data = nullptr; \
framework::LoDTensor cpu_tensor; \
if (is_cpu_place(tensor->place())) { \
data = tensor->data<cpp_type>(); \
} else { \
platform::CPUPlace cpu_place; \
TensorCopy(*tensor, cpu_place, &cpu_tensor); \
data = cpu_tensor.data<cpp_type>(); \
} \
auto element_num = tensor->numel(); \ auto element_num = tensor->numel(); \
*sstream << element_num << "]:["; \
if (element_num > 0) { \ if (element_num > 0) { \
*sstream << data[0]; \ *sstream << data[0]; \
for (int j = 1; j < element_num; ++j) { \ for (int j = 1; j < element_num; ++j) { \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册