提交 b9d855cb 编写于 作者: J jonyguo

fix: device occupied tdt hung

上级 9e124493
...@@ -194,17 +194,19 @@ bool MsContext::OpenTsd() { ...@@ -194,17 +194,19 @@ bool MsContext::OpenTsd() {
} }
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << "."; MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
int32_t initStatus = tdt::TdtHostInit(device_id); int32_t initStatus = tdt::TdtHostInit(device_id);
if (initStatus != TDT_OK_CODE) { if (initStatus != TDT_OK_CODE) {
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << "."; MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
return false; return false;
} }
tdt_print_ = std::thread(TensorPrint()); tdt_print_ = std::thread(TensorPrint());
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
tsd_ref_++; tsd_ref_++;
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << "."; MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
return true; return true;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册