未验证 提交 56943dd4 编写于 作者: F Fan Zhang 提交者: GitHub

add retry in pull sparse (#33812) (#37000)

* add retry in pull sparse

* retry
Co-authored-by: NThunderbrook <52529258+Thunderbrook@users.noreply.github.com>
上级 15cb05c8
...@@ -343,18 +343,38 @@ void FleetWrapper::PullSparseVarsSync( ...@@ -343,18 +343,38 @@ void FleetWrapper::PullSparseVarsSync(
for (auto& t : *fea_values) { for (auto& t : *fea_values) {
pull_result_ptr.push_back(t.data()); pull_result_ptr.push_back(t.data());
} }
int32_t cnt = 0;
while (true) {
pull_sparse_status.clear();
auto status = pslib_ptr_->_worker_ptr->pull_sparse( auto status = pslib_ptr_->_worker_ptr->pull_sparse(
pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
pull_sparse_status.push_back(std::move(status)); pull_sparse_status.push_back(std::move(status));
bool flag = true;
for (auto& t : pull_sparse_status) { for (auto& t : pull_sparse_status) {
t.wait(); t.wait();
auto status = t.get(); int32_t status = -1;
try {
status = t.get();
} catch (const std::future_error& e) {
VLOG(0) << "Caught a future_error with code" << e.code()
<< ", Message:" << e.what();
}
if (status != 0) { if (status != 0) {
LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
sleep(sleep_seconds_before_fail_exit_); sleep(sleep_seconds_before_fail_exit_);
flag = false;
cnt++;
}
if (cnt > 3) {
VLOG(0) << "fleet pull sparse failed, retry 3 times";
exit(-1); exit(-1);
} }
} }
if (flag) {
break;
}
}
#endif #endif
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册