未验证 提交 56943dd4 编写于 作者: F Fan Zhang 提交者: GitHub

add retry in pull sparse (#33812) (#37000)

* add retry in pull sparse

* retry
Co-authored-by: NThunderbrook <52529258+Thunderbrook@users.noreply.github.com>
上级 15cb05c8
...@@ -343,16 +343,36 @@ void FleetWrapper::PullSparseVarsSync( ...@@ -343,16 +343,36 @@ void FleetWrapper::PullSparseVarsSync(
for (auto& t : *fea_values) { for (auto& t : *fea_values) {
pull_result_ptr.push_back(t.data()); pull_result_ptr.push_back(t.data());
} }
auto status = pslib_ptr_->_worker_ptr->pull_sparse(
pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); int32_t cnt = 0;
pull_sparse_status.push_back(std::move(status)); while (true) {
for (auto& t : pull_sparse_status) { pull_sparse_status.clear();
t.wait(); auto status = pslib_ptr_->_worker_ptr->pull_sparse(
auto status = t.get(); pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
if (status != 0) { pull_sparse_status.push_back(std::move(status));
LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; bool flag = true;
sleep(sleep_seconds_before_fail_exit_); for (auto& t : pull_sparse_status) {
exit(-1); t.wait();
int32_t status = -1;
try {
status = t.get();
} catch (const std::future_error& e) {
VLOG(0) << "Caught a future_error with code" << e.code()
<< ", Message:" << e.what();
}
if (status != 0) {
VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
sleep(sleep_seconds_before_fail_exit_);
flag = false;
cnt++;
}
if (cnt > 3) {
VLOG(0) << "fleet pull sparse failed, retry 3 times";
exit(-1);
}
}
if (flag) {
break;
} }
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册