diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h index 037187ea9cf3f04cb480f459d54b982bd9917f2c..49e7c07e5d33d620777c40f98813486c312bdfba 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h @@ -57,6 +57,8 @@ class ConcurrentSet { std::future GetAndClear(std::vector* result) { auto task = [this, &result] { result->clear(); + // FIXME(qiao): use a trick to avoid the bug of recv an selected rows + result->push_back(0); for (auto& id : set_) { result->push_back(id); } diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index a5983593c90fb2cbd87d87536155df34c189a93d..7e44bfc82eeadee320771387ab518c7345b17acc 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -108,7 +108,9 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " " << sstream.str(); } - for (auto i = 0; i < recv_slr.rows().size(); ++i) { + + // FIXME(qiao): use a trick to avoid the bug of recv an selected rows + for (auto i = 1; i < recv_slr.rows().size(); ++i) { auto row_id = recv_slr.rows()[i] + row_offset; PADDLE_ENFORCE_LT(row_id, recv_dims[1]); memcpy(recv_tensor->data() + row_id * width,