未验证 提交 aafa9820 编写于 作者: J james 提交者: GitHub

correct sync behavior for XPU distributed training (#47882)

* correct sync behavior for XPU distributed training

XPU support event mechanism similar to cuda event, so it is advisable to
use an event to sync compute/comm streams for performance. However this
mechanism is never fully tested, and inconsistent loss/ending_epochs are
reported. Therefore, this PR replaces event sync with stream waiting as
a temporary solution.

* remove compile warning
上级 1fb4d90b
......@@ -77,23 +77,11 @@ class XPUEventManager {
device_index_));
platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream()));
// TODO(zhangxiaoci) temporary solution: xpu::event seems buggy
PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream()));
}
void Block(const XPUContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"XPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_));
}
}
void Block(const XPUContext& ctx) const {}
private:
bool is_created_{false};
......
......@@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) {
if (barrier_) {
// If we use the work to do barrier, we should block cpu
// TODO(zhangxiaoci) There is no such function that can sync entire device
// for xpu (for now), so all we can do is sync whatever stream that we know
// and hope for the best. Note that for correctness the communication stream
// needs to be in sync mode.
platform::XPUDeviceGuard guard(place_.GetDeviceId());
xpu_wait();
calc_ctx->Wait();
}
return true;
}
......
......@@ -64,7 +64,7 @@ struct XPUContext::Impl {
// manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor
xpu_wait(context_->xpu_stream);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream));
xpu_stream_destroy(context_->xpu_stream);
context_->xpu_stream = nullptr;
xpu::destroy_context(context_);
context_ = nullptr;
......
......@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
namespace phi {
......@@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> {
reinterpret_cast<XPUType*>(output->data<T>()),
xdims_list,
axis);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat");
}
};
......@@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> {
xdims_list,
split_list,
axis);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "split");
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册