未验证 提交 aafa9820 编写于 作者: J james 提交者: GitHub

correct sync behavior for XPU distributed training (#47882)

* correct sync behavior for XPU distributed training

XPU support event mechanism similar to cuda event, so it is advisable to
use an event to sync compute/comm streams for performance. However this
mechanism is never fully tested, and inconsistent loss/ending_epochs are
reported. Therefore, this PR replaces event sync with stream waiting as
a temporary solution.

* remove compile warning
上级 1fb4d90b
...@@ -77,23 +77,11 @@ class XPUEventManager { ...@@ -77,23 +77,11 @@ class XPUEventManager {
device_index_)); device_index_));
platform::XPUDeviceGuard guard(device_index_); platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream())); // TODO(zhangxiaoci) temporary solution: xpu::event seems buggy
PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream()));
} }
void Block(const XPUContext& ctx) const { void Block(const XPUContext& ctx) const {}
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"XPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_));
}
}
private: private:
bool is_created_{false}; bool is_created_{false};
......
...@@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) { ...@@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) {
if (barrier_) { if (barrier_) {
// If we use the work to do barrier, we should block cpu // If we use the work to do barrier, we should block cpu
// TODO(zhangxiaoci) There is no such function that can sync entire device
// for xpu (for now), so all we can do is sync whatever stream that we know
// and hope for the best. Note that for correctness the communication stream
// needs to be in sync mode.
platform::XPUDeviceGuard guard(place_.GetDeviceId()); platform::XPUDeviceGuard guard(place_.GetDeviceId());
xpu_wait(); xpu_wait();
calc_ctx->Wait();
} }
return true; return true;
} }
......
...@@ -64,7 +64,7 @@ struct XPUContext::Impl { ...@@ -64,7 +64,7 @@ struct XPUContext::Impl {
// manually destroy XPUStream here until xpu::api integrates this work // manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor // into Context dtor
xpu_wait(context_->xpu_stream); xpu_wait(context_->xpu_stream);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream)); xpu_stream_destroy(context_->xpu_stream);
context_->xpu_stream = nullptr; context_->xpu_stream = nullptr;
xpu::destroy_context(context_); xpu::destroy_context(context_);
context_ = nullptr; context_ = nullptr;
......
...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
namespace phi { namespace phi {
...@@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> { ...@@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> {
reinterpret_cast<XPUType*>(output->data<T>()), reinterpret_cast<XPUType*>(output->data<T>()),
xdims_list, xdims_list,
axis); axis);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat");
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
} }
}; };
...@@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> { ...@@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> {
xdims_list, xdims_list,
split_list, split_list,
axis); axis);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_XDNN_SUCCESS(r, "split");
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册