未验证 提交 03533b0c 编写于 作者: F fwenguang 提交者: GitHub

[MLU] support add callback to stream (#41831)

上级 bb71d834
......@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
return;
}
// NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
paddle::framework::TensorCopy(
in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
out);
return;
}
// NOTE(yy): TransDataDevice should wait for computation of input.
if (!platform::is_cuda_pinned_place(in.place())) {
platform::DeviceContextPool::Instance().Get(in.place())->Wait();
......
......@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
framework::Tensor extra_device_tensor =
ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
{static_cast<int64_t>(extra_input_size)}, dev_ctx);
// TODO(fwg): use Async copy, and add a callback to stream that free
// host
// memory.
framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
&extra_device_tensor);
framework::TensorCopy(extra_host_tensor, ctx.GetPlace(),
&extra_device_tensor);
// Increase extra_host_tensor holder_ reference count until copy
// complete.
auto increase_ref_count = [extra_host_tensor]() {
VLOG(4) << "Finished copying extra_host_tensor["
<< GetBasePtr(&extra_host_tensor)
<< "] in mlu pooling kernel.";
};
dev_ctx.AddStreamCallback(increase_ref_count);
MLUCnnl::PoolingForward(
ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
......
......@@ -40,7 +40,6 @@ class MLUStream final {
template <typename Callback>
void AddCallback(Callback&& callback) const {
// TODO(mlu): mlu not support AddCallback
callback_manager_->AddCallback(callback);
}
......
......@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync);
REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
#undef REGISTER_RUNTIME_CBID_STR
}
......
......@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif
#if PADDLE_WITH_MLU
VLOG(3) << "MLULaunchCallback at stream: " << stream_
<< " Failed to call MLULaunchCallback, "
<< "because mlu not support StreamAddCallback yet. "
<< "function: " << func;
VLOG(3) << "MLULaunchCallback at stream: " << stream_;
cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
#endif
}
......
......@@ -47,6 +47,7 @@ enum class Backend : uint8_t {
GPU,
XPU, // XPU currently does not exist at the same time as CUDA
NPU, // NPU currently does not exist at the same time as CUDA
MLU, // MLU currently does not exist at the same time as CUDA
// the third library backend
MKLDNN,
......@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
case Backend::NPU:
os << "NPU";
break;
case Backend::MLU:
os << "MLU";
break;
case Backend::MKLDNN:
os << "MKLDNN";
break;
......@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::XPU;
} else if (s == std::string("NPU")) {
return Backend::NPU;
} else if (s == std::string("MLU")) {
return Backend::MLU;
} else if (s == std::string("MKLDNN")) {
return Backend::MKLDNN;
} else if (s == std::string("GPUDNN")) {
......
......@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
return Backend::NPU;
} else if (allocation_type == phi::AllocationType::IPU) {
return Backend::IPU;
} else if (allocation_type == phi::AllocationType::MLU) {
return Backend::MLU;
} else if (allocation_type == phi::AllocationType::CUSTOM) {
return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) +
......
......@@ -271,7 +271,8 @@ def monkey_patch_varbase():
if _grad_scalar:
# When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
self = _grad_scalar.scale(self)
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(
) or paddle.is_compiled_with_mlu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self)
if framework._in_eager_mode_:
......
......@@ -2,9 +2,9 @@
# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
#
# Build:
# - CNTOOLKIT_VERSION 2.6.5-1
# - CNNL_VERSION 1.8.3-1
# - CNCL_VERSION 1.0.2-1
# - CNTOOLKIT_VERSION 2.8.1-1
# - CNNL_VERSION 1.9.3-1
# - CNCL_VERSION 1.0.4-1
#
# Download three packages from FTP (need to connect cambricon AE to get FTP url)
# - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
......@@ -21,9 +21,9 @@
# (get cncl pkg)
#
# docker build -f Dockerfile.mlu \
# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
# --build-arg CNNL_VERSION=1.8.3-1 \
# --build-arg CNCL_VERSION=1.0.2-1 \
# --build-arg CNTOOLKIT_VERSION=2.8.1-1 \
# --build-arg CNNL_VERSION=1.9.3-1 \
# --build-arg CNCL_VERSION=1.0.4-1 \
# -t paddlepaddle/paddle:latest-dev-mlu .
#
# without mlu device:
......@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV WITH_GPU=OFF
ARG CNTOOLKIT_VERSION=2.6.5-1
ARG CNNL_VERSION=1.8.3-1
ARG CNCL_VERSION=1.0.2-1
ARG CNTOOLKIT_VERSION=2.8.1-1
ARG CNNL_VERSION=1.9.3-1
ARG CNCL_VERSION=1.0.4-1
ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册