未验证 提交 03533b0c 编写于 作者: F fwenguang 提交者: GitHub

[MLU] support add callback to stream (#41831)

上级 bb71d834
...@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, ...@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
return; return;
} }
// NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
paddle::framework::TensorCopy(
in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
out);
return;
}
// NOTE(yy): TransDataDevice should wait for computation of input. // NOTE(yy): TransDataDevice should wait for computation of input.
if (!platform::is_cuda_pinned_place(in.place())) { if (!platform::is_cuda_pinned_place(in.place())) {
platform::DeviceContextPool::Instance().Get(in.place())->Wait(); platform::DeviceContextPool::Instance().Get(in.place())->Wait();
......
...@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
framework::Tensor extra_device_tensor = framework::Tensor extra_device_tensor =
ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>( ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
{static_cast<int64_t>(extra_input_size)}, dev_ctx); {static_cast<int64_t>(extra_input_size)}, dev_ctx);
// TODO(fwg): use Async copy, and add a callback to stream that free framework::TensorCopy(extra_host_tensor, ctx.GetPlace(),
// host &extra_device_tensor);
// memory. // Increase extra_host_tensor holder_ reference count until copy
framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), // complete.
&extra_device_tensor); auto increase_ref_count = [extra_host_tensor]() {
VLOG(4) << "Finished copying extra_host_tensor["
<< GetBasePtr(&extra_host_tensor)
<< "] in mlu pooling kernel.";
};
dev_ctx.AddStreamCallback(increase_ref_count);
MLUCnnl::PoolingForward( MLUCnnl::PoolingForward(
ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
......
...@@ -40,7 +40,6 @@ class MLUStream final { ...@@ -40,7 +40,6 @@ class MLUStream final {
template <typename Callback> template <typename Callback>
void AddCallback(Callback&& callback) const { void AddCallback(Callback&& callback) const {
// TODO(mlu): mlu not support AddCallback
callback_manager_->AddCallback(callback); callback_manager_->AddCallback(callback);
} }
......
...@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() { ...@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent); REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice); REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync); REGISTER_RUNTIME_CBID_STR(cnCtxSync);
REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
#undef REGISTER_RUNTIME_CBID_STR #undef REGISTER_RUNTIME_CBID_STR
} }
......
...@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback( ...@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif #endif
#if PADDLE_WITH_MLU #if PADDLE_WITH_MLU
VLOG(3) << "MLULaunchCallback at stream: " << stream_ VLOG(3) << "MLULaunchCallback at stream: " << stream_;
<< " Failed to call MLULaunchCallback, " cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
<< "because mlu not support StreamAddCallback yet. "
<< "function: " << func;
#endif #endif
} }
......
...@@ -47,6 +47,7 @@ enum class Backend : uint8_t { ...@@ -47,6 +47,7 @@ enum class Backend : uint8_t {
GPU, GPU,
XPU, // XPU currently does not exist at the same time as CUDA XPU, // XPU currently does not exist at the same time as CUDA
NPU, // NPU currently does not exist at the same time as CUDA NPU, // NPU currently does not exist at the same time as CUDA
MLU, // MLU currently does not exist at the same time as CUDA
// the third library backend // the third library backend
MKLDNN, MKLDNN,
...@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { ...@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
case Backend::NPU: case Backend::NPU:
os << "NPU"; os << "NPU";
break; break;
case Backend::MLU:
os << "MLU";
break;
case Backend::MKLDNN: case Backend::MKLDNN:
os << "MKLDNN"; os << "MKLDNN";
break; break;
...@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) { ...@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
return Backend::XPU; return Backend::XPU;
} else if (s == std::string("NPU")) { } else if (s == std::string("NPU")) {
return Backend::NPU; return Backend::NPU;
} else if (s == std::string("MLU")) {
return Backend::MLU;
} else if (s == std::string("MKLDNN")) { } else if (s == std::string("MKLDNN")) {
return Backend::MKLDNN; return Backend::MKLDNN;
} else if (s == std::string("GPUDNN")) { } else if (s == std::string("GPUDNN")) {
......
...@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) { ...@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
return Backend::NPU; return Backend::NPU;
} else if (allocation_type == phi::AllocationType::IPU) { } else if (allocation_type == phi::AllocationType::IPU) {
return Backend::IPU; return Backend::IPU;
} else if (allocation_type == phi::AllocationType::MLU) {
return Backend::MLU;
} else if (allocation_type == phi::AllocationType::CUSTOM) { } else if (allocation_type == phi::AllocationType::CUSTOM) {
return static_cast<Backend>( return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) + static_cast<size_t>(Backend::NUM_BACKENDS) +
......
...@@ -271,7 +271,8 @@ def monkey_patch_varbase(): ...@@ -271,7 +271,8 @@ def monkey_patch_varbase():
if _grad_scalar: if _grad_scalar:
# When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
self = _grad_scalar.scale(self) self = _grad_scalar.scale(self)
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(): if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(
) or paddle.is_compiled_with_mlu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future. # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self) scaled_loss = scale_loss(self)
if framework._in_eager_mode_: if framework._in_eager_mode_:
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions # Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
# #
# Build: # Build:
# - CNTOOLKIT_VERSION 2.6.5-1 # - CNTOOLKIT_VERSION 2.8.1-1
# - CNNL_VERSION 1.8.3-1 # - CNNL_VERSION 1.9.3-1
# - CNCL_VERSION 1.0.2-1 # - CNCL_VERSION 1.0.4-1
# #
# Download three packages from FTP (need to connect cambricon AE to get FTP url) # Download three packages from FTP (need to connect cambricon AE to get FTP url)
# - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb # - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
...@@ -21,9 +21,9 @@ ...@@ -21,9 +21,9 @@
# (get cncl pkg) # (get cncl pkg)
# #
# docker build -f Dockerfile.mlu \ # docker build -f Dockerfile.mlu \
# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \ # --build-arg CNTOOLKIT_VERSION=2.8.1-1 \
# --build-arg CNNL_VERSION=1.8.3-1 \ # --build-arg CNNL_VERSION=1.9.3-1 \
# --build-arg CNCL_VERSION=1.0.2-1 \ # --build-arg CNCL_VERSION=1.0.4-1 \
# -t paddlepaddle/paddle:latest-dev-mlu . # -t paddlepaddle/paddle:latest-dev-mlu .
# #
# without mlu device: # without mlu device:
...@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> ...@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV WITH_GPU=OFF ENV WITH_GPU=OFF
ARG CNTOOLKIT_VERSION=2.6.5-1 ARG CNTOOLKIT_VERSION=2.8.1-1
ARG CNNL_VERSION=1.8.3-1 ARG CNNL_VERSION=1.9.3-1
ARG CNCL_VERSION=1.0.2-1 ARG CNCL_VERSION=1.0.4-1
ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册