diff --git a/dnn/src/atlas/megcore/computing_context.cpp b/dnn/src/atlas/megcore/computing_context.cpp index 69b4d3b620cc546666dcd0ca63e43a13e60dac14..7bda2c91b8b8d485b603ca104977a8bd3c06f709 100644 --- a/dnn/src/atlas/megcore/computing_context.cpp +++ b/dnn/src/atlas/megcore/computing_context.cpp @@ -51,6 +51,7 @@ void AtlasComputingContext::memcpy(void* dst, const void* src, ACL_MEMCPY_HOST_TO_DEVICE)); break; case megcoreMemcpyDeviceToDevice: + // async d2d is always faster than sync d2d because of SDMA acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes, ACL_MEMCPY_DEVICE_TO_DEVICE, m_ctx.stream)); break; diff --git a/src/core/impl/comp_node/atlas/comp_node.cpp b/src/core/impl/comp_node/atlas/comp_node.cpp index 1f64bf0eb9990859478a2bee6142e034cc8cb492..f0d5fb1b1941487dee5340a838a9d96cd171dc36 100644 --- a/src/core/impl/comp_node/atlas/comp_node.cpp +++ b/src/core/impl/comp_node/atlas/comp_node.cpp @@ -230,14 +230,10 @@ void AtlasCompNodeImpl::peer_copy_to(Impl* dest_impl, void* dest, auto&& src_env = m_env.atlas_env(); activate(); if (dst_env.device == src_env.device) { -#if 1 + // async d2d use SDMA which is faster than sync ctrl cpu d2d MGB_ATLAS_CHECK(aclrtMemcpyAsync(dest, size, src, size, ACL_MEMCPY_DEVICE_TO_DEVICE, dst_env.stream)); -#else - MGB_ATLAS_CHECK(aclrtMemcpy(dest, size, src, size, - ACL_MEMCPY_DEVICE_TO_DEVICE)); -#endif } else { mgb_throw(MegBrainError, "Atlas does not support peer copy between differents " diff --git a/src/opr/impl/atlas_runtime_op.cpp b/src/opr/impl/atlas_runtime_op.cpp index 051a2c8f2fd6692ecd8bf288f4561a531dbac744..7853545a2ec9f28fc27fd9e814637ca04ec38246 100644 --- a/src/opr/impl/atlas_runtime_op.cpp +++ b/src/opr/impl/atlas_runtime_op.cpp @@ -361,7 +361,6 @@ void AtlasRuntimeOpr::scn_do_execute() { i, output(i)->cname()); aclmdlAddDatasetBuffer(model_outputs, output_db); } - MGB_ATLAS_CHECK(aclmdlExecute(m_model_id, model_inputs, model_outputs)); for (size_t i = 0; i < nr_inputs; ++i) {