diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake index 823048552f3cb5f05375e97e94cd5b5ad63e7563..145a2394986e5cf03c75dfc367e5997c3ad75731 100644 --- a/cmake/device/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -39,7 +39,7 @@ else() endif() find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt - PATHS ${XPU_SDK_ROOT}/XTDK/shlib + PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib NO_DEFAULT_PATH) if(NOT XPU_SDK_XPU_RT_FILE) diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..214a63457c06cb5c11e9fc229e91cf66e091da39 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/arena/framework.h" +#include #include "lite/core/context.h" #include "lite/operators/subgraph_op.h" @@ -22,7 +23,14 @@ namespace arena { void TestCase::CreateInstruction() { std::shared_ptr op = nullptr; - if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) { + static const std::set subgraph_op_supported_targets( + {TARGET(kNPU), TARGET(kXPU)}); + bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) != + subgraph_op_supported_targets.end(); +#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) + enable_subgraph_op = false; // Use XPU kernel directly if XTCL is disabled. +#endif + if (enable_subgraph_op) { // Create a new block desc to wrap the original op desc int sub_block_idx = 0; auto sub_block_desc = new cpp::BlockDesc(); @@ -91,7 +99,8 @@ void TestCase::PrepareInputsForInstruction() { /// alloc memory and then copy data there. if (param_type->type->IsTensor()) { const auto* shared_tensor = scope_->FindTensor(var); - auto* target_tensor = inst_scope_->NewTensor(var); + auto* target_tensor = + inst_scope_->LocalVar(var)->GetMutable(); CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; target_tensor->Resize(shared_tensor->dims()); TargetCopy(param_type->type->target(), @@ -103,7 +112,7 @@ void TestCase::PrepareInputsForInstruction() { const auto* shared_tensor_array = scope_->FindVar(var)->GetMutable>(); auto* target_tensor_array = - inst_scope_->Var(var)->GetMutable>(); + inst_scope_->LocalVar(var)->GetMutable>(); CHECK(!shared_tensor_array->empty()) << "shared_tensor_array is empty yet"; target_tensor_array->resize(shared_tensor_array->size()); @@ -142,12 +151,23 @@ bool TestCase::CheckTensorPrecision(const Tensor* a_tensor, b_tensor->target() == TARGET(kARM)); const T* a_data{}; + Tensor a_host_tensor; + a_host_tensor.Resize(a_tensor->dims()); switch (a_tensor->target()) { case TARGET(kX86): case TARGET(kHost): case TARGET(kARM): a_data = static_cast(a_tensor->raw_data()); break; +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + CopySync(a_host_tensor.mutable_data(), + a_tensor->raw_data(), + sizeof(T) * a_tensor->dims().production(), + IoDirection::DtoH); + a_data = a_host_tensor.data(); + break; +#endif default: // Before compare, need to copy data from `target` device to host. diff --git a/lite/core/memory.cc b/lite/core/memory.cc index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { dst, src, size, IoDirection::HtoD); break; #endif +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { case TARGET(kBM): TargetWrapper::MemcpySync(dst, src, size, dir); break; +#endif +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + TargetWrapperXPU::MemcpySync(dst, src, size, dir); + break; #endif default: LOG(FATAL) diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt index 0acc54c270ca73185bab79b07f4afb952c4ab754..7e5ddecb082e17a4a70a41fef0f359c354f2e97e 100644 --- a/lite/tests/api/CMakeLists.txt +++ b/lite/tests/api/CMakeLists.txt @@ -3,17 +3,19 @@ if(LITE_WITH_XPU) DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) - add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz) lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/ernie) - add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz) lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/bert) - add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz) + if(WITH_TESTING) + add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz) + add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz) + add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz) + endif() endif() if(LITE_WITH_RKNPU)