diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
index 823048552f3cb5f05375e97e94cd5b5ad63e7563..145a2394986e5cf03c75dfc367e5997c3ad75731 100644
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -39,7 +39,7 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib
   NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XPU_RT_FILE)
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..214a63457c06cb5c11e9fc229e91cf66e091da39 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/core/arena/framework.h"
+#include <set>
 #include "lite/core/context.h"
 #include "lite/operators/subgraph_op.h"
 
@@ -22,7 +23,14 @@ namespace arena {
 
 void TestCase::CreateInstruction() {
   std::shared_ptr<lite::OpLite> op = nullptr;
-  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+  static const std::set<TargetType> subgraph_op_supported_targets(
+      {TARGET(kNPU), TARGET(kXPU)});
+  bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
+                            subgraph_op_supported_targets.end();
+#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
+  enable_subgraph_op = false;  // Use XPU kernel directly if XTCL is disabled.
+#endif
+  if (enable_subgraph_op) {
     // Create a new block desc to wrap the original op desc
     int sub_block_idx = 0;
     auto sub_block_desc = new cpp::BlockDesc();
@@ -91,7 +99,8 @@ void TestCase::PrepareInputsForInstruction() {
         /// alloc memory and then copy data there.
         if (param_type->type->IsTensor()) {
           const auto* shared_tensor = scope_->FindTensor(var);
-          auto* target_tensor = inst_scope_->NewTensor(var);
+          auto* target_tensor =
+              inst_scope_->LocalVar(var)->GetMutable<Tensor>();
           CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
           target_tensor->Resize(shared_tensor->dims());
           TargetCopy(param_type->type->target(),
@@ -103,7 +112,7 @@ void TestCase::PrepareInputsForInstruction() {
           const auto* shared_tensor_array =
               scope_->FindVar(var)->GetMutable<std::vector<Tensor>>();
           auto* target_tensor_array =
-              inst_scope_->Var(var)->GetMutable<std::vector<Tensor>>();
+              inst_scope_->LocalVar(var)->GetMutable<std::vector<Tensor>>();
           CHECK(!shared_tensor_array->empty())
               << "shared_tensor_array is empty yet";
           target_tensor_array->resize(shared_tensor_array->size());
@@ -142,12 +151,23 @@ bool TestCase::CheckTensorPrecision(const Tensor* a_tensor,
         b_tensor->target() == TARGET(kARM));
 
   const T* a_data{};
+  Tensor a_host_tensor;
+  a_host_tensor.Resize(a_tensor->dims());
   switch (a_tensor->target()) {
     case TARGET(kX86):
     case TARGET(kHost):
     case TARGET(kARM):
       a_data = static_cast<const T*>(a_tensor->raw_data());
       break;
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      CopySync<TARGET(kXPU)>(a_host_tensor.mutable_data<T>(),
+                             a_tensor->raw_data(),
+                             sizeof(T) * a_tensor->dims().production(),
+                             IoDirection::DtoH);
+      a_data = a_host_tensor.data<T>();
+      break;
+#endif
 
     default:
       // Before compare, need to copy data from `target` device to host.
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
           dst, src, size, IoDirection::HtoD);
       break;
 #endif
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/memory.h b/lite/core/memory.h
index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kBM):
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
+#endif
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      TargetWrapperXPU::MemcpySync(dst, src, size, dir);
+      break;
 #endif
     default:
       LOG(FATAL)
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index 0acc54c270ca73185bab79b07f4afb952c4ab754..7e5ddecb082e17a4a70a41fef0f359c354f2e97e 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -3,17 +3,19 @@ if(LITE_WITH_XPU)
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
-    add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz)
     lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/ernie)
-    add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz)
     lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/bert)
-    add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz)
+    if(WITH_TESTING)
+        add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz)
+        add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz)
+        add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz)
+    endif()
 endif()
 
 if(LITE_WITH_RKNPU)