diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 3b6370e11851ffa8b40a0ec4d74aeea6861bf30d..b63f317aae8932ec639ff072166def4ba3dfed2b 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -261,6 +261,8 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
     cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_HIP)
     hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_XPU_KP)
+    xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
         "Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 975ce696ece826b3ae16758383e1e7485366ae5c..51456457d0606f84f4a914e692029ccd402b0e37 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -30,12 +30,7 @@ IF(WITH_XPU_KP)
     xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps feature_value.h)
     xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps)
     cc_library(heter_comm SRCS heter_comm.h heter_resource.cc DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
-    # Change heter_ps.cu file suffix
-    # NOTE(zhangminxu): If we compile with XPU_KP, we directly copy heter_ps.cu to heter_ps.cc
-    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/heter_ps.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
-    file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/heter_ps.cu ${CMAKE_CURRENT_BINARY_DIR}/heter_ps.cc)
     cc_library(heter_ps SRCS heter_ps.cc DEPS heter_comm)
-    # xpu_library(heter_comm SRCS heter_comm.h heter_comm_kernel.kps feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 7ebf7660ee521a8ce53427ad85c375ed8efd7014..098adc2bdeb8814a36265eb8ad858c2a1eb2e8a6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -193,7 +193,9 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
     memory_copy(dst_place, node.key_storage, src_place,
                 reinterpret_cast<char*>(src_key + h_left[i]),
                 node.key_bytes_len, node.in_stream);
+#if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
     cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len, node.in_stream);
+#endif
 
     if (need_copy_val) {
       memory_copy(dst_place, node.val_storage, src_place,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
new file mode 100644
index 0000000000000000000000000000000000000000..700b43f18fb96764773e5eeeb12ca000c1b80333
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
+#include <vector>
+
+#ifdef PADDLE_WITH_HETERPS
+
+namespace paddle {
+namespace framework {
+
+HeterPsBase* HeterPsBase::get_instance(
+    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  return new HeterPs(capacity, resource);
+}
+
+HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  comm_ =
+      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
+          capacity, resource);
+}
+
+HeterPs::~HeterPs() {}
+
+void HeterPs::pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
+                          size_t len) {
+  comm_->pull_sparse(num, d_keys, d_vals, len);
+}
+
+void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                       size_t len, size_t chunk_size, int stream_num) {
+  comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
+}
+
+int HeterPs::get_index_by_devid(int devid) {
+  return comm_->get_index_by_devid(devid);
+}
+
+void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_sparse_sgd(optimizer_config);
+}
+
+void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_embedx_sgd(optimizer_config);
+}
+
+void HeterPs::end_pass() { comm_->end_pass(); }
+
+void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
+
+void HeterPs::push_sparse(int num, FeatureKey* d_keys,
+                          FeaturePushValue* d_grads, size_t len) {
+  comm_->push_sparse(num, d_keys, d_grads, len);
+  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 8a877f85076efa7e7bfd3330d7d1489b5be89de7..581b0d511c23ee070b6dc33af315cc420f6ef20a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -29,9 +29,7 @@ HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
   comm_ =
       std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
           capacity, resource);
-#if defined(PADDLE_WITH_CUDA)
   opt_ = Optimizer<FeatureValue, FeaturePushValue>();
-#endif
 }
 
 HeterPs::~HeterPs() {}
@@ -50,37 +48,21 @@ int HeterPs::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
 
-#if defined(PADDLE_WITH_XPU_KP)
-void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
-  comm_->set_sparse_sgd(optimizer_config);
-}
-
-void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
-  comm_->set_embedx_sgd(optimizer_config);
-}
-#endif
-
 void HeterPs::end_pass() { comm_->end_pass(); }
 
 void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                           FeaturePushValue* d_grads, size_t len) {
-#if defined(PADDLE_WITH_CUDA)
   comm_->push_sparse(num, d_keys, d_grads, len, opt_);
-#elif defined(PADDLE_WITH_XPU_KP)
-  comm_->push_sparse(num, d_keys, d_grads, len);
-#endif
   // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
-#if defined(PADDLE_WITH_CUDA)
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                                      const std::vector<ncclComm_t>& inter_comms,
                                      int comm_size) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
-#endif
 
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index 22b43910e6967d5aa8236bfc0b5a952fa4b681e4..8c9b5f8d90fb53dca495b291a5fce4e131411918 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -132,7 +132,5 @@ REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp,
                   ops::PushBoxSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushBoxSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel<float>);
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel<float>);
-REGISTER_OP_XPU_KERNEL(pull_box_sparse, ops::PullBoxSparseXPUKernel<float>);
-REGISTER_OP_XPU_KERNEL(push_box_sparse, ops::PushBoxSparseXPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
+REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
deleted file mode 100644
index e3407dd3b2e8ba912a71f1cb419b9395b8382504..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/pull_box_sparse_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class PullBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel<float>);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 2bde9725abdca191d9e9891ca37299b4851c02f5..136e91121f8c6c45c4b8b64380ee1558440e4dcb 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -15,8 +15,12 @@
 #pragma once
 #include <memory>
 #include <vector>
+#ifdef PADDLE_WITH_BOX_PS
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
+#endif
+#ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+#endif
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -100,7 +104,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
 
 using LoDTensor = framework::LoDTensor;
 template <typename T>
-class PullBoxSparseCPUKernel : public framework::OpKernel<T> {
+class PullBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PullBoxSparseFunctor<T>(ctx);
@@ -108,27 +112,12 @@ class PullBoxSparseCPUKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class PushBoxSparseCPUKernel : public framework::OpKernel<T> {
+class PushBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PushBoxSparseFunctor<T>(ctx);
   }
 };
 
-template <typename T>
-class PullBoxSparseXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PullBoxSparseFunctor<T>(ctx);
-  }
-};
-
-template <typename T>
-class PushBoxSparseXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PushBoxSparseFunctor<T>(ctx);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps
new file mode 100644
index 0000000000000000000000000000000000000000..6b7c7c8495108ee2327e771b230b5419ab5606d1
--- /dev/null
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#include <xpu/runtime.h>                // NOLINT
+#include "xpu/kernel/cluster_header.h"  // NOLINT
+#include "xpu/kernel/debug.h"           // NOLINT
+#include "xpu/kernel/math.h"            // NOLINT
+#else
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#endif
+
+#include "paddle/fluid/operators/pull_box_sparse_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace,
+                   ops::PullBoxSparseKernel<float>);
+REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace,
+                   ops::PushBoxSparseKernel<float>);
+#else
+REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
+REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 43c9e63ac194b12f39cf140fc26bdddbade16e7a..ab68ebf3a5448c429d1466bce30f0f1f55af5121 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -107,6 +107,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"pull_box_sparse",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };