[cherry-pick]XPUPS add support for kunlun2 (#41916)

* [XPUPS]add support for kunlun2 (#40985) [XPUPS]add support for kunlun2 Co-authored-by: N WorgenZhang <frank08081993@gmail.com> * [XPUPS]fix hashtable_kernel.kps (#41790) * refactor heter comm kernel * update. test=develop * update calc_shard_offset. test=develop * update xpu kernel. test=develop * update args of calc_shard_offset * update. test=develop * remove customGradMerger * update. test=develop * update. test=develop * fix. test=develop * update. test=develop * update. test=develop * update optimizer kernel * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * add optimizer kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix kunlun not support size_t. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update hashtable. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * template init. test=develop * hashtable template init. test=develop * fix. test=develop * fix. test=devlop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix hashtable_kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop Co-authored-by: N WorgenZhang <frank08081993@gmail.com> * [XPUPS]modify xpu_kp.cmake with HETERPS&PSLIB (#41760) * modify xpu_kp.cmake with HETERPS&PSLIB * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop Co-authored-by: N WorgenZhang <frank08081993@gmail.com>

[cherry-pick]XPUPS add support for kunlun2 (#41916)
* [XPUPS]add support for kunlun2 (#40985) [XPUPS]add support for kunlun2 Co-authored-by: N WorgenZhang <frank08081993@gmail.com> * [XPUPS]fix hashtable_kernel.kps (#41790) * refactor heter comm kernel * update. test=develop * update calc_shard_offset. test=develop * update xpu kernel. test=develop * update args of calc_shard_offset * update. test=develop * remove customGradMerger * update. test=develop * update. test=develop * fix. test=develop * update. test=develop * update. test=develop * update optimizer kernel * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * add optimizer kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix kunlun not support size_t. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update hashtable. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * template init. test=develop * hashtable template init. test=develop * fix. test=develop * fix. test=devlop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix hashtable_kernel. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop Co-authored-by: N WorgenZhang <frank08081993@gmail.com> * [XPUPS]modify xpu_kp.cmake with HETERPS&PSLIB (#41760) * modify xpu_kp.cmake with HETERPS&PSLIB * fix. test=develop * fix. test=develop * fix. test=develop * fix. test=develop Co-authored-by: N WorgenZhang <frank08081993@gmail.com>
3a2fb4cf · zmxdream · GitHub · 8ccdb91b · 3a2fb4cf · 3a2fb4cf
21 changed file
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -122,6 +122,12 @@ macro(compile_kernel COMPILE_ARGS)
  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
  separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")

+  set(ABI_VERSION "")
+  if(WITH_HETERPS AND WITH_PSLIB)
+    set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=0")
+  else()
+    set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1")
+  endif()
  add_custom_command(
    OUTPUT
      kernel_build/${kernel_name}.bin.o
@@ -130,7 +136,7 @@ macro(compile_kernel COMPILE_ARGS)
    COMMAND
 	  ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
    COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
       -I.  -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
        --xpu-device-only -c -v 
    COMMAND
@@ -153,7 +159,7 @@ macro(compile_kernel COMPILE_ARGS)
    COMMAND
 	  ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
    COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
+    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
        -I.  -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
        --xpu-host-only -c -v 
    WORKING_DIRECTORY

--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <vector>

 #ifdef PADDLE_WITH_PSLIB
-#include "common_value.h"  // NOLINT
+#include "common/common_value.h"  // NOLINT
 #endif

 #ifdef PADDLE_WITH_PSCORE

--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,7 +7,9 @@ IF(WITH_GPU)
        get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
        SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
    endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
    nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
    if(WITH_PSCORE)

--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -52,18 +52,18 @@ struct FeaturePushValue {
  float lr_g;
  float mf_g[MF_DIM];

-  __device__ __forceinline__ FeaturePushValue
-  operator+(const FeaturePushValue& a) const {
-    FeaturePushValue out;
-    out.slot = a.slot;
-    out.show = a.show + show;
-    out.clk = a.clk + clk;
-    out.lr_g = a.lr_g + lr_g;
-    for (int i = 0; i < MF_DIM; ++i) {
-      out.mf_g[i] = a.mf_g[i] + mf_g[i];
-    }
-    return out;
-  }
+  // __device__ __forceinline__ FeaturePushValue
+  // operator+(const FeaturePushValue& a) const {
+  //  FeaturePushValue out;
+  //  out.slot = a.slot;
+  //  out.show = a.show + show;
+  //  out.clk = a.clk + clk;
+  //  out.lr_g = a.lr_g + lr_g;
+  //  for (int i = 0; i < MF_DIM; ++i) {
+  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
+  //  }
+  //  return out;
+  // }
 };

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -13,28 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <glog/logging.h>
 #include <limits>
 #include <memory>
 #include <vector>
+
 #ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
 #endif
-#ifdef PADDLE_WITH_PSCORE
+
+#if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-#include "thrust/pair.h"
-// #include "cudf/concurrent_unordered_map.cuh.h"
+
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
-#ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "thrust/pair.h"
+#elif defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/math.h"
+#include "xpu/kernel/simd.h"
+#endif

 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
 template <typename KeyType, typename ValType>
 class TableContainer
    : public concurrent_unordered_map<KeyType, ValType,
@@ -45,31 +55,84 @@ class TableContainer
                                 std::numeric_limits<KeyType>::max()>(
            capacity, ValType()) {}
 };
+#elif defined(PADDLE_WITH_XPU_KP)
+
+template <typename KeyType, typename ValType>
+class XPUCacheArray {
+ public:
+  explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
+    xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
+    xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
+  }
+
+  virtual ~XPUCacheArray() {
+    xpu_free(keys);
+    xpu_free(vals);
+  }
+
+  void print() {}
+  // ValType* find(const KeyType& key) { return NULL; }
+  // bool insert(const KeyType& key, const ValType& val) { return true; }
+
+  int prefetch(const int dev_id, XPUStream stream = NULL) { return 0; }
+  size_t size() { return size_; }
+
+ private:
+  long long capacity_;
+  long long size_;
+  KeyType* keys;
+  ValType* vals;
+};
+#endif

 template <typename KeyType, typename ValType>
 class HashTable {
 public:
-  HashTable(size_t capacity);
+  explicit HashTable(size_t capacity);
  virtual ~HashTable();
  HashTable(const HashTable&) = delete;
  HashTable& operator=(const HashTable&) = delete;
+
+  template <typename StreamType>
  void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
  void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
  void get(const KeyType* d_keys, ValType* d_vals, size_t len,
-           gpuStream_t stream);
-  void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
+           StreamType stream);
+
+  template <typename StreamType>
+  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);
+
  void show();
-  void dump_to_cpu(int devid, cudaStream_t stream);

-  template <typename GradType, typename Sgd>
+  template <typename StreamType>
+  void dump_to_cpu(int devid, StreamType stream);
+
+#if defined(PADDLE_WITH_CUDA)
+
+  template <typename GradType, typename Sgd, typename StreamType>
  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
-              Sgd sgd, gpuStream_t stream);
+              Sgd sgd, StreamType stream);

-  template <typename Sgd>
+  template <typename Sgd, typename StreamType>
  void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
-              gpuStream_t stream);
+              StreamType stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  template <typename GradType, typename StreamType>
+  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
+              StreamType stream);
+
+  template <typename StreamType>
+  void update(const KeyType* d_keys, const char* d_grads, size_t len,
+              StreamType stream);
+
+#endif

  int size() { return container_->size(); }

@@ -84,7 +147,11 @@ class HashTable {
  std::unique_ptr<phi::RWLock> rwlock_{nullptr};

 private:
+#if defined(PADDLE_WITH_CUDA)
  TableContainer<KeyType, ValType>* container_;
+#elif defined(PADDLE_WITH_XPU_KP)
+  XPUCacheArray<KeyType, ValType>* container_;
+#endif
  int BLOCK_SIZE_{256};
  float LOAD_FACTOR{0.75f};
  size_t capacity_;
@@ -94,5 +161,4 @@ class HashTable {
 };
 }  // end namespace framework
 }  // end namespace paddle
-#include "hashtable_inl.h"
 #endif
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef PADDLE_WITH_HETERPS
+#include <thread>
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"

 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
+
 template <typename value_type>
 struct ReplaceOp {
  __host__ __device__ value_type operator()(value_type new_value,
@@ -87,6 +92,7 @@ __global__ void dy_mf_search_kernel(Table* table,
    }
  }
 }
+
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
                              const typename Table::key_type* const keys,
@@ -135,8 +141,9 @@ void HashTable<KeyType, ValType>::show() {
 }

 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
  if (len == 0) {
    return;
  }
@@ -146,8 +153,9 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 }

 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
  if (len == 0) {
    return;
  }
@@ -157,9 +165,10 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
 }

 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                         const ValType* d_vals, size_t len,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
  if (len == 0) {
    return;
  }
@@ -169,22 +178,24 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 }

 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
                                         char* pool, size_t start_index,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
  if (len == 0) {
    return;
  }
-  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
  if (pool == NULL) {
    return;
  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
                                                       pool, start_index);
 }

 template <typename KeyType, typename ValType>
-void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
+template <typename StreamType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
  container_->prefetch(cudaCpuDeviceId, stream);
  std::vector<std::thread> threads;
  size_t num = container_->size();
@@ -260,10 +271,10 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
 }

 template <typename KeyType, typename ValType>
-template <typename GradType, typename Sgd>
+template <typename GradType, typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                         const GradType* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
  if (len == 0) {
    return;
  }
@@ -273,19 +284,66 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }

 template <typename KeyType, typename ValType>
-template <typename Sgd>
+template <typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                         const char* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
  if (len == 0) {
    return;
  }
  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-
  dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
      container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
 }

+template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
+    cudaStream_t>(const unsigned long* d_keys,
+                  paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void
+// HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
+//    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
+//    stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::insert<
+//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+//                  size_t start_index, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::
+    dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
+    paddle::framework::FeaturePushValue,
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeaturePushValue* d_grads,
+                  size_t len, Optimizer<paddle::framework::FeatureValue,
+                                        paddle::framework::FeaturePushValue>
+                                  sgd,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::update<
+//    Optimizer<paddle::framework::FeatureValue,
+//              paddle::framework::FeaturePushValue>,
+//    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t
+//    len,
+//                  Optimizer<paddle::framework::FeatureValue,
+//                            paddle::framework::FeaturePushValue>
+//                      sgd,
+//                  cudaStream_t stream);
+
+#endif
 }  // end namespace framework
 }  // end namespace paddle
 #endif
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+
+namespace optimizer_config {
+extern _global_ptr_ float* nonclk_coeff;
+extern _global_ptr_ float* clk_coeff;
+
+extern _global_ptr_ float* min_bound;
+extern _global_ptr_ float* max_bound;
+extern _global_ptr_ float* learning_rate;
+extern _global_ptr_ float* initial_g2sum;
+extern _global_ptr_ float* initial_range;
+
+extern _global_ptr_ float* mf_create_thresholds;
+extern _global_ptr_ float* mf_learning_rate;
+extern _global_ptr_ float* mf_initial_g2sum;
+extern _global_ptr_ float* mf_initial_range;
+extern _global_ptr_ float* mf_min_bound;
+extern _global_ptr_ float* mf_max_bound;
+}
+
+namespace paddle {
+namespace framework {
+
+#if defined(PADDLE_WITH_XPU_KP)
+
+__device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
+                          float scale) {
+  __local__ float local_learning_rate;
+  __local__ float local_initial_g2sum;
+  __local__ float local_min_bound;
+  __local__ float local_max_bound;
+
+  GM2LM(optimizer_config::learning_rate, &local_learning_rate, sizeof(float));
+  GM2LM(optimizer_config::initial_g2sum, &local_initial_g2sum, sizeof(float));
+  GM2LM(optimizer_config::min_bound, &local_min_bound, sizeof(float));
+  GM2LM(optimizr_config::max_bound, &local_max_bound, sizeof(float));
+
+  double add_g2sum = 0;
+  double ratio = local_learning_rate *
+                 sqrt(local_initial_g2sum / (local_initial_g2sum + g2sum));
+  double scaled_grad = g / scale;
+
+  w += scaled_grad * ratio;
+
+  if (w < local_min_bound) w = local_min_bound;
+  if (w > local_max_bound) w = local_max_bound;
+
+  add_g2sum += scaled_grad * scaled_grad;
+
+  g2sum += add_g2sum;
+}
+
+__device__ void update_mf(int n, float* w, float& g2sum, const float* g,
+                          float scale) {
+  __local__ float local_mf_learning_rate;
+  __local__ float local_mf_initial_g2sum;
+  __local__ float local_mf_min_bound;
+  __local__ float local_mf_max_bound;
+
+  GM2LM(optimizer_config::mf_learning_rate, &local_mf_learning_rate,
+        sizeof(float));
+  GM2LM(optimizer_config::mf_initial_g2sum, &local_mf_initial_g2sum,
+        sizeof(float));
+  GM2LM(optimizer_config::mf_min_bound, &local_mf_min_bound, sizeof(float));
+  GM2LM(optimizer_config::mf_max_bound, &local_mf_max_bound, sizeof(float));
+
+  double add_g2sum = 0;
+  double ratio =
+      local_mf_learning_rate *
+      sqrt(local_mf_initial_g2sum / (local_mf_initial_g2sum + g2sum));
+  for (int i = 0; i < n; ++i) {
+    double scaled_grad = g[i] / scale;
+    w[i] += scaled_grad * ratio;
+
+    if (w[i] < local_mf_min_bound) w[i] = local_mf_min_bound;
+    if (w[i] > local_mf_max_bound) w[i] = local_mf_max_bound;
+    add_g2sum += scaled_grad * scaled_grad;
+  }
+
+  g2sum += add_g2sum / n;
+}
+
+__device__ float xpu_rand_uniform() { return 0.1; }
+
+template <typename ValType, typename GradType>
+__device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
+  val.slot = grad.slot;
+  val.show += grad.show;
+  val.clk += grad.clk;
+
+  __local__ float local_nonclk_coeff;
+  __local__ float local_clk_coeff;
+
+  __local__ float local_mf_create_thresholds;
+  __local__ float local_mf_initial_range;
+
+  GM2LM(optimizer_config::nonclk_coeff, &local_nonclk_coeff, sizeof(float));
+  GM2LM(optimizer_config::clk_coeff, &local_clk_coeff, sizeof(float));
+  GM2LM(optimizer_config::mf_create_thresholds, &local_mf_create_thresholds,
+        sizeof(float));
+
+  val.delta_score +=
+      local_nonclk_coeff * (grad.show - grad.clk) + local_clk_coeff * grad.clk;
+
+  update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+
+  if (val.mf_size == 0) {
+    if (local_mf_create_thresholds <=
+        local_nonclk_coeff * (val.show - val.clk) + local_clk_coeff * val.clk) {
+      val.mf_size = MF_DIM + 1;
+      val.mf[0] = 0;
+
+      for (int i = 0; i < MF_DIM; ++i) {
+        val.mf[i + 1] = (xpu_rand_uniform()) * local_mf_initial_range;
+      }
+    }
+  } else {
+    update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table>
+__global__ void insert_kernel(Table* table, const KeyType* const keys,
+                              const ValType* const vals, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 150;
+  __local__ KeyType local_keys[buf_size];
+  __local__ ValType local_vals[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    GM2LM(vals, local_vals, read_len * sizeof(ValType));
+    for (int k = 0; k < read_len; k++) {
+      // auto status = table->insert(local_keys[k], local_vals[k]);
+      // assert(status != false && "error: insert fails: table is full");
+    }
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table>
+__global__ void search_kernel(Table* table, const KeyType* const keys,
+                              ValType* const vals, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 150;
+  __local__ KeyType local_keys[buf_size];
+  __local__ ValType local_vals[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    for (int k = 0; k < read_len; k++) {
+      // ValType* val = table->find(local_keys[k]);
+      // if (val != NULL) {
+      //  local_vals[k] = *val;
+      // }
+    }
+    LM2GM(local_vals, vals + i, read_len * sizeof(ValType));
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table, typename GradType>
+__global__ void update_kernel(Table* table, const KeyType* const keys,
+                              const GradType* const grads, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 250;
+  __local__ KeyType local_keys[buf_size];
+  __local__ GradType local_grads[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    GM2LM(grads, local_grads, read_len * sizeof(GradType));
+
+    for (int k = 0; k < read_len; k++) {
+      // ValType* val = table->find(local_keys[k]);
+      // if (val != NULL) {
+      //  update_value(*val, grads[i]);
+      //}
+    }
+  }
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::HashTable(size_t capacity) {
+  auto tmp_container = XPUCacheArray<KeyType, ValType>(capacity);
+  xpu_malloc(reinterpret_cast<void**>(&container_),
+             sizeof(XPUCacheArray<KeyType, ValType>));
+  xpu_memcpy(container_, &tmp_container,
+             sizeof(XPUCacheArray<KeyType, ValType>), XPU_HOST_TO_DEVICE);
+  rwlock_.reset(new phi::RWLock);
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::~HashTable() {
+  xpu_free((void*)container_);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::show() {
+  container_->print();
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
+                                      size_t len, StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  search_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
+                                      size_t len, StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  // TODO(zhangminxu): to be implemented
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         const ValType* d_vals, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  insert_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
+  // TODO(zhangminxu): to be implemented
+}
+
+template <typename KeyType, typename ValType>
+template <typename GradType, typename StreamType>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const GradType* d_grads, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  update_kernel<<<4, 64, stream>>>(container_, d_keys, d_grads, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const char* d_grads, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  // TODO(zhangminxu): to be implemented
+}
+
+template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
+    XPUStream>(const unsigned long* d_keys,
+               paddle::framework::FeatureValue* d_vals, size_t len,
+               XPUStream stream);
+
+// template void
+// HashTable<unsigned long, paddle::framework::FeatureValue>::get<XPUStream>(
+//    const unsigned long* d_keys, char* d_vals, size_t len, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
+    XPUStream>(const unsigned long* d_keys,
+               const paddle::framework::FeatureValue* d_vals, size_t len,
+               XPUStream stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::insert<
+//    XPUStream>(const unsigned long* d_keys, size_t len, char* pool,
+//               size_t start_index, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::
+    dump_to_cpu<XPUStream>(int devid, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
+    paddle::framework::FeaturePushValue, XPUStream>(
+    const unsigned long* d_keys,
+    const paddle::framework::FeaturePushValue* d_grads, size_t len,
+    XPUStream stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::update<
+//    XPUStream>(const unsigned long* d_keys, const char* d_grads,
+//                          size_t len, XPUStream stream);
+
+#endif
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,39 +15,28 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
-#include "cub/cub.cuh"
-#include "cub/util_allocator.cuh"
-#include "hashtable.h"       // NOLINT
-#include "heter_resource.h"  // NOLINT
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/place.h"
 #include "thrust/pair.h"
+#elif defined(PADDLE_WITH_XPU_KP)
+#include <xpu/runtime.h>
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#endif
+
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/place.h"

 #ifdef PADDLE_WITH_HETERPS

 namespace paddle {
 namespace framework {

-struct CustomGradMerger {
-  template <typename T>
-  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
-  operator()(const T& a, const T& b) const {
-    T out;
-    out.slot = a.slot;
-    out.show = a.show + b.show;
-    out.clk = a.clk + b.clk;
-    out.lr_g = a.lr_g + b.lr_g;
-    for (int i = 0; i < MF_DIM; ++i) {
-      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
-    }
-    return out;
-  }
-};
-
 template <typename KeyType, typename ValType, typename GradType>
 class HeterComm {
 public:
@@ -67,10 +56,21 @@ class HeterComm {
  void show_one_table(int gpu_num);
  int get_index_by_devid(int devid);

+#if defined(PADDLE_WITH_CUDA)
  template <typename Sgd>
  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len,
                   Sgd& sgd);  // NOLINT
+#elif defined(PADDLE_WITH_XPU_KP)
+  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len);
+#endif
+
+  int log2i(int x);

+  template <typename DstPlace, typename SrcPlace, typename StreamType>
+  void memory_copy(DstPlace dst_place, void* dst, SrcPlace src_place,
+                   const void* src, size_t count, StreamType stream = 0);
+
+#if defined(PADDLE_WITH_CUDA)
  template <typename Sgd>
  void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads,
                              size_t len, Sgd& sgd);  // NOLINT
@@ -85,8 +85,6 @@ class HeterComm {
  int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads,
                             int len);

-  int log2i(int x);
-
  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                              const std::vector<ncclComm_t>& inter_comms,
                              int comm_size) {
@@ -94,6 +92,7 @@ class HeterComm {
    nccl_inter_comms_ = inter_comms;
    node_size_ = comm_size;
  }
+#endif

  bool need_transfer(int send_id, int receive_id) {
    return ((send_id / 4 != receive_id / 4) && (send_id + 4) % 8 != receive_id);
@@ -101,19 +100,19 @@ class HeterComm {

  // void dump_to_cpu(int index);

-  void end_pass();
-
  int get_transfer_devid(int send_id) { return (send_id + 4) % 8; }

+  void end_pass();
+
  struct Node {
-    cudaStream_t in_stream;
-    cudaStream_t out_stream;
+    ppStream in_stream;
+    ppStream out_stream;
    char* key_storage;
    char* val_storage;
    int sync;
    int key_bytes_len;
    int val_bytes_len;
-    int gpu_num;
+    int dev_num;
  };

  struct Path {
@@ -133,7 +132,7 @@ class HeterComm {
      alloc(size, true);
    }

-    void alloc(int size, bool force = false) {
+    void alloc(size_t size, bool force = false) {
      if (force || size > all_keys_mem->size()) {
        all_keys_mem.reset();
        all_grads_mem.reset();
@@ -152,7 +151,11 @@ class HeterComm {
      }
    }

+#if defined(PADDLE_WITH_CUDA)
    platform::CUDAPlace place_;
+#elif defined(PADDLE_WITH_XPU_KP)
+    platform::XPUPlace place_;
+#endif
    std::shared_ptr<memory::Allocation> all_keys_mem;
    std::shared_ptr<memory::Allocation> all_grads_mem;
    KeyType* all_keys;
@@ -166,6 +169,33 @@ class HeterComm {

  void init_path();

+  template <typename StreamType>
+  void sync_stream(const StreamType& stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(stream));
+#endif
+  }
+
+  template <typename StreamType>
+  void create_stream(StreamType* stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(stream));
+#endif
+  }
+
+  template <typename StreamType>
+  void destroy_stream(StreamType stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(stream));
+#endif
+  }
+
  void create_storage(int start_index, int end_index, int keylen, int vallen);
  void destroy_storage(int start_index, int end_index);
  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
@@ -182,15 +212,18 @@ class HeterComm {
  int block_size_{256};

 private:
+  std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
  std::vector<LocalStorage> storage_;
-  CustomGradMerger merger_;
  int topo_aware_{0};
  int feanum_{1800 * 2048};
  int multi_node_{0};
+  int node_size_;
+
+#if defined(PADDLE_WITH_CUDA)
  std::vector<ncclComm_t> nccl_inner_comms_;
  std::vector<ncclComm_t> nccl_inter_comms_;
-  int node_size_;
  std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
+#endif
 };

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+
+namespace paddle {
+namespace framework {
+
+#ifdef PADDLE_WITH_CUDA
+
+struct GPUCustomGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
+    }
+    return out;
+  }
+} gpu_merger;
+
+template <typename T>
+__global__ void fill_idx_kernel(T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    idx[i] = i;
+  }
+}
+
+// template <typename T>
+// void show_tensor(T* input, size_t len, gpuStream_t stream, std::string
+// name)
+// {
+//  T tmp[len];  // NOLINT
+//  cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost,
+//  stream);
+//  cudaStreamSynchronize(stream);
+//  std::cout << name;
+//  for (int i = 0; i < len; ++i) {
+//    std::cout << ":" << tmp[i];
+//  }
+//  std::cout << std::endl;
+//}
+
+template <typename T>
+__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
+                                         size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len - 1) {
+    if (idx[i] != idx[i + 1]) {
+      right[idx[i]] = i;
+      left[idx[i + 1]] = i + 1;
+    }
+  }
+  if (i == 0) {
+    left[idx[i]] = i;
+  }
+  if (i == (len - 1)) {
+    right[idx[i]] = i;
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void calc_shard_index_kernel(KeyType* d_keys, size_t len,
+                                        T* shard_index, int total_gpu) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    shard_index[i] = d_keys[i] % total_gpu;
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                      T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+  }
+}
+
+template <typename KeyType, typename GradType, typename T>
+__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                        GradType* d_shard_grads,
+                                        GradType* d_grads, T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+    d_shard_grads[i] = d_grads[idx[i]];
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                  T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_vals[idx[i]] = d_shard_vals[i];
+  }
+}
+
+// cuda implemention of  heter_comm_kernel.h
+template <typename T, typename StreamType>
+void HeterCommKernel::fill_idx(T* idx, long long len,
+                               const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_idx_kernel<<<grid_size, block_size_, 0, stream>>>(idx, c_len);
+}
+
+template <typename T, typename StreamType>
+void HeterCommKernel::calc_shard_offset(T* idx, T* left, T* right,
+                                        long long len, int total_devs,
+                                        const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  calc_shard_offset_kernel<<<grid_size, block_size_, 0, stream>>>(idx, left,
+                                                                  right, c_len);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::calc_shard_index(KeyType* d_keys, long long len,
+                                       T* shard_index, int total_gpu,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  calc_shard_index_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_keys, c_len, shard_index, total_gpu);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys,
+                                     T* idx, long long len,
+                                     const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_shard_key_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, idx, c_len);
+}
+
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                                       GradType* d_shard_grads,
+                                       GradType* d_grads, T* idx, long long len,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void HeterCommKernel::fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                                 long long len, const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(d_shard_vals, d_vals,
+                                                           idx, c_len);
+}
+
+template <typename KeyT, typename ValueT, typename StreamType>
+void HeterCommKernel::sort_pairs(void* d_temp_storage,
+                                 size_t& temp_storage_bytes,  // NOLINT
+                                 const KeyT* d_keys_in,       // NOLINT
+                                 KeyT* d_keys_out, const ValueT* d_values_in,
+                                 ValueT* d_values_out, int num_items,
+                                 int begin_bit, int end_bit, StreamType stream,
+                                 bool debug_synchronous) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,
+      d_values_out, num_items, begin_bit, end_bit, stream, debug_synchronous));
+}
+
+template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT, typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT, typename StreamType>
+void HeterCommKernel::reduce_by_key(void* d_temp_storage,
+                                    size_t& temp_storage_bytes,  // NOLINT
+                                    KeysInputIteratorT d_keys_in,
+                                    UniqueOutputIteratorT d_unique_out,
+                                    ValuesInputIteratorT d_values_in,
+                                    AggregatesOutputIteratorT d_aggregates_out,
+                                    NumRunsOutputIteratorT d_num_runs_out,
+                                    int num_items, StreamType stream,
+                                    bool debug_synchronous) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in,
+      d_aggregates_out, d_num_runs_out, gpu_merger, num_items, stream,
+      debug_synchronous));
+}
+
+template void HeterCommKernel::fill_idx<int, cudaStream_t>(
+    int* idx, long long len, const cudaStream_t& stream);
+
+template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
+    int* idx, int* left, int* right, long long len, int total_devs,
+    const cudaStream_t& stream);
+template void HeterCommKernel::calc_shard_index<
+    unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len,
+                                      int* shard_index, int total_devs,
+                                      const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_shard_grads<
+    unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys,
+    paddle::framework::FeaturePushValue* d_shard_grads,
+    paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void
+HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::sort_pairs<
+    unsigned long, paddle::framework::FeaturePushValue, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,      // NOLINT
+    const unsigned long* d_keys_in,  // NOLINT
+    unsigned long* d_keys_out,
+    const paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_values_out, int num_items,
+    int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous);
+
+template void HeterCommKernel::sort_pairs<int, int, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    const int* d_keys_in,        // NOLINT
+    int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items,
+    int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous);
+
+template void HeterCommKernel::reduce_by_key<
+    unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*,
+    paddle::framework::FeaturePushValue*, int*, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    unsigned long* d_keys_in, unsigned long* d_unique_out,
+    paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out,
+    int num_items, cudaStream_t stream, bool debug_synchronous);
+
+#endif
+
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+
+#if defined(PADDLE_WITH_CUDA)
+#include "cub/cub.cuh"
+#include "cub/util_allocator.cuh"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+class HeterCommKernel {
+ public:
+  HeterCommKernel() {}
+  explicit HeterCommKernel(const int block_size) : block_size_(block_size) {}
+
+  template <typename T, typename StreamType>
+  void fill_idx(T* idx, long long len, const StreamType& stream);
+
+  template <typename T, typename StreamType>
+  void calc_shard_offset(T* idx, T* left, T* right, long long len,
+                         int total_devs, const StreamType& stream);
+
+  template <typename KeyType, typename T, typename StreamType>
+  void calc_shard_index(KeyType* d_keys, long long len, T* shard_index,
+                        int total_devs, const StreamType& stream);
+
+  template <typename KeyType, typename T, typename StreamType>
+  void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
+                      long long len, const StreamType& stream);
+
+  template <typename KeyType, typename GradType, typename T,
+            typename StreamType>
+  void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                        GradType* d_shard_grads, GradType* d_grads, T* idx,
+                        long long len, const StreamType& stream);
+
+  template <typename ValType, typename T, typename StreamType>
+  void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len,
+                  const StreamType& stream);
+
+  template <typename KeyT, typename ValueT, typename StreamType>
+  void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes,  // NOLINT
+                  const KeyT* d_keys_in, KeyT* d_keys_out,
+                  const ValueT* d_values_in, ValueT* d_values_out,
+                  int num_items, int begin_bit = 0,
+                  int end_bit = sizeof(KeyT) * 8, StreamType stream = NULL,
+                  bool debug_synchronous = false);
+
+  template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+            typename ValuesInputIteratorT, typename AggregatesOutputIteratorT,
+            typename NumRunsOutputIteratorT, typename StreamType>
+  void reduce_by_key(void* d_temp_storage,
+                     size_t& temp_storage_bytes,  // NOLINT
+                     KeysInputIteratorT d_keys_in,
+                     UniqueOutputIteratorT d_unique_out,
+                     ValuesInputIteratorT d_values_in,
+                     AggregatesOutputIteratorT d_aggregates_out,
+                     NumRunsOutputIteratorT d_num_runs_out, int num_items,
+                     StreamType stream = NULL, bool debug_synchronous = false);
+
+ private:
+  int block_size_{256};
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+
+#if defined(PADDLE_WITH_XPU_KP)
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/math.h"
+#include "xpu/kernel/simd.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+#if defined(PADDLE_WITH_XPU_KP)
+
+struct XPUCustomGradMerger {
+  template <typename T>
+  __device__ T operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
+    }
+    return out;
+  }
+} xpu_merger;
+
+template <typename T>
+__global__ void fill_idx_kernel(T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 1024;
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    for (int k = 0; k < read_len; k++) {
+      int real_idx = i + k;
+      local_idx[k] = real_idx;
+    }
+    LM2GM(local_idx, idx + i, read_len * sizeof(T));
+  }
+}
+
+template <typename T>
+__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
+                                         long long len, const int total_xpu) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 1024;
+  __local__ T local_idx[buf_size];
+  __local__ T local_left[total_xpu];
+  __local__ T local_right[total_xpu];
+
+  for (int i = 0; i < total_xpu; i++) {
+    local_left[i] = -1;
+    local_right[i] = -1;
+  }
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      if (local_idx[k] != local_idx[k + 1]) {
+        int real_idx = i + k;
+        local_right[local_idx[k]] = real_idx;
+        local_left[local_idx[k + 1]] = real_idx + 1;
+      }
+    }
+    if (i == 0) {
+      local_left[local_idx[i]] = i;
+    }
+    if (i + read_len == len) {
+      local_right[local_idx[len - 1]] = len - 1;
+    }
+  }
+  // to be optimized: call LM2GM too frequently
+  // all_reduce between threads to get global left & global right && LM2GM
+  for (int i = 0; i < total_xpu; i++) {
+    if (local_left[i] != -1) LM2GM(local_left + i, left + i, sizeof(T));
+    if (local_right[i] != -1) LM2GM(local_right + i, right + i, sizeof(T));
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void calc_shard_index_kernel(KeyType* d_keys, long long len,
+                                        T* shard_index, int total_xpu) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 512;
+  __local__ KeyType local_keys[buf_size];
+  __local__ T local_shard_index[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_index[k] = local_keys[k] % total_xpu;
+    }
+    LM2GM(local_shard_index, shard_index + i, read_len * sizeof(T));
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                      T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 400;
+  __local__ KeyType local_keys[buf_size];
+  __local__ KeyType local_shard_keys[buf_size];
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_keys[k] = local_keys[local_idx[k]];
+    }
+    LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
+  }
+}
+
+// local mem too large, cause compile error
+template <typename KeyType, typename GradType, typename T>
+__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                        GradType* d_shard_grads,
+                                        GradType* d_grads, T* idx,
+                                        long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 100;
+  __local__ KeyType local_keys[buf_size];
+  __local__ GradType local_grads[buf_size];
+  __local__ KeyType local_shard_keys[buf_size];
+  __local__ GradType local_shard_grads[buf_size];
+  __local__ T local_idx[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_keys[k] = local_keys[local_idx[k]];
+      local_shard_grads[k] = local_grads[local_idx[k]];
+    }
+    LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
+    LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType));
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                  T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 50;
+  __local__ ValType local_vals[buf_size];
+  __local__ ValType local_shard_vals[buf_size];
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType));
+    for (int k = 0; k < read_len; k++) {
+      local_vals[local_idx[k]] = local_shard_vals[k];
+    }
+    LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
+  }
+}
+
+// xpu implementation of heter_comm_kernel.h
+
+template <typename T, typename StreamType>
+void HeterCommKernel::fill_idx(T* idx, long long len,
+                               const StreamType& stream) {
+  fill_idx_kernel<T><<<4, 64, stream>>>(idx, len);
+}
+
+template <typename T, typename StreamType>
+void HeterCommKernel::calc_shard_offset(T* idx, T* left, T* right,
+                                        long long len, int total_devs,
+                                        const StreamType& stream) {
+  calc_shard_offset_kernel<T><<<4, 64, stream>>>(idx, left, right, len,
+                                                 total_devs);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::calc_shard_index(KeyType* d_keys, long long len,
+                                       T* shard_index, int total_devs,
+                                       const StreamType& stream) {
+  calc_shard_index_kernel<KeyType, T><<<4, 64, stream>>>(
+      d_keys, len, shard_index, total_devs);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys,
+                                     T* idx, long long len,
+                                     const StreamType& stream) {
+  fill_shard_key_kernel<KeyType, T><<<4, 64, stream>>>(d_shard_keys, d_keys,
+                                                       idx, len);
+}
+
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                                       GradType* d_shard_grads,
+                                       GradType* d_grads, T* idx, long long len,
+                                       const StreamType& stream) {
+  fill_shard_grads_kernel<KeyType, GradType, T><<<4, 64, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, len);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void HeterCommKernel::fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                                 long long len, const StreamType& stream) {
+  fill_dvals_kernel<ValType, T><<<4, 64, stream>>>(d_shard_vals, d_vals, idx,
+                                                   len);
+}
+
+template <typename KeyT, typename ValueT, typename StreamType>
+void HeterCommKernel::sort_pairs(void* d_temp_storage,
+                                 size_t& temp_storage_bytes,  // NOLINT
+                                 const KeyT* d_keys_in,       // NOLINT
+                                 KeyT* d_keys_out, const ValueT* d_values_in,
+                                 ValueT* d_values_out, int num_items,
+                                 int begin_bit, int end_bit, StreamType stream,
+                                 bool debug_synchronous) {}
+
+template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+          void HeterCommKernel::reduce_by_key(
+              void* d_temp_storage,
+              size_t& temp_storage_bytes,  // NOLINT
+              KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out,
+              ValuesInputIteratorT d_values_in,
+              AggregatesOutputIteratorT d_aggregates_out,
+              NumRunsOutputIteratorT d_num_runs_out, int num_items,
+              StreamType stream, bool debug_synchronous) {}
+
+template void HeterCommKernel::fill_idx<int, XPUStream>(
+    int* idx, long long len, const XPUStream& stream);
+template void HeterCommKernel::calc_shard_offset<int, XPUStream>(
+    int* idx, int* left, int* right, long long len, int total_devs,
+    const XPUStream& stream);
+template void HeterCommKernel::calc_shard_index<unsigned long, int, XPUStream>(
+    unsigned long* d_keys, long long len, int* shard_index, int total_devs,
+    const XPUStream& stream);
+
+template void HeterCommKernel::fill_shard_key<unsigned long, int, XPUStream>(
+    unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
+    const XPUStream& stream);
+template void HeterCommKernel::fill_shard_grads<
+    unsigned long, paddle::framework::FeaturePushValue, int, XPUStream>(
+    unsigned long* d_shard_keys, unsigned long* d_keys,
+    paddle::framework::FeaturePushValue* d_shard_grads,
+    paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
+    const XPUStream& stream);
+template void
+HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, XPUStream>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    const XPUStream& stream);
+
+template void HeterCommKernel::sort_pairs<
+    unsigned long, paddle::framework::FeaturePushValue, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,      // NOLINT
+    const unsigned long* d_keys_in,  // NOLINT
+    unsigned long* d_keys_out,
+    const paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_values_out, int num_items,
+    int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous);
+
+template void HeterCommKernel::sort_pairs<int, int, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    const int* d_keys_in,        // NOLINT
+    int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items,
+    int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous);
+
+template void HeterCommKernel::reduce_by_key<
+    unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*,
+    paddle::framework::FeaturePushValue*, int*, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    unsigned long* d_keys_in, unsigned long* d_unique_out,
+    paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_aggregates_out,
+    int* d_num_runs_out int num_items, XPUStream stream,
+    bool debug_synchronous);
+
+#endif
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -29,7 +29,9 @@ HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
  comm_ =
      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
          capacity, resource);
+#if defined(PADDLE_WITH_CUDA)
  opt_ = Optimizer<FeatureValue, FeaturePushValue>();
+#endif
 }

 HeterPs::~HeterPs() {}
@@ -54,15 +56,21 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }

 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                          FeaturePushValue* d_grads, size_t len) {
+#if defined(PADDLE_WITH_CUDA)
  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+#elif defined(PADDLE_WITH_XPU_KP)
+  comm_->push_sparse(num, d_keys, d_grads, len);
+#endif
  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }

+#if defined(PADDLE_WITH_CUDA)
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                                     const std::vector<ncclComm_t>& inter_comms,
                                     int comm_size) {
  comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
+#endif

 }  // end namespace framework
 }  // end namespace paddle

--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#endif

 #ifdef PADDLE_WITH_HETERPS

@@ -35,9 +37,13 @@ class HeterPs : public HeterPsBase {
                           size_t len) override;
  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                        size_t len, size_t chunk_size, int stream_num) override;
+
+#if defined(PADDLE_WITH_CUDA)
  virtual void set_nccl_comm_and_size(
      const std::vector<ncclComm_t>& inner_comms,
      const std::vector<ncclComm_t>& inter_comms, int comm_size) override;
+#endif
+
  virtual void end_pass() override;
  virtual int get_index_by_devid(int devid) override;
  virtual void show_one_table(int gpu_num) override;
@@ -46,7 +52,9 @@ class HeterPs : public HeterPsBase {

 private:
  std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_;
+#if defined(PADDLE_WITH_CUDA)
  Optimizer<FeatureValue, FeaturePushValue> opt_;
+#endif
 };

 }  // end namespace framework

--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -35,9 +35,11 @@ class HeterPsBase {
  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                        size_t len, size_t chunk_size, int stream_num) = 0;
  virtual int get_index_by_devid(int devid) = 0;
+#if defined(PADDLE_WITH_CUDA)
  virtual void set_nccl_comm_and_size(
      const std::vector<ncclComm_t>& inner_comms,
      const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
+#endif
  virtual void end_pass() = 0;
  virtual void show_one_table(int gpu_num) = 0;
  virtual void push_sparse(int num, FeatureKey* d_keys,

--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -13,12 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef PADDLE_WITH_HETERPS
-#include "heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif

 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
 GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
  index_ = index;
  dev_ids_ = dev_ids;
@@ -52,7 +61,41 @@ GPUResource::~GPUResource() {
  }
 }

+#elif defined(PADDLE_WITH_XPU_KP)
+XPUResource::XPUResource(std::vector<int>& dev_ids, int index) {
+  index_ = index;
+  dev_ids_ = dev_ids;
+  dev_id_ = dev_ids_[index];
+
+  platform::XPUDeviceGuard guard(dev_id_);
+  local_streams_.resize(dev_ids_.size());
+  comm_streams_.resize(dev_ids_.size(), NULL);
+  remote_streams_.resize(dev_ids_.size());
+
+  for (size_t i = 0; i < dev_ids_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&local_streams_[i]));
+    // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&comm_streams_[i]));
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&remote_streams_[i]));
+  }
+}
+
+XPUResource::~XPUResource() {
+  platform::XPUDeviceGuard guard(dev_id_);
+  for (size_t i = 0; i < local_streams_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(local_streams_[i]));
+  }
+  // for (size_t i = 0; i < comm_streams_.size(); ++i) {
+  //  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(comm_streams_[i]));
+  // }
+  for (size_t i = 0; i < remote_streams_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(remote_streams_[i]));
+  }
+}
+
+#endif
+
 void HeterPsResource::enable_p2p() {
+#if defined(PADDLE_WITH_CUDA)
  for (size_t i = 0; i < dev_ids_.size(); ++i) {
    platform::CUDADeviceGuard guard(dev_ids_[i]);
    for (size_t j = 0; j < dev_ids_.size(); ++j) {
@@ -72,28 +115,28 @@ void HeterPsResource::enable_p2p() {
      }
    }
  }
+#endif
 }

 HeterPsResource::HeterPsResource(const std::vector<int>& dev_ids) {
  dev_ids_ = dev_ids;
  for (size_t i = 0; i < dev_ids_.size(); ++i) {
-    std::shared_ptr<GPUResource> resource =
-        std::make_shared<GPUResource>(dev_ids_, i);
+    std::shared_ptr<DevResource> resource =
+        std::make_shared<DevResource>(dev_ids_, i);
    resources_.push_back(resource);
    devid_2_index_[dev_ids_[i]] = i;
  }
 }

-cudaStream_t HeterPsResource::comm_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->comm_stream(stream_num);
+ppStream HeterPsResource::comm_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->comm_stream(stream_num);
 }
-
-cudaStream_t HeterPsResource::local_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->local_stream(stream_num);
+ppStream HeterPsResource::local_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->local_stream(stream_num);
 }

-cudaStream_t HeterPsResource::remote_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->remote_stream(stream_num);
+ppStream HeterPsResource::remote_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->remote_stream(stream_num);
 }

 int HeterPsResource::dev_id(int num) { return dev_ids_[num]; }
@@ -102,7 +145,7 @@ int HeterPsResource::get_index_by_devid(int devid) {
  return devid_2_index_[devid];
 }

-int HeterPsResource::total_gpu() { return dev_ids_.size(); }
+int HeterPsResource::total_device() { return dev_ids_.size(); }

 void HeterPsResource::set_multi_mf(int multi_mf_dim, int max_mf_dim) {
  multi_mf_dim_ = multi_mf_dim;

--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -17,7 +17,16 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <vector>
+
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include <xpu/runtime.h>  // NOLINT
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
+
 #include "paddle/fluid/platform/enforce.h"

 #ifdef PADDLE_WITH_HETERPS
@@ -25,9 +34,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
+using ppStream = cudaStream_t;
+#elif defined(PADDLE_WITH_XPU_KP)
+using ppStream = XPUStream;
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
 class GPUResource {
 public:
-  GPUResource(std::vector<int>& device_id, int index);
+  GPUResource(std::vector<int>& device_id, int index);  // NOLINT
  virtual ~GPUResource();
  GPUResource(const GPUResource&) = delete;
  GPUResource& operator=(const GPUResource&) = delete;
@@ -45,23 +61,55 @@ class GPUResource {
  std::vector<gpuStream_t> local_streams_;
  std::vector<gpuStream_t> comm_streams_;
 };
+#elif defined(PADDLE_WITH_XPU_KP)
+class XPUResource {
+ public:
+  XPUResource(std::vector<int>& device_id, int index);  // NOLINT
+  virtual ~XPUResource();
+  XPUResource(const XPUResource&) = delete;
+  XPUResource& operator=(const XPUResource&) = delete;
+
+  int dev_id() const { return dev_id_; }
+  int index() const { return index_; }
+  XPUStream local_stream(int num) { return local_streams_[num]; }
+  XPUStream remote_stream(int num) { return remote_streams_[num]; }
+  XPUStream comm_stream(int num) { return comm_streams_[num]; }
+
+  int dev_id_;
+  int index_;
+  std::vector<int> dev_ids_;
+  std::vector<XPUStream> remote_streams_;
+  std::vector<XPUStream> local_streams_;
+  std::vector<XPUStream> comm_streams_;
+};
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+using DevResource = GPUResource;
+using DevPlace = platform::CUDAPlace;
+using AnyDeviceGuard = platform::CUDADeviceGuard;
+#elif defined(PADDLE_WITH_XPU_KP)
+using DevResource = XPUResource;
+using DevPlace = platform::XPUPlace;
+using AnyDeviceGuard = platform::XPUDeviceGuard;
+#endif

 class HeterPsResource {
 public:
-  HeterPsResource(const std::vector<int>& dev_ids);
+  explicit HeterPsResource(const std::vector<int>& dev_ids);
  HeterPsResource(const HeterPsResource&) = delete;
  HeterPsResource& operator=(const HeterPsResource&) = delete;
  virtual ~HeterPsResource() {}
  void enable_p2p();
-  int total_gpu();
+  int total_device();
  int get_index_by_devid(int devid);
  int dev_id(int num);
  void set_multi_mf(int multi_mf_dim, int max_mf_dim);
-  gpuStream_t local_stream(int gpu_num, int stream_num);
-  gpuStream_t remote_stream(int gpu_num, int stream_num);
-  gpuStream_t comm_stream(int gpu_num, int stream_num);
+  ppStream local_stream(int dev_num, int stream_num);
+  ppStream remote_stream(int dev_num, int stream_num);
+  ppStream comm_stream(int dev_num, int stream_num);

-  std::vector<std::shared_ptr<GPUResource>> resources_;
+  std::vector<std::shared_ptr<DevResource>> resources_;
  std::vector<int> dev_ids_;
  std::map<int, int> devid_2_index_;
  int multi_mf_dim_{0};

--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -18,6 +18,7 @@ limitations under the License. */
 // #include
 // "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
 #include <iostream>
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"

 namespace paddle {
@@ -111,3 +112,4 @@ class HBMMemoryPool : public managed {
 }  // end namespace framework
 }  // end namespace paddle
 #endif
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -13,16 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#ifdef PADDLE_WITH_HETERPS
+
+#if defined(PADDLE_WITH_CUDA)
 #include <curand_kernel.h>
+#endif
 #include <vector>
-#include "optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-
-#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"

 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
 template <typename ValType, typename GradType>
 class Optimizer {
 public:
@@ -32,7 +35,8 @@ class Optimizer {

  void initialize() {}

-  __device__ void update_lr(float& w, float& g2sum, float g, float scale) {
+  __device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
+                            float scale) {
    double add_g2sum = 0;
    double ratio = optimizer_config::learning_rate *
                   sqrt(optimizer_config::initial_g2sum /
@@ -49,8 +53,8 @@ class Optimizer {
    g2sum += add_g2sum;
  }

-  __device__ void update_mf(int n, float* w, float& g2sum, const float* g,
-                            float scale) {
+  __device__ void update_mf(int n, float* w, float& g2sum,  // NOLINT
+                            const float* g, float scale) {
    double add_g2sum = 0;
    double ratio = optimizer_config::mf_learning_rate *
                   sqrt(optimizer_config::mf_initial_g2sum /
@@ -69,7 +73,8 @@ class Optimizer {

    g2sum += add_g2sum / n;
  }
-  __device__ void update_value(ValType& val, const GradType& grad) {
+
+  __device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
    val.slot = grad.slot;
    val.show += grad.show;
    val.clk += grad.clk;
@@ -132,6 +137,7 @@ class Optimizer {
  }
 };

+#endif
 }  // end namespace framework
 }  // end namespace paddle
 #endif
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -14,8 +14,16 @@ limitations under the License. */

 #pragma once

+#if defined(PADDLE_WITH_XPU_KP)
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+#endif
+
 namespace optimizer_config {

+#if defined(PADDLE_WITH_CUDA)
+
 __constant__ float nonclk_coeff = 0.1;
 __constant__ float clk_coeff = 1;

@@ -31,4 +39,24 @@ __constant__ float mf_initial_g2sum = 3.0;
 __constant__ float mf_initial_range = 1e-4;
 __constant__ float mf_min_bound = -10;
 __constant__ float mf_max_bound = 10;
-}
+
+#elif defined(PADDLE_WITH_XPU_KP)
+
+_global_ptr_ float* nonclk_coeff;
+_global_ptr_ float* clk_coeff;
+
+_global_ptr_ float* min_bound;
+_global_ptr_ float* max_bound;
+_global_ptr_ float* learning_rate;
+_global_ptr_ float* initial_g2sum;
+_global_ptr_ float* initial_range;
+
+_global_ptr_ float* mf_create_thresholds;
+_global_ptr_ float* mf_learning_rate;
+_global_ptr_ float* mf_initial_g2sum;
+_global_ptr_ float* mf_initial_range;
+_global_ptr_ float* mf_min_bound;
+_global_ptr_ float* mf_max_bound;
+
+#endif
+}  // namespace optimizer_config
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -121,7 +121,7 @@ class PSGPUWrapper {
      is_initialized_ = true;
      resource_ = std::make_shared<HeterPsResource>(dev_ids);
      resource_->enable_p2p();
-      keys_tensor.resize(resource_->total_gpu());
+      keys_tensor.resize(resource_->total_device());
 #ifdef PADDLE_WITH_GLOO
      auto gloo = paddle::framework::GlooWrapper::GetInstance();
      if (gloo->Size() > 1) {
@@ -287,8 +287,8 @@ class PSGPUWrapper {
    for (size_t i = 0; i < num_of_dim; i++) {
      dim_index_map[index_dim_vec_[i]] = i;
    }
-    hbm_pools_.resize(resource_->total_gpu() * num_of_dim);
-    mem_pools_.resize(resource_->total_gpu() * num_of_dim);
+    hbm_pools_.resize(resource_->total_device() * num_of_dim);
+    mem_pools_.resize(resource_->total_device() * num_of_dim);
    max_mf_dim_ = index_dim_vec_.back();
    multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0;
    resource_->set_multi_mf(multi_mf_dim_, max_mf_dim_);