diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 939b5e3099a62a8194cf7202e3fe6fe697ff9210..189724a5455200bdfbd0497aee53bc949df412e9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,7 +7,7 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfaf96ebbe88d942709cfa56071449121e525c7f
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+// #include
+// "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
+#include <iostream>
+#include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"
+
+namespace paddle {
+namespace framework {
+
+class MemoryPool {
+ public:
+  MemoryPool(size_t capacity, size_t block_size)
+      : capacity_(capacity), block_size_(block_size) {
+    VLOG(3) << "mem_pool init with block_size: " << block_size
+            << " capacity: " << capacity;
+    mem_ = (char*)malloc(block_size * capacity_);
+  }
+  ~MemoryPool() {
+    VLOG(3) << "mem pool delete";
+    free(mem_);
+  }
+  size_t block_size() { return block_size_; }
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  size_t byte_size() { return capacity_ * block_size_; }
+  void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t block_size_;
+};
+
+class HBMMemoryPool : public managed {
+ public:
+  HBMMemoryPool(size_t capacity, size_t block_size)
+      : capacity_(capacity), block_size_(block_size) {}
+  HBMMemoryPool(MemoryPool* mem_pool) {
+    capacity_ = mem_pool->capacity();
+    block_size_ = mem_pool->block_size();
+    VLOG(3) << "hbm memory pool with capacity" << capacity_
+            << " bs: " << block_size_;
+    cudaMalloc(&mem_, block_size_ * capacity_);
+    cudaMemcpy(mem_, mem_pool->mem(), mem_pool->byte_size(),
+               cudaMemcpyHostToDevice);
+  }
+
+  ~HBMMemoryPool() {
+    VLOG(3) << "delete hbm memory pool";
+    cudaFree(mem_);
+  }
+
+  size_t block_size() { return block_size_; }
+
+  void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); }
+
+  void reset(size_t capacity) {
+    cudaFree(mem_);
+    mem_ = NULL;
+    capacity_ = capacity;
+    cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8);
+    cudaMemset(mem_, 0, block_size_ * capacity);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
+    for (size_t k = 0; k < 5; k++) {
+      auto x = (FeatureValue*)(p.mem() + k * p.capacity());
+      out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
+          << " lr: " << x->lr << " mf_dim: " << x->mf_size
+          << " mf_size: " << x->mf_size << " mf:";
+      for (int i = 0; i < x->mf_dim + 1; ++i) {
+        out << " " << x->mf[i];
+      }
+      out << "\n";
+    }
+    return out;
+  }
+
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  __forceinline__ __device__ void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t block_size_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif